diff --git a/go.mod b/go.mod
index 16036b725ca..856f9a14b0e 100644
--- a/go.mod
+++ b/go.mod
@@ -51,7 +51,7 @@ require (
 	github.com/spf13/afero v1.11.0
 	github.com/stretchr/testify v1.10.0
 	github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97
-	github.com/thanos-io/promql-engine v0.0.0-20250220213456-fab1185f8c6c
+	github.com/thanos-io/promql-engine v0.0.0-20250302135832-accbf0891a16
 	github.com/thanos-io/thanos v0.37.3-0.20250212101700-346d18bb0f80
 	github.com/uber/jaeger-client-go v2.30.0+incompatible
 	github.com/weaveworks/common v0.0.0-20230728070032-dd9e68f319d5
diff --git a/go.sum b/go.sum
index b943cc8638d..efa1dc38805 100644
--- a/go.sum
+++ b/go.sum
@@ -1687,8 +1687,8 @@ github.com/thanos-community/galaxycache v0.0.0-20211122094458-3a32041a1f1e h1:f1
 github.com/thanos-community/galaxycache v0.0.0-20211122094458-3a32041a1f1e/go.mod h1:jXcofnrSln/cLI6/dhlBxPQZEEQHVPCcFaH75M+nSzM=
 github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97 h1:VjG0mwhN1DkncwDHFvrpd12/2TLfgYNRmEQA48ikp+0=
 github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97/go.mod h1:vyzFrBXgP+fGNG2FopEGWOO/zrIuoy7zt3LpLeezRsw=
-github.com/thanos-io/promql-engine v0.0.0-20250220213456-fab1185f8c6c h1:STCm5S4Aht3hOR0WQ0B3daZv21GQC13uPYIfkcN762U=
-github.com/thanos-io/promql-engine v0.0.0-20250220213456-fab1185f8c6c/go.mod h1:aHSV5hL94fNb7PklN9L0V10j+/RGIlzqbw7OLdNgZFs=
+github.com/thanos-io/promql-engine v0.0.0-20250302135832-accbf0891a16 h1:ezd8hNCWiGQr4kdfCHFa0VCSi+LAO/28Mna264nDs2c=
+github.com/thanos-io/promql-engine v0.0.0-20250302135832-accbf0891a16/go.mod h1:aHSV5hL94fNb7PklN9L0V10j+/RGIlzqbw7OLdNgZFs=
 github.com/thanos-io/thanos v0.37.3-0.20250212101700-346d18bb0f80 h1:mOCRYn9SLBWJCXAdP+qDfgZDc0eqDxDc2HZGKTZ5vzk=
 github.com/thanos-io/thanos v0.37.3-0.20250212101700-346d18bb0f80/go.mod h1:Y7D8la8B5rpzRVKq2HCR4hbYZ4LGroSPqIJjtizgQg8=
 github.com/tjhop/slog-gokit v0.1.2 h1:pmQI4SvU9h4gA0vIQsdhJQSqQg4mOmsPykG2/PM3j1I=
diff --git a/pkg/querier/engine_factory.go b/pkg/querier/engine_factory.go
new file mode 100644
index 00000000000..c1060ef766f
--- /dev/null
+++ b/pkg/querier/engine_factory.go
@@ -0,0 +1,88 @@
+package querier
+
+import (
+	"context"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	"github.com/prometheus/prometheus/promql"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/thanos-io/promql-engine/engine"
+	"github.com/thanos-io/promql-engine/logicalplan"
+)
+
+type EngineFactory struct {
+	prometheusEngine *promql.Engine
+	thanosEngine     *engine.Engine
+
+	fallbackQueriesTotal prometheus.Counter
+}
+
+func NewEngineFactory(opts promql.EngineOpts, enableThanosEngine bool, reg prometheus.Registerer) *EngineFactory {
+	prometheusEngine := promql.NewEngine(opts)
+
+	var thanosEngine *engine.Engine
+	if enableThanosEngine {
+		thanosEngine = engine.New(engine.Opts{
+			EngineOpts:        opts,
+			LogicalOptimizers: logicalplan.AllOptimizers,
+			EnableAnalysis:    true,
+		})
+	}
+
+	return &EngineFactory{
+		prometheusEngine: prometheusEngine,
+		thanosEngine:     thanosEngine,
+		fallbackQueriesTotal: promauto.With(reg).NewCounter(prometheus.CounterOpts{
+			Name: "cortex_thanos_engine_fallback_queries_total",
+			Help: "Total number of fallback queries due to not implementation in thanos engine",
+		}),
+	}
+}
+
+func (qf *EngineFactory) NewInstantQuery(ctx context.Context, q storage.Queryable, opts promql.QueryOpts, qs string, ts time.Time) (promql.Query, error) {
+	if qf.thanosEngine != nil {
+		res, err := qf.thanosEngine.MakeInstantQuery(ctx, q, fromPromQLOpts(opts), qs, ts)
+		if err != nil {
+			if engine.IsUnimplemented(err) {
+				// fallback to use prometheus engine
+				qf.fallbackQueriesTotal.Inc()
+				goto fallback
+			}
+			return nil, err
+		}
+		return res, nil
+	}
+
+fallback:
+	return qf.prometheusEngine.NewInstantQuery(ctx, q, opts, qs, ts)
+}
+
+func (qf *EngineFactory) NewRangeQuery(ctx context.Context, q storage.Queryable, opts promql.QueryOpts, qs string, start, end time.Time, interval time.Duration) (promql.Query, error) {
+	if qf.thanosEngine != nil {
+		res, err := qf.thanosEngine.MakeRangeQuery(ctx, q, fromPromQLOpts(opts), qs, start, end, interval)
+		if err != nil {
+			if engine.IsUnimplemented(err) {
+				// fallback to use prometheus engine
+				qf.fallbackQueriesTotal.Inc()
+				goto fallback
+			}
+			return nil, err
+		}
+		return res, nil
+	}
+
+fallback:
+	return qf.prometheusEngine.NewRangeQuery(ctx, q, opts, qs, start, end, interval)
+}
+
+func fromPromQLOpts(opts promql.QueryOpts) *engine.QueryOpts {
+	if opts == nil {
+		return &engine.QueryOpts{}
+	}
+	return &engine.QueryOpts{
+		LookbackDeltaParam:      opts.LookbackDelta(),
+		EnablePerStepStatsParam: opts.EnablePerStepStats(),
+	}
+}
diff --git a/pkg/querier/engine_factory_test.go b/pkg/querier/engine_factory_test.go
new file mode 100644
index 00000000000..b15e155085f
--- /dev/null
+++ b/pkg/querier/engine_factory_test.go
@@ -0,0 +1,59 @@
+package querier
+
+import (
+	"bytes"
+	"context"
+	"testing"
+	"time"
+
+	"github.com/go-kit/log"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+	"github.com/prometheus/prometheus/promql/parser"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cortexproject/cortex/pkg/util/flagext"
+	"github.com/cortexproject/cortex/pkg/util/validation"
+)
+
+func TestEngineFactory_Fallback(t *testing.T) {
+	// add unimplemented function
+	parser.Functions["unimplemented"] = &parser.Function{
+		Name:       "unimplemented",
+		ArgTypes:   []parser.ValueType{parser.ValueTypeVector},
+		ReturnType: parser.ValueTypeVector,
+	}
+
+	cfg := Config{}
+	flagext.DefaultValues(&cfg)
+	cfg.ThanosEngine = true
+	ctx := context.Background()
+	reg := prometheus.NewRegistry()
+
+	chunkStore := &emptyChunkStore{}
+	distributor := &errDistributor{}
+
+	overrides, err := validation.NewOverrides(DefaultLimitsConfig(), nil)
+	require.NoError(t, err)
+
+	now := time.Now()
+	start := time.Now().Add(-time.Minute * 5)
+	step := time.Minute
+	queryable, _, queryEngine := New(cfg, overrides, distributor, []QueryableWithFilter{UseAlwaysQueryable(NewMockStoreQueryable(chunkStore))}, reg, log.NewNopLogger(), nil)
+
+	// instant query, should go to fallback
+	_, _ = queryEngine.NewInstantQuery(ctx, queryable, nil, "unimplemented(foo)", now)
+	require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP cortex_thanos_engine_fallback_queries_total Total number of fallback queries due to not implementation in thanos engine
+		# TYPE cortex_thanos_engine_fallback_queries_total counter
+		cortex_thanos_engine_fallback_queries_total 1
+	`), "cortex_thanos_engine_fallback_queries_total"))
+
+	// range query, should go to fallback
+	_, _ = queryEngine.NewRangeQuery(ctx, queryable, nil, "unimplemented(foo)", start, now, step)
+	require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+		# HELP cortex_thanos_engine_fallback_queries_total Total number of fallback queries due to not implementation in thanos engine
+		# TYPE cortex_thanos_engine_fallback_queries_total counter
+		cortex_thanos_engine_fallback_queries_total 2
+	`), "cortex_thanos_engine_fallback_queries_total"))
+}
diff --git a/pkg/querier/querier.go b/pkg/querier/querier.go
index 2915f01fb1f..277f9903e24 100644
--- a/pkg/querier/querier.go
+++ b/pkg/querier/querier.go
@@ -19,8 +19,6 @@ import (
 	"github.com/prometheus/prometheus/promql/parser"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/util/annotations"
-	"github.com/thanos-io/promql-engine/engine"
-	"github.com/thanos-io/promql-engine/logicalplan"
 	"github.com/thanos-io/thanos/pkg/strutil"
 	"golang.org/x/sync/errgroup"
 
@@ -208,7 +206,6 @@ func New(cfg Config, limits *validation.Overrides, distributor Distributor, stor
 	// The cortex supports holt_winters for users using this function.
 	EnableExperimentalPromQLFunctions(cfg.EnablePromQLExperimentalFunctions, true)
 
-	var queryEngine promql.QueryEngine
 	opts := promql.EngineOpts{
 		Logger:               util_log.GoKitLogToSlog(logger),
 		Reg:                  reg,
@@ -223,15 +220,7 @@ func New(cfg Config, limits *validation.Overrides, distributor Distributor, stor
 			return cfg.DefaultEvaluationInterval.Milliseconds()
 		},
 	}
-	if cfg.ThanosEngine {
-		queryEngine = engine.New(engine.Opts{
-			EngineOpts:        opts,
-			LogicalOptimizers: logicalplan.AllOptimizers,
-			EnableAnalysis:    true,
-		})
-	} else {
-		queryEngine = promql.NewEngine(opts)
-	}
+	queryEngine := NewEngineFactory(opts, cfg.ThanosEngine, reg)
 	return NewSampleAndChunkQueryable(lazyQueryable), exemplarQueryable, queryEngine
 }
 
diff --git a/vendor/github.com/thanos-io/promql-engine/engine/distributed.go b/vendor/github.com/thanos-io/promql-engine/engine/distributed.go
index a7c73782091..29e1248545e 100644
--- a/vendor/github.com/thanos-io/promql-engine/engine/distributed.go
+++ b/vendor/github.com/thanos-io/promql-engine/engine/distributed.go
@@ -50,64 +50,71 @@ func (l remoteEngine) NewRangeQuery(ctx context.Context, opts promql.QueryOpts,
 }
 
 type DistributedEngine struct {
-	endpoints    api.RemoteEndpoints
-	remoteEngine *Engine
+	engine *Engine
 }
 
-func NewDistributedEngine(opts Opts, endpoints api.RemoteEndpoints) *DistributedEngine {
-	opts.LogicalOptimizers = []logicalplan.Optimizer{
-		logicalplan.PassthroughOptimizer{Endpoints: endpoints},
-		logicalplan.DistributedExecutionOptimizer{Endpoints: endpoints},
-	}
-
+func NewDistributedEngine(opts Opts) *DistributedEngine {
 	return &DistributedEngine{
-		endpoints:    endpoints,
-		remoteEngine: New(opts),
+		engine: New(opts),
 	}
 }
 
-func (l DistributedEngine) SetQueryLogger(log promql.QueryLogger) {}
-
-func (l DistributedEngine) NewInstantQuery(ctx context.Context, q storage.Queryable, opts promql.QueryOpts, qs string, ts time.Time) (promql.Query, error) {
-	return l.MakeInstantQuery(ctx, q, fromPromQLOpts(opts), qs, ts)
-}
-
-func (l DistributedEngine) NewRangeQuery(ctx context.Context, q storage.Queryable, opts promql.QueryOpts, qs string, start, end time.Time, interval time.Duration) (promql.Query, error) {
-	return l.MakeRangeQuery(ctx, q, fromPromQLOpts(opts), qs, start, end, interval)
-}
-
-func (l DistributedEngine) MakeInstantQueryFromPlan(ctx context.Context, q storage.Queryable, opts *QueryOpts, plan logicalplan.Node, ts time.Time) (promql.Query, error) {
+func (l DistributedEngine) MakeInstantQueryFromPlan(ctx context.Context, q storage.Queryable, e api.RemoteEndpoints, opts promql.QueryOpts, plan logicalplan.Node, ts time.Time) (promql.Query, error) {
 	// Truncate milliseconds to avoid mismatch in timestamps between remote and local engines.
 	// Some clients might only support second precision when executing queries.
 	ts = ts.Truncate(time.Second)
 
-	return l.remoteEngine.MakeInstantQueryFromPlan(ctx, q, opts, plan, ts)
+	qOpts := fromPromQLOpts(opts)
+	qOpts.LogicalOptimizers = []logicalplan.Optimizer{
+		logicalplan.PassthroughOptimizer{Endpoints: e},
+		logicalplan.DistributedExecutionOptimizer{Endpoints: e},
+	}
+
+	return l.engine.MakeInstantQueryFromPlan(ctx, q, qOpts, plan, ts)
 }
 
-func (l DistributedEngine) MakeRangeQueryFromPlan(ctx context.Context, q storage.Queryable, opts *QueryOpts, plan logicalplan.Node, start, end time.Time, interval time.Duration) (promql.Query, error) {
+func (l DistributedEngine) MakeRangeQueryFromPlan(ctx context.Context, q storage.Queryable, e api.RemoteEndpoints, opts promql.QueryOpts, plan logicalplan.Node, start, end time.Time, interval time.Duration) (promql.Query, error) {
 	// Truncate milliseconds to avoid mismatch in timestamps between remote and local engines.
 	// Some clients might only support second precision when executing queries.
 	start = start.Truncate(time.Second)
 	end = end.Truncate(time.Second)
 	interval = interval.Truncate(time.Second)
 
-	return l.remoteEngine.MakeRangeQueryFromPlan(ctx, q, opts, plan, start, end, interval)
+	qOpts := fromPromQLOpts(opts)
+	qOpts.LogicalOptimizers = []logicalplan.Optimizer{
+		logicalplan.PassthroughOptimizer{Endpoints: e},
+		logicalplan.DistributedExecutionOptimizer{Endpoints: e},
+	}
+
+	return l.engine.MakeRangeQueryFromPlan(ctx, q, qOpts, plan, start, end, interval)
 }
 
-func (l DistributedEngine) MakeInstantQuery(ctx context.Context, q storage.Queryable, opts *QueryOpts, qs string, ts time.Time) (promql.Query, error) {
+func (l DistributedEngine) MakeInstantQuery(ctx context.Context, q storage.Queryable, e api.RemoteEndpoints, opts promql.QueryOpts, qs string, ts time.Time) (promql.Query, error) {
 	// Truncate milliseconds to avoid mismatch in timestamps between remote and local engines.
 	// Some clients might only support second precision when executing queries.
 	ts = ts.Truncate(time.Second)
 
-	return l.remoteEngine.MakeInstantQuery(ctx, q, opts, qs, ts)
+	qOpts := fromPromQLOpts(opts)
+	qOpts.LogicalOptimizers = []logicalplan.Optimizer{
+		logicalplan.PassthroughOptimizer{Endpoints: e},
+		logicalplan.DistributedExecutionOptimizer{Endpoints: e},
+	}
+
+	return l.engine.MakeInstantQuery(ctx, q, qOpts, qs, ts)
 }
 
-func (l DistributedEngine) MakeRangeQuery(ctx context.Context, q storage.Queryable, opts *QueryOpts, qs string, start, end time.Time, interval time.Duration) (promql.Query, error) {
+func (l DistributedEngine) MakeRangeQuery(ctx context.Context, q storage.Queryable, e api.RemoteEndpoints, opts promql.QueryOpts, qs string, start, end time.Time, interval time.Duration) (promql.Query, error) {
 	// Truncate milliseconds to avoid mismatch in timestamps between remote and local engines.
 	// Some clients might only support second precision when executing queries.
 	start = start.Truncate(time.Second)
 	end = end.Truncate(time.Second)
 	interval = interval.Truncate(time.Second)
 
-	return l.remoteEngine.MakeRangeQuery(ctx, q, opts, qs, start, end, interval)
+	qOpts := fromPromQLOpts(opts)
+	qOpts.LogicalOptimizers = []logicalplan.Optimizer{
+		logicalplan.PassthroughOptimizer{Endpoints: e},
+		logicalplan.DistributedExecutionOptimizer{Endpoints: e},
+	}
+
+	return l.engine.MakeRangeQuery(ctx, q, qOpts, qs, start, end, interval)
 }
diff --git a/vendor/github.com/thanos-io/promql-engine/engine/engine.go b/vendor/github.com/thanos-io/promql-engine/engine/engine.go
index f74ff74f316..8fe141e5d01 100644
--- a/vendor/github.com/thanos-io/promql-engine/engine/engine.go
+++ b/vendor/github.com/thanos-io/promql-engine/engine/engine.go
@@ -8,11 +8,10 @@ import (
 	"log/slog"
 	"math"
 	"runtime"
+	"slices"
 	"sort"
 	"time"
 
-	"github.com/thanos-io/promql-engine/execution/telemetry"
-
 	"github.com/efficientgo/core/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
@@ -27,6 +26,7 @@ import (
 	"github.com/thanos-io/promql-engine/execution/function"
 	"github.com/thanos-io/promql-engine/execution/model"
 	"github.com/thanos-io/promql-engine/execution/parse"
+	"github.com/thanos-io/promql-engine/execution/telemetry"
 	"github.com/thanos-io/promql-engine/execution/warnings"
 	"github.com/thanos-io/promql-engine/extlabels"
 	"github.com/thanos-io/promql-engine/logicalplan"
@@ -39,7 +39,7 @@ type QueryType int
 
 type engineMetrics struct {
 	currentQueries prometheus.Gauge
-	queries        *prometheus.CounterVec
+	totalQueries   prometheus.Counter
 }
 
 const (
@@ -50,16 +50,16 @@ const (
 	stepsBatch             = 10
 )
 
+func IsUnimplemented(err error) bool {
+	return errors.Is(err, parse.ErrNotSupportedExpr) || errors.Is(err, parse.ErrNotImplemented)
+}
+
 type Opts struct {
 	promql.EngineOpts
 
 	// LogicalOptimizers are optimizers that are run if the value is not nil. If it is nil then the default optimizers are run. Default optimizer list is available in the logicalplan package.
 	LogicalOptimizers []logicalplan.Optimizer
 
-	// DisableFallback enables mode where engine returns error if some expression of feature is not yet implemented
-	// in the new engine, instead of falling back to prometheus engine.
-	DisableFallback bool
-
 	// ExtLookbackDelta specifies what time range to use to determine valid previous sample for extended range functions.
 	// Defaults to 1 hour if not specified.
 	ExtLookbackDelta time.Duration
@@ -67,40 +67,22 @@ type Opts struct {
 	// DecodingConcurrency is the maximum number of goroutines that can be used to decode samples. Defaults to GOMAXPROCS / 2.
 	DecodingConcurrency int
 
+	// SelectorBatchSize specifies the maximum number of samples to be returned by selectors in a single batch.
+	SelectorBatchSize int64
+
 	// EnableXFunctions enables custom xRate, xIncrease and xDelta functions.
 	// This will default to false.
 	EnableXFunctions bool
 
-	// FallbackEngine
-	Engine promql.QueryEngine
-
 	// EnableAnalysis enables query analysis.
 	EnableAnalysis bool
 
-	// EnablePartialResponses enables partial responses in distributed mode.
-	EnablePartialResponses bool
-
-	// SelectorBatchSize specifies the maximum number of samples to be returned by selectors in a single batch.
-	SelectorBatchSize int64
-
 	// The Prometheus engine has internal check for duplicate labels produced by functions, aggregations or binary operators.
 	// This check can produce false positives when querying time-series data which does not conform to the Prometheus data model,
 	// and can be disabled if it leads to false positives.
 	DisableDuplicateLabelChecks bool
 }
 
-func (o Opts) getLogicalOptimizers() []logicalplan.Optimizer {
-	var optimizers []logicalplan.Optimizer
-	if o.LogicalOptimizers == nil {
-		optimizers = make([]logicalplan.Optimizer, len(logicalplan.DefaultOptimizers))
-		copy(optimizers, logicalplan.DefaultOptimizers)
-	} else {
-		optimizers = make([]logicalplan.Optimizer, len(o.LogicalOptimizers))
-		copy(optimizers, o.LogicalOptimizers)
-	}
-	return optimizers
-}
-
 // QueryOpts implements promql.QueryOpts but allows to override more engine default options.
 type QueryOpts struct {
 	// These values are used to implement promql.QueryOpts, they have weird "Param" suffix because
@@ -111,8 +93,11 @@ type QueryOpts struct {
 	// DecodingConcurrency can be used to override the DecodingConcurrency engine setting.
 	DecodingConcurrency int
 
-	// EnablePartialResponses can be used to override the EnablePartialResponses engine setting.
-	EnablePartialResponses bool
+	// SelectorBatchSize can be used to override the SelectorBatchSize engine setting.
+	SelectorBatchSize int64
+
+	// LogicalOptimizers can be used to override the LogicalOptimizers engine setting.
+	LogicalOptimizers []logicalplan.Optimizer
 }
 
 func (opts QueryOpts) LookbackDelta() time.Duration { return opts.LookbackDeltaParam }
@@ -151,10 +136,9 @@ func NewWithScanners(opts Opts, scanners engstorage.Scanners) *Engine {
 		opts.ExtLookbackDelta = 1 * time.Hour
 		opts.Logger.Debug("external lookback delta is zero, setting to default value", "value", 1*24*time.Hour)
 	}
-	if opts.SelectorBatchSize != 0 {
+	if len(opts.LogicalOptimizers) == 0 {
 		opts.LogicalOptimizers = append(
-			[]logicalplan.Optimizer{logicalplan.SelectorBatchSize{Size: opts.SelectorBatchSize}},
-			opts.LogicalOptimizers...,
+			opts.LogicalOptimizers, logicalplan.DefaultOptimizers...,
 		)
 	}
 
@@ -177,23 +161,16 @@ func NewWithScanners(opts Opts, scanners engstorage.Scanners) *Engine {
 				Help:      "The current number of queries being executed or waiting.",
 			},
 		),
-		queries: promauto.With(opts.Reg).NewCounterVec(
+		totalQueries: promauto.With(opts.Reg).NewCounter(
 			prometheus.CounterOpts{
 				Namespace: namespace,
 				Subsystem: subsystem,
 				Name:      "queries_total",
 				Help:      "Number of PromQL queries.",
-			}, []string{"fallback"},
+			},
 		),
 	}
 
-	var engine promql.QueryEngine
-	if opts.Engine == nil {
-		engine = promql.NewEngine(opts.EngineOpts)
-	} else {
-		engine = opts.Engine
-	}
-
 	decodingConcurrency := opts.DecodingConcurrency
 	if opts.DecodingConcurrency < 1 {
 		decodingConcurrency = runtime.GOMAXPROCS(0) / 2
@@ -201,6 +178,7 @@ func NewWithScanners(opts Opts, scanners engstorage.Scanners) *Engine {
 			decodingConcurrency = 1
 		}
 	}
+	selectorBatchSize := opts.SelectorBatchSize
 
 	var queryTracker promql.QueryTracker = nopQueryTracker{}
 	if opts.ActiveQueryTracker != nil {
@@ -208,27 +186,25 @@ func NewWithScanners(opts Opts, scanners engstorage.Scanners) *Engine {
 	}
 
 	return &Engine{
-		prom:               engine,
 		functions:          functions,
 		scanners:           scanners,
 		activeQueryTracker: queryTracker,
 
 		disableDuplicateLabelChecks: opts.DisableDuplicateLabelChecks,
-		disableFallback:             opts.DisableFallback,
-
-		logger:                 opts.Logger,
-		lookbackDelta:          opts.LookbackDelta,
-		enablePerStepStats:     opts.EnablePerStepStats,
-		logicalOptimizers:      opts.getLogicalOptimizers(),
-		timeout:                opts.Timeout,
-		metrics:                metrics,
-		extLookbackDelta:       opts.ExtLookbackDelta,
-		enableAnalysis:         opts.EnableAnalysis,
-		enablePartialResponses: opts.EnablePartialResponses,
+
+		logger:             opts.Logger,
+		lookbackDelta:      opts.LookbackDelta,
+		enablePerStepStats: opts.EnablePerStepStats,
+		logicalOptimizers:  opts.LogicalOptimizers,
+		timeout:            opts.Timeout,
+		metrics:            metrics,
+		extLookbackDelta:   opts.ExtLookbackDelta,
+		enableAnalysis:     opts.EnableAnalysis,
 		noStepSubqueryIntervalFn: func(d time.Duration) time.Duration {
 			return time.Duration(opts.NoStepSubqueryIntervalFn(d.Milliseconds()) * 1000000)
 		},
 		decodingConcurrency: decodingConcurrency,
+		selectorBatchSize:   selectorBatchSize,
 	}
 }
 
@@ -240,13 +216,11 @@ var (
 )
 
 type Engine struct {
-	prom               promql.QueryEngine
 	functions          map[string]*parser.Function
 	scanners           engstorage.Scanners
 	activeQueryTracker promql.QueryTracker
 
 	disableDuplicateLabelChecks bool
-	disableFallback             bool
 
 	logger             *slog.Logger
 	lookbackDelta      time.Duration
@@ -257,12 +231,18 @@ type Engine struct {
 
 	extLookbackDelta         time.Duration
 	decodingConcurrency      int
+	selectorBatchSize        int64
 	enableAnalysis           bool
-	enablePartialResponses   bool
 	noStepSubqueryIntervalFn func(time.Duration) time.Duration
 }
 
 func (e *Engine) MakeInstantQuery(ctx context.Context, q storage.Queryable, opts *QueryOpts, qs string, ts time.Time) (promql.Query, error) {
+	idx, err := e.activeQueryTracker.Insert(ctx, qs)
+	if err != nil {
+		return nil, err
+	}
+	defer e.activeQueryTracker.Delete(idx)
+
 	expr, err := parser.NewParser(qs, parser.WithFunctions(e.functions)).ParseExpr()
 	if err != nil {
 		return nil, err
@@ -280,7 +260,7 @@ func (e *Engine) MakeInstantQuery(ctx context.Context, q storage.Queryable, opts
 	planOpts := logicalplan.PlanOptions{
 		DisableDuplicateLabelCheck: e.disableDuplicateLabelChecks,
 	}
-	lplan, warns := logicalplan.NewFromAST(expr, qOpts, planOpts).Optimize(e.logicalOptimizers)
+	lplan, warns := logicalplan.NewFromAST(expr, qOpts, planOpts).Optimize(e.getLogicalOptimizers(opts))
 
 	scanners, err := e.storageScanners(q, qOpts, lplan)
 	if err != nil {
@@ -290,20 +270,16 @@ func (e *Engine) MakeInstantQuery(ctx context.Context, q storage.Queryable, opts
 	ctx = warnings.NewContext(ctx)
 	defer func() { warns.Merge(warnings.FromContext(ctx)) }()
 	exec, err := execution.New(ctx, lplan.Root(), scanners, qOpts)
-	if e.triggerFallback(err) {
-		e.metrics.queries.WithLabelValues("true").Inc()
-		return e.prom.NewInstantQuery(ctx, q, opts, qs, ts)
-	}
-	e.metrics.queries.WithLabelValues("false").Inc()
 	if err != nil {
 		return nil, err
 	}
+	e.metrics.totalQueries.Inc()
 	return &compatibilityQuery{
 		Query:      &Query{exec: exec, opts: opts},
 		engine:     e,
 		plan:       lplan,
-		ts:         ts,
 		warns:      warns,
+		ts:         ts,
 		t:          InstantQuery,
 		resultSort: resultSort,
 		scanners:   scanners,
@@ -327,7 +303,7 @@ func (e *Engine) MakeInstantQueryFromPlan(ctx context.Context, q storage.Queryab
 	planOpts := logicalplan.PlanOptions{
 		DisableDuplicateLabelCheck: e.disableDuplicateLabelChecks,
 	}
-	lplan, warns := logicalplan.New(root, qOpts, planOpts).Optimize(e.logicalOptimizers)
+	lplan, warns := logicalplan.New(root, qOpts, planOpts).Optimize(e.getLogicalOptimizers(opts))
 
 	ctx = warnings.NewContext(ctx)
 	defer func() { warns.Merge(warnings.FromContext(ctx)) }()
@@ -338,21 +314,17 @@ func (e *Engine) MakeInstantQueryFromPlan(ctx context.Context, q storage.Queryab
 	}
 
 	exec, err := execution.New(ctx, lplan.Root(), scnrs, qOpts)
-	if e.triggerFallback(err) {
-		e.metrics.queries.WithLabelValues("true").Inc()
-		return e.prom.NewInstantQuery(ctx, q, opts, root.String(), ts)
-	}
-	e.metrics.queries.WithLabelValues("false").Inc()
 	if err != nil {
 		return nil, err
 	}
+	e.metrics.totalQueries.Inc()
 
 	return &compatibilityQuery{
 		Query:  &Query{exec: exec, opts: opts},
 		engine: e,
 		plan:   lplan,
-		ts:     ts,
 		warns:  warns,
+		ts:     ts,
 		t:      InstantQuery,
 		// TODO(fpetkovski): Infer the sort order from the plan, ideally without copying the newResultSort function.
 		resultSort: noSortResultSort{},
@@ -386,7 +358,7 @@ func (e *Engine) MakeRangeQuery(ctx context.Context, q storage.Queryable, opts *
 	planOpts := logicalplan.PlanOptions{
 		DisableDuplicateLabelCheck: e.disableDuplicateLabelChecks,
 	}
-	lplan, warns := logicalplan.NewFromAST(expr, qOpts, planOpts).Optimize(e.logicalOptimizers)
+	lplan, warns := logicalplan.NewFromAST(expr, qOpts, planOpts).Optimize(e.getLogicalOptimizers(opts))
 
 	ctx = warnings.NewContext(ctx)
 	defer func() { warns.Merge(warnings.FromContext(ctx)) }()
@@ -396,14 +368,10 @@ func (e *Engine) MakeRangeQuery(ctx context.Context, q storage.Queryable, opts *
 	}
 
 	exec, err := execution.New(ctx, lplan.Root(), scnrs, qOpts)
-	if e.triggerFallback(err) {
-		e.metrics.queries.WithLabelValues("true").Inc()
-		return e.prom.NewRangeQuery(ctx, q, opts, qs, start, end, step)
-	}
-	e.metrics.queries.WithLabelValues("false").Inc()
 	if err != nil {
 		return nil, err
 	}
+	e.metrics.totalQueries.Inc()
 
 	return &compatibilityQuery{
 		Query:    &Query{exec: exec, opts: opts},
@@ -432,7 +400,7 @@ func (e *Engine) MakeRangeQueryFromPlan(ctx context.Context, q storage.Queryable
 	planOpts := logicalplan.PlanOptions{
 		DisableDuplicateLabelCheck: e.disableDuplicateLabelChecks,
 	}
-	lplan, warns := logicalplan.New(root, qOpts, planOpts).Optimize(e.logicalOptimizers)
+	lplan, warns := logicalplan.New(root, qOpts, planOpts).Optimize(e.getLogicalOptimizers(opts))
 
 	scnrs, err := e.storageScanners(q, qOpts, lplan)
 	if err != nil {
@@ -442,14 +410,11 @@ func (e *Engine) MakeRangeQueryFromPlan(ctx context.Context, q storage.Queryable
 	ctx = warnings.NewContext(ctx)
 	defer func() { warns.Merge(warnings.FromContext(ctx)) }()
 	exec, err := execution.New(ctx, lplan.Root(), scnrs, qOpts)
-	if e.triggerFallback(err) {
-		e.metrics.queries.WithLabelValues("true").Inc()
-		return e.prom.NewRangeQuery(ctx, q, opts, lplan.Root().String(), start, end, step)
-	}
-	e.metrics.queries.WithLabelValues("false").Inc()
 	if err != nil {
 		return nil, err
 	}
+	e.metrics.totalQueries.Inc()
+
 	return &compatibilityQuery{
 		Query:    &Query{exec: exec, opts: opts},
 		engine:   e,
@@ -485,7 +450,6 @@ func (e *Engine) makeQueryOpts(start time.Time, end time.Time, step time.Duratio
 		EnablePerStepStats:       e.enablePerStepStats,
 		ExtLookbackDelta:         e.extLookbackDelta,
 		EnableAnalysis:           e.enableAnalysis,
-		EnablePartialResponses:   e.enablePartialResponses,
 		NoStepSubqueryIntervalFn: e.noStepSubqueryIntervalFn,
 		DecodingConcurrency:      e.decodingConcurrency,
 	}
@@ -503,12 +467,24 @@ func (e *Engine) makeQueryOpts(start time.Time, end time.Time, step time.Duratio
 	if opts.DecodingConcurrency != 0 {
 		res.DecodingConcurrency = opts.DecodingConcurrency
 	}
-	if opts.EnablePartialResponses {
-		res.EnablePartialResponses = opts.EnablePartialResponses
-	}
+
 	return res
 }
 
+func (e *Engine) getLogicalOptimizers(opts *QueryOpts) []logicalplan.Optimizer {
+	var optimizers []logicalplan.Optimizer
+	if len(opts.LogicalOptimizers) != 0 {
+		optimizers = slices.Clone(opts.LogicalOptimizers)
+	} else {
+		optimizers = slices.Clone(e.logicalOptimizers)
+	}
+	selectorBatchSize := e.selectorBatchSize
+	if opts.SelectorBatchSize != 0 {
+		selectorBatchSize = opts.SelectorBatchSize
+	}
+	return append(optimizers, logicalplan.SelectorBatchSize{Size: selectorBatchSize})
+}
+
 func (e *Engine) storageScanners(queryable storage.Queryable, qOpts *query.Options, lplan logicalplan.Plan) (engstorage.Scanners, error) {
 	if e.scanners == nil {
 		return promstorage.NewPrometheusScanners(queryable, qOpts, lplan)
@@ -516,14 +492,6 @@ func (e *Engine) storageScanners(queryable storage.Queryable, qOpts *query.Optio
 	return e.scanners, nil
 }
 
-func (e *Engine) triggerFallback(err error) bool {
-	if e.disableFallback {
-		return false
-	}
-
-	return errors.Is(err, parse.ErrNotSupportedExpr) || errors.Is(err, parse.ErrNotImplemented)
-}
-
 type Query struct {
 	exec model.VectorOperator
 	opts promql.QueryOpts
diff --git a/vendor/github.com/thanos-io/promql-engine/execution/aggregate/accumulator.go b/vendor/github.com/thanos-io/promql-engine/execution/aggregate/accumulator.go
index 784c6827b76..5e643c353e8 100644
--- a/vendor/github.com/thanos-io/promql-engine/execution/aggregate/accumulator.go
+++ b/vendor/github.com/thanos-io/promql-engine/execution/aggregate/accumulator.go
@@ -378,7 +378,7 @@ func (a *avgAcc) Add(ctx context.Context, v float64, h *histogram.FloatHistogram
 	a.hasValue = true
 
 	if !a.incremental {
-		newSum, newC := kahanSumInc(v, a.kahanSum, a.kahanC)
+		newSum, newC := KahanSumInc(v, a.kahanSum, a.kahanC)
 
 		if !math.IsInf(newSum, 0) {
 			// The sum doesn't overflow, so we propagate it to the
@@ -414,7 +414,7 @@ func (a *avgAcc) Add(ctx context.Context, v float64, h *histogram.FloatHistogram
 		}
 	}
 	currentMean := a.avg + a.kahanC
-	a.avg, a.kahanC = kahanSumInc(
+	a.avg, a.kahanC = KahanSumInc(
 		// Divide each side of the `-` by `group.groupCount` to avoid float64 overflows.
 		v/float64(a.count)-currentMean/float64(a.count),
 		a.avg,
@@ -693,7 +693,8 @@ func SumCompensated(s []float64) float64 {
 	return sum + c
 }
 
-func kahanSumInc(inc, sum, c float64) (newSum, newC float64) {
+// KahanSumInc implements kahan summation, see https://en.wikipedia.org/wiki/Kahan_summation_algorithm.
+func KahanSumInc(inc, sum, c float64) (newSum, newC float64) {
 	t := sum + inc
 	switch {
 	case math.IsInf(t, 0):
diff --git a/vendor/github.com/thanos-io/promql-engine/execution/function/functions.go b/vendor/github.com/thanos-io/promql-engine/execution/function/functions.go
index c17e56adf31..7ca6a1df240 100644
--- a/vendor/github.com/thanos-io/promql-engine/execution/function/functions.go
+++ b/vendor/github.com/thanos-io/promql-engine/execution/function/functions.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	"github.com/prometheus/prometheus/model/histogram"
+	"github.com/prometheus/prometheus/promql"
 	"github.com/prometheus/prometheus/promql/parser"
 )
 
@@ -143,7 +144,19 @@ var instantVectorFuncs = map[string]functionCall{
 		if h == nil || len(vargs) != 2 {
 			return 0., false
 		}
-		return histogramFraction(vargs[0], vargs[1], h), true
+		return promql.HistogramFraction(vargs[0], vargs[1], h), true
+	},
+	"histogram_stddev": func(f float64, h *histogram.FloatHistogram, vargs ...float64) (float64, bool) {
+		if h == nil {
+			return 0., false
+		}
+		return histogramStdDev(h), true
+	},
+	"histogram_stdvar": func(f float64, h *histogram.FloatHistogram, vargs ...float64) (float64, bool) {
+		if h == nil {
+			return 0., false
+		}
+		return histogramStdVar(h), true
 	},
 	// variants of date time functions with an argument
 	"days_in_month": func(f float64, h *histogram.FloatHistogram, vargs ...float64) (float64, bool) {
diff --git a/vendor/github.com/thanos-io/promql-engine/execution/function/histogram.go b/vendor/github.com/thanos-io/promql-engine/execution/function/histogram.go
index 64830cd2a3c..cbaea5fc4ce 100644
--- a/vendor/github.com/thanos-io/promql-engine/execution/function/histogram.go
+++ b/vendor/github.com/thanos-io/promql-engine/execution/function/histogram.go
@@ -15,6 +15,7 @@ import (
 
 	"github.com/cespare/xxhash/v2"
 	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/promql"
 	"github.com/prometheus/prometheus/promql/parser/posrange"
 	"github.com/prometheus/prometheus/util/annotations"
 
@@ -172,7 +173,7 @@ func (o *histogramOperator) processInputSeries(ctx context.Context, vectors []mo
 			// In that case, we reset the conventional buckets to avoid emitting a sample.
 			// TODO(fpetkovski): Prometheus is looking to solve these conflicts through warnings: https://github.com/prometheus/prometheus/issues/10839.
 			if len(o.seriesBuckets[outputSeriesID]) == 0 {
-				value := histogramQuantile(o.scalarPoints[stepIndex], vector.Histograms[i])
+				value := promql.HistogramQuantile(o.scalarPoints[stepIndex], vector.Histograms[i])
 				step.AppendSample(o.pool, uint64(outputSeriesID), value)
 			} else {
 				o.seriesBuckets[outputSeriesID] = o.seriesBuckets[outputSeriesID][:0]
diff --git a/vendor/github.com/thanos-io/promql-engine/execution/function/quantile.go b/vendor/github.com/thanos-io/promql-engine/execution/function/quantile.go
index 4a78629e0ed..44ce21f4295 100644
--- a/vendor/github.com/thanos-io/promql-engine/execution/function/quantile.go
+++ b/vendor/github.com/thanos-io/promql-engine/execution/function/quantile.go
@@ -16,6 +16,8 @@ import (
 
 	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/util/almost"
+
+	"github.com/thanos-io/promql-engine/execution/aggregate"
 )
 
 // smallDeltaTolerance is the threshold for relative deltas between classic
@@ -196,280 +198,56 @@ func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bo
 	return forcedMonotonic, fixedPrecision
 }
 
-// histogramQuantile calculates the quantile 'q' based on the given histogram.
-//
-// For custom buckets, the result is interpolated linearly, i.e. it is assumed
-// the observations are uniformly distributed within each bucket. (This is a
-// quite blunt assumption, but it is consistent with the interpolation method
-// used for classic histograms so far.)
-//
-// For exponential buckets, the interpolation is done under the assumption that
-// the samples within each bucket are distributed in a way that they would
-// uniformly populate the buckets in a hypothetical histogram with higher
-// resolution. For example, if the rank calculation suggests that the requested
-// quantile is right in the middle of the population of the (1,2] bucket, we
-// assume the quantile would be right at the bucket boundary between the two
-// buckets the (1,2] bucket would be divided into if the histogram had double
-// the resolution, which is 2**2**-1 = 1.4142... We call this exponential
-// interpolation.
-//
-// However, for a quantile that ends up in the zero bucket, this method isn't
-// very helpful (because there is an infinite number of buckets close to zero,
-// so we would have to assume zero as the result). Therefore, we return to
-// linear interpolation in the zero bucket.
-//
-// A natural lower bound of 0 is assumed if the histogram has only positive
-// buckets. Likewise, a natural upper bound of 0 is assumed if the histogram has
-// only negative buckets.
-//
-// There are a number of special cases:
-//
-// If the histogram has 0 observations, NaN is returned.
-//
-// If q<0, -Inf is returned.
-//
-// If q>1, +Inf is returned.
-//
-// If q is NaN, NaN is returned.
-func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
-	if q < 0 {
-		return math.Inf(-1)
-	}
-	if q > 1 {
-		return math.Inf(+1)
-	}
-
-	if h.Count == 0 || math.IsNaN(q) {
-		return math.NaN()
-	}
-
-	var (
-		bucket histogram.Bucket[float64]
-		count  float64
-		it     histogram.BucketIterator[float64]
-		rank   float64
-	)
-
-	// If there are NaN observations in the histogram (h.Sum is NaN), use the forward iterator.
-	// If q < 0.5, use the forward iterator.
-	// If q >= 0.5, use the reverse iterator.
-	if math.IsNaN(h.Sum) || q < 0.5 {
-		it = h.AllBucketIterator()
-		rank = q * h.Count
-	} else {
-		it = h.AllReverseBucketIterator()
-		rank = (1 - q) * h.Count
-	}
-
+// TODO: import from prometheus once exported there.
+func histogramStdDev(h *histogram.FloatHistogram) float64 {
+	mean := h.Sum / h.Count
+	var variance, cVariance float64
+	it := h.AllBucketIterator()
 	for it.Next() {
-		bucket = it.At()
+		bucket := it.At()
 		if bucket.Count == 0 {
 			continue
 		}
-		count += bucket.Count
-		if count >= rank {
-			break
-		}
-	}
-	if !h.UsesCustomBuckets() && bucket.Lower < 0 && bucket.Upper > 0 {
-		switch {
-		case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
-			// The result is in the zero bucket and the histogram has only
-			// positive buckets. So we consider 0 to be the lower bound.
-			bucket.Lower = 0
-		case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
-			// The result is in the zero bucket and the histogram has only
-			// negative buckets. So we consider 0 to be the upper bound.
-			bucket.Upper = 0
-		}
-	} else if h.UsesCustomBuckets() {
-		if bucket.Lower == math.Inf(-1) {
-			// first bucket, with lower bound -Inf
-			if bucket.Upper <= 0 {
-				return bucket.Upper
+		var val float64
+		if bucket.Lower <= 0 && 0 <= bucket.Upper {
+			val = 0
+		} else {
+			val = math.Sqrt(bucket.Upper * bucket.Lower)
+			if bucket.Upper < 0 {
+				val = -val
 			}
-			bucket.Lower = 0
-		} else if bucket.Upper == math.Inf(1) {
-			// last bucket, with upper bound +Inf
-			return bucket.Lower
 		}
+		delta := val - mean
+		variance, cVariance = aggregate.KahanSumInc(bucket.Count*delta*delta, variance, cVariance)
 	}
-	// Due to numerical inaccuracies, we could end up with a higher count
-	// than h.Count. Thus, make sure count is never higher than h.Count.
-	if count > h.Count {
-		count = h.Count
-	}
-	// We could have hit the highest bucket without even reaching the rank
-	// (this should only happen if the histogram contains observations of
-	// the value NaN), in which case we simply return the upper limit of the
-	// highest explicit bucket.
-	if count < rank {
-		return bucket.Upper
-	}
-
-	// NaN observations increase h.Count but not the total number of
-	// observations in the buckets. Therefore, we have to use the forward
-	// iterator to find percentiles. We recognize histograms containing NaN
-	// observations by checking if their h.Sum is NaN.
-	if math.IsNaN(h.Sum) || q < 0.5 {
-		rank -= count - bucket.Count
-	} else {
-		rank = count - rank
-	}
-
-	fraction := rank / bucket.Count
-
-	// Return linear interpolation for custom buckets and for quantiles that
-	// end up in the zero bucket.
-	if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) {
-		return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction
-	}
-
-	// For exponential buckets, we interpolate on a logarithmic scale. On a
-	// logarithmic scale, the exponential bucket boundaries (for any schema)
-	// become linear (every bucket has the same width). Therefore, after
-	// taking the logarithm of both bucket boundaries, we can use the
-	// calculated fraction in the same way as for linear interpolation (see
-	// above). Finally, we return to the normal scale by applying the
-	// exponential function to the result.
-	logLower := math.Log2(math.Abs(bucket.Lower))
-	logUpper := math.Log2(math.Abs(bucket.Upper))
-	if bucket.Lower > 0 { // Positive bucket.
-		return math.Exp2(logLower + (logUpper-logLower)*fraction)
-	}
-	// Otherwise, we are in a negative bucket and have to mirror things.
-	return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction))
+	variance += cVariance
+	variance /= h.Count
+	return math.Sqrt(variance)
 }
 
-// histogramFraction calculates the fraction of observations between the
-// provided lower and upper bounds, based on the provided histogram.
-//
-// histogramFraction is in a certain way the inverse of histogramQuantile.  If
-// histogramQuantile(0.9, h) returns 123.4, then histogramFraction(-Inf, 123.4, h)
-// returns 0.9.
-//
-// The same notes with regard to interpolation and assumptions about the zero
-// bucket boundaries apply as for histogramQuantile.
-//
-// Whether either boundary is inclusive or exclusive doesn’t actually matter as
-// long as interpolation has to be performed anyway. In the case of a boundary
-// coinciding with a bucket boundary, the inclusive or exclusive nature of the
-// boundary determines the exact behavior of the threshold. With the current
-// implementation, that means that lower is exclusive for positive values and
-// inclusive for negative values, while upper is inclusive for positive values
-// and exclusive for negative values.
-//
-// Special cases:
-//
-// If the histogram has 0 observations, NaN is returned.
-//
-// Use a lower bound of -Inf to get the fraction of all observations below the
-// upper bound.
-//
-// Use an upper bound of +Inf to get the fraction of all observations above the
-// lower bound.
-//
-// If lower or upper is NaN, NaN is returned.
-//
-// If lower >= upper and the histogram has at least 1 observation, zero is returned.
-func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 {
-	if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) {
-		return math.NaN()
-	}
-	if lower >= upper {
-		return 0
-	}
-
-	var (
-		rank, lowerRank, upperRank float64
-		lowerSet, upperSet         bool
-		it                         = h.AllBucketIterator()
-	)
+// TODO: import from prometheus once exported there.
+func histogramStdVar(h *histogram.FloatHistogram) float64 {
+	mean := h.Sum / h.Count
+	var variance, cVariance float64
+	it := h.AllBucketIterator()
 	for it.Next() {
-		b := it.At()
-
-		zeroBucket := false
-		// interpolateLinearly is used for custom buckets to be
-		// consistent with the linear interpolation known from classic
-		// histograms. It is also used for the zero bucket.
-		interpolateLinearly := func(v float64) float64 {
-			return rank + b.Count*(v-b.Lower)/(b.Upper-b.Lower)
-		}
-		// interpolateExponentially is using the same exponential
-		// interpolation method as above for histogramQuantile. This
-		// method is a better fit for exponential bucketing.
-		interpolateExponentially := func(v float64) float64 {
-			var (
-				logLower = math.Log2(math.Abs(b.Lower))
-				logUpper = math.Log2(math.Abs(b.Upper))
-				logV     = math.Log2(math.Abs(v))
-				fraction float64
-			)
-			if v > 0 {
-				fraction = (logV - logLower) / (logUpper - logLower)
-			} else {
-				fraction = 1 - ((logV - logUpper) / (logLower - logUpper))
-			}
-			return rank + b.Count*fraction
-		}
-
-		if b.Lower <= 0 && b.Upper >= 0 {
-			zeroBucket = true
-			switch {
-			case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
-				// This is the zero bucket and the histogram has only
-				// positive buckets. So we consider 0 to be the lower
-				// bound.
-				b.Lower = 0
-			case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
-				// This is in the zero bucket and the histogram has only
-				// negative buckets. So we consider 0 to be the upper
-				// bound.
-				b.Upper = 0
-			}
-		}
-		if !lowerSet && b.Lower >= lower {
-			// We have hit the lower value at the lower bucket boundary.
-			lowerRank = rank
-			lowerSet = true
-		}
-		if !upperSet && b.Lower >= upper {
-			// We have hit the upper value at the lower bucket boundary.
-			upperRank = rank
-			upperSet = true
-		}
-		if lowerSet && upperSet {
-			break
-		}
-		if !lowerSet && b.Lower < lower && b.Upper > lower {
-			// The lower value is in this bucket.
-			if h.UsesCustomBuckets() || zeroBucket {
-				lowerRank = interpolateLinearly(lower)
-			} else {
-				lowerRank = interpolateExponentially(lower)
-			}
-			lowerSet = true
+		bucket := it.At()
+		if bucket.Count == 0 {
+			continue
 		}
-		if !upperSet && b.Lower < upper && b.Upper > upper {
-			// The upper value is in this bucket.
-			if h.UsesCustomBuckets() || zeroBucket {
-				upperRank = interpolateLinearly(upper)
-			} else {
-				upperRank = interpolateExponentially(upper)
+		var val float64
+		if bucket.Lower <= 0 && 0 <= bucket.Upper {
+			val = 0
+		} else {
+			val = math.Sqrt(bucket.Upper * bucket.Lower)
+			if bucket.Upper < 0 {
+				val = -val
 			}
-			upperSet = true
 		}
-		if lowerSet && upperSet {
-			break
-		}
-		rank += b.Count
-	}
-	if !lowerSet || lowerRank > h.Count {
-		lowerRank = h.Count
+		delta := val - mean
+		variance, cVariance = aggregate.KahanSumInc(bucket.Count*delta*delta, variance, cVariance)
 	}
-	if !upperSet || upperRank > h.Count {
-		upperRank = h.Count
-	}
-
-	return (upperRank - lowerRank) / h.Count
+	variance += cVariance
+	variance /= h.Count
+	return variance
 }
diff --git a/vendor/github.com/thanos-io/promql-engine/execution/remote/operator.go b/vendor/github.com/thanos-io/promql-engine/execution/remote/operator.go
index e1efb98f21b..2655c35e5b8 100644
--- a/vendor/github.com/thanos-io/promql-engine/execution/remote/operator.go
+++ b/vendor/github.com/thanos-io/promql-engine/execution/remote/operator.go
@@ -128,12 +128,7 @@ func (s *storageAdapter) executeQuery(ctx context.Context) {
 		warnings.AddToContext(w, ctx)
 	}
 	if result.Err != nil {
-		err := errors.Wrapf(result.Err, "remote exec error [%s]", s.lbls)
-		if s.opts.EnablePartialResponses {
-			warnings.AddToContext(err, ctx)
-		} else {
-			s.err = err
-		}
+		s.err = errors.Wrapf(result.Err, "remote exec error [%s]", s.lbls)
 		return
 	}
 	switch val := result.Value.(type) {
diff --git a/vendor/github.com/thanos-io/promql-engine/logicalplan/plan.go b/vendor/github.com/thanos-io/promql-engine/logicalplan/plan.go
index 9e84b4f7c63..bb37e7eb98a 100644
--- a/vendor/github.com/thanos-io/promql-engine/logicalplan/plan.go
+++ b/vendor/github.com/thanos-io/promql-engine/logicalplan/plan.go
@@ -244,6 +244,12 @@ func replacePrometheusNodes(plan parser.Expr) Node {
 	case *parser.NumberLiteral:
 		return &NumberLiteral{Val: t.Val}
 	case *parser.StepInvariantExpr:
+		// We expect functions to be pushed down into matrix selectors. This means that
+		// parents of matrixselector nodes are always expected to be functions, not step invariant
+		// operators.
+		if m, ok := t.Expr.(*parser.MatrixSelector); ok {
+			return replacePrometheusNodes(m)
+		}
 		return &StepInvariantExpr{Expr: replacePrometheusNodes(t.Expr)}
 	case *parser.MatrixSelector:
 		return &MatrixSelector{
diff --git a/vendor/github.com/thanos-io/promql-engine/query/options.go b/vendor/github.com/thanos-io/promql-engine/query/options.go
index decdda36cd5..5dbb3c09718 100644
--- a/vendor/github.com/thanos-io/promql-engine/query/options.go
+++ b/vendor/github.com/thanos-io/promql-engine/query/options.go
@@ -17,7 +17,6 @@ type Options struct {
 	ExtLookbackDelta         time.Duration
 	NoStepSubqueryIntervalFn func(time.Duration) time.Duration
 	EnableAnalysis           bool
-	EnablePartialResponses   bool
 	DecodingConcurrency      int
 }
 
diff --git a/vendor/github.com/thanos-io/promql-engine/ringbuffer/functions.go b/vendor/github.com/thanos-io/promql-engine/ringbuffer/functions.go
index e85a3a9cefb..1d59ec04d28 100644
--- a/vendor/github.com/thanos-io/promql-engine/ringbuffer/functions.go
+++ b/vendor/github.com/thanos-io/promql-engine/ringbuffer/functions.go
@@ -6,17 +6,16 @@ package ringbuffer
 import (
 	"context"
 	"math"
+	"sort"
 
 	"github.com/efficientgo/core/errors"
-
-	"github.com/prometheus/prometheus/util/annotations"
-
-	"github.com/thanos-io/promql-engine/execution/warnings"
-
 	"github.com/prometheus/prometheus/model/histogram"
+	"github.com/prometheus/prometheus/util/annotations"
+	"gonum.org/v1/gonum/stat"
 
 	"github.com/thanos-io/promql-engine/execution/aggregate"
 	"github.com/thanos-io/promql-engine/execution/parse"
+	"github.com/thanos-io/promql-engine/execution/warnings"
 )
 
 type SamplesBuffer GenericRingBuffer
@@ -82,6 +81,12 @@ var rangeVectorFuncs = map[string]FunctionCall{
 		}
 		return sumOverTime(f.Samples), nil, true, nil
 	},
+	"mad_over_time": func(f FunctionArgs) (float64, *histogram.FloatHistogram, bool, error) {
+		if len(f.Samples) == 0 {
+			return 0., nil, false, nil
+		}
+		return madOverTime(f.Samples), nil, true, nil
+	},
 	"max_over_time": func(f FunctionArgs) (float64, *histogram.FloatHistogram, bool, error) {
 		if len(f.Samples) == 0 {
 			return 0., nil, false, nil
@@ -583,6 +588,23 @@ func histogramRate(ctx context.Context, points []Sample, isCounter bool) (*histo
 	return h.Compact(0), nil
 }
 
+func madOverTime(points []Sample) float64 {
+	values := make([]float64, 0, len(points))
+	for _, f := range points {
+		values = append(values, f.V.F)
+	}
+	sort.Float64s(values)
+
+	median := stat.Quantile(0.5, stat.LinInterp, values, nil)
+
+	for i, f := range points {
+		values[i] = math.Abs(f.V.F - median)
+	}
+	sort.Float64s(values)
+
+	return stat.Quantile(0.5, stat.LinInterp, values, nil)
+}
+
 func maxOverTime(points []Sample) float64 {
 	max := points[0].V.F
 	for _, v := range points {
diff --git a/vendor/github.com/thanos-io/promql-engine/storage/prometheus/matrix_selector.go b/vendor/github.com/thanos-io/promql-engine/storage/prometheus/matrix_selector.go
index 85bb46f051d..47251644b1f 100644
--- a/vendor/github.com/thanos-io/promql-engine/storage/prometheus/matrix_selector.go
+++ b/vendor/github.com/thanos-io/promql-engine/storage/prometheus/matrix_selector.go
@@ -11,21 +11,18 @@ import (
 	"sync"
 	"time"
 
-	"github.com/prometheus/prometheus/promql/parser/posrange"
-	"github.com/prometheus/prometheus/util/annotations"
-
-	"github.com/thanos-io/promql-engine/execution/warnings"
-
-	"github.com/thanos-io/promql-engine/execution/telemetry"
-
 	"github.com/efficientgo/core/errors"
 	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/model/value"
+	"github.com/prometheus/prometheus/promql/parser/posrange"
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/util/annotations"
 
 	"github.com/thanos-io/promql-engine/execution/model"
 	"github.com/thanos-io/promql-engine/execution/parse"
+	"github.com/thanos-io/promql-engine/execution/telemetry"
+	"github.com/thanos-io/promql-engine/execution/warnings"
 	"github.com/thanos-io/promql-engine/extlabels"
 	"github.com/thanos-io/promql-engine/query"
 	"github.com/thanos-io/promql-engine/ringbuffer"
diff --git a/vendor/github.com/thanos-io/promql-engine/storage/interface.go b/vendor/github.com/thanos-io/promql-engine/storage/scanners.go
similarity index 100%
rename from vendor/github.com/thanos-io/promql-engine/storage/interface.go
rename to vendor/github.com/thanos-io/promql-engine/storage/scanners.go
diff --git a/vendor/gonum.org/v1/gonum/blas/README.md b/vendor/gonum.org/v1/gonum/blas/README.md
new file mode 100644
index 00000000000..16d62bd3554
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/README.md
@@ -0,0 +1,51 @@
+# Gonum BLAS
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/blas)](https://pkg.go.dev/gonum.org/v1/gonum/blas)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/blas?status.svg)](https://godocs.io/gonum.org/v1/gonum/blas)
+
+A collection of packages to provide BLAS functionality for the [Go programming
+language](http://golang.org)
+
+## Installation
+```sh
+  go get gonum.org/v1/gonum/blas/...
+```
+
+## Packages
+
+### blas
+
+Defines [BLAS API](http://www.netlib.org/blas/blast-forum/cinterface.pdf) split in several
+interfaces.
+
+### blas/gonum
+
+Go implementation of the BLAS API (incomplete, implements the `float32` and `float64` API).
+
+### blas/blas64 and blas/blas32
+
+Wrappers for an implementation of the double (i.e., `float64`) and single (`float32`)
+precision real parts of the BLAS API.
+
+```Go
+package main
+
+import (
+	"fmt"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+func main() {
+	v := blas64.Vector{Inc: 1, Data: []float64{1, 1, 1}}
+	v.N = len(v.Data)
+	fmt.Println("v has length:", blas64.Nrm2(v))
+}
+```
+
+### blas/cblas128 and blas/cblas64
+
+Wrappers for an implementation of the double (i.e., `complex128`) and single (`complex64`) 
+precision complex parts of the blas API.
+
+Currently blas/cblas64 and blas/cblas128 require gonum.org/v1/netlib/blas.
diff --git a/vendor/gonum.org/v1/gonum/blas/blas.go b/vendor/gonum.org/v1/gonum/blas/blas.go
new file mode 100644
index 00000000000..9b933e3fc57
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas.go
@@ -0,0 +1,283 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./conversions.bash
+
+package blas
+
+// Flag constants indicate Givens transformation H matrix state.
+type Flag int
+
+const (
+	Identity    Flag = -2 // H is the identity matrix; no rotation is needed.
+	Rescaling   Flag = -1 // H specifies rescaling.
+	OffDiagonal Flag = 0  // Off-diagonal elements of H are non-unit.
+	Diagonal    Flag = 1  // Diagonal elements of H are non-unit.
+)
+
+// SrotmParams contains Givens transformation parameters returned
+// by the Float32 Srotm method.
+type SrotmParams struct {
+	Flag
+	H [4]float32 // Column-major 2 by 2 matrix.
+}
+
+// DrotmParams contains Givens transformation parameters returned
+// by the Float64 Drotm method.
+type DrotmParams struct {
+	Flag
+	H [4]float64 // Column-major 2 by 2 matrix.
+}
+
+// Transpose specifies the transposition operation of a matrix.
+type Transpose byte
+
+const (
+	NoTrans   Transpose = 'N'
+	Trans     Transpose = 'T'
+	ConjTrans Transpose = 'C'
+)
+
+// Uplo specifies whether a matrix is upper or lower triangular.
+type Uplo byte
+
+const (
+	Upper Uplo = 'U'
+	Lower Uplo = 'L'
+	All   Uplo = 'A'
+)
+
+// Diag specifies whether a matrix is unit triangular.
+type Diag byte
+
+const (
+	NonUnit Diag = 'N'
+	Unit    Diag = 'U'
+)
+
+// Side specifies from which side a multiplication operation is performed.
+type Side byte
+
+const (
+	Left  Side = 'L'
+	Right Side = 'R'
+)
+
+// Float32 implements the single precision real BLAS routines.
+type Float32 interface {
+	Float32Level1
+	Float32Level2
+	Float32Level3
+}
+
+// Float32Level1 implements the single precision real BLAS Level 1 routines.
+type Float32Level1 interface {
+	Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32
+	Dsdot(n int, x []float32, incX int, y []float32, incY int) float64
+	Sdot(n int, x []float32, incX int, y []float32, incY int) float32
+	Snrm2(n int, x []float32, incX int) float32
+	Sasum(n int, x []float32, incX int) float32
+	Isamax(n int, x []float32, incX int) int
+	Sswap(n int, x []float32, incX int, y []float32, incY int)
+	Scopy(n int, x []float32, incX int, y []float32, incY int)
+	Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int)
+	Srotg(a, b float32) (c, s, r, z float32)
+	Srotmg(d1, d2, b1, b2 float32) (p SrotmParams, rd1, rd2, rb1 float32)
+	Srot(n int, x []float32, incX int, y []float32, incY int, c, s float32)
+	Srotm(n int, x []float32, incX int, y []float32, incY int, p SrotmParams)
+	Sscal(n int, alpha float32, x []float32, incX int)
+}
+
+// Float32Level2 implements the single precision real BLAS Level 2 routines.
+type Float32Level2 interface {
+	Sgemv(tA Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sgbmv(tA Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Strmv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Strsv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Ssymv(ul Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Ssbmv(ul Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sspmv(ul Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int)
+	Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Ssyr(ul Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int)
+	Sspr(ul Uplo, n int, alpha float32, x []float32, incX int, ap []float32)
+	Ssyr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Sspr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32)
+}
+
+// Float32Level3 implements the single precision real BLAS Level 3 routines.
+type Float32Level3 interface {
+	Sgemm(tA, tB Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssymm(s Side, ul Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssyrk(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int)
+	Ssyr2k(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Strmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+	Strsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+}
+
+// Float64 implements the single precision real BLAS routines.
+type Float64 interface {
+	Float64Level1
+	Float64Level2
+	Float64Level3
+}
+
+// Float64Level1 implements the double precision real BLAS Level 1 routines.
+type Float64Level1 interface {
+	Ddot(n int, x []float64, incX int, y []float64, incY int) float64
+	Dnrm2(n int, x []float64, incX int) float64
+	Dasum(n int, x []float64, incX int) float64
+	Idamax(n int, x []float64, incX int) int
+	Dswap(n int, x []float64, incX int, y []float64, incY int)
+	Dcopy(n int, x []float64, incX int, y []float64, incY int)
+	Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int)
+	Drotg(a, b float64) (c, s, r, z float64)
+	Drotmg(d1, d2, b1, b2 float64) (p DrotmParams, rd1, rd2, rb1 float64)
+	Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64)
+	Drotm(n int, x []float64, incX int, y []float64, incY int, p DrotmParams)
+	Dscal(n int, alpha float64, x []float64, incX int)
+}
+
+// Float64Level2 implements the double precision real BLAS Level 2 routines.
+type Float64Level2 interface {
+	Dgemv(tA Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dgbmv(tA Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dtrmv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dtrsv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dsymv(ul Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dsbmv(ul Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dspmv(ul Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int)
+	Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dsyr(ul Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int)
+	Dspr(ul Uplo, n int, alpha float64, x []float64, incX int, ap []float64)
+	Dsyr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dspr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64)
+}
+
+// Float64Level3 implements the double precision real BLAS Level 3 routines.
+type Float64Level3 interface {
+	Dgemm(tA, tB Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsymm(s Side, ul Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsyrk(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int)
+	Dsyr2k(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dtrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+	Dtrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+}
+
+// Complex64 implements the single precision complex BLAS routines.
+type Complex64 interface {
+	Complex64Level1
+	Complex64Level2
+	Complex64Level3
+}
+
+// Complex64Level1 implements the single precision complex BLAS Level 1 routines.
+type Complex64Level1 interface {
+	Cdotu(n int, x []complex64, incX int, y []complex64, incY int) (dotu complex64)
+	Cdotc(n int, x []complex64, incX int, y []complex64, incY int) (dotc complex64)
+	Scnrm2(n int, x []complex64, incX int) float32
+	Scasum(n int, x []complex64, incX int) float32
+	Icamax(n int, x []complex64, incX int) int
+	Cswap(n int, x []complex64, incX int, y []complex64, incY int)
+	Ccopy(n int, x []complex64, incX int, y []complex64, incY int)
+	Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int)
+	Cscal(n int, alpha complex64, x []complex64, incX int)
+	Csscal(n int, alpha float32, x []complex64, incX int)
+}
+
+// Complex64Level2 implements the single precision complex BLAS routines Level 2 routines.
+type Complex64Level2 interface {
+	Cgemv(tA Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgbmv(tA Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Ctrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Ctrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Chemv(ul Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chbmv(ul Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chpmv(ul Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cher(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int)
+	Chpr(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64)
+	Cher2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Chpr2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64)
+}
+
+// Complex64Level3 implements the single precision complex BLAS Level 3 routines.
+type Complex64Level3 interface {
+	Cgemm(tA, tB Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csymm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csyrk(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int)
+	Csyr2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Ctrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Ctrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Chemm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Cherk(ul Uplo, t Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int)
+	Cher2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int)
+}
+
+// Complex128 implements the double precision complex BLAS routines.
+type Complex128 interface {
+	Complex128Level1
+	Complex128Level2
+	Complex128Level3
+}
+
+// Complex128Level1 implements the double precision complex BLAS Level 1 routines.
+type Complex128Level1 interface {
+	Zdotu(n int, x []complex128, incX int, y []complex128, incY int) (dotu complex128)
+	Zdotc(n int, x []complex128, incX int, y []complex128, incY int) (dotc complex128)
+	Dznrm2(n int, x []complex128, incX int) float64
+	Dzasum(n int, x []complex128, incX int) float64
+	Izamax(n int, x []complex128, incX int) int
+	Zswap(n int, x []complex128, incX int, y []complex128, incY int)
+	Zcopy(n int, x []complex128, incX int, y []complex128, incY int)
+	Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int)
+	Zscal(n int, alpha complex128, x []complex128, incX int)
+	Zdscal(n int, alpha float64, x []complex128, incX int)
+}
+
+// Complex128Level2 implements the double precision complex BLAS Level 2 routines.
+type Complex128Level2 interface {
+	Zgemv(tA Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgbmv(tA Transpose, m, n int, kL int, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Ztrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Ztrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Zhemv(ul Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhbmv(ul Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhpmv(ul Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zher(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int)
+	Zhpr(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128)
+	Zher2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zhpr2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128)
+}
+
+// Complex128Level3 implements the double precision complex BLAS Level 3 routines.
+type Complex128Level3 interface {
+	Zgemm(tA, tB Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsymm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsyrk(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int)
+	Zsyr2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Ztrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Ztrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Zhemm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zherk(ul Uplo, t Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int)
+	Zher2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go b/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
new file mode 100644
index 00000000000..64ac985c1cf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
@@ -0,0 +1,533 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var blas64 blas.Float64 = gonum.Implementation{}
+
+// Use sets the BLAS float64 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Float64) {
+	blas64 = b
+}
+
+// Implementation returns the current BLAS float64 implementation.
+//
+// Implementation allows direct calls to the current BLAS float64 implementation
+// giving finer control of parameters.
+func Implementation() blas.Float64 {
+	return blas64
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Data []float64
+	Inc  int
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Data       []float64
+	Stride     int
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Data       []float64
+	Stride     int
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	Uplo blas.Uplo
+	Diag blas.Diag
+	N    int
+	Data []float64
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	Uplo   blas.Uplo
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	Uplo   blas.Uplo
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	Uplo blas.Uplo
+	N    int
+	Data []float64
+}
+
+// Level 1
+
+const (
+	negInc    = "blas64: negative vector increment"
+	badLength = "blas64: vector length mismatch"
+)
+
+// Dot computes the dot product of the two vectors:
+//
+//	\sum_i x[i]*y[i].
+//
+// Dot will panic if the lengths of x and y do not match.
+func Dot(x, y Vector) float64 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return blas64.Ddot(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//
+//	sqrt(\sum_i x[i]*x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dnrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of the absolute values of the elements of x:
+//
+//	\sum_i |x[i]|.
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Idamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of the two vectors:
+//
+//	x[i], y[i] = y[i], x[i] for all i.
+//
+// Swap will panic if the lengths of x and y do not match.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//
+//	y[i] = x[i] for all i.
+//
+// Copy will panic if the lengths of x and y do not match.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy adds x scaled by alpha to y:
+//
+//	y[i] += alpha*x[i] for all i.
+//
+// Axpy will panic if the lengths of x and y do not match.
+func Axpy(alpha float64, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Daxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Rotg computes the parameters of a Givens plane rotation so that
+//
+//	⎡ c s⎤   ⎡a⎤   ⎡r⎤
+//	⎣-s c⎦ * ⎣b⎦ = ⎣0⎦
+//
+// where a and b are the Cartesian coordinates of a given point.
+// c, s, and r are defined as
+//
+//	r = ±Sqrt(a^2 + b^2),
+//	c = a/r, the cosine of the rotation angle,
+//	s = a/r, the sine of the rotation angle,
+//
+// and z is defined such that
+//
+//	if |a| > |b|,        z = s,
+//	otherwise if c != 0, z = 1/c,
+//	otherwise            z = 1.
+func Rotg(a, b float64) (c, s, r, z float64) {
+	return blas64.Drotg(a, b)
+}
+
+// Rotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func Rotmg(d1, d2, b1, b2 float64) (p blas.DrotmParams, rd1, rd2, rb1 float64) {
+	return blas64.Drotmg(d1, d2, b1, b2)
+}
+
+// Rot applies a plane transformation to n points represented by the vectors x
+// and y:
+//
+//	x[i] =  c*x[i] + s*y[i],
+//	y[i] = -s*x[i] + c*y[i], for all i.
+func Rot(x, y Vector, c, s float64) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drot(x.N, x.Data, x.Inc, y.Data, y.Inc, c, s)
+}
+
+// Rotm applies the modified Givens rotation to n points represented by the
+// vectors x and y.
+func Rotm(x, y Vector, p blas.DrotmParams) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drotm(x.N, x.Data, x.Inc, y.Data, y.Inc, p)
+}
+
+// Scal scales the vector x by alpha:
+//
+//	x[i] *= alpha for all i.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	blas64.Dscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func Gemv(t blas.Transpose, alpha float64, a General, x Vector, beta float64, y Vector) {
+	blas64.Dgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are scalars.
+func Gbmv(t blas.Transpose, alpha float64, a Band, x Vector, beta float64, y Vector) {
+	blas64.Dgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x and b are
+// vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Symv computes
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Symv(alpha float64, a Symmetric, x Vector, beta float64, y Vector) {
+	blas64.Dsymv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Sbmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Sbmv(alpha float64, a SymmetricBand, x Vector, beta float64, y Vector) {
+	blas64.Dsbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Spmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Spmv(alpha float64, a SymmetricPacked, x Vector, beta float64, y Vector) {
+	blas64.Dspmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Ger performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(alpha float64, x, y Vector, a General) {
+	blas64.Dger(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Syr performs a rank-1 update
+//
+//	A += alpha * x * xᵀ,
+//
+// where A is an n×n symmetric matrix, x is a vector, and alpha is a scalar.
+func Syr(alpha float64, x Vector, a Symmetric) {
+	blas64.Dsyr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Spr performs the rank-1 update
+//
+//	A += alpha * x * xᵀ,
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Spr(alpha float64, x Vector, a SymmetricPacked) {
+	blas64.Dspr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Syr2 performs a rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ,
+//
+// where A is a symmetric n×n matrix, x and y are vectors, and alpha is a scalar.
+func Syr2(alpha float64, x, y Vector, a Symmetric) {
+	blas64.Dsyr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Spr2 performs a rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ,
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Spr2(alpha float64, x, y Vector, a SymmetricPacked) {
+	blas64.Dspr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//
+//	C = alpha * A * B + beta * C,
+//
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed.
+func Gemm(tA, tB blas.Transpose, alpha float64, a, b General, beta float64, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	blas64.Dgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha is a scalar.
+func Symm(s blas.Side, alpha float64, a Symmetric, b General, beta float64, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	blas64.Dsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//
+//	C = alpha * A * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * A + beta * C  if t == blas.Trans or blas.ConjTrans,
+//
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans and
+// a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha float64, a General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if t == blas.Trans or blas.ConjTrans,
+//
+// where C is an n×n symmetric matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha float64, a, b General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and s == blas.Left,
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//	B = alpha * B * A   if tA == blas.NoTrans and s == blas.Right,
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and s == blas.Left,
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//	X * A = alpha * B   if tA == blas.NoTrans and s == blas.Right,
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/conv.go b/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
new file mode 100644
index 00000000000..6cc6517f1b9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
@@ -0,0 +1,277 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("blas64: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("blas64: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go b/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
new file mode 100644
index 00000000000..5146f1a1c3c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
@@ -0,0 +1,153 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/blas64/doc.go b/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
new file mode 100644
index 00000000000..7410cee486f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package blas64 provides a simple interface to the float64 BLAS API.
+package blas64 // import "gonum.org/v1/gonum/blas/blas64"
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go b/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
new file mode 100644
index 00000000000..82a6f22e2bc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
@@ -0,0 +1,600 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var cblas128 blas.Complex128 = gonum.Implementation{}
+
+// Use sets the BLAS complex128 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Complex128) {
+	cblas128 = b
+}
+
+// Implementation returns the current BLAS complex128 implementation.
+//
+// Implementation allows direct calls to the current the BLAS complex128 implementation
+// giving finer control of parameters.
+func Implementation() blas.Complex128 {
+	return cblas128
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Inc  int
+	Data []complex128
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Stride     int
+	Data       []complex128
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Stride     int
+	Data       []complex128
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+	Diag blas.Diag
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+}
+
+// Hermitian represents an Hermitian matrix using the conventional storage scheme.
+type Hermitian Symmetric
+
+// HermitianBand represents an Hermitian matrix using the band storage scheme.
+type HermitianBand SymmetricBand
+
+// HermitianPacked represents an Hermitian matrix using the packed storage scheme.
+type HermitianPacked SymmetricPacked
+
+// Level 1
+
+const (
+	negInc    = "cblas128: negative vector increment"
+	badLength = "cblas128: vector length mismatch"
+)
+
+// Dotu computes the dot product of the two vectors without
+// complex conjugation:
+//
+//	xᵀ * y.
+//
+// Dotu will panic if the lengths of x and y do not match.
+func Dotu(x, y Vector) complex128 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return cblas128.Zdotu(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Dotc computes the dot product of the two vectors with
+// complex conjugation:
+//
+//	xᴴ * y.
+//
+// Dotc will panic if the lengths of x and y do not match.
+func Dotc(x, y Vector) complex128 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return cblas128.Zdotc(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dznrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of magnitudes of the real and imaginary parts of
+// elements of the vector x:
+//
+//	\sum_i (|Re x[i]| + |Im x[i]|).
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dzasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest sum of
+// magnitudes of the real and imaginary parts (|Re x[i]|+|Im x[i]|).
+// If there are multiple such indices, the earliest is returned.
+//
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Izamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of two vectors:
+//
+//	x[i], y[i] = y[i], x[i] for all i.
+//
+// Swap will panic if the lengths of x and y do not match.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//
+//	y[i] = x[i] for all i.
+//
+// Copy will panic if the lengths of x and y do not match.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy computes
+//
+//	y = alpha * x + y,
+//
+// where x and y are vectors, and alpha is a scalar.
+// Axpy will panic if the lengths of x and y do not match.
+func Axpy(alpha complex128, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zaxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Scal computes
+//
+//	x = alpha * x,
+//
+// where x is a vector, and alpha is a scalar.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha complex128, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Dscal computes
+//
+//	x = alpha * x,
+//
+// where x is a vector, and alpha is a real scalar.
+//
+// Dscal will panic if the vector increment is negative.
+func Dscal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zdscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans,
+//	y = alpha * Aᴴ * x + beta * y  if t == blas.ConjTrans,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gemv(t blas.Transpose, alpha complex128, a General, x Vector, beta complex128, y Vector) {
+	cblas128.Zgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans,
+//	y = alpha * Aᴴ * x + beta * y  if t == blas.ConjTrans,
+//
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gbmv(t blas.Transpose, alpha complex128, a Band, x Vector, beta complex128, y Vector) {
+	cblas128.Zgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Hemv computes
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Hemv(alpha complex128, a Hermitian, x Vector, beta complex128, y Vector) {
+	cblas128.Zhemv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hbmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Hbmv(alpha complex128, a HermitianBand, x Vector, beta complex128, y Vector) {
+	cblas128.Zhbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hpmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Hpmv(alpha complex128, a HermitianPacked, x Vector, beta complex128, y Vector) {
+	cblas128.Zhpmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Geru performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Geru(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgeru(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Gerc performs a rank-1 update
+//
+//	A += alpha * x * yᴴ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Gerc(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgerc(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Her performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her(alpha float64, x Vector, a Hermitian) {
+	cblas128.Zher(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Hpr performs a rank-1 update
+//
+//	A += alpha * x * xᴴ,
+//
+// where A is an n×n Hermitian matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Hpr(alpha float64, x Vector, a HermitianPacked) {
+	cblas128.Zhpr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Her2 performs a rank-2 update
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
+//
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her2(alpha complex128, x, y Vector, a Hermitian) {
+	cblas128.Zher2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Hpr2 performs a rank-2 update
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
+//
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Hpr2(alpha complex128, x, y Vector, a HermitianPacked) {
+	cblas128.Zhpr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//
+//	C = alpha * A * B + beta * C,
+//
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed or conjugated.
+func Gemm(tA, tB blas.Transpose, alpha complex128, a, b General, beta complex128, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	cblas128.Zgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Symm(s blas.Side, alpha complex128, a Symmetric, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//
+//	C = alpha * A * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * A + beta * C  if t == blas.Trans,
+//
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha complex128, a General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if t == blas.Trans,
+//
+// where C is an n×n symmetric matrix, A and B are n×k matrices if
+// t == blas.NoTrans and k×n otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha complex128, a, b General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and s == blas.Left,
+//	B = alpha * Aᵀ * B  if tA == blas.Trans and s == blas.Left,
+//	B = alpha * Aᴴ * B  if tA == blas.ConjTrans and s == blas.Left,
+//	B = alpha * B * A   if tA == blas.NoTrans and s == blas.Right,
+//	B = alpha * B * Aᵀ  if tA == blas.Trans and s == blas.Right,
+//	B = alpha * B * Aᴴ  if tA == blas.ConjTrans and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and s == blas.Left,
+//	Aᵀ * X = alpha * B  if tA == blas.Trans and s == blas.Left,
+//	Aᴴ * X = alpha * B  if tA == blas.ConjTrans and s == blas.Left,
+//	X * A = alpha * B   if tA == blas.NoTrans and s == blas.Right,
+//	X * Aᵀ = alpha * B  if tA == blas.Trans and s == blas.Right,
+//	X * Aᴴ = alpha * B  if tA == blas.ConjTrans and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, b contains the values of B, and the result is
+// stored in-place into b.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Hemm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m Hermitian matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Hemm(s blas.Side, alpha complex128, a Hermitian, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zhemm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Herk performs the Hermitian rank-k update
+//
+//	C = alpha * A * Aᴴ + beta*C  if t == blas.NoTrans,
+//	C = alpha * Aᴴ * A + beta*C  if t == blas.ConjTrans,
+//
+// where C is an n×n Hermitian matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Herk(t blas.Transpose, alpha float64, a General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zherk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Her2k performs the Hermitian rank-2k update
+//
+//	C = alpha * A * Bᴴ + conj(alpha) * B * Aᴴ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᴴ * B + conj(alpha) * Bᴴ * A + beta * C  if t == blas.ConjTrans,
+//
+// where C is an n×n Hermitian matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Her2k(t blas.Transpose, alpha complex128, a, b General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zher2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go b/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
new file mode 100644
index 00000000000..c459e1d87e3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
@@ -0,0 +1,279 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("cblas128: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("cblas128: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
new file mode 100644
index 00000000000..51c3a5777bb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// HermitianCols represents a matrix using the conventional column-major storage scheme.
+type HermitianCols Hermitian
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t HermitianCols) From(a Hermitian) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Hermitian) From(a HermitianCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// HermitianBandCols represents an Hermitian matrix using the band column-major storage scheme.
+type HermitianBandCols HermitianBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBandCols) From(a HermitianBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBand) From(a HermitianBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
new file mode 100644
index 00000000000..f1bf40c2083
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go b/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
new file mode 100644
index 00000000000..09719b19e63
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cblas128 provides a simple interface to the complex128 BLAS API.
+package cblas128 // import "gonum.org/v1/gonum/blas/cblas128"
diff --git a/vendor/gonum.org/v1/gonum/blas/conversions.bash b/vendor/gonum.org/v1/gonum/blas/conversions.bash
new file mode 100644
index 00000000000..d1c0ef0d995
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/conversions.bash
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+
+# Copyright ©2017 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Generate code for blas32.
+echo Generating blas32/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv.go
+
+echo Generating blas32/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_test.go
+
+echo Generating blas32/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv_symmetric.go
+
+echo Generating blas32/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_symmetric_test.go
+
+
+# Generate code for cblas128.
+echo Generating cblas128/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv.go
+
+echo Generating cblas128/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_test.go
+
+echo Generating cblas128/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv_symmetric.go
+
+echo Generating cblas128/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_symmetric_test.go
+
+echo Generating cblas128/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas128/conv_hermitian.go
+
+echo Generating cblas128/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_hermitian_test.go
+
+
+# Generate code for cblas64.
+echo Generating cblas64/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+\
+>> cblas64/conv.go
+
+echo Generating cblas64/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_test.go
+
+echo Generating cblas64/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas64/conv_hermitian.go
+
+echo Generating cblas64/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_hermitian_test.go
diff --git a/vendor/gonum.org/v1/gonum/blas/doc.go b/vendor/gonum.org/v1/gonum/blas/doc.go
new file mode 100644
index 00000000000..ea4b16c904d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/doc.go
@@ -0,0 +1,108 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package blas provides interfaces for the BLAS linear algebra standard.
+
+All methods must perform appropriate parameter checking and panic if
+provided parameters that do not conform to the requirements specified
+by the BLAS standard.
+
+Quick Reference Guide to the BLAS from http://www.netlib.org/lapack/lug/node145.html
+
+This version is modified to remove the "order" option. All matrix operations are
+on row-order matrices.
+
+Level 1 BLAS
+
+	        dim scalar vector   vector   scalars              5-element prefixes
+	                                                          struct
+
+	_rotg (                                      a, b )                S, D
+	_rotmg(                              d1, d2, a, b )                S, D
+	_rot  ( n,         x, incX, y, incY,               c, s )          S, D
+	_rotm ( n,         x, incX, y, incY,                      param )  S, D
+	_swap ( n,         x, incX, y, incY )                              S, D, C, Z
+	_scal ( n,  alpha, x, incX )                                       S, D, C, Z, Cs, Zd
+	_copy ( n,         x, incX, y, incY )                              S, D, C, Z
+	_axpy ( n,  alpha, x, incX, y, incY )                              S, D, C, Z
+	_dot  ( n,         x, incX, y, incY )                              S, D, Ds
+	_dotu ( n,         x, incX, y, incY )                              C, Z
+	_dotc ( n,         x, incX, y, incY )                              C, Z
+	__dot ( n,  alpha, x, incX, y, incY )                              Sds
+	_nrm2 ( n,         x, incX )                                       S, D, Sc, Dz
+	_asum ( n,         x, incX )                                       S, D, Sc, Dz
+	I_amax( n,         x, incX )                                       s, d, c, z
+
+Level 2 BLAS
+
+	        options                   dim   b-width scalar matrix  vector   scalar vector   prefixes
+
+	_gemv (        trans,      m, n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_gbmv (        trans,      m, n, kL, kU, alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_hemv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hpmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) C, Z
+	_symv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_sbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_spmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) S, D
+	_trmv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbmv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpmv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+	_trsv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbsv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpsv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+
+	        options                   dim   scalar vector   vector   matrix  prefixes
+
+	_ger  (                    m, n, alpha, x, incX, y, incY, a, lda ) S, D
+	_geru (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_gerc (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_her  ( uplo,                 n, alpha, x, incX,          a, lda ) C, Z
+	_hpr  ( uplo,                 n, alpha, x, incX,          ap )     C, Z
+	_her2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_hpr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     C, Z
+	_syr  ( uplo,                 n, alpha, x, incX,          a, lda ) S, D
+	_spr  ( uplo,                 n, alpha, x, incX,          ap )     S, D
+	_syr2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) S, D
+	_spr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     S, D
+
+Level 3 BLAS
+
+	        options                                 dim      scalar matrix  matrix  scalar matrix  prefixes
+
+	_gemm (             transA, transB,      m, n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_symm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_hemm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_syrk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) S, D, C, Z
+	_herk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) C, Z
+	_syr2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_her2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_trmm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+	_trsm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+
+Meaning of prefixes
+
+	S - float32	C - complex64
+	D - float64	Z - complex128
+
+Matrix types
+
+	GE - GEneral 		GB - General Band
+	SY - SYmmetric 		SB - Symmetric Band 	SP - Symmetric Packed
+	HE - HErmitian 		HB - Hermitian Band 	HP - Hermitian Packed
+	TR - TRiangular 	TB - Triangular Band 	TP - Triangular Packed
+
+Options
+
+	trans 	= NoTrans, Trans, ConjTrans
+	uplo 	= Upper, Lower
+	diag 	= Nonunit, Unit
+	side 	= Left, Right (A or op(A) on the left, or A or op(A) on the right)
+
+For real matrices, Trans and ConjTrans have the same meaning.
+For Hermitian matrices, trans = Trans is not allowed.
+For complex symmetric matrices, trans = ConjTrans is not allowed.
+*/
+package blas // import "gonum.org/v1/gonum/blas"
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go b/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
new file mode 100644
index 00000000000..9e74cc1dbf3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
@@ -0,0 +1,297 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Dgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C
+//	C = alpha * Aᵀ * B + beta * C
+//	C = alpha * A * Bᵀ + beta * C
+//	C = alpha * Aᵀ * Bᵀ + beta * C
+//
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+func (Implementation) Dgemm(tA, tB blas.Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	dgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func dgemmParallel(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(Aᵀ * B)
+	// Cij = \sum_k Aik Bjk,	(A * Bᵀ)
+	// Cij = \sum_k Aki Bjk,	(Aᵀ * Bᵀ)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		dgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	// workerLimit acts a number of maximum concurrent workers,
+	// with the limit set to the number of procs available.
+	workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
+
+	// wg is used to wait for all
+	var wg sync.WaitGroup
+	wg.Add(parBlocks)
+	defer wg.Wait()
+
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			workerLimit <- struct{}{}
+			go func(i, j int) {
+				defer func() {
+					wg.Done()
+					<-workerLimit
+				}()
+
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView64(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float64
+					if aTrans {
+						aSub = sliceView64(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView64(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView64(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView64(b, ldb, k, j, lenk, lenj)
+					}
+					dgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}(i, j)
+		}
+	}
+}
+
+// dgemmSerial is serial matrix multiply
+func dgemmSerial(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	switch {
+	case !aTrans && !bTrans:
+		dgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		dgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		dgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		dgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// dgemmSerial where neither a nor b are transposed
+func dgemmSerialNotNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is transposed and b is not
+func dgemmSerialTransNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is not transposed and b is
+func dgemmSerialNotTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f64.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// dgemmSerial where both are transposed
+func dgemmSerialTransTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView64(a []float64, lda, i, j, r, c int) []float64 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/doc.go b/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
new file mode 100644
index 00000000000..cbca601d90d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
@@ -0,0 +1,99 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ensure changes made to blas/native are reflected in blas/cgo where relevant.
+
+/*
+Package gonum is a Go implementation of the BLAS API. This implementation
+panics when the input arguments are invalid as per the standard, for example
+if a vector increment is zero. Note that the treatment of NaN values
+is not specified, and differs among the BLAS implementations.
+gonum.org/v1/gonum/blas/blas64 provides helpful wrapper functions to the BLAS
+interface. The rest of this text describes the layout of the data for the input types.
+
+Note that in the function documentation, x[i] refers to the i^th element
+of the vector, which will be different from the i^th element of the slice if
+incX != 1.
+
+See http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
+for more license information.
+
+Vector arguments are effectively strided slices. They have two input arguments,
+a number of elements, n, and an increment, incX. The increment specifies the
+distance between elements of the vector. The actual Go slice may be longer
+than necessary.
+The increment may be positive or negative, except in functions with only
+a single vector argument where the increment may only be positive. If the increment
+is negative, s[0] is the last element in the slice. Note that this is not the same
+as counting backward from the end of the slice, as len(s) may be longer than
+necessary. So, for example, if n = 5 and incX = 3, the elements of s are
+
+	[0 * * 1 * * 2 * * 3 * * 4 * * * ...]
+
+where ∗ elements are never accessed. If incX = -3, the same elements are
+accessed, just in reverse order (4, 3, 2, 1, 0).
+
+Dense matrices are specified by a number of rows, a number of columns, and a stride.
+The stride specifies the number of entries in the slice between the first element
+of successive rows. The stride must be at least as large as the number of columns
+but may be longer.
+
+	[a00 ... a0n a0* ... a1stride-1 a21 ... amn am* ... amstride-1]
+
+Thus, dense[i*ld + j] refers to the {i, j}th element of the matrix.
+
+Symmetric and triangular matrices (non-packed) are stored identically to Dense,
+except that only elements in one triangle of the matrix are accessed.
+
+Packed symmetric and packed triangular matrices are laid out with the entries
+condensed such that all of the unreferenced elements are removed. So, the upper triangular
+matrix
+
+	[
+	  1  2  3
+	  0  4  5
+	  0  0  6
+	]
+
+and the lower-triangular matrix
+
+	[
+	  1  0  0
+	  2  3  0
+	  4  5  6
+	]
+
+will both be compacted as [1 2 3 4 5 6]. The (i, j) element of the original
+dense matrix can be found at element i*n - (i-1)*i/2 + j for upper triangular,
+and at element i * (i+1) /2 + j for lower triangular.
+
+Banded matrices are laid out in a compact format, constructed by removing the
+zeros in the rows and aligning the diagonals. For example, the matrix
+
+	[
+	  1  2  3  0  0  0
+	  4  5  6  7  0  0
+	  0  8  9 10 11  0
+	  0  0 12 13 14 15
+	  0  0  0 16 17 18
+	  0  0  0  0 19 20
+	]
+
+implicitly becomes (∗ entries are never accessed)
+
+	[
+	   *  1  2  3
+	   4  5  6  7
+	   8  9 10 11
+	  12 13 14 15
+	  16 17 18  *
+	  19 20  *  *
+	]
+
+which is given to the BLAS routine as [∗ 1 2 3 4 ...].
+
+See http://www.crest.iu.edu/research/mtl/reference/html/banded.html
+for more information
+*/
+package gonum // import "gonum.org/v1/gonum/blas/gonum"
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/errors.go b/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
new file mode 100644
index 00000000000..e98575d0fa5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
@@ -0,0 +1,35 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Panic strings used during parameter checks.
+// This list is duplicated in netlib/blas/netlib. Keep in sync.
+const (
+	zeroIncX = "blas: zero x index increment"
+	zeroIncY = "blas: zero y index increment"
+
+	mLT0  = "blas: m < 0"
+	nLT0  = "blas: n < 0"
+	kLT0  = "blas: k < 0"
+	kLLT0 = "blas: kL < 0"
+	kULT0 = "blas: kU < 0"
+
+	badUplo      = "blas: illegal triangle"
+	badTranspose = "blas: illegal transpose"
+	badDiag      = "blas: illegal diagonal"
+	badSide      = "blas: illegal side"
+	badFlag      = "blas: illegal rotm flag"
+
+	badLdA = "blas: bad leading dimension of A"
+	badLdB = "blas: bad leading dimension of B"
+	badLdC = "blas: bad leading dimension of C"
+
+	shortX  = "blas: insufficient length of x"
+	shortY  = "blas: insufficient length of y"
+	shortAP = "blas: insufficient length of ap"
+	shortA  = "blas: insufficient length of a"
+	shortB  = "blas: insufficient length of b"
+	shortC  = "blas: insufficient length of c"
+)
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go b/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
new file mode 100644
index 00000000000..61a8b8b5d0b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
@@ -0,0 +1,52 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./single_precision.bash
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+type Implementation struct{}
+
+// [SD]gemm behavior constants. These are kept here to keep them out of the
+// way during single precision code generation.
+const (
+	blockSize   = 64 // b x b matrix
+	minParBlock = 4  // minimum number of blocks needed to go parallel
+)
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func min(a, b int) int {
+	if a > b {
+		return b
+	}
+	return a
+}
+
+// blocks returns the number of divisions of the dimension length with the given
+// block size.
+func blocks(dim, bsize int) int {
+	return (dim + bsize - 1) / bsize
+}
+
+// dcabs1 returns |real(z)|+|imag(z)|.
+func dcabs1(z complex128) float64 {
+	return math.Abs(real(z)) + math.Abs(imag(z))
+}
+
+// scabs1 returns |real(z)|+|imag(z)|.
+func scabs1(z complex64) float32 {
+	return math32.Abs(real(z)) + math32.Abs(imag(z))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
new file mode 100644
index 00000000000..3e3af0db138
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
@@ -0,0 +1,454 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level1 = Implementation{}
+
+// Dzasum returns the sum of the absolute values of the elements of x
+//
+//	\sum_i |Re(x[i])| + |Im(x[i])|
+//
+// Dzasum returns 0 if incX is negative.
+func (Implementation) Dzasum(n int, x []complex128, incX int) float64 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float64
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += dcabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += dcabs1(v)
+	}
+	return sum
+}
+
+// Dznrm2 computes the Euclidean norm of the complex vector x,
+//
+//	‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+//
+// This function returns 0 if incX is negative.
+func (Implementation) Dznrm2(n int, x []complex128, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float64
+		ssq   float64 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Izamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Izamax returns -1 if n is 0 or incX is negative.
+func (Implementation) Izamax(n int, x []complex128, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := dcabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := dcabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := dcabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Zaxpy adds alpha times x to y:
+//
+//	y[i] += alpha * x[i] for all i
+func (Implementation) Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c128.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c128.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zcopy copies the vector x to vector y.
+func (Implementation) Zcopy(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Zdotc computes the dot product
+//
+//	xᴴ · y
+//
+// of two complex vectors x and y.
+func (Implementation) Zdotc(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdotu computes the dot product
+//
+//	xᵀ · y
+//
+// of two complex vectors x and y.
+func (Implementation) Zdotu(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdscal scales the vector x by a real scalar alpha.
+// Zdscal has no effect if incX < 0.
+func (Implementation) Zdscal(n int, alpha float64, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Zscal scales the vector x by a complex scalar alpha.
+// Zscal has no effect if incX < 0.
+func (Implementation) Zscal(n int, alpha complex128, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c128.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c128.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Zswap exchanges the elements of two complex vectors x and y.
+func (Implementation) Zswap(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
new file mode 100644
index 00000000000..249335cada4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
@@ -0,0 +1,476 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level1 = Implementation{}
+
+// Scasum returns the sum of the absolute values of the elements of x
+//
+//	\sum_i |Re(x[i])| + |Im(x[i])|
+//
+// Scasum returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scasum(n int, x []complex64, incX int) float32 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float32
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += scabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += scabs1(v)
+	}
+	return sum
+}
+
+// Scnrm2 computes the Euclidean norm of the complex vector x,
+//
+//	‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+//
+// This function returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scnrm2(n int, x []complex64, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float32
+		ssq   float32 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Icamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Icamax returns -1 if n is 0 or incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Icamax(n int, x []complex64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := scabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := scabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := scabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Caxpy adds alpha times x to y:
+//
+//	y[i] += alpha * x[i] for all i
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Ccopy copies the vector x to vector y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ccopy(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Cdotc computes the dot product
+//
+//	xᴴ · y
+//
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotc(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Cdotu computes the dot product
+//
+//	xᵀ · y
+//
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotu(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Csscal scales the vector x by a real scalar alpha.
+// Csscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csscal(n int, alpha float32, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Cscal scales the vector x by a complex scalar alpha.
+// Cscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cscal(n int, alpha complex64, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Cswap exchanges the elements of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cswap(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
new file mode 100644
index 00000000000..a90b88aceb3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
@@ -0,0 +1,653 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level1 = Implementation{}
+
+// Snrm2 computes the Euclidean norm of a vector,
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// This function returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Snrm2(n int, x []float32, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 {
+		return f32.L2NormUnitary(x[:n])
+	}
+	return f32.L2NormInc(x, uintptr(n), uintptr(incX))
+}
+
+// Sasum computes the sum of the absolute values of the elements of x.
+//
+//	\sum_i |x[i]|
+//
+// Sasum returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sasum(n int, x []float32, incX int) float32 {
+	var sum float32
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Isamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Isamax returns -1 if n == 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Isamax(n int, x []float32, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Sswap exchanges the elements of two vectors.
+//
+//	x[i], y[i] = y[i], x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sswap(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Scopy copies the elements of x into the elements of y.
+//
+//	y[i] = x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Scopy(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Saxpy adds alpha times x to y
+//
+//	y[i] += alpha * x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f32.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f32.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Srotg computes a plane rotation
+//
+//	⎡  c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
+//	⎣ -s c ⎦ ⎣ b ⎦   ⎣ 0 ⎦
+//
+// satisfying c^2 + s^2 = 1.
+//
+// The computation uses the formulas
+//
+//	sigma = sgn(a)    if |a| >  |b|
+//	      = sgn(b)    if |b| >= |a|
+//	r = sigma*sqrt(a^2 + b^2)
+//	c = 1; s = 0      if r = 0
+//	c = a/r; s = b/r  if r != 0
+//	c >= 0            if |a| > |b|
+//
+// The subroutine also computes
+//
+//	z = s    if |a| > |b|,
+//	  = 1/c  if |b| >= |a| and c != 0
+//	  = 1    if c = 0
+//
+// This allows c and s to be reconstructed from z as follows:
+//
+//	If z = 1, set c = 0, s = 1.
+//	If |z| < 1, set c = sqrt(1 - z^2) and s = z.
+//	If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
+//
+// NOTE: There is a discrepancy between the reference implementation and the
+// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
+// agrees with the definition in the manual and other common BLAS
+// implementations.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotg(a, b float32) (c, s, r, z float32) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	const (
+		safmin = 0x1p-126
+		safmax = 1 / safmin
+	)
+	anorm := math.Abs(a)
+	bnorm := math.Abs(b)
+	switch {
+	case bnorm == 0:
+		c = 1
+		s = 0
+		r = a
+		z = 0
+	case anorm == 0:
+		c = 0
+		s = 1
+		r = b
+		z = 1
+	default:
+		maxab := math.Max(anorm, bnorm)
+		scl := math.Min(math.Max(safmin, maxab), safmax)
+		var sigma float32
+		if anorm > bnorm {
+			sigma = math.Copysign(1, a)
+		} else {
+			sigma = math.Copysign(1, b)
+		}
+		ascl := a / scl
+		bscl := b / scl
+		r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
+		c = a / r
+		s = b / r
+		switch {
+		case anorm > bnorm:
+			z = s
+		case c != 0:
+			z = 1 / c
+		default:
+			z = 1
+		}
+	}
+	return c, s, r, z
+}
+
+// Srotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotmg(d1, d2, x1, y1 float32) (p blas.SrotmParams, rd1, rd2, rx1 float32) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float32
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - float32(h12*h21)
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + float32(h11*h22)
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float32{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float32{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float32{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Srot applies a plane transformation.
+//
+//	x[i] = c * x[i] + s * y[i]
+//	y[i] = c * y[i] - s * x[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srot(n int, x []float32, incX int, y []float32, incY int, c float32, s float32) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Srotm applies the modified Givens rotation to the 2×n matrix.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotm(n int, x []float32, incX int, y []float32, incY int, p blas.SrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+float32(vy*h12), float32(vx*h21)+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+float32(vy*h12), float32(vx*h21)+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float32(vx*h11)+vy, -vx+float32(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float32(vx*h11)+vy, -vx+float32(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Sscal scales x by alpha.
+//
+//	x[i] *= alpha
+//
+// Sscal has no effect if incX < 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sscal(n int, alpha float32, x []float32, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f32.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f32.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
new file mode 100644
index 00000000000..cd7df4110ad
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Dsdot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Dsdot(n int, x []float32, incX int, y []float32, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DdotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
new file mode 100644
index 00000000000..c4cc166322d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdot(n int, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
new file mode 100644
index 00000000000..eb6b73bd418
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdsdot computes the dot product of the two vectors plus a constant
+//
+//	alpha + \sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return alpha + float32(f32.DdotUnitary(x[:n], y[:n]))
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return alpha + float32(f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy)))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
new file mode 100644
index 00000000000..795769d9665
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
@@ -0,0 +1,629 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level1 = Implementation{}
+
+// Dnrm2 computes the Euclidean norm of a vector,
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// This function returns 0 if incX is negative.
+func (Implementation) Dnrm2(n int, x []float64, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 {
+		return f64.L2NormUnitary(x[:n])
+	}
+	return f64.L2NormInc(x, uintptr(n), uintptr(incX))
+}
+
+// Dasum computes the sum of the absolute values of the elements of x.
+//
+//	\sum_i |x[i]|
+//
+// Dasum returns 0 if incX is negative.
+func (Implementation) Dasum(n int, x []float64, incX int) float64 {
+	var sum float64
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Idamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Idamax returns -1 if n == 0.
+func (Implementation) Idamax(n int, x []float64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Dswap exchanges the elements of two vectors.
+//
+//	x[i], y[i] = y[i], x[i] for all i
+func (Implementation) Dswap(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dcopy copies the elements of x into the elements of y.
+//
+//	y[i] = x[i] for all i
+func (Implementation) Dcopy(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Daxpy adds alpha times x to y
+//
+//	y[i] += alpha * x[i] for all i
+func (Implementation) Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Drotg computes a plane rotation
+//
+//	⎡  c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
+//	⎣ -s c ⎦ ⎣ b ⎦   ⎣ 0 ⎦
+//
+// satisfying c^2 + s^2 = 1.
+//
+// The computation uses the formulas
+//
+//	sigma = sgn(a)    if |a| >  |b|
+//	      = sgn(b)    if |b| >= |a|
+//	r = sigma*sqrt(a^2 + b^2)
+//	c = 1; s = 0      if r = 0
+//	c = a/r; s = b/r  if r != 0
+//	c >= 0            if |a| > |b|
+//
+// The subroutine also computes
+//
+//	z = s    if |a| > |b|,
+//	  = 1/c  if |b| >= |a| and c != 0
+//	  = 1    if c = 0
+//
+// This allows c and s to be reconstructed from z as follows:
+//
+//	If z = 1, set c = 0, s = 1.
+//	If |z| < 1, set c = sqrt(1 - z^2) and s = z.
+//	If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
+//
+// NOTE: There is a discrepancy between the reference implementation and the
+// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
+// agrees with the definition in the manual and other common BLAS
+// implementations.
+func (Implementation) Drotg(a, b float64) (c, s, r, z float64) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	const (
+		safmin = 0x1p-1022
+		safmax = 1 / safmin
+	)
+	anorm := math.Abs(a)
+	bnorm := math.Abs(b)
+	switch {
+	case bnorm == 0:
+		c = 1
+		s = 0
+		r = a
+		z = 0
+	case anorm == 0:
+		c = 0
+		s = 1
+		r = b
+		z = 1
+	default:
+		maxab := math.Max(anorm, bnorm)
+		scl := math.Min(math.Max(safmin, maxab), safmax)
+		var sigma float64
+		if anorm > bnorm {
+			sigma = math.Copysign(1, a)
+		} else {
+			sigma = math.Copysign(1, b)
+		}
+		ascl := a / scl
+		bscl := b / scl
+		r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
+		c = a / r
+		s = b / r
+		switch {
+		case anorm > bnorm:
+			z = s
+		case c != 0:
+			z = 1 / c
+		default:
+			z = 1
+		}
+	}
+	return c, s, r, z
+}
+
+// Drotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func (Implementation) Drotmg(d1, d2, x1, y1 float64) (p blas.DrotmParams, rd1, rd2, rx1 float64) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float64
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - float64(h12*h21)
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + float64(h11*h22)
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float64{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float64{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float64{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Drot applies a plane transformation.
+//
+//	x[i] = c * x[i] + s * y[i]
+//	y[i] = c * y[i] - s * x[i]
+func (Implementation) Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Drotm applies the modified Givens rotation to the 2×n matrix.
+func (Implementation) Drotm(n int, x []float64, incX int, y []float64, incY int, p blas.DrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+float64(vy*h12), float64(vx*h21)+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+float64(vy*h12), float64(vx*h21)+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float64(vx*h11)+vy, -vx+float64(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float64(vx*h11)+vy, -vx+float64(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Dscal scales x by alpha.
+//
+//	x[i] *= alpha
+//
+// Dscal has no effect if incX < 0.
+func (Implementation) Dscal(n int, alpha float64, x []float64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
new file mode 100644
index 00000000000..1569656ef2c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
@@ -0,0 +1,50 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Ddot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+func (Implementation) Ddot(n int, x []float64, incX int, y []float64, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f64.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f64.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
new file mode 100644
index 00000000000..fa076d5fb19
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
@@ -0,0 +1,2940 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level2 = Implementation{}
+
+// Zgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n band matrix
+// with kL sub-diagonals and kU super-diagonals.
+func (Implementation) Zgbmv(trans blas.Transpose, m, n, kL, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX, lenY = n, m
+	} else {
+		lenX, lenY = m, n
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c128.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	nRow := min(m, n+kL)
+	nCol := kL + 1 + kU
+	switch trans {
+	case blas.NoTrans:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL)
+				xtmp := x[off : off+u-l]
+				var sum complex128
+				for j, v := range aRow {
+					sum += xtmp[j] * v
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		} else {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incX
+				jx := kx
+				var sum complex128
+				for _, v := range aRow {
+					sum += x[off+jx] * v
+					jx += incX
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		}
+	case blas.Trans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	case blas.ConjTrans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Zgemv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n dense matrix.
+func (Implementation) Zgemv(trans blas.Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX = n
+		lenY = m
+	} else {
+		lenX = m
+		lenY = n
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c128.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c128.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	switch trans {
+	default:
+		// Form y = alpha*A*x + y.
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < m; i++ {
+				y[iy] += alpha * c128.DotuUnitary(a[i*lda:i*lda+n], x[:n])
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			y[iy] += alpha * c128.DotuInc(a[i*lda:i*lda+n], x, uintptr(n), 1, uintptr(incX), 0, uintptr(kx))
+			iy += incY
+		}
+		return
+
+	case blas.Trans:
+		// Form y = alpha*Aᵀ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				c128.AxpyUnitary(alpha*x[ix], a[i*lda:i*lda+n], y[:n])
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			c128.AxpyInc(alpha*x[ix], a[i*lda:i*lda+n], y, uintptr(n), 1, uintptr(incY), 0, uintptr(ky))
+			ix += incX
+		}
+		return
+
+	case blas.ConjTrans:
+		// Form y = alpha*Aᴴ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				tmp := alpha * x[ix]
+				for j := 0; j < n; j++ {
+					y[j] += tmp * cmplx.Conj(a[i*lda+j])
+				}
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			tmp := alpha * x[ix]
+			jy := ky
+			for j := 0; j < n; j++ {
+				y[jy] += tmp * cmplx.Conj(a[i*lda+j])
+				jy += incY
+			}
+			ix += incX
+		}
+		return
+	}
+}
+
+// Zgerc performs the rank-one operation
+//
+//	A += alpha * x * yᴴ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+func (Implementation) Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, jy int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for j := 0; j < n; j++ {
+		if y[jy] != 0 {
+			tmp := alpha * cmplx.Conj(y[jy])
+			c128.AxpyInc(tmp, x, a[j:], uintptr(m), uintptr(incX), uintptr(lda), uintptr(kx), 0)
+		}
+		jy += incY
+	}
+}
+
+// Zgeru performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+func (Implementation) Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY == 1 {
+		for i := 0; i < m; i++ {
+			if x[kx] != 0 {
+				tmp := alpha * x[kx]
+				c128.AxpyUnitary(tmp, y[:n], a[i*lda:i*lda+n])
+			}
+			kx += incX
+		}
+		return
+	}
+	var jy int
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for i := 0; i < m; i++ {
+		if x[kx] != 0 {
+			tmp := alpha * x[kx]
+			c128.AxpyInc(tmp, y, a[i*lda:i*lda+n], uintptr(n), uintptr(incY), 1, uintptr(jy), 0)
+		}
+		kx += incX
+	}
+}
+
+// Zhbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian band matrix with k super-diagonals. The imaginary parts of
+// the diagonal elements of A are ignored and assumed to be zero.
+func (Implementation) Zhbmv(uplo blas.Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through a.
+	switch uplo {
+	case blas.Upper:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[i]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[ix]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jx := incX
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[ix+jx] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += sum
+				ix += incX
+				iy += incY
+			}
+		}
+	case blas.Lower:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[i]
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[i-k+j]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[ix]
+				jx := l * incX
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[ix-k*incX+jx]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				ix += incX
+				iy += incY
+			}
+		}
+	}
+}
+
+// Zhemv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix. The imaginary parts of the diagonal elements of A are
+// ignored and assumed to be zero.
+func (Implementation) Zhemv(uplo blas.Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through
+	// the triangular part of A.
+
+	if uplo == blas.Upper {
+		// Form y when A is stored in upper triangle.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				var tmp2 complex128
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[j]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[i] += tmp1*aii + alpha*tmp2
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				var tmp2 complex128
+				jx := ix
+				jy := iy
+				for j := i + 1; j < n; j++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[jx]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[iy] += tmp1*aii + alpha*tmp2
+				ix += incX
+				iy += incY
+			}
+		}
+		return
+	}
+
+	// Form y when A is stored in lower triangle.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex128
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[j]
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex128
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Zher performs the Hermitian rank-one operation
+//
+//	A += alpha * x * xᴴ
+//
+// where A is an n×n Hermitian matrix, alpha is a real scalar, and x is an n
+// element vector. On entry, the imaginary parts of the diagonal elements of A
+// are ignored and assumed to be zero, on return they will be set to zero.
+func (Implementation) Zher(uplo blas.Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 {
+					tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+					aii := real(a[i*lda+i])
+					xtmp := real(tmp * cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii+xtmp, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp * cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+
+		ix := kx
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 {
+				tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+		}
+		return
+	}
+
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 {
+				tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+
+	ix := kx
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 {
+			tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+			jx := kx
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+				jx += incX
+			}
+			aii := real(a[i*lda+i])
+			xtmp := real(tmp * cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii+xtmp, 0)
+
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+	}
+}
+
+// Zher2 performs the Hermitian rank-two operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a scalar, x and y are n element vectors and A is an n×n
+// Hermitian matrix. On entry, the imaginary parts of the diagonal elements are
+// ignored and assumed to be zero. On return they will be set to zero.
+func (Implementation) Zher2(uplo blas.Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, ky int
+	var ix, iy int
+	if incX != 1 || incY != 1 {
+		if incX < 0 {
+			kx = (1 - n) * incX
+		}
+		if incY < 0 {
+			ky = (1 - n) * incY
+		}
+		ix = kx
+		iy = ky
+	}
+	if uplo == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii, 0)
+				jx := ix + incX
+				jy := iy + incY
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 || y[iy] != 0 {
+			tmp1 := alpha * x[ix]
+			tmp2 := cmplx.Conj(alpha) * y[iy]
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+				jx += incX
+				jy += incY
+			}
+			aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii, 0)
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Zhpmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix in packed form. The imaginary parts of the diagonal
+// elements of A are ignored and assumed to be zero.
+func (Implementation) Zhpmv(uplo blas.Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] *= beta
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form y when ap contains the upper triangle.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				y[i] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex128
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[j]
+					k++
+				}
+				y[i] += alpha * tmp2
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				y[iy] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex128
+				jx := ix
+				jy := iy
+				for k := kk + 1; k < kk+n-i; k++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[jx]
+				}
+				y[iy] += alpha * tmp2
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form y when ap contains the lower triangle.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex128
+			k := kk
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[j]
+				k++
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex128
+			jx := kx
+			jy := ky
+			for k := kk; k < kk+i; k++ {
+				y[jy] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Zhpr performs the Hermitian rank-1 operation
+//
+//	A += alpha * x * xᴴ
+//
+// where alpha is a real scalar, x is a vector, and A is an n×n hermitian matrix
+// in packed form. On entry, the imaginary parts of the diagonal elements are
+// assumed to be zero, and on return they are set to zero.
+func (Implementation) Zhpr(uplo blas.Uplo, n int, alpha float64, x []complex128, incX int, ap []complex128) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					a := ap[kk+1 : kk+n-i]
+					x := x[i+1 : n]
+					for j, v := range x {
+						a[j] += tmp * cmplx.Conj(v)
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					jx := ix + incX
+					a := ap[kk+1 : kk+n-i]
+					for k := range a {
+						a[k] += tmp * cmplx.Conj(x[jx])
+						jx += incX
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				for j, v := range x[:i] {
+					a[j] += tmp * cmplx.Conj(v)
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		for i := 0; i < n; i++ {
+			xi := x[ix]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				jx := kx
+				for k := range a {
+					a[k] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			kk += i + 1
+		}
+	}
+}
+
+// Zhpr2 performs the Hermitian rank-2 operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a complex scalar, x and y are n element vectors, and A is an
+// n×n Hermitian matrix, supplied in packed form. On entry, the imaginary parts
+// of the diagonal elements are assumed to be zero, and on return they are set to zero.
+func (Implementation) Zhpr2(uplo blas.Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					ap[kk] = complex(aii, 0)
+					k := kk + 1
+					for j := i + 1; j < n; j++ {
+						ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+						k++
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				if x[ix] != 0 || y[iy] != 0 {
+					tmp1 := alpha * x[ix]
+					tmp2 := cmplx.Conj(alpha) * y[iy]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+					ap[kk] = complex(aii, 0)
+					jx := ix + incX
+					jy := iy + incY
+					for k := kk + 1; k < kk+n-i; k++ {
+						ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+						jx += incX
+						jy += incY
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				k := kk
+				for j := 0; j < i; j++ {
+					ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					k++
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				jx := kx
+				jy := ky
+				for k := kk; k < kk+i; k++ {
+					ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Ztbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular band matrix, with
+// (k+1) diagonals.
+func (Implementation) Ztbmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[i+j+1] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[i-kk+j] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix -= incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda])
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda+k])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda+k])
+					}
+					ix += incX
+				}
+			}
+		}
+	}
+}
+
+// Ztbsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular band matrix
+// with (k+1) diagonals.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Ztbsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex128
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[i+1+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex128
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex128
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[i-kk+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex128
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	}
+}
+
+// Ztpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular matrix, supplied in
+// packed form.
+func (Implementation) Ztpmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex128, x []complex128, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[i] += c128.DotuUnitary(ap[kk+1:kk+n-i], x[i+1:])
+					}
+					kk += n - i
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c128.DotuInc(ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+					kk += n - i
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[i] += c128.DotuUnitary(ap[kk:kk+i], x[:i])
+					}
+					kk -= i
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[ix] += c128.DotuInc(ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+					kk -= i
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyUnitary(xi, ap[kk+1:kk+n-i], x[i+1:n])
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyInc(xi, ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if i > 0 {
+						c128.AxpyUnitary(x[i], ap[kk:kk+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c128.AxpyInc(x[ix], ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		// kk points to the current diagonal element in ap.
+		kk := n*(n+1)/2 - 1
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk])
+				}
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(ap[k])
+					k++
+				}
+				kk -= n - i + 1
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk])
+				}
+				jx := ix + incX
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(ap[k])
+					jx += incX
+					k++
+				}
+				ix -= incX
+				kk -= n - i + 1
+			}
+		}
+	} else {
+		// kk points to the beginning of current row in ap.
+		kk := 0
+		if incX == 1 {
+			x = x[:n]
+			for i, xi := range x {
+				for j := 0; j < i; j++ {
+					x[j] += xi * cmplx.Conj(ap[kk+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk+i])
+				}
+				kk += i + 1
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += xi * cmplx.Conj(ap[kk+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk+i])
+				}
+				ix += incX
+				kk += i + 1
+			}
+		}
+	}
+}
+
+// Ztpsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix in
+// packed form.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Ztpsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex128, x []complex128, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[i] -= c128.DotuUnitary(x[i+1:n], ap[kk+1:kk+n-i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[ix] -= c128.DotuInc(x, ap[kk+1:kk+n-i], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			kk := 0
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c128.DotuUnitary(x[:i], ap[kk:kk+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c128.DotuInc(x, ap[kk:kk+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			kk := 0
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyUnitary(-x[j], ap[kk+1:kk+n-j], x[j+1:n])
+					}
+					kk += n - j
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyInc(-x[jx], ap[kk+1:kk+n-j], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+					kk += n - j
+				}
+			}
+		} else {
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk+j]
+					}
+					if j > 0 {
+						c128.AxpyUnitary(-x[j], ap[kk:kk+j], x[:j])
+					}
+					kk -= j
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk+j]
+					}
+					if j > 0 {
+						c128.AxpyInc(-x[jx], ap[kk:kk+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+					kk -= j
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		kk := 0
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[j]
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(ap[k])
+					k++
+				}
+				kk += n - j
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[k])
+					ix += incX
+					k++
+				}
+				jx += incX
+				kk += n - j
+			}
+		}
+	} else {
+		kk := n*(n+1)/2 - n
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(ap[kk+i])
+				}
+				kk -= j
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[kk+i])
+					ix += incX
+				}
+				jx -= incX
+				kk -= j
+			}
+		}
+	}
+}
+
+// Ztrmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is a vector, and A is an n×n triangular matrix.
+func (Implementation) Ztrmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[i] += c128.DotuUnitary(a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c128.DotuInc(a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[i] += c128.DotuUnitary(a[i*lda:i*lda+i], x[:i])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[ix] += c128.DotuInc(a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyUnitary(xi, a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c128.AxpyInc(xi, a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c128.AxpyUnitary(x[i], a[i*lda:i*lda+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c128.AxpyInc(x[ix], a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(a[i*lda+j])
+				}
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				ix -= incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				for j := 0; j < i; j++ {
+					x[j] += x[i] * cmplx.Conj(a[i*lda+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += x[ix] * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Ztrsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Ztrsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex128, lda int, x []complex128, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[i] -= c128.DotuUnitary(x[i+1:n], a[i*lda+i+1:i*lda+n])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[ix] -= c128.DotuInc(x, a[i*lda+i+1:i*lda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c128.DotuUnitary(x[:i], a[i*lda:i*lda+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c128.DotuInc(x, a[i*lda:i*lda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyUnitary(-x[j], a[j*lda+j+1:j*lda+n], x[j+1:n])
+					}
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c128.AxpyInc(-x[jx], a[j*lda+j+1:j*lda+n], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					xj := x[j]
+					if j > 0 {
+						c128.AxpyUnitary(-xj, a[j*lda:j*lda+j], x[:j])
+					}
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if j > 0 {
+						c128.AxpyInc(-x[jx], a[j*lda:j*lda+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx += incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx -= incX
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
new file mode 100644
index 00000000000..3ce67868cd7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
@@ -0,0 +1,2976 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	cmplx "gonum.org/v1/gonum/internal/cmplx64"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level2 = Implementation{}
+
+// Cgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n band matrix
+// with kL sub-diagonals and kU super-diagonals.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgbmv(trans blas.Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX, lenY = n, m
+	} else {
+		lenX, lenY = m, n
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c64.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	nRow := min(m, n+kL)
+	nCol := kL + 1 + kU
+	switch trans {
+	case blas.NoTrans:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL)
+				xtmp := x[off : off+u-l]
+				var sum complex64
+				for j, v := range aRow {
+					sum += xtmp[j] * v
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		} else {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incX
+				jx := kx
+				var sum complex64
+				for _, v := range aRow {
+					sum += x[off+jx] * v
+					jx += incX
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+		}
+	case blas.Trans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * v
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	case blas.ConjTrans:
+		if incX == 1 {
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[i]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < nRow; i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				aRow := a[i*lda+l : i*lda+u]
+				off := max(0, i-kL) * incY
+				alphaxi := alpha * x[ix]
+				jy := ky
+				for _, v := range aRow {
+					y[off+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Cgemv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if trans = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if trans = blas.Trans
+//	y = alpha * Aᴴ * x + beta * y  if trans = blas.ConjTrans
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an m×n dense matrix.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgemv(trans blas.Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	var lenX, lenY int
+	if trans == blas.NoTrans {
+		lenX = n
+		lenY = m
+	} else {
+		lenX = m
+		lenY = n
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - lenX) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - lenY) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				c64.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					c64.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	switch trans {
+	default:
+		// Form y = alpha*A*x + y.
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < m; i++ {
+				y[iy] += alpha * c64.DotuUnitary(a[i*lda:i*lda+n], x[:n])
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			y[iy] += alpha * c64.DotuInc(a[i*lda:i*lda+n], x, uintptr(n), 1, uintptr(incX), 0, uintptr(kx))
+			iy += incY
+		}
+		return
+
+	case blas.Trans:
+		// Form y = alpha*Aᵀ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				c64.AxpyUnitary(alpha*x[ix], a[i*lda:i*lda+n], y[:n])
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			c64.AxpyInc(alpha*x[ix], a[i*lda:i*lda+n], y, uintptr(n), 1, uintptr(incY), 0, uintptr(ky))
+			ix += incX
+		}
+		return
+
+	case blas.ConjTrans:
+		// Form y = alpha*Aᴴ*x + y.
+		ix := kx
+		if incY == 1 {
+			for i := 0; i < m; i++ {
+				tmp := alpha * x[ix]
+				for j := 0; j < n; j++ {
+					y[j] += tmp * cmplx.Conj(a[i*lda+j])
+				}
+				ix += incX
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			tmp := alpha * x[ix]
+			jy := ky
+			for j := 0; j < n; j++ {
+				y[jy] += tmp * cmplx.Conj(a[i*lda+j])
+				jy += incY
+			}
+			ix += incX
+		}
+		return
+	}
+}
+
+// Cgerc performs the rank-one operation
+//
+//	A += alpha * x * yᴴ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, jy int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for j := 0; j < n; j++ {
+		if y[jy] != 0 {
+			tmp := alpha * cmplx.Conj(y[jy])
+			c64.AxpyInc(tmp, x, a[j:], uintptr(m), uintptr(incX), uintptr(lda), uintptr(kx), 0)
+		}
+		jy += incY
+	}
+}
+
+// Cgeru performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, alpha is a scalar, x is an m element vector,
+// and y is an n element vector.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - m) * incX
+	}
+	if incY == 1 {
+		for i := 0; i < m; i++ {
+			if x[kx] != 0 {
+				tmp := alpha * x[kx]
+				c64.AxpyUnitary(tmp, y[:n], a[i*lda:i*lda+n])
+			}
+			kx += incX
+		}
+		return
+	}
+	var jy int
+	if incY < 0 {
+		jy = (1 - n) * incY
+	}
+	for i := 0; i < m; i++ {
+		if x[kx] != 0 {
+			tmp := alpha * x[kx]
+			c64.AxpyInc(tmp, y, a[i*lda:i*lda+n], uintptr(n), uintptr(incY), 1, uintptr(jy), 0)
+		}
+		kx += incX
+	}
+}
+
+// Chbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian band matrix with k super-diagonals. The imaginary parts of
+// the diagonal elements of A are ignored and assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chbmv(uplo blas.Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through a.
+	switch uplo {
+	case blas.Upper:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[i]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				aRow := a[i*lda:]
+				alphaxi := alpha * x[ix]
+				sum := alphaxi * complex(real(aRow[0]), 0)
+				u := min(k+1, n-i)
+				jx := incX
+				jy := incY
+				for j := 1; j < u; j++ {
+					v := aRow[j]
+					sum += alpha * x[ix+jx] * v
+					y[iy+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += sum
+				ix += incX
+				iy += incY
+			}
+		}
+	case blas.Lower:
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[i]
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[i-k+j]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				iy += incY
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				l := max(0, k-i)
+				alphaxi := alpha * x[ix]
+				jx := l * incX
+				jy := l * incY
+				aRow := a[i*lda:]
+				for j := l; j < k; j++ {
+					v := aRow[j]
+					y[iy] += alpha * v * x[ix-k*incX+jx]
+					y[iy-k*incY+jy] += alphaxi * cmplx.Conj(v)
+					jx += incX
+					jy += incY
+				}
+				y[iy] += alphaxi * complex(real(aRow[k]), 0)
+				ix += incX
+				iy += incY
+			}
+		}
+	}
+}
+
+// Chemv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix. The imaginary parts of the diagonal elements of A are
+// ignored and assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chemv(uplo blas.Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] = beta * y[iy]
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through
+	// the triangular part of A.
+
+	if uplo == blas.Upper {
+		// Form y when A is stored in upper triangle.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				var tmp2 complex64
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[j]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[i] += tmp1*aii + alpha*tmp2
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				var tmp2 complex64
+				jx := ix
+				jy := iy
+				for j := i + 1; j < n; j++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+					tmp2 += a[i*lda+j] * x[jx]
+				}
+				aii := complex(real(a[i*lda+i]), 0)
+				y[iy] += tmp1*aii + alpha*tmp2
+				ix += incX
+				iy += incY
+			}
+		}
+		return
+	}
+
+	// Form y when A is stored in lower triangle.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex64
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[j]
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex64
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				y[jy] += tmp1 * cmplx.Conj(a[i*lda+j])
+				tmp2 += a[i*lda+j] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(a[i*lda+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Cher performs the Hermitian rank-one operation
+//
+//	A += alpha * x * xᴴ
+//
+// where A is an n×n Hermitian matrix, alpha is a real scalar, and x is an n
+// element vector. On entry, the imaginary parts of the diagonal elements of A
+// are ignored and assumed to be zero, on return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cher(uplo blas.Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 {
+					tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+					aii := real(a[i*lda+i])
+					xtmp := real(tmp * cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii+xtmp, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp * cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+
+		ix := kx
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 {
+				tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+		}
+		return
+	}
+
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 {
+				tmp := complex(alpha*real(x[i]), alpha*imag(x[i]))
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp * cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i])
+				xtmp := real(tmp * cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii+xtmp, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+
+	ix := kx
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 {
+			tmp := complex(alpha*real(x[ix]), alpha*imag(x[ix]))
+			jx := kx
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp * cmplx.Conj(x[jx])
+				jx += incX
+			}
+			aii := real(a[i*lda+i])
+			xtmp := real(tmp * cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii+xtmp, 0)
+
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+	}
+}
+
+// Cher2 performs the Hermitian rank-two operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a scalar, x and y are n element vectors and A is an n×n
+// Hermitian matrix. On entry, the imaginary parts of the diagonal elements are
+// ignored and assumed to be zero. On return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cher2(uplo blas.Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var kx, ky int
+	var ix, iy int
+	if incX != 1 || incY != 1 {
+		if incX < 0 {
+			kx = (1 - n) * incX
+		}
+		if incY < 0 {
+			ky = (1 - n) * incY
+		}
+		ix = kx
+		iy = ky
+	}
+	if uplo == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					a[i*lda+i] = complex(aii, 0)
+					for j := i + 1; j < n; j++ {
+						a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					}
+				} else {
+					aii := real(a[i*lda+i])
+					a[i*lda+i] = complex(aii, 0)
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				a[i*lda+i] = complex(aii, 0)
+				jx := ix + incX
+				jy := iy + incY
+				for j := i + 1; j < n; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				for j := 0; j < i; j++ {
+					a[i*lda+j] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+				}
+				aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				a[i*lda+i] = complex(aii, 0)
+			} else {
+				aii := real(a[i*lda+i])
+				a[i*lda+i] = complex(aii, 0)
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		if x[ix] != 0 || y[iy] != 0 {
+			tmp1 := alpha * x[ix]
+			tmp2 := cmplx.Conj(alpha) * y[iy]
+			jx := kx
+			jy := ky
+			for j := 0; j < i; j++ {
+				a[i*lda+j] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+				jx += incX
+				jy += incY
+			}
+			aii := real(a[i*lda+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+			a[i*lda+i] = complex(aii, 0)
+		} else {
+			aii := real(a[i*lda+i])
+			a[i*lda+i] = complex(aii, 0)
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Chpmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where alpha and beta are scalars, x and y are vectors, and A is an n×n
+// Hermitian matrix in packed form. The imaginary parts of the diagonal
+// elements of A are ignored and assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chpmv(uplo blas.Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up the start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// Form y = beta*y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				for i, v := range y[:n] {
+					y[i] = beta * v
+				}
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					y[iy] *= beta
+					iy += incY
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form y when ap contains the upper triangle.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[i]
+				y[i] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex64
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					y[j] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[j]
+					k++
+				}
+				y[i] += alpha * tmp2
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				tmp1 := alpha * x[ix]
+				y[iy] += tmp1 * complex(real(ap[kk]), 0)
+				var tmp2 complex64
+				jx := ix
+				jy := iy
+				for k := kk + 1; k < kk+n-i; k++ {
+					jx += incX
+					jy += incY
+					y[jy] += tmp1 * cmplx.Conj(ap[k])
+					tmp2 += ap[k] * x[jx]
+				}
+				y[iy] += alpha * tmp2
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form y when ap contains the lower triangle.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[i]
+			var tmp2 complex64
+			k := kk
+			for j := 0; j < i; j++ {
+				y[j] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[j]
+				k++
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[i] += tmp1*aii + alpha*tmp2
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			tmp1 := alpha * x[ix]
+			var tmp2 complex64
+			jx := kx
+			jy := ky
+			for k := kk; k < kk+i; k++ {
+				y[jy] += tmp1 * cmplx.Conj(ap[k])
+				tmp2 += ap[k] * x[jx]
+				jx += incX
+				jy += incY
+			}
+			aii := complex(real(ap[kk+i]), 0)
+			y[iy] += tmp1*aii + alpha*tmp2
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Chpr performs the Hermitian rank-1 operation
+//
+//	A += alpha * x * xᴴ
+//
+// where alpha is a real scalar, x is a vector, and A is an n×n hermitian matrix
+// in packed form. On entry, the imaginary parts of the diagonal elements are
+// assumed to be zero, and on return they are set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chpr(uplo blas.Uplo, n int, alpha float32, x []complex64, incX int, ap []complex64) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					a := ap[kk+1 : kk+n-i]
+					x := x[i+1 : n]
+					for j, v := range x {
+						a[j] += tmp * cmplx.Conj(v)
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				if xi != 0 {
+					aii := real(ap[kk]) + alpha*real(cmplx.Conj(xi)*xi)
+					ap[kk] = complex(aii, 0)
+
+					tmp := complex(alpha, 0) * xi
+					jx := ix + incX
+					a := ap[kk+1 : kk+n-i]
+					for k := range a {
+						a[k] += tmp * cmplx.Conj(x[jx])
+						jx += incX
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				for j, v := range x[:i] {
+					a[j] += tmp * cmplx.Conj(v)
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		for i := 0; i < n; i++ {
+			xi := x[ix]
+			if xi != 0 {
+				tmp := complex(alpha, 0) * xi
+				a := ap[kk : kk+i]
+				jx := kx
+				for k := range a {
+					a[k] += tmp * cmplx.Conj(x[jx])
+					jx += incX
+				}
+
+				aii := real(ap[kk+i]) + alpha*real(cmplx.Conj(xi)*xi)
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			kk += i + 1
+		}
+	}
+}
+
+// Chpr2 performs the Hermitian rank-2 operation
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ
+//
+// where alpha is a complex scalar, x and y are n element vectors, and A is an
+// n×n Hermitian matrix, supplied in packed form. On entry, the imaginary parts
+// of the diagonal elements are assumed to be zero, and on return they are set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chpr2(uplo blas.Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	// Set up start indices in X and Y.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+	var ky int
+	if incY < 0 {
+		ky = (1 - n) * incY
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	var kk int
+	if uplo == blas.Upper {
+		// Form A when upper triangle is stored in AP.
+		// Here, kk points to the current diagonal element in ap.
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				if x[i] != 0 || y[i] != 0 {
+					tmp1 := alpha * x[i]
+					tmp2 := cmplx.Conj(alpha) * y[i]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+					ap[kk] = complex(aii, 0)
+					k := kk + 1
+					for j := i + 1; j < n; j++ {
+						ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+						k++
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				kk += n - i
+			}
+		} else {
+			ix := kx
+			iy := ky
+			for i := 0; i < n; i++ {
+				if x[ix] != 0 || y[iy] != 0 {
+					tmp1 := alpha * x[ix]
+					tmp2 := cmplx.Conj(alpha) * y[iy]
+					aii := real(ap[kk]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+					ap[kk] = complex(aii, 0)
+					jx := ix + incX
+					jy := iy + incY
+					for k := kk + 1; k < kk+n-i; k++ {
+						ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+						jx += incX
+						jy += incY
+					}
+				} else {
+					ap[kk] = complex(real(ap[kk]), 0)
+				}
+				ix += incX
+				iy += incY
+				kk += n - i
+			}
+		}
+		return
+	}
+
+	// Form A when lower triangle is stored in AP.
+	// Here, kk points to the beginning of current row in ap.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			if x[i] != 0 || y[i] != 0 {
+				tmp1 := alpha * x[i]
+				tmp2 := cmplx.Conj(alpha) * y[i]
+				k := kk
+				for j := 0; j < i; j++ {
+					ap[k] += tmp1*cmplx.Conj(y[j]) + tmp2*cmplx.Conj(x[j])
+					k++
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[i])) + real(tmp2*cmplx.Conj(x[i]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			kk += i + 1
+		}
+	} else {
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			if x[ix] != 0 || y[iy] != 0 {
+				tmp1 := alpha * x[ix]
+				tmp2 := cmplx.Conj(alpha) * y[iy]
+				jx := kx
+				jy := ky
+				for k := kk; k < kk+i; k++ {
+					ap[k] += tmp1*cmplx.Conj(y[jy]) + tmp2*cmplx.Conj(x[jx])
+					jx += incX
+					jy += incY
+				}
+				aii := real(ap[kk+i]) + real(tmp1*cmplx.Conj(y[iy])) + real(tmp2*cmplx.Conj(x[ix]))
+				ap[kk+i] = complex(aii, 0)
+			} else {
+				ap[kk+i] = complex(real(ap[kk+i]), 0)
+			}
+			ix += incX
+			iy += incY
+			kk += i + 1
+		}
+	}
+}
+
+// Ctbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular band matrix, with
+// (k+1) diagonals.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctbmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[i+j+1] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[i-kk+j] * aij
+					}
+					x[i] = xi
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						xi *= a[i*lda+k]
+					}
+					kk := min(k, i)
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						xi += x[jx] * aij
+						jx += incX
+					}
+					x[ix] = xi
+					ix -= incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * aij
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * aij
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+j+1] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					jx := ix + incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda])
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] += xi * cmplx.Conj(aij)
+					}
+					if diag == blas.NonUnit {
+						x[i] *= cmplx.Conj(a[i*lda+k])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					jx := ix - kk*incX
+					xi := x[ix]
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] += xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= cmplx.Conj(a[i*lda+k])
+					}
+					ix += incX
+				}
+			}
+		}
+	}
+}
+
+// Ctbsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular band matrix
+// with (k+1) diagonals.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctbsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, k int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	switch trans {
+	case blas.NoTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex64
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[i+1+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					kk := min(k, n-i-1)
+					var sum complex64
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex64
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[i-kk+j] * aij
+					}
+					x[i] -= sum
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					kk := min(k, i)
+					var sum complex64
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						sum += x[jx] * aij
+						jx += incX
+					}
+					x[ix] -= sum
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					ix += incX
+				}
+			}
+		}
+	case blas.Trans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda]
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * aij
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+k]
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * aij
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	case blas.ConjTrans:
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[i]
+					for j, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[i+1+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda])
+					}
+					kk := min(k, n-i-1)
+					xi := x[ix]
+					jx := ix + incX
+					for _, aij := range a[i*lda+1 : i*lda+kk+1] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[i]
+					for j, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[i-kk+j] -= xi * cmplx.Conj(aij)
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] /= cmplx.Conj(a[i*lda+k])
+					}
+					kk := min(k, i)
+					xi := x[ix]
+					jx := ix - kk*incX
+					for _, aij := range a[i*lda+k-kk : i*lda+k] {
+						x[jx] -= xi * cmplx.Conj(aij)
+						jx += incX
+					}
+					ix -= incX
+				}
+			}
+		}
+	}
+}
+
+// Ctpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is an n element vector and A is an n×n triangular matrix, supplied in
+// packed form.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctpmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex64, x []complex64, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[i] += c64.DotuUnitary(ap[kk+1:kk+n-i], x[i+1:])
+					}
+					kk += n - i
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c64.DotuInc(ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+					kk += n - i
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[i] += c64.DotuUnitary(ap[kk:kk+i], x[:i])
+					}
+					kk -= i
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					if i > 0 {
+						x[ix] += c64.DotuInc(ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+					kk -= i
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			// kk points to the current diagonal element in ap.
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyUnitary(xi, ap[kk+1:kk+n-i], x[i+1:n])
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyInc(xi, ap[kk+1:kk+n-i], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			// kk points to the beginning of current row in ap.
+			kk := 0
+			if incX == 1 {
+				x = x[:n]
+				for i := range x {
+					if i > 0 {
+						c64.AxpyUnitary(x[i], ap[kk:kk+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c64.AxpyInc(x[ix], ap[kk:kk+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		// kk points to the current diagonal element in ap.
+		kk := n*(n+1)/2 - 1
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk])
+				}
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(ap[k])
+					k++
+				}
+				kk -= n - i + 1
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk])
+				}
+				jx := ix + incX
+				k := kk + 1
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(ap[k])
+					jx += incX
+					k++
+				}
+				ix -= incX
+				kk -= n - i + 1
+			}
+		}
+	} else {
+		// kk points to the beginning of current row in ap.
+		kk := 0
+		if incX == 1 {
+			x = x[:n]
+			for i, xi := range x {
+				for j := 0; j < i; j++ {
+					x[j] += xi * cmplx.Conj(ap[kk+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(ap[kk+i])
+				}
+				kk += i + 1
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				xi := x[ix]
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += xi * cmplx.Conj(ap[kk+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(ap[kk+i])
+				}
+				ix += incX
+				kk += i + 1
+			}
+		}
+	}
+}
+
+// Ctpsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix in
+// packed form.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctpsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, ap []complex64, x []complex64, incX int) {
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through ap.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			kk := n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[i] -= c64.DotuUnitary(x[i+1:n], ap[kk+1:kk+n-i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+					kk -= n - i + 1
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := ap[kk]
+					if n-i-1 > 0 {
+						x[ix] -= c64.DotuInc(x, ap[kk+1:kk+n-i], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+					kk -= n - i + 1
+				}
+			}
+		} else {
+			kk := 0
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c64.DotuUnitary(x[:i], ap[kk:kk+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= ap[kk+i]
+					}
+					kk += i + 1
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c64.DotuInc(x, ap[kk:kk+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= ap[kk+i]
+					}
+					ix += incX
+					kk += i + 1
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			kk := 0
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyUnitary(-x[j], ap[kk+1:kk+n-j], x[j+1:n])
+					}
+					kk += n - j
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyInc(-x[jx], ap[kk+1:kk+n-j], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+					kk += n - j
+				}
+			}
+		} else {
+			kk := n*(n+1)/2 - n
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= ap[kk+j]
+					}
+					if j > 0 {
+						c64.AxpyUnitary(-x[j], ap[kk:kk+j], x[:j])
+					}
+					kk -= j
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= ap[kk+j]
+					}
+					if j > 0 {
+						c64.AxpyInc(-x[jx], ap[kk:kk+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+					kk -= j
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		kk := 0
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[j]
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(ap[k])
+					k++
+				}
+				kk += n - j
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				k := kk + 1
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[k])
+					ix += incX
+					k++
+				}
+				jx += incX
+				kk += n - j
+			}
+		}
+	} else {
+		kk := n*(n+1)/2 - n
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(ap[kk+i])
+				}
+				kk -= j
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(ap[kk+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(ap[kk+i])
+					ix += incX
+				}
+				jx -= incX
+				kk -= j
+			}
+		}
+	}
+}
+
+// Ctrmv performs one of the matrix-vector operations
+//
+//	x = A * x   if trans = blas.NoTrans
+//	x = Aᵀ * x  if trans = blas.Trans
+//	x = Aᴴ * x  if trans = blas.ConjTrans
+//
+// where x is a vector, and A is an n×n triangular matrix.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrmv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = A*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[i] += c64.DotuUnitary(a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						x[ix] += c64.DotuInc(a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[i] += c64.DotuUnitary(a[i*lda:i*lda+i], x[:i])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if i > 0 {
+						x[ix] += c64.DotuInc(a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					ix -= incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = Aᵀ*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					xi := x[i]
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyUnitary(xi, a[i*lda+i+1:i*lda+n], x[i+1:n])
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					xi := x[ix]
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					if n-i-1 > 0 {
+						c64.AxpyInc(xi, a[i*lda+i+1:i*lda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(ix+incX))
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c64.AxpyUnitary(x[i], a[i*lda:i*lda+i], x[:i])
+					}
+					if diag == blas.NonUnit {
+						x[i] *= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						c64.AxpyInc(x[ix], a[i*lda:i*lda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					if diag == blas.NonUnit {
+						x[ix] *= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = Aᴴ*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+				for j := i + 1; j < n; j++ {
+					x[j] += xi * cmplx.Conj(a[i*lda+j])
+				}
+			}
+		} else {
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				xi := x[ix]
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				jx := ix + incX
+				for j := i + 1; j < n; j++ {
+					x[jx] += xi * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				ix -= incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				for j := 0; j < i; j++ {
+					x[j] += x[i] * cmplx.Conj(a[i*lda+j])
+				}
+				if diag == blas.NonUnit {
+					x[i] *= cmplx.Conj(a[i*lda+i])
+				}
+			}
+		} else {
+			ix := kx
+			for i := 0; i < n; i++ {
+				jx := kx
+				for j := 0; j < i; j++ {
+					x[jx] += x[ix] * cmplx.Conj(a[i*lda+j])
+					jx += incX
+				}
+				if diag == blas.NonUnit {
+					x[ix] *= cmplx.Conj(a[i*lda+i])
+				}
+				ix += incX
+			}
+		}
+	}
+}
+
+// Ctrsv solves one of the systems of equations
+//
+//	A * x = b   if trans == blas.NoTrans
+//	Aᵀ * x = b  if trans == blas.Trans
+//	Aᴴ * x = b  if trans == blas.ConjTrans
+//
+// where b and x are n element vectors and A is an n×n triangular matrix.
+//
+// On entry, x contains the values of b, and the solution is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrsv(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n int, a []complex64, lda int, x []complex64, incX int) {
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch uplo {
+	default:
+		panic(badUplo)
+	case blas.Upper, blas.Lower:
+	}
+	switch diag {
+	default:
+		panic(badDiag)
+	case blas.NonUnit, blas.Unit:
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	// Set up start index in X.
+	var kx int
+	if incX < 0 {
+		kx = (1 - n) * incX
+	}
+
+	// The elements of A are accessed sequentially with one pass through A.
+
+	if trans == blas.NoTrans {
+		// Form x = inv(A)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[i] -= c64.DotuUnitary(x[i+1:n], a[i*lda+i+1:i*lda+n])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= aii
+					}
+				}
+			} else {
+				ix := kx + (n-1)*incX
+				for i := n - 1; i >= 0; i-- {
+					aii := a[i*lda+i]
+					if n-i-1 > 0 {
+						x[ix] -= c64.DotuInc(x, a[i*lda+i+1:i*lda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= aii
+					}
+					ix -= incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[i] -= c64.DotuUnitary(x[:i], a[i*lda:i*lda+i])
+					}
+					if diag == blas.NonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+			} else {
+				ix := kx
+				for i := 0; i < n; i++ {
+					if i > 0 {
+						x[ix] -= c64.DotuInc(x, a[i*lda:i*lda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+					}
+					if diag == blas.NonUnit {
+						x[ix] /= a[i*lda+i]
+					}
+					ix += incX
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.Trans {
+		// Form x = inv(Aᵀ)*x.
+		if uplo == blas.Upper {
+			if incX == 1 {
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyUnitary(-x[j], a[j*lda+j+1:j*lda+n], x[j+1:n])
+					}
+				}
+			} else {
+				jx := kx
+				for j := 0; j < n; j++ {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if n-j-1 > 0 {
+						c64.AxpyInc(-x[jx], a[j*lda+j+1:j*lda+n], x, uintptr(n-j-1), 1, uintptr(incX), 0, uintptr(jx+incX))
+					}
+					jx += incX
+				}
+			}
+		} else {
+			if incX == 1 {
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[j] /= a[j*lda+j]
+					}
+					xj := x[j]
+					if j > 0 {
+						c64.AxpyUnitary(-xj, a[j*lda:j*lda+j], x[:j])
+					}
+				}
+			} else {
+				jx := kx + (n-1)*incX
+				for j := n - 1; j >= 0; j-- {
+					if diag == blas.NonUnit {
+						x[jx] /= a[j*lda+j]
+					}
+					if j > 0 {
+						c64.AxpyInc(-x[jx], a[j*lda:j*lda+j], x, uintptr(j), 1, uintptr(incX), 0, uintptr(kx))
+					}
+					jx -= incX
+				}
+			}
+		}
+		return
+	}
+
+	// Form x = inv(Aᴴ)*x.
+	if uplo == blas.Upper {
+		if incX == 1 {
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := j + 1; i < n; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx
+			for j := 0; j < n; j++ {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := jx + incX
+				for i := j + 1; i < n; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx += incX
+			}
+		}
+	} else {
+		if incX == 1 {
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[j] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[j]
+				for i := 0; i < j; i++ {
+					x[i] -= xj * cmplx.Conj(a[j*lda+i])
+				}
+			}
+		} else {
+			jx := kx + (n-1)*incX
+			for j := n - 1; j >= 0; j-- {
+				if diag == blas.NonUnit {
+					x[jx] /= cmplx.Conj(a[j*lda+j])
+				}
+				xj := x[jx]
+				ix := kx
+				for i := 0; i < j; i++ {
+					x[ix] -= xj * cmplx.Conj(a[j*lda+i])
+					ix += incX
+				}
+				jx -= incX
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
new file mode 100644
index 00000000000..26e4959d7fb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
@@ -0,0 +1,2400 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level2 = Implementation{}
+
+// Sger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+	f32.Ger(uintptr(m), uintptr(n),
+		alpha,
+		x, uintptr(incX),
+		y, uintptr(incY),
+		a, uintptr(lda))
+}
+
+// Sgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if tA == blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an m×n band matrix with kL sub-diagonals and kU super-diagonals,
+// x and y are vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgbmv(tA blas.Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// i and j are indices of the compacted banded matrix.
+	// off is the offset into the dense matrix (off + j = densej)
+	nCol := kU + 1 + kL
+	if tA == blas.NoTrans {
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < min(m, n+kL); i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				off := max(0, i-kL)
+				atmp := a[i*lda+l : i*lda+u]
+				xtmp := x[off : off+u-l]
+				var sum float32
+				for j, v := range atmp {
+					sum += xtmp[j] * v
+				}
+				y[iy] += sum * alpha
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			jx := kx
+			var sum float32
+			for _, v := range atmp {
+				sum += x[off*incX+jx] * v
+				jx += incX
+			}
+			y[iy] += sum * alpha
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			tmp := alpha * x[i]
+			jy := ky
+			for _, v := range atmp {
+				y[jy+off*incY] += tmp * v
+				jy += incY
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < min(m, n+kL); i++ {
+		l := max(0, kL-i)
+		u := min(nCol, n+kL-i)
+		off := max(0, i-kL)
+		atmp := a[i*lda+l : i*lda+u]
+		tmp := alpha * x[ix]
+		jy := ky
+		for _, v := range atmp {
+			y[jy+off*incY] += tmp * v
+			jy += incY
+		}
+		ix += incX
+	}
+}
+
+// Sgemv computes
+//
+//	y = alpha * A * x + beta * y   if tA = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA = blas.Trans or blas.ConjTrans
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemv(tA blas.Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	// Set up indexes
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+
+	// Quick return if possible
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if (incX > 0 && (lenX-1)*incX >= len(x)) || (incX < 0 && (1-lenX)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (lenY-1)*incY >= len(y)) || (incY < 0 && (1-lenY)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		// First form y = beta * y
+		if incY > 0 {
+			Implementation{}.Sscal(lenY, beta, y, incY)
+		} else {
+			Implementation{}.Sscal(lenY, beta, y, -incY)
+		}
+		return
+	}
+
+	// Form y = alpha * A * x + y
+	if tA == blas.NoTrans {
+		f32.GemvN(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+		return
+	}
+	// Cases where a is transposed.
+	f32.GemvT(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+}
+
+// Strmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	nonUnit := d != blas.Unit
+	if n == 1 {
+		if nonUnit {
+			x[0] *= a[0]
+		}
+		return
+	}
+	var kx int
+	if incX <= 0 {
+		kx = -(n - 1) * incX
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					ilda := i * lda
+					var tmp float32
+					if nonUnit {
+						tmp = a[ilda+i] * x[i]
+					} else {
+						tmp = x[i]
+					}
+					x[i] = tmp + f32.DotUnitary(a[ilda+i+1:ilda+n], x[i+1:n])
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				ilda := i * lda
+				var tmp float32
+				if nonUnit {
+					tmp = a[ilda+i] * x[ix]
+				} else {
+					tmp = x[ix]
+				}
+				x[ix] = tmp + f32.DotInc(x, a[ilda+i+1:ilda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				var tmp float32
+				if nonUnit {
+					tmp += a[ilda+i] * x[i]
+				} else {
+					tmp = x[i]
+				}
+				x[i] = tmp + f32.DotUnitary(a[ilda:ilda+i], x[:i])
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			var tmp float32
+			if nonUnit {
+				tmp = a[ilda+i] * x[ix]
+			} else {
+				tmp = x[ix]
+			}
+			x[ix] = tmp + f32.DotInc(x, a[ilda:ilda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+			ix -= incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				xi := x[i]
+				f32.AxpyUnitary(xi, a[ilda+i+1:ilda+n], x[i+1:n])
+				if nonUnit {
+					x[i] *= a[ilda+i]
+				}
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			xi := x[ix]
+			f32.AxpyInc(xi, a[ilda+i+1:ilda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(kx+(i+1)*incX))
+			if nonUnit {
+				x[ix] *= a[ilda+i]
+			}
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			ilda := i * lda
+			xi := x[i]
+			f32.AxpyUnitary(xi, a[ilda:ilda+i], x[:i])
+			if nonUnit {
+				x[i] *= a[i*lda+i]
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		ilda := i * lda
+		xi := x[ix]
+		f32.AxpyInc(xi, a[ilda:ilda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+		if nonUnit {
+			x[ix] *= a[ilda+i]
+		}
+		ix += incX
+	}
+}
+
+// Strsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	if n == 1 {
+		if d == blas.NonUnit {
+			x[0] /= a[0]
+		}
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					var sum float32
+					atmp := a[i*lda+i+1 : i*lda+n]
+					for j, v := range atmp {
+						jv := i + j + 1
+						sum += x[jv] * v
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				var sum float32
+				jx := ix + incX
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for _, v := range atmp {
+					sum += x[jx] * v
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= a[i*lda+i]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				var sum float32
+				atmp := a[i*lda : i*lda+i]
+				for j, v := range atmp {
+					sum += x[j] * v
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			var sum float32
+			atmp := a[i*lda : i*lda+i]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+				xi := x[i]
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jv := j + i + 1
+					x[jv] -= v * xi
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			xi := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				x[jx] -= v * xi
+				jx += incX
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= a[i*lda+i]
+			}
+			xi := x[i]
+			atmp := a[i*lda : i*lda+i]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= a[i*lda+i]
+		}
+		xi := x[ix]
+		jx := kx
+		atmp := a[i*lda : i*lda+i]
+		for _, v := range atmp {
+			x[jx] -= v * xi
+			jx += incX
+		}
+		ix -= incX
+	}
+}
+
+// Ssymv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssymv(ul blas.Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * a[0] * x[0]
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := x[i] * a[i*lda+i]
+				jy := ky + (i+1)*incY
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jp := j + i + 1
+					sum += x[jp] * v
+					y[jy] += xv * v
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := x[ix] * a[i*lda+i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				y[jy] += xv * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			jy := ky
+			xv := alpha * x[i]
+			atmp := a[i*lda : i*lda+i]
+			var sum float32
+			for j, v := range atmp {
+				sum += x[j] * v
+				y[jy] += xv * v
+				jy += incY
+			}
+			sum += x[i] * a[i*lda+i]
+			sum *= alpha
+			y[iy] += sum
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xv := alpha * x[ix]
+		atmp := a[i*lda : i*lda+i]
+		var sum float32
+		for _, v := range atmp {
+			sum += x[jx] * v
+			y[jy] += xv * v
+			jx += incX
+			jy += incY
+		}
+		sum += x[ix] * a[i*lda+i]
+		sum *= alpha
+		y[iy] += sum
+		ix += incX
+		iy += incY
+	}
+}
+
+// Stbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stbmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonunit := d != blas.Unit
+
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					u := min(1+k, n-i)
+					var sum float32
+					atmp := a[i*lda:]
+					xtmp := x[i:]
+					for j := 1; j < u; j++ {
+						sum += xtmp[j] * atmp[j]
+					}
+					if nonunit {
+						sum += xtmp[0] * atmp[0]
+					} else {
+						sum += xtmp[0]
+					}
+					x[i] = sum
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				u := min(1+k, n-i)
+				var sum float32
+				atmp := a[i*lda:]
+				jx := incX
+				for j := 1; j < u; j++ {
+					sum += x[ix+jx] * atmp[j]
+					jx += incX
+				}
+				if nonunit {
+					sum += x[ix] * atmp[0]
+				} else {
+					sum += x[ix]
+				}
+				x[ix] = sum
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				l := max(0, k-i)
+				atmp := a[i*lda:]
+				var sum float32
+				for j := l; j < k; j++ {
+					sum += x[i-k+j] * atmp[j]
+				}
+				if nonunit {
+					sum += x[i] * atmp[k]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			l := max(0, k-i)
+			atmp := a[i*lda:]
+			var sum float32
+			jx := l * incX
+			for j := l; j < k; j++ {
+				sum += x[ix-k*incX+jx] * atmp[j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * atmp[k]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				u := k + 1
+				if i < u {
+					u = i + 1
+				}
+				var sum float32
+				for j := 1; j < u; j++ {
+					sum += x[i-j] * a[(i-j)*lda+j]
+				}
+				if nonunit {
+					sum += x[i] * a[i*lda]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			u := k + 1
+			if i < u {
+				u = i + 1
+			}
+			var sum float32
+			jx := incX
+			for j := 1; j < u; j++ {
+				sum += x[ix-jx] * a[(i-j)*lda+j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * a[i*lda]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			u := k
+			if i+k >= n {
+				u = n - i - 1
+			}
+			var sum float32
+			for j := 0; j < u; j++ {
+				sum += x[i+j+1] * a[(i+j+1)*lda+k-j-1]
+			}
+			if nonunit {
+				sum += x[i] * a[i*lda+k]
+			} else {
+				sum += x[i]
+			}
+			x[i] = sum
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		u := k
+		if i+k >= n {
+			u = n - i - 1
+		}
+		var (
+			sum float32
+			jx  int
+		)
+		for j := 0; j < u; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		if nonunit {
+			sum += x[ix] * a[i*lda+k]
+		} else {
+			sum += x[ix]
+		}
+		x[ix] = sum
+		ix += incX
+	}
+}
+
+// Stpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stpmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float32, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if nonUnit {
+						xi *= ap[offset]
+					}
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					for j, v := range atmp {
+						xi += v * xtmp[j]
+					}
+					x[i] = xi
+					offset += n - i
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				xix := x[ix]
+				if nonUnit {
+					xix *= ap[offset]
+				}
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				for _, v := range atmp {
+					xix += v * x[jx]
+					jx += incX
+				}
+				x[ix] = xix
+				offset += n - i
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if nonUnit {
+					xi *= ap[offset]
+				}
+				atmp := ap[offset-i : offset]
+				for j, v := range atmp {
+					xi += v * x[j]
+				}
+				x[i] = xi
+				offset -= i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			if nonUnit {
+				xix *= ap[offset]
+			}
+			atmp := ap[offset-i : offset]
+			jx := kx
+			for _, v := range atmp {
+				xix += v * x[jx]
+				jx += incX
+			}
+			x[ix] = xix
+			offset -= i + 1
+			ix -= incX
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] += v * xi
+				}
+				if nonUnit {
+					x[i] *= ap[offset]
+				}
+				offset -= n - i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := ap[offset+1 : offset+n-i]
+			for _, v := range atmp {
+				x[jx] += v * xix
+				jx += incX
+			}
+			if nonUnit {
+				x[ix] *= ap[offset]
+			}
+			offset -= n - i + 1
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] += v * xi
+			}
+			if nonUnit {
+				x[i] *= ap[offset]
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		xix := x[ix]
+		jx := kx
+		atmp := ap[offset-i : offset]
+		for _, v := range atmp {
+			x[jx] += v * xix
+			jx += incX
+		}
+		if nonUnit {
+			x[ix] *= ap[offset]
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Stbsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals,
+// and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stbsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float32, lda int, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	// Form x = A^-1 x.
+	// Several cases below use subslices for speed improvement.
+	// The incX != 1 cases usually do not because incX may be negative.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					bands := k
+					if i+bands >= n {
+						bands = n - i - 1
+					}
+					atmp := a[i*lda+1:]
+					xtmp := x[i+1 : i+bands+1]
+					var sum float32
+					for j, v := range xtmp {
+						sum += v * atmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				max := k + 1
+				if i+max > n {
+					max = n - i
+				}
+				atmp := a[i*lda:]
+				var (
+					jx  int
+					sum float32
+				)
+				for j := 1; j < max; j++ {
+					jx += incX
+					sum += x[ix+jx] * atmp[j]
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= atmp[0]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				atmp := a[i*lda+k-bands:]
+				xtmp := x[i-bands : i]
+				var sum float32
+				for j, v := range xtmp {
+					sum += v * atmp[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= atmp[bands]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			atmp := a[i*lda+k-bands:]
+			var (
+				sum float32
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * atmp[j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= atmp[bands]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				var sum float32
+				for j := 0; j < bands; j++ {
+					sum += x[i-bands+j] * a[(i-bands+j)*lda+bands-j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			var (
+				sum float32
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * a[(i-bands+j)*lda+bands-j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda]
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			bands := k
+			if i+bands >= n {
+				bands = n - i - 1
+			}
+			var sum float32
+			xtmp := x[i+1 : i+1+bands]
+			for j, v := range xtmp {
+				sum += v * a[(i+j+1)*lda+k-j-1]
+			}
+			x[i] -= sum
+			if nonUnit {
+				x[i] /= a[i*lda+k]
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		bands := k
+		if i+bands >= n {
+			bands = n - i - 1
+		}
+		var (
+			sum float32
+			jx  int
+		)
+		for j := 0; j < bands; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		x[ix] -= sum
+		if nonUnit {
+			x[ix] /= a[i*lda+k]
+		}
+		ix -= incX
+	}
+}
+
+// Ssbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric band matrix with k super-diagonals, x and y are
+// vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssbmv(ul blas.Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up indexes
+	lenX := n
+	lenY := n
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda:]
+				tmp := alpha * x[i]
+				sum := tmp * atmp[0]
+				u := min(k, n-i-1)
+				jy := incY
+				for j := 1; j <= u; j++ {
+					v := atmp[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += tmp * v
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda:]
+			tmp := alpha * x[ix]
+			sum := tmp * atmp[0]
+			u := min(k, n-i-1)
+			jx := incX
+			jy := incY
+			for j := 1; j <= u; j++ {
+				v := atmp[j]
+				sum += alpha * x[ix+jx] * v
+				y[iy+jy] += tmp * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	// Cases where a has bands below the diagonal.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			l := max(0, k-i)
+			tmp := alpha * x[i]
+			jy := l * incY
+			atmp := a[i*lda:]
+			for j := l; j < k; j++ {
+				v := atmp[j]
+				y[iy] += alpha * v * x[i-k+j]
+				y[iy-k*incY+jy] += tmp * v
+				jy += incY
+			}
+			y[iy] += tmp * atmp[k]
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		l := max(0, k-i)
+		tmp := alpha * x[ix]
+		jx := l * incX
+		jy := l * incY
+		atmp := a[i*lda:]
+		for j := l; j < k; j++ {
+			v := atmp[j]
+			y[iy] += alpha * v * x[ix-k*incX+jx]
+			y[iy-k*incY+jy] += tmp * v
+			jx += incX
+			jy += incY
+		}
+		y[iy] += tmp * atmp[k]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Ssyr performs the symmetric rank-one update
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix, and x is a vector.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr(ul blas.Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				tmp := x[i] * alpha
+				if tmp != 0 {
+					atmp := a[i*lda+i : i*lda+n]
+					xtmp := x[i:n]
+					for j, v := range xtmp {
+						atmp[j] += v * tmp
+					}
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			tmp := x[ix] * alpha
+			if tmp != 0 {
+				jx := ix
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += x[jx] * tmp
+					jx += incX
+				}
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			tmp := x[i] * alpha
+			if tmp != 0 {
+				atmp := a[i*lda:]
+				xtmp := x[:i+1]
+				for j, v := range xtmp {
+					atmp[j] += tmp * v
+				}
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		tmp := x[ix] * alpha
+		if tmp != 0 {
+			atmp := a[i*lda:]
+			jx := kx
+			for j := 0; j < i+1; j++ {
+				atmp[j] += tmp * x[jx]
+				jx += incX
+			}
+		}
+		ix += incX
+	}
+}
+
+// Ssyr2 performs the symmetric rank-two update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr2(ul blas.Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				yi := y[i]
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+				}
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			xi := x[ix]
+			yi := y[iy]
+			atmp := a[i*lda:]
+			for j := i; j < n; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			yi := y[i]
+			atmp := a[i*lda:]
+			for j := 0; j <= i; j++ {
+				atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+			}
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xi := x[ix]
+		yi := y[iy]
+		atmp := a[i*lda:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Stpsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Stpsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float32, x []float32, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			offset = n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					var sum float32
+					for j, v := range atmp {
+						sum += v * xtmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= ap[offset]
+					}
+					offset -= n - i + 1
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				var sum float32
+				for _, v := range atmp {
+					sum += v * x[jx]
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= ap[offset]
+				}
+				ix -= incX
+				offset -= n - i + 1
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset-i : offset]
+				var sum float32
+				for j, v := range atmp {
+					sum += v * x[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				offset += i + 2
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			atmp := ap[offset-i : offset]
+			var sum float32
+			for _, v := range atmp {
+				sum += v * x[jx]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			ix += incX
+			offset += i + 2
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] -= v * xi
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			xix := x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			for _, v := range atmp {
+				x[jx] -= v * xix
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= ap[offset]
+			}
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+			offset -= i + 1
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	offset = n*(n+1)/2 - 1
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= ap[offset]
+		}
+		xix := x[ix]
+		atmp := ap[offset-i : offset]
+		jx := kx
+		for _, v := range atmp {
+			x[jx] -= v * xix
+			jx += incX
+		}
+		ix -= incX
+		offset -= i + 1
+	}
+}
+
+// Sspmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sspmv(ul blas.Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f32.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f32.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * ap[0] * x[0]
+		return
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := ap[offset] * x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				jy := ky + (i+1)*incY
+				for j, v := range atmp {
+					sum += v * xtmp[j]
+					y[jy] += v * xv
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := ap[offset] * x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			for _, v := range atmp {
+				sum += v * x[jx]
+				y[jy] += v * xv
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[i] * alpha
+			atmp := ap[offset-i : offset]
+			jy := ky
+			var sum float32
+			for j, v := range atmp {
+				sum += v * x[j]
+				y[jy] += v * xv
+				jy += incY
+			}
+			sum += ap[offset] * x[i]
+			y[iy] += alpha * sum
+			iy += incY
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		xv := x[ix] * alpha
+		atmp := ap[offset-i : offset]
+		jx := kx
+		jy := ky
+		var sum float32
+		for _, v := range atmp {
+			sum += v * x[jx]
+			y[jy] += v * xv
+			jx += incX
+			jy += incY
+		}
+
+		sum += ap[offset] * x[ix]
+		y[iy] += alpha * sum
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
+
+// Sspr performs the symmetric rank-one operation
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sspr(ul blas.Uplo, n int, alpha float32, x []float32, incX int, ap []float32) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xv := alpha * x[i]
+				xtmp := x[i:n]
+				for j, v := range xtmp {
+					atmp[j] += xv * v
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			atmp := ap[offset:]
+			xv := alpha * x[ix]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += xv * x[jx]
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xv := alpha * x[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += xv * v
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		jx := kx
+		atmp := ap[offset-i:]
+		xv := alpha * x[ix]
+		for j := 0; j <= i; j++ {
+			atmp[j] += xv * x[jx]
+			jx += incX
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Sspr2 performs the symmetric rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sspr2(ul blas.Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, ap []float32) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xi := x[i]
+				yi := y[i]
+				xtmp := x[i:n]
+				ytmp := y[i:n]
+				for j, v := range xtmp {
+					atmp[j] += alpha * (xi*ytmp[j] + v*yi)
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			atmp := ap[offset:]
+			xi := x[ix]
+			yi := y[iy]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xi := x[i]
+			yi := y[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += alpha * (xi*y[j] + v*yi)
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		atmp := ap[offset-i:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (x[ix]*y[jy] + x[jx]*y[iy])
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
new file mode 100644
index 00000000000..19b9c7e1c3f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
@@ -0,0 +1,2366 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level2 = Implementation{}
+
+// Dger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func (Implementation) Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int) {
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (m-1)*incX) || (incX < 0 && len(x) <= (1-m)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+	f64.Ger(uintptr(m), uintptr(n),
+		alpha,
+		x, uintptr(incX),
+		y, uintptr(incY),
+		a, uintptr(lda))
+}
+
+// Dgbmv performs one of the matrix-vector operations
+//
+//	y = alpha * A * x + beta * y   if tA == blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an m×n band matrix with kL sub-diagonals and kU super-diagonals,
+// x and y are vectors, and alpha and beta are scalars.
+func (Implementation) Dgbmv(tA blas.Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if kL < 0 {
+		panic(kLLT0)
+	}
+	if kU < 0 {
+		panic(kULT0)
+	}
+	if lda < kL+kU+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(min(m, n+kL)-1)+kL+kU+1 {
+		panic(shortA)
+	}
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+	if (incX > 0 && len(x) <= (lenX-1)*incX) || (incX < 0 && len(x) <= (1-lenX)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (lenY-1)*incY) || (incY < 0 && len(y) <= (1-lenY)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:lenY] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:lenY])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < lenY; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(lenY), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(lenY), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	// i and j are indices of the compacted banded matrix.
+	// off is the offset into the dense matrix (off + j = densej)
+	nCol := kU + 1 + kL
+	if tA == blas.NoTrans {
+		iy := ky
+		if incX == 1 {
+			for i := 0; i < min(m, n+kL); i++ {
+				l := max(0, kL-i)
+				u := min(nCol, n+kL-i)
+				off := max(0, i-kL)
+				atmp := a[i*lda+l : i*lda+u]
+				xtmp := x[off : off+u-l]
+				var sum float64
+				for j, v := range atmp {
+					sum += xtmp[j] * v
+				}
+				y[iy] += sum * alpha
+				iy += incY
+			}
+			return
+		}
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			jx := kx
+			var sum float64
+			for _, v := range atmp {
+				sum += x[off*incX+jx] * v
+				jx += incX
+			}
+			y[iy] += sum * alpha
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < min(m, n+kL); i++ {
+			l := max(0, kL-i)
+			u := min(nCol, n+kL-i)
+			off := max(0, i-kL)
+			atmp := a[i*lda+l : i*lda+u]
+			tmp := alpha * x[i]
+			jy := ky
+			for _, v := range atmp {
+				y[jy+off*incY] += tmp * v
+				jy += incY
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < min(m, n+kL); i++ {
+		l := max(0, kL-i)
+		u := min(nCol, n+kL-i)
+		off := max(0, i-kL)
+		atmp := a[i*lda+l : i*lda+u]
+		tmp := alpha * x[ix]
+		jy := ky
+		for _, v := range atmp {
+			y[jy+off*incY] += tmp * v
+			jy += incY
+		}
+		ix += incX
+	}
+}
+
+// Dgemv computes
+//
+//	y = alpha * A * x + beta * y   if tA = blas.NoTrans
+//	y = alpha * Aᵀ * x + beta * y  if tA = blas.Trans or blas.ConjTrans
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func (Implementation) Dgemv(tA blas.Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	// Set up indexes
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+
+	// Quick return if possible
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if (incX > 0 && (lenX-1)*incX >= len(x)) || (incX < 0 && (1-lenX)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (lenY-1)*incY >= len(y)) || (incY < 0 && (1-lenY)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		// First form y = beta * y
+		if incY > 0 {
+			Implementation{}.Dscal(lenY, beta, y, incY)
+		} else {
+			Implementation{}.Dscal(lenY, beta, y, -incY)
+		}
+		return
+	}
+
+	// Form y = alpha * A * x + y
+	if tA == blas.NoTrans {
+		f64.GemvN(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+		return
+	}
+	// Cases where a is transposed.
+	f64.GemvT(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+}
+
+// Dtrmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func (Implementation) Dtrmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	nonUnit := d != blas.Unit
+	if n == 1 {
+		if nonUnit {
+			x[0] *= a[0]
+		}
+		return
+	}
+	var kx int
+	if incX <= 0 {
+		kx = -(n - 1) * incX
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					ilda := i * lda
+					var tmp float64
+					if nonUnit {
+						tmp = a[ilda+i] * x[i]
+					} else {
+						tmp = x[i]
+					}
+					x[i] = tmp + f64.DotUnitary(a[ilda+i+1:ilda+n], x[i+1:n])
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				ilda := i * lda
+				var tmp float64
+				if nonUnit {
+					tmp = a[ilda+i] * x[ix]
+				} else {
+					tmp = x[ix]
+				}
+				x[ix] = tmp + f64.DotInc(x, a[ilda+i+1:ilda+n], uintptr(n-i-1), uintptr(incX), 1, uintptr(ix+incX), 0)
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				var tmp float64
+				if nonUnit {
+					tmp += a[ilda+i] * x[i]
+				} else {
+					tmp = x[i]
+				}
+				x[i] = tmp + f64.DotUnitary(a[ilda:ilda+i], x[:i])
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			var tmp float64
+			if nonUnit {
+				tmp = a[ilda+i] * x[ix]
+			} else {
+				tmp = x[ix]
+			}
+			x[ix] = tmp + f64.DotInc(x, a[ilda:ilda+i], uintptr(i), uintptr(incX), 1, uintptr(kx), 0)
+			ix -= incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				ilda := i * lda
+				xi := x[i]
+				f64.AxpyUnitary(xi, a[ilda+i+1:ilda+n], x[i+1:n])
+				if nonUnit {
+					x[i] *= a[ilda+i]
+				}
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			ilda := i * lda
+			xi := x[ix]
+			f64.AxpyInc(xi, a[ilda+i+1:ilda+n], x, uintptr(n-i-1), 1, uintptr(incX), 0, uintptr(kx+(i+1)*incX))
+			if nonUnit {
+				x[ix] *= a[ilda+i]
+			}
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			ilda := i * lda
+			xi := x[i]
+			f64.AxpyUnitary(xi, a[ilda:ilda+i], x[:i])
+			if nonUnit {
+				x[i] *= a[i*lda+i]
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		ilda := i * lda
+		xi := x[ix]
+		f64.AxpyInc(xi, a[ilda:ilda+i], x, uintptr(i), 1, uintptr(incX), 0, uintptr(kx))
+		if nonUnit {
+			x[ix] *= a[ilda+i]
+		}
+		ix += incX
+	}
+}
+
+// Dtrsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Dtrsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	if n == 1 {
+		if d == blas.NonUnit {
+			x[0] /= a[0]
+		}
+		return
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					var sum float64
+					atmp := a[i*lda+i+1 : i*lda+n]
+					for j, v := range atmp {
+						jv := i + j + 1
+						sum += x[jv] * v
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda+i]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				var sum float64
+				jx := ix + incX
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for _, v := range atmp {
+					sum += x[jx] * v
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= a[i*lda+i]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				var sum float64
+				atmp := a[i*lda : i*lda+i]
+				for j, v := range atmp {
+					sum += x[j] * v
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			var sum float64
+			atmp := a[i*lda : i*lda+i]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= a[i*lda+i]
+				}
+				xi := x[i]
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jv := j + i + 1
+					x[jv] -= v * xi
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= a[i*lda+i]
+			}
+			xi := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				x[jx] -= v * xi
+				jx += incX
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= a[i*lda+i]
+			}
+			xi := x[i]
+			atmp := a[i*lda : i*lda+i]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= a[i*lda+i]
+		}
+		xi := x[ix]
+		jx := kx
+		atmp := a[i*lda : i*lda+i]
+		for _, v := range atmp {
+			x[jx] -= v * xi
+			jx += incX
+		}
+		ix -= incX
+	}
+}
+
+// Dsymv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func (Implementation) Dsymv(ul blas.Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * a[0] * x[0]
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := x[i] * a[i*lda+i]
+				jy := ky + (i+1)*incY
+				atmp := a[i*lda+i+1 : i*lda+n]
+				for j, v := range atmp {
+					jp := j + i + 1
+					sum += x[jp] * v
+					y[jy] += xv * v
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := x[ix] * a[i*lda+i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			atmp := a[i*lda+i+1 : i*lda+n]
+			for _, v := range atmp {
+				sum += x[jx] * v
+				y[jy] += xv * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			jy := ky
+			xv := alpha * x[i]
+			atmp := a[i*lda : i*lda+i]
+			var sum float64
+			for j, v := range atmp {
+				sum += x[j] * v
+				y[jy] += xv * v
+				jy += incY
+			}
+			sum += x[i] * a[i*lda+i]
+			sum *= alpha
+			y[iy] += sum
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xv := alpha * x[ix]
+		atmp := a[i*lda : i*lda+i]
+		var sum float64
+		for _, v := range atmp {
+			sum += x[jx] * v
+			y[jy] += xv * v
+			jx += incX
+			jy += incY
+		}
+		sum += x[ix] * a[i*lda+i]
+		sum *= alpha
+		y[iy] += sum
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dtbmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals, and x is a vector.
+func (Implementation) Dtbmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonunit := d != blas.Unit
+
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					u := min(1+k, n-i)
+					var sum float64
+					atmp := a[i*lda:]
+					xtmp := x[i:]
+					for j := 1; j < u; j++ {
+						sum += xtmp[j] * atmp[j]
+					}
+					if nonunit {
+						sum += xtmp[0] * atmp[0]
+					} else {
+						sum += xtmp[0]
+					}
+					x[i] = sum
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				u := min(1+k, n-i)
+				var sum float64
+				atmp := a[i*lda:]
+				jx := incX
+				for j := 1; j < u; j++ {
+					sum += x[ix+jx] * atmp[j]
+					jx += incX
+				}
+				if nonunit {
+					sum += x[ix] * atmp[0]
+				} else {
+					sum += x[ix]
+				}
+				x[ix] = sum
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				l := max(0, k-i)
+				atmp := a[i*lda:]
+				var sum float64
+				for j := l; j < k; j++ {
+					sum += x[i-k+j] * atmp[j]
+				}
+				if nonunit {
+					sum += x[i] * atmp[k]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			l := max(0, k-i)
+			atmp := a[i*lda:]
+			var sum float64
+			jx := l * incX
+			for j := l; j < k; j++ {
+				sum += x[ix-k*incX+jx] * atmp[j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * atmp[k]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := n - 1; i >= 0; i-- {
+				u := k + 1
+				if i < u {
+					u = i + 1
+				}
+				var sum float64
+				for j := 1; j < u; j++ {
+					sum += x[i-j] * a[(i-j)*lda+j]
+				}
+				if nonunit {
+					sum += x[i] * a[i*lda]
+				} else {
+					sum += x[i]
+				}
+				x[i] = sum
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		for i := n - 1; i >= 0; i-- {
+			u := k + 1
+			if i < u {
+				u = i + 1
+			}
+			var sum float64
+			jx := incX
+			for j := 1; j < u; j++ {
+				sum += x[ix-jx] * a[(i-j)*lda+j]
+				jx += incX
+			}
+			if nonunit {
+				sum += x[ix] * a[i*lda]
+			} else {
+				sum += x[ix]
+			}
+			x[ix] = sum
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			u := k
+			if i+k >= n {
+				u = n - i - 1
+			}
+			var sum float64
+			for j := 0; j < u; j++ {
+				sum += x[i+j+1] * a[(i+j+1)*lda+k-j-1]
+			}
+			if nonunit {
+				sum += x[i] * a[i*lda+k]
+			} else {
+				sum += x[i]
+			}
+			x[i] = sum
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		u := k
+		if i+k >= n {
+			u = n - i - 1
+		}
+		var (
+			sum float64
+			jx  int
+		)
+		for j := 0; j < u; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		if nonunit {
+			sum += x[ix] * a[i*lda+k]
+		} else {
+			sum += x[ix]
+		}
+		x[ix] = sum
+		ix += incX
+	}
+}
+
+// Dtpmv performs one of the matrix-vector operations
+//
+//	x = A * x   if tA == blas.NoTrans
+//	x = Aᵀ * x  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func (Implementation) Dtpmv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float64, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := 0; i < n; i++ {
+					xi := x[i]
+					if nonUnit {
+						xi *= ap[offset]
+					}
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					for j, v := range atmp {
+						xi += v * xtmp[j]
+					}
+					x[i] = xi
+					offset += n - i
+				}
+				return
+			}
+			ix := kx
+			for i := 0; i < n; i++ {
+				xix := x[ix]
+				if nonUnit {
+					xix *= ap[offset]
+				}
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				for _, v := range atmp {
+					xix += v * x[jx]
+					jx += incX
+				}
+				x[ix] = xix
+				offset += n - i
+				ix += incX
+			}
+			return
+		}
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				if nonUnit {
+					xi *= ap[offset]
+				}
+				atmp := ap[offset-i : offset]
+				for j, v := range atmp {
+					xi += v * x[j]
+				}
+				x[i] = xi
+				offset -= i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			if nonUnit {
+				xix *= ap[offset]
+			}
+			atmp := ap[offset-i : offset]
+			jx := kx
+			for _, v := range atmp {
+				xix += v * x[jx]
+				jx += incX
+			}
+			x[ix] = xix
+			offset -= i + 1
+			ix -= incX
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			offset = n*(n+1)/2 - 1
+			for i := n - 1; i >= 0; i-- {
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] += v * xi
+				}
+				if nonUnit {
+					x[i] *= ap[offset]
+				}
+				offset -= n - i + 1
+			}
+			return
+		}
+		ix := kx + (n-1)*incX
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			xix := x[ix]
+			jx := kx + (i+1)*incX
+			atmp := ap[offset+1 : offset+n-i]
+			for _, v := range atmp {
+				x[jx] += v * xix
+				jx += incX
+			}
+			if nonUnit {
+				x[ix] *= ap[offset]
+			}
+			offset -= n - i + 1
+			ix -= incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] += v * xi
+			}
+			if nonUnit {
+				x[i] *= ap[offset]
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		xix := x[ix]
+		jx := kx
+		atmp := ap[offset-i : offset]
+		for _, v := range atmp {
+			x[jx] += v * xix
+			jx += incX
+		}
+		if nonUnit {
+			x[ix] *= ap[offset]
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Dtbsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with k+1 diagonals,
+// and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Dtbsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n, k int, a []float64, lda int, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	nonUnit := d == blas.NonUnit
+	// Form x = A^-1 x.
+	// Several cases below use subslices for speed improvement.
+	// The incX != 1 cases usually do not because incX may be negative.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					bands := k
+					if i+bands >= n {
+						bands = n - i - 1
+					}
+					atmp := a[i*lda+1:]
+					xtmp := x[i+1 : i+bands+1]
+					var sum float64
+					for j, v := range xtmp {
+						sum += v * atmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= a[i*lda]
+					}
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				max := k + 1
+				if i+max > n {
+					max = n - i
+				}
+				atmp := a[i*lda:]
+				var (
+					jx  int
+					sum float64
+				)
+				for j := 1; j < max; j++ {
+					jx += incX
+					sum += x[ix+jx] * atmp[j]
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= atmp[0]
+				}
+				ix -= incX
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				atmp := a[i*lda+k-bands:]
+				xtmp := x[i-bands : i]
+				var sum float64
+				for j, v := range xtmp {
+					sum += v * atmp[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= atmp[bands]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			atmp := a[i*lda+k-bands:]
+			var (
+				sum float64
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * atmp[j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= atmp[bands]
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				bands := k
+				if i-k < 0 {
+					bands = i
+				}
+				var sum float64
+				for j := 0; j < bands; j++ {
+					sum += x[i-bands+j] * a[(i-bands+j)*lda+bands-j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= a[i*lda]
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			bands := k
+			if i-k < 0 {
+				bands = i
+			}
+			var (
+				sum float64
+				jx  int
+			)
+			for j := 0; j < bands; j++ {
+				sum += x[ix-bands*incX+jx] * a[(i-bands+j)*lda+bands-j]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= a[i*lda]
+			}
+			ix += incX
+		}
+		return
+	}
+	if incX == 1 {
+		for i := n - 1; i >= 0; i-- {
+			bands := k
+			if i+bands >= n {
+				bands = n - i - 1
+			}
+			var sum float64
+			xtmp := x[i+1 : i+1+bands]
+			for j, v := range xtmp {
+				sum += v * a[(i+j+1)*lda+k-j-1]
+			}
+			x[i] -= sum
+			if nonUnit {
+				x[i] /= a[i*lda+k]
+			}
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	for i := n - 1; i >= 0; i-- {
+		bands := k
+		if i+bands >= n {
+			bands = n - i - 1
+		}
+		var (
+			sum float64
+			jx  int
+		)
+		for j := 0; j < bands; j++ {
+			sum += x[ix+jx+incX] * a[(i+j+1)*lda+k-j-1]
+			jx += incX
+		}
+		x[ix] -= sum
+		if nonUnit {
+			x[ix] /= a[i*lda+k]
+		}
+		ix -= incX
+	}
+}
+
+// Dsbmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric band matrix with k super-diagonals, x and y are
+// vectors, and alpha and beta are scalars.
+func (Implementation) Dsbmv(ul blas.Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	if lda < k+1 {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(n-1)+k+1 {
+		panic(shortA)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up indexes
+	lenX := n
+	lenY := n
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda:]
+				tmp := alpha * x[i]
+				sum := tmp * atmp[0]
+				u := min(k, n-i-1)
+				jy := incY
+				for j := 1; j <= u; j++ {
+					v := atmp[j]
+					sum += alpha * x[i+j] * v
+					y[iy+jy] += tmp * v
+					jy += incY
+				}
+				y[iy] += sum
+				iy += incY
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda:]
+			tmp := alpha * x[ix]
+			sum := tmp * atmp[0]
+			u := min(k, n-i-1)
+			jx := incX
+			jy := incY
+			for j := 1; j <= u; j++ {
+				v := atmp[j]
+				sum += alpha * x[ix+jx] * v
+				y[iy+jy] += tmp * v
+				jx += incX
+				jy += incY
+			}
+			y[iy] += sum
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+
+	// Cases where a has bands below the diagonal.
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			l := max(0, k-i)
+			tmp := alpha * x[i]
+			jy := l * incY
+			atmp := a[i*lda:]
+			for j := l; j < k; j++ {
+				v := atmp[j]
+				y[iy] += alpha * v * x[i-k+j]
+				y[iy-k*incY+jy] += tmp * v
+				jy += incY
+			}
+			y[iy] += tmp * atmp[k]
+			iy += incY
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		l := max(0, k-i)
+		tmp := alpha * x[ix]
+		jx := l * incX
+		jy := l * incY
+		atmp := a[i*lda:]
+		for j := l; j < k; j++ {
+			v := atmp[j]
+			y[iy] += alpha * v * x[ix-k*incX+jx]
+			y[iy-k*incY+jy] += tmp * v
+			jx += incX
+			jy += incY
+		}
+		y[iy] += tmp * atmp[k]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dsyr performs the symmetric rank-one update
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix, and x is a vector.
+func (Implementation) Dsyr(ul blas.Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				tmp := x[i] * alpha
+				if tmp != 0 {
+					atmp := a[i*lda+i : i*lda+n]
+					xtmp := x[i:n]
+					for j, v := range xtmp {
+						atmp[j] += v * tmp
+					}
+				}
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			tmp := x[ix] * alpha
+			if tmp != 0 {
+				jx := ix
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += x[jx] * tmp
+					jx += incX
+				}
+			}
+			ix += incX
+		}
+		return
+	}
+	// Cases where a is lower triangular.
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			tmp := x[i] * alpha
+			if tmp != 0 {
+				atmp := a[i*lda:]
+				xtmp := x[:i+1]
+				for j, v := range xtmp {
+					atmp[j] += tmp * v
+				}
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		tmp := x[ix] * alpha
+		if tmp != 0 {
+			atmp := a[i*lda:]
+			jx := kx
+			for j := 0; j < i+1; j++ {
+				atmp[j] += tmp * x[jx]
+				jx += incX
+			}
+		}
+		ix += incX
+	}
+}
+
+// Dsyr2 performs the symmetric rank-two update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha is a scalar.
+func (Implementation) Dsyr2(ul blas.Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(a) < lda*(n-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				xi := x[i]
+				yi := y[i]
+				atmp := a[i*lda:]
+				for j := i; j < n; j++ {
+					atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+				}
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			xi := x[ix]
+			yi := y[iy]
+			atmp := a[i*lda:]
+			for j := i; j < n; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			xi := x[i]
+			yi := y[i]
+			atmp := a[i*lda:]
+			for j := 0; j <= i; j++ {
+				atmp[j] += alpha * (xi*y[j] + x[j]*yi)
+			}
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		xi := x[ix]
+		yi := y[iy]
+		atmp := a[i*lda:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dtpsv solves one of the systems of equations
+//
+//	A * x = b   if tA == blas.NoTrans
+//	Aᵀ * x = b  if tA == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular matrix in packed format, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func (Implementation) Dtpsv(ul blas.Uplo, tA blas.Transpose, d blas.Diag, n int, ap []float64, x []float64, incX int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+
+	var kx int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+
+	nonUnit := d == blas.NonUnit
+	var offset int // Offset is the index of (i,i)
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			offset = n*(n+1)/2 - 1
+			if incX == 1 {
+				for i := n - 1; i >= 0; i-- {
+					atmp := ap[offset+1 : offset+n-i]
+					xtmp := x[i+1:]
+					var sum float64
+					for j, v := range atmp {
+						sum += v * xtmp[j]
+					}
+					x[i] -= sum
+					if nonUnit {
+						x[i] /= ap[offset]
+					}
+					offset -= n - i + 1
+				}
+				return
+			}
+			ix := kx + (n-1)*incX
+			for i := n - 1; i >= 0; i-- {
+				atmp := ap[offset+1 : offset+n-i]
+				jx := kx + (i+1)*incX
+				var sum float64
+				for _, v := range atmp {
+					sum += v * x[jx]
+					jx += incX
+				}
+				x[ix] -= sum
+				if nonUnit {
+					x[ix] /= ap[offset]
+				}
+				ix -= incX
+				offset -= n - i + 1
+			}
+			return
+		}
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset-i : offset]
+				var sum float64
+				for j, v := range atmp {
+					sum += v * x[j]
+				}
+				x[i] -= sum
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				offset += i + 2
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx
+			atmp := ap[offset-i : offset]
+			var sum float64
+			for _, v := range atmp {
+				sum += v * x[jx]
+				jx += incX
+			}
+			x[ix] -= sum
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			ix += incX
+			offset += i + 2
+		}
+		return
+	}
+	// Cases where ap is transposed.
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				if nonUnit {
+					x[i] /= ap[offset]
+				}
+				xi := x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				for j, v := range atmp {
+					xtmp[j] -= v * xi
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			if nonUnit {
+				x[ix] /= ap[offset]
+			}
+			xix := x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			for _, v := range atmp {
+				x[jx] -= v * xix
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		offset = n*(n+1)/2 - 1
+		for i := n - 1; i >= 0; i-- {
+			if nonUnit {
+				x[i] /= ap[offset]
+			}
+			xi := x[i]
+			atmp := ap[offset-i : offset]
+			for j, v := range atmp {
+				x[j] -= v * xi
+			}
+			offset -= i + 1
+		}
+		return
+	}
+	ix := kx + (n-1)*incX
+	offset = n*(n+1)/2 - 1
+	for i := n - 1; i >= 0; i-- {
+		if nonUnit {
+			x[ix] /= ap[offset]
+		}
+		xix := x[ix]
+		atmp := ap[offset-i : offset]
+		jx := kx
+		for _, v := range atmp {
+			x[jx] -= v * xix
+			jx += incX
+		}
+		ix -= incX
+		offset -= i + 1
+	}
+}
+
+// Dspmv performs the matrix-vector operation
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func (Implementation) Dspmv(ul blas.Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// Set up start points
+	var kx, ky int
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+
+	// Form y = beta * y.
+	if beta != 1 {
+		if incY == 1 {
+			if beta == 0 {
+				for i := range y[:n] {
+					y[i] = 0
+				}
+			} else {
+				f64.ScalUnitary(beta, y[:n])
+			}
+		} else {
+			iy := ky
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					y[iy] = 0
+					iy += incY
+				}
+			} else {
+				if incY > 0 {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(incY))
+				} else {
+					f64.ScalInc(beta, y, uintptr(n), uintptr(-incY))
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if n == 1 {
+		y[0] += alpha * ap[0] * x[0]
+		return
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			iy := ky
+			for i := 0; i < n; i++ {
+				xv := x[i] * alpha
+				sum := ap[offset] * x[i]
+				atmp := ap[offset+1 : offset+n-i]
+				xtmp := x[i+1:]
+				jy := ky + (i+1)*incY
+				for j, v := range atmp {
+					sum += v * xtmp[j]
+					y[jy] += v * xv
+					jy += incY
+				}
+				y[iy] += alpha * sum
+				iy += incY
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[ix] * alpha
+			sum := ap[offset] * x[ix]
+			atmp := ap[offset+1 : offset+n-i]
+			jx := kx + (i+1)*incX
+			jy := ky + (i+1)*incY
+			for _, v := range atmp {
+				sum += v * x[jx]
+				y[jy] += v * xv
+				jx += incX
+				jy += incY
+			}
+			y[iy] += alpha * sum
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		iy := ky
+		for i := 0; i < n; i++ {
+			xv := x[i] * alpha
+			atmp := ap[offset-i : offset]
+			jy := ky
+			var sum float64
+			for j, v := range atmp {
+				sum += v * x[j]
+				y[jy] += v * xv
+				jy += incY
+			}
+			sum += ap[offset] * x[i]
+			y[iy] += alpha * sum
+			iy += incY
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		xv := x[ix] * alpha
+		atmp := ap[offset-i : offset]
+		jx := kx
+		jy := ky
+		var sum float64
+		for _, v := range atmp {
+			sum += v * x[jx]
+			y[jy] += v * xv
+			jx += incX
+			jy += incY
+		}
+
+		sum += ap[offset] * x[ix]
+		y[iy] += alpha * sum
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
+
+// Dspr performs the symmetric rank-one operation
+//
+//	A += alpha * x * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func (Implementation) Dspr(ul blas.Uplo, n int, alpha float64, x []float64, incX int, ap []float64) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	lenX := n
+	var kx int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xv := alpha * x[i]
+				xtmp := x[i:n]
+				for j, v := range xtmp {
+					atmp[j] += xv * v
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			atmp := ap[offset:]
+			xv := alpha * x[ix]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += xv * x[jx]
+				jx += incX
+			}
+			ix += incX
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xv := alpha * x[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += xv * v
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < n; i++ {
+		jx := kx
+		atmp := ap[offset-i:]
+		xv := alpha * x[ix]
+		for j := 0; j <= i; j++ {
+			atmp[j] += xv * x[jx]
+			jx += incX
+		}
+		ix += incX
+		offset += i + 2
+	}
+}
+
+// Dspr2 performs the symmetric rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func (Implementation) Dspr2(ul blas.Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, ap []float64) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if len(ap) < n*(n+1)/2 {
+		panic(shortAP)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		return
+	}
+
+	var ky, kx int
+	if incY < 0 {
+		ky = -(n - 1) * incY
+	}
+	if incX < 0 {
+		kx = -(n - 1) * incX
+	}
+	var offset int // Offset is the index of (i,i).
+	if ul == blas.Upper {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < n; i++ {
+				atmp := ap[offset:]
+				xi := x[i]
+				yi := y[i]
+				xtmp := x[i:n]
+				ytmp := y[i:n]
+				for j, v := range xtmp {
+					atmp[j] += alpha * (xi*ytmp[j] + v*yi)
+				}
+				offset += n - i
+			}
+			return
+		}
+		ix := kx
+		iy := ky
+		for i := 0; i < n; i++ {
+			jx := kx + i*incX
+			jy := ky + i*incY
+			atmp := ap[offset:]
+			xi := x[ix]
+			yi := y[iy]
+			for j := 0; j < n-i; j++ {
+				atmp[j] += alpha * (xi*y[jy] + x[jx]*yi)
+				jx += incX
+				jy += incY
+			}
+			ix += incX
+			iy += incY
+			offset += n - i
+		}
+		return
+	}
+	if incX == 1 && incY == 1 {
+		for i := 0; i < n; i++ {
+			atmp := ap[offset-i:]
+			xi := x[i]
+			yi := y[i]
+			xtmp := x[:i+1]
+			for j, v := range xtmp {
+				atmp[j] += alpha * (xi*y[j] + v*yi)
+			}
+			offset += i + 2
+		}
+		return
+	}
+	ix := kx
+	iy := ky
+	for i := 0; i < n; i++ {
+		jx := kx
+		jy := ky
+		atmp := ap[offset-i:]
+		for j := 0; j <= i; j++ {
+			atmp[j] += alpha * (x[ix]*y[jy] + x[jx]*y[iy])
+			jx += incX
+			jy += incY
+		}
+		ix += incX
+		iy += incY
+		offset += i + 2
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
new file mode 100644
index 00000000000..bfff8c55799
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
@@ -0,0 +1,1751 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level3 = Implementation{}
+
+// Zgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * op(A) * op(B) + beta * C
+//
+// where op(X) is one of
+//
+//	op(X) = X  or  op(X) = Xᵀ  or  op(X) = Xᴴ,
+//
+// alpha and beta are scalars, and A, B and C are matrices, with op(A) an m×k matrix,
+// op(B) a k×n matrix and C an m×n matrix.
+func (Implementation) Zgemm(tA, tB blas.Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	}
+	rowA, colA := m, k
+	if tA != blas.NoTrans {
+		rowA, colA = k, m
+	}
+	if lda < max(1, colA) {
+		panic(badLdA)
+	}
+	rowB, colB := k, n
+	if tB != blas.NoTrans {
+		rowB, colB = n, k
+	}
+	if ldb < max(1, colB) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(b) < (rowB-1)*ldb+colB {
+		panic(shortB)
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	switch tA {
+	case blas.NoTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * A * B + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[l*ldb+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * A * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[j*ldb+l]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * A * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * cmplx.Conj(b[j*ldb+l])
+					}
+				}
+			}
+		}
+	case blas.Trans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᵀ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᵀ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᵀ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	case blas.ConjTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᴴ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᴴ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᴴ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex128
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zhemm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n hermitian matrix and B
+// and C are m×n matrices. The imaginary parts of the diagonal elements of A are
+// assumed to be zero.
+func (Implementation) Zhemm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c128.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * complex(real(a[i*lda+i]), 0)
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zherk performs one of the hermitian rank-k operations
+//
+//	C = alpha*A*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are real scalars, C is an n×n hermitian matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+func (Implementation) Zherk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.ConjTrans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c128.DscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c128.DscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	calpha := complex(alpha, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᴴ + beta*C.
+		cbeta := complex(beta, 0)
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the i-th diagonal element of C.
+					ci[0] = complex(alpha*real(c128.DotcUnitary(ai, ai)), 0)
+					// Handle the remaining elements on the i-th row of C.
+					for jc := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha * c128.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+				case beta != 1:
+					cii := calpha*c128.DotcUnitary(ai, ai) + cbeta*ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+				default:
+					cii := calpha*c128.DotcUnitary(ai, ai) + ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the first i-1 elements on the i-th row of C.
+					for j := range ci[:i] {
+						ci[j] = calpha * c128.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+					// Handle the i-th diagonal element of C.
+					ci[i] = complex(alpha*real(c128.DotcUnitary(ai, ai)), 0)
+				case beta != 1:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+					cii := calpha*c128.DotcUnitary(ai, ai) + cbeta*ci[i]
+					ci[i] = complex(real(cii), 0)
+				default:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c128.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+					cii := calpha*c128.DotcUnitary(ai, ai) + ci[i]
+					ci[i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c128.AxpyUnitary(calpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c128.AxpyUnitary(calpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		}
+	}
+}
+
+// Zher2k performs one of the hermitian rank-2k operations
+//
+//	C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are scalars with beta real, C is an n×n hermitian matrix
+// and A and B are n×k matrices in the first case and k×n matrices in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+func (Implementation) Zher2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.ConjTrans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c128.DscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c128.DscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	conjalpha := cmplx.Conj(alpha)
+	cbeta := complex(beta, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i+1 : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+				} else {
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc, cij := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c128.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c128.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+					cii := alpha*c128.DotcUnitary(bi, ai) + conjalpha*c128.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				ci[0] = complex(real(ci[0]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c128.DscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				ci[i] = complex(real(ci[i]), 0)
+			}
+		}
+	}
+}
+
+// Zsymm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n symmetric matrix and B
+// and C are m×n matrices.
+func (Implementation) Zsymm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c128.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[k*lda+i]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[k*lda+i]
+					c128.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex128
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zsyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha*A*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+func (Implementation) Zsyrk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.Trans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha * c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = beta*cij + alpha*c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha * c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = beta*cij + alpha*c128.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Zsyr2k performs one of the symmetric rank-2k operations
+//
+//	C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A and B
+// are n×k matrices in the first case and k×n matrices in the second case.
+func (Implementation) Zsyr2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.Trans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c128.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c128.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c128.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(alpha*bji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c128.AxpyUnitary(alpha*aji, b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c128.AxpyUnitary(alpha*bji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ztrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * op(A) * B  if side == blas.Left,
+//	B = alpha * B * op(A)  if side == blas.Right,
+//
+// where alpha is a scalar, B is an m×n matrix, A is a unit, or non-unit,
+// upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if trans == blas.NoTrans,
+//	op(A) = Aᵀ  if trans == blas.Trans,
+//	op(A) = Aᴴ  if trans == blas.ConjTrans.
+func (Implementation) Ztrmm(side blas.Side, uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			bi := b[i*ldb : i*ldb+n]
+			for j := range bi {
+				bi[j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := trans != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if trans == blas.NoTrans {
+			// Form B = alpha*A*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						j := ja + i + 1
+						if aij != 0 {
+							c128.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c128.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*Aᵀ*B  or  B = alpha*Aᴴ*B.
+			if uplo == blas.Upper {
+				for k := m - 1; k >= 0; k-- {
+					bk := b[k*ldb : k*ldb+n]
+					for ja, ajk := range a[k*lda+k+1 : k*lda+m] {
+						if ajk == 0 {
+							continue
+						}
+						j := k + 1 + ja
+						if noConj {
+							c128.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c128.ScalUnitary(akk, bk)
+					}
+				}
+			} else {
+				for k := 0; k < m; k++ {
+					bk := b[k*ldb : k*ldb+n]
+					for j, ajk := range a[k*lda : k*lda+k] {
+						if ajk == 0 {
+							continue
+						}
+						if noConj {
+							c128.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c128.ScalUnitary(akk, bk)
+					}
+				}
+			}
+		}
+	} else {
+		if trans == blas.NoTrans {
+			// Form B = alpha*B*A.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := n - 1; k >= 0; k-- {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c128.AxpyUnitary(abik, a[k*lda+k+1:k*lda+n], bi[k+1:])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := 0; k < n; k++ {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c128.AxpyUnitary(abik, a[k*lda:k*lda+k], bi[:k])
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*B*Aᵀ  or  B = alpha*B*Aᴴ.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c128.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c128.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := bi[j]
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c128.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c128.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ztrsm solves one of the matrix equations
+//
+//	op(A) * X = alpha * B  if side == blas.Left,
+//	X * op(A) = alpha * B  if side == blas.Right,
+//
+// where alpha is a scalar, X and B are m×n matrices, A is a unit or
+// non-unit, upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if transA == blas.NoTrans,
+//	op(A) = Aᵀ  if transA == blas.Trans,
+//	op(A) = Aᴴ  if transA == blas.ConjTrans.
+//
+// On return the matrix X is overwritten on B.
+func (Implementation) Ztrsm(side blas.Side, uplo blas.Uplo, transA blas.Transpose, diag blas.Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case transA != blas.NoTrans && transA != blas.Trans && transA != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				b[i*ldb+j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := transA != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*inv(A)*B.
+			if uplo == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for ka, aik := range a[i*lda+i+1 : i*lda+m] {
+						k := i + 1 + ka
+						if aik != 0 {
+							c128.AxpyUnitary(-aik, b[k*ldb:k*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c128.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c128.AxpyUnitary(-aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c128.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*inv(Aᵀ)*B  or  B = alpha*inv(Aᴴ)*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c128.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c128.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						if aij == 0 {
+							continue
+						}
+						j := i + 1 + ja
+						if noConj {
+							c128.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c128.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c128.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij == 0 {
+							continue
+						}
+						if noConj {
+							c128.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c128.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+				}
+			}
+		}
+	} else {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*B*inv(A).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for j, bij := range bi {
+						if bij == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c128.AxpyUnitary(-bi[j], a[j*lda+j+1:j*lda+n], bi[j+1:n])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c128.ScalUnitary(alpha, bi)
+					}
+					for j := n - 1; j >= 0; j-- {
+						if bi[j] == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c128.AxpyUnitary(-bi[j], a[j*lda:j*lda+j], bi[:j])
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*B*inv(Aᵀ)  or   B = alpha*B*inv(Aᴴ).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := alpha * bi[j]
+						if noConj {
+							bij -= c128.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c128.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						bij *= alpha
+						if noConj {
+							bij -= c128.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c128.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
new file mode 100644
index 00000000000..b7fb5a2c4ed
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
@@ -0,0 +1,1771 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	cmplx "gonum.org/v1/gonum/internal/cmplx64"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level3 = Implementation{}
+
+// Cgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * op(A) * op(B) + beta * C
+//
+// where op(X) is one of
+//
+//	op(X) = X  or  op(X) = Xᵀ  or  op(X) = Xᴴ,
+//
+// alpha and beta are scalars, and A, B and C are matrices, with op(A) an m×k matrix,
+// op(B) a k×n matrix and C an m×n matrix.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cgemm(tA, tB blas.Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	}
+	rowA, colA := m, k
+	if tA != blas.NoTrans {
+		rowA, colA = k, m
+	}
+	if lda < max(1, colA) {
+		panic(badLdA)
+	}
+	rowB, colB := k, n
+	if tB != blas.NoTrans {
+		rowB, colB = n, k
+	}
+	if ldb < max(1, colB) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(b) < (rowB-1)*ldb+colB {
+		panic(shortB)
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					c[i*ldc+j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	switch tA {
+	case blas.NoTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * A * B + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[l*ldb+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * A * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * b[j*ldb+l]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * A * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				switch {
+				case beta == 0:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] = 0
+					}
+				case beta != 1:
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] *= beta
+					}
+				}
+				for l := 0; l < k; l++ {
+					tmp := alpha * a[i*lda+l]
+					for j := 0; j < n; j++ {
+						c[i*ldc+j] += tmp * cmplx.Conj(b[j*ldb+l])
+					}
+				}
+			}
+		}
+	case blas.Trans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᵀ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᵀ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᵀ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += a[l*lda+i] * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	case blas.ConjTrans:
+		switch tB {
+		case blas.NoTrans:
+			// Form  C = alpha * Aᴴ * B + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[l*ldb+j]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.Trans:
+			// Form  C = alpha * Aᴴ * Bᵀ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * b[j*ldb+l]
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		case blas.ConjTrans:
+			// Form  C = alpha * Aᴴ * Bᴴ + beta * C.
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					var tmp complex64
+					for l := 0; l < k; l++ {
+						tmp += cmplx.Conj(a[l*lda+i]) * cmplx.Conj(b[j*ldb+l])
+					}
+					if beta == 0 {
+						c[i*ldc+j] = alpha * tmp
+					} else {
+						c[i*ldc+j] = alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Chemm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n hermitian matrix and B
+// and C are m×n matrices. The imaginary parts of the diagonal elements of A are
+// assumed to be zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Chemm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c64.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * complex(real(a[i*lda+i]), 0)
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * cmplx.Conj(a[k*lda+i])
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * cmplx.Conj(ajk)
+					}
+					ajj := complex(real(a[j*lda+j]), 0)
+					if beta == 0 {
+						c[i*ldc+j] = abij*ajj + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*ajj + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Cherk performs one of the hermitian rank-k operations
+//
+//	C = alpha*A*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are real scalars, C is an n×n hermitian matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cherk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.ConjTrans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c64.SscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c64.SscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	calpha := complex(alpha, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᴴ + beta*C.
+		cbeta := complex(beta, 0)
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the i-th diagonal element of C.
+					ci[0] = complex(alpha*real(c64.DotcUnitary(ai, ai)), 0)
+					// Handle the remaining elements on the i-th row of C.
+					for jc := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha * c64.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+				case beta != 1:
+					cii := calpha*c64.DotcUnitary(ai, ai) + cbeta*ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+				default:
+					cii := calpha*c64.DotcUnitary(ai, ai) + ci[0]
+					ci[0] = complex(real(cii), 0)
+					for jc, cij := range ci[1:] {
+						j := i + 1 + jc
+						ci[jc+1] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				switch {
+				case beta == 0:
+					// Handle the first i-1 elements on the i-th row of C.
+					for j := range ci[:i] {
+						ci[j] = calpha * c64.DotcUnitary(a[j*lda:j*lda+k], ai)
+					}
+					// Handle the i-th diagonal element of C.
+					ci[i] = complex(alpha*real(c64.DotcUnitary(ai, ai)), 0)
+				case beta != 1:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cbeta*cij
+					}
+					cii := calpha*c64.DotcUnitary(ai, ai) + cbeta*ci[i]
+					ci[i] = complex(real(cii), 0)
+				default:
+					for j, cij := range ci[:i] {
+						ci[j] = calpha*c64.DotcUnitary(a[j*lda:j*lda+k], ai) + cij
+					}
+					cii := calpha*c64.DotcUnitary(ai, ai) + ci[i]
+					ci[i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c64.AxpyUnitary(calpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := cmplx.Conj(a[j*lda+i])
+					if aji != 0 {
+						c64.AxpyUnitary(calpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				c[i*ldc+i] = complex(real(c[i*ldc+i]), 0)
+			}
+		}
+	}
+}
+
+// Cher2k performs one of the hermitian rank-2k operations
+//
+//	C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C  if trans == blas.ConjTrans
+//
+// where alpha and beta are scalars with beta real, C is an n×n hermitian matrix
+// and A and B are n×k matrices in the first case and k×n matrices in the second case.
+//
+// The imaginary parts of the diagonal elements of C are assumed to be zero, and
+// on return they will be set to zero.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cher2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.ConjTrans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					ci[0] = complex(beta*real(ci[0]), 0)
+					if i != n-1 {
+						c64.SscalUnitary(beta, ci[1:])
+					}
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					if i != 0 {
+						c64.SscalUnitary(beta, ci[:i])
+					}
+					ci[i] = complex(beta*real(ci[i]), 0)
+				}
+			}
+		}
+		return
+	}
+
+	conjalpha := cmplx.Conj(alpha)
+	cbeta := complex(beta, 0)
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᴴ + conj(alpha)*B*Aᴴ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i+1 : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+				} else {
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+					for jc, cij := range ci {
+						j := i + 1 + jc
+						ci[jc] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi)
+					}
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi)
+					c[i*ldc+i] = complex(real(cii), 0)
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c64.DotcUnitary(b[j*ldb:j*ldb+k], ai) + conjalpha*c64.DotcUnitary(a[j*lda:j*lda+k], bi) + cbeta*cij
+					}
+					cii := alpha*c64.DotcUnitary(bi, ai) + conjalpha*c64.DotcUnitary(ai, bi) + cbeta*c[i*ldc+i]
+					c[i*ldc+i] = complex(real(cii), 0)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᴴ*B + conj(alpha)*Bᴴ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[0] = complex(real(ci[0]), 0)
+				default:
+					ci[0] = complex(real(ci[0]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda+i:j*lda+n], ci)
+					}
+				}
+				ci[0] = complex(real(ci[0]), 0)
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					c64.SscalUnitary(beta, ci)
+					ci[i] = complex(real(ci[i]), 0)
+				default:
+					ci[i] = complex(real(ci[i]), 0)
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*cmplx.Conj(aji), b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(conjalpha*cmplx.Conj(bji), a[j*lda:j*lda+i+1], ci)
+					}
+				}
+				ci[i] = complex(real(ci[i]), 0)
+			}
+		}
+	}
+}
+
+// Csymm performs one of the matrix-matrix operations
+//
+//	C = alpha*A*B + beta*C  if side == blas.Left
+//	C = alpha*B*A + beta*C  if side == blas.Right
+//
+// where alpha and beta are scalars, A is an m×m or n×n symmetric matrix and B
+// and C are m×n matrices.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csymm(side blas.Side, uplo blas.Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(na-1)+na {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				c64.ScalUnitary(beta, ci)
+			}
+		}
+		return
+	}
+
+	if side == blas.Left {
+		// Form  C = alpha*A*B + beta*C.
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			bi := b[i*ldb : i*ldb+n]
+			ci := c[i*ldc : i*ldc+n]
+			if beta == 0 {
+				for j, bij := range bi {
+					ci[j] = atmp * bij
+				}
+			} else {
+				for j, bij := range bi {
+					ci[j] = atmp*bij + beta*ci[j]
+				}
+			}
+			if uplo == blas.Upper {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[k*lda+i]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			} else {
+				for k := 0; k < i; k++ {
+					atmp = alpha * a[i*lda+k]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+				for k := i + 1; k < m; k++ {
+					atmp = alpha * a[k*lda+i]
+					c64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ci)
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*B*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := n - 1; j >= 0; j-- {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda+j+1 : j*lda+n]
+					bi := b[i*ldb+j+1 : i*ldb+n]
+					ci := c[i*ldc+j+1 : i*ldc+n]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					abij := alpha * b[i*ldb+j]
+					aj := a[j*lda : j*lda+j]
+					bi := b[i*ldb : i*ldb+j]
+					ci := c[i*ldc : i*ldc+j]
+					var tmp complex64
+					for k, ajk := range aj {
+						ci[k] += abij * ajk
+						tmp += bi[k] * ajk
+					}
+					if beta == 0 {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp
+					} else {
+						c[i*ldc+j] = abij*a[j*lda+j] + alpha*tmp + beta*c[i*ldc+j]
+					}
+				}
+			}
+		}
+	}
+}
+
+// Csyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha*A*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A is
+// an n×k matrix in the first case and a k×n matrix in the second case.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csyrk(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int) {
+	var rowA, colA int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		rowA, colA = n, k
+	case blas.Trans:
+		rowA, colA = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, colA):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (rowA-1)*lda+colA {
+		panic(shortA)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha * c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = beta*cij + alpha*c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha * c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = beta*cij + alpha*c64.DotuUnitary(ai, a[j*lda:j*lda+k])
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Csyr2k performs one of the symmetric rank-2k operations
+//
+//	C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C  if trans == blas.NoTrans
+//	C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C  if trans == blas.Trans
+//
+// where alpha and beta are scalars, C is an n×n symmetric matrix and A and B
+// are n×k matrices in the first case and k×n matrices in the second case.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csyr2k(uplo blas.Uplo, trans blas.Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int) {
+	var row, col int
+	switch trans {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans:
+		row, col = n, k
+	case blas.Trans:
+		row, col = k, n
+	}
+	switch {
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case lda < max(1, col):
+		panic(badLdA)
+	case ldb < max(1, col):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (row-1)*lda+col {
+		panic(shortA)
+	}
+	if len(b) < (row-1)*ldb+col {
+		panic(shortB)
+	}
+	if len(c) < (n-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if uplo == blas.Upper {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc+i : i*ldc+n]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		} else {
+			if beta == 0 {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					for j := range ci {
+						ci[j] = 0
+					}
+				}
+			} else {
+				for i := 0; i < n; i++ {
+					ci := c[i*ldc : i*ldc+i+1]
+					c64.ScalUnitary(beta, ci)
+				}
+			}
+		}
+		return
+	}
+
+	if trans == blas.NoTrans {
+		// Form  C = alpha*A*Bᵀ + alpha*B*Aᵀ + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for jc := range ci {
+						j := i + jc
+						ci[jc] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, cij := range ci {
+						j := i + jc
+						ci[jc] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				ai := a[i*lda : i*lda+k]
+				bi := b[i*ldb : i*ldb+k]
+				if beta == 0 {
+					for j := range ci {
+						ci[j] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k])
+					}
+				} else {
+					for j, cij := range ci {
+						ci[j] = alpha*c64.DotuUnitary(ai, b[j*ldb:j*ldb+k]) + alpha*c64.DotuUnitary(bi, a[j*lda:j*lda+k]) + beta*cij
+					}
+				}
+			}
+		}
+	} else {
+		// Form  C = alpha*Aᵀ*B + alpha*Bᵀ*A + beta*C.
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc+i : i*ldc+n]
+				switch {
+				case beta == 0:
+					for jc := range ci {
+						ci[jc] = 0
+					}
+				case beta != 1:
+					for jc := range ci {
+						ci[jc] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, b[j*ldb+i:j*ldb+n], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(alpha*bji, a[j*lda+i:j*lda+n], ci)
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				ci := c[i*ldc : i*ldc+i+1]
+				switch {
+				case beta == 0:
+					for j := range ci {
+						ci[j] = 0
+					}
+				case beta != 1:
+					for j := range ci {
+						ci[j] *= beta
+					}
+				}
+				for j := 0; j < k; j++ {
+					aji := a[j*lda+i]
+					bji := b[j*ldb+i]
+					if aji != 0 {
+						c64.AxpyUnitary(alpha*aji, b[j*ldb:j*ldb+i+1], ci)
+					}
+					if bji != 0 {
+						c64.AxpyUnitary(alpha*bji, a[j*lda:j*lda+i+1], ci)
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ctrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * op(A) * B  if side == blas.Left,
+//	B = alpha * B * op(A)  if side == blas.Right,
+//
+// where alpha is a scalar, B is an m×n matrix, A is a unit, or non-unit,
+// upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if trans == blas.NoTrans,
+//	op(A) = Aᵀ  if trans == blas.Trans,
+//	op(A) = Aᴴ  if trans == blas.ConjTrans.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrmm(side blas.Side, uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			bi := b[i*ldb : i*ldb+n]
+			for j := range bi {
+				bi[j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := trans != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if trans == blas.NoTrans {
+			// Form B = alpha*A*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						j := ja + i + 1
+						if aij != 0 {
+							c64.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					aii := alpha
+					if noUnit {
+						aii *= a[i*lda+i]
+					}
+					bi := b[i*ldb : i*ldb+n]
+					for j := range bi {
+						bi[j] *= aii
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c64.AxpyUnitary(alpha*aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*Aᵀ*B  or  B = alpha*Aᴴ*B.
+			if uplo == blas.Upper {
+				for k := m - 1; k >= 0; k-- {
+					bk := b[k*ldb : k*ldb+n]
+					for ja, ajk := range a[k*lda+k+1 : k*lda+m] {
+						if ajk == 0 {
+							continue
+						}
+						j := k + 1 + ja
+						if noConj {
+							c64.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c64.ScalUnitary(akk, bk)
+					}
+				}
+			} else {
+				for k := 0; k < m; k++ {
+					bk := b[k*ldb : k*ldb+n]
+					for j, ajk := range a[k*lda : k*lda+k] {
+						if ajk == 0 {
+							continue
+						}
+						if noConj {
+							c64.AxpyUnitary(alpha*ajk, bk, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(alpha*cmplx.Conj(ajk), bk, b[j*ldb:j*ldb+n])
+						}
+					}
+					akk := alpha
+					if noUnit {
+						if noConj {
+							akk *= a[k*lda+k]
+						} else {
+							akk *= cmplx.Conj(a[k*lda+k])
+						}
+					}
+					if akk != 1 {
+						c64.ScalUnitary(akk, bk)
+					}
+				}
+			}
+		}
+	} else {
+		if trans == blas.NoTrans {
+			// Form B = alpha*B*A.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := n - 1; k >= 0; k-- {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c64.AxpyUnitary(abik, a[k*lda+k+1:k*lda+n], bi[k+1:])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for k := 0; k < n; k++ {
+						abik := alpha * bi[k]
+						if abik == 0 {
+							continue
+						}
+						bi[k] = abik
+						if noUnit {
+							bi[k] *= a[k*lda+k]
+						}
+						c64.AxpyUnitary(abik, a[k*lda:k*lda+k], bi[:k])
+					}
+				}
+			}
+		} else {
+			// Form B = alpha*B*Aᵀ  or  B = alpha*B*Aᴴ.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c64.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c64.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := bi[j]
+						if noConj {
+							if noUnit {
+								bij *= a[j*lda+j]
+							}
+							bij += c64.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+						} else {
+							if noUnit {
+								bij *= cmplx.Conj(a[j*lda+j])
+							}
+							bij += c64.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+						}
+						bi[j] = alpha * bij
+					}
+				}
+			}
+		}
+	}
+}
+
+// Ctrsm solves one of the matrix equations
+//
+//	op(A) * X = alpha * B  if side == blas.Left,
+//	X * op(A) = alpha * B  if side == blas.Right,
+//
+// where alpha is a scalar, X and B are m×n matrices, A is a unit or
+// non-unit, upper or lower triangular matrix and op(A) is one of
+//
+//	op(A) = A   if transA == blas.NoTrans,
+//	op(A) = Aᵀ  if transA == blas.Trans,
+//	op(A) = Aᴴ  if transA == blas.ConjTrans.
+//
+// On return the matrix X is overwritten on B.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ctrsm(side blas.Side, uplo blas.Uplo, transA blas.Transpose, diag blas.Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int) {
+	na := m
+	if side == blas.Right {
+		na = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case uplo != blas.Lower && uplo != blas.Upper:
+		panic(badUplo)
+	case transA != blas.NoTrans && transA != blas.Trans && transA != blas.ConjTrans:
+		panic(badTranspose)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, na):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < (na-1)*lda+na {
+		panic(shortA)
+	}
+	if len(b) < (m-1)*ldb+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				b[i*ldb+j] = 0
+			}
+		}
+		return
+	}
+
+	noConj := transA != blas.ConjTrans
+	noUnit := diag == blas.NonUnit
+	if side == blas.Left {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*inv(A)*B.
+			if uplo == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for ka, aik := range a[i*lda+i+1 : i*lda+m] {
+						k := i + 1 + ka
+						if aik != 0 {
+							c64.AxpyUnitary(-aik, b[k*ldb:k*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c64.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij != 0 {
+							c64.AxpyUnitary(-aij, b[j*ldb:j*ldb+n], bi)
+						}
+					}
+					if noUnit {
+						c64.ScalUnitary(1/a[i*lda+i], bi)
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*inv(Aᵀ)*B  or  B = alpha*inv(Aᴴ)*B.
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c64.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c64.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for ja, aij := range a[i*lda+i+1 : i*lda+m] {
+						if aij == 0 {
+							continue
+						}
+						j := i + 1 + ja
+						if noConj {
+							c64.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+				}
+			} else {
+				for i := m - 1; i >= 0; i-- {
+					bi := b[i*ldb : i*ldb+n]
+					if noUnit {
+						if noConj {
+							c64.ScalUnitary(1/a[i*lda+i], bi)
+						} else {
+							c64.ScalUnitary(1/cmplx.Conj(a[i*lda+i]), bi)
+						}
+					}
+					for j, aij := range a[i*lda : i*lda+i] {
+						if aij == 0 {
+							continue
+						}
+						if noConj {
+							c64.AxpyUnitary(-aij, bi, b[j*ldb:j*ldb+n])
+						} else {
+							c64.AxpyUnitary(-cmplx.Conj(aij), bi, b[j*ldb:j*ldb+n])
+						}
+					}
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+				}
+			}
+		}
+	} else {
+		if transA == blas.NoTrans {
+			// Form  B = alpha*B*inv(A).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for j, bij := range bi {
+						if bij == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c64.AxpyUnitary(-bi[j], a[j*lda+j+1:j*lda+n], bi[j+1:n])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						c64.ScalUnitary(alpha, bi)
+					}
+					for j := n - 1; j >= 0; j-- {
+						if bi[j] == 0 {
+							continue
+						}
+						if noUnit {
+							bi[j] /= a[j*lda+j]
+						}
+						c64.AxpyUnitary(-bi[j], a[j*lda:j*lda+j], bi[:j])
+					}
+				}
+			}
+		} else {
+			// Form  B = alpha*B*inv(Aᵀ)  or   B = alpha*B*inv(Aᴴ).
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j := n - 1; j >= 0; j-- {
+						bij := alpha * bi[j]
+						if noConj {
+							bij -= c64.DotuUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c64.DotcUnitary(a[j*lda+j+1:j*lda+n], bi[j+1:n])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					bi := b[i*ldb : i*ldb+n]
+					for j, bij := range bi {
+						bij *= alpha
+						if noConj {
+							bij -= c64.DotuUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= a[j*lda+j]
+							}
+						} else {
+							bij -= c64.DotcUnitary(a[j*lda:j*lda+j], bi[:j])
+							if noUnit {
+								bij /= cmplx.Conj(a[j*lda+j])
+							}
+						}
+						bi[j] = bij
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
new file mode 100644
index 00000000000..4b813fbc050
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
@@ -0,0 +1,925 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level3 = Implementation{}
+
+// Strsm solves one of the matrix equations
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f32.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f32.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f32.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f32.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f32.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f32.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f32.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Ssymm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C  if side == blas.Left
+//	C = alpha * B * A + beta * C  if side == blas.Right
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if beta == 0 {
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		}
+	}
+
+	if alpha == 0 {
+		if beta != 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := 0; j < n; j++ {
+					ctmp[j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float32
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float32
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Ssyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f32.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Ssyr2k performs one of the symmetric rank 2k operations
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float32
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] = alpha * (tmp1 + tmp2)
+					}
+				} else {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float32
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] *= beta
+						ctmp[jc] += alpha * (tmp1 + tmp2)
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			if beta == 0 {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] = alpha * (tmp1 + tmp2)
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] *= beta
+					ctmp[j] += alpha * (tmp1 + tmp2)
+				}
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			switch beta {
+			case 0:
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			case 1:
+			default:
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		switch beta {
+		case 0:
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		case 1:
+		default:
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Strmm performs one of the matrix-matrix operations
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f32.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f32.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f32.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f32.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f32.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f32.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f32.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go b/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
new file mode 100644
index 00000000000..0d203513c15
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
@@ -0,0 +1,913 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level3 = Implementation{}
+
+// Dtrsm solves one of the matrix equations
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func (Implementation) Dtrsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f64.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f64.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f64.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f64.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f64.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f64.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f64.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Dsymm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C  if side == blas.Left
+//	C = alpha * B * A + beta * C  if side == blas.Right
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+func (Implementation) Dsymm(s blas.Side, ul blas.Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if beta == 0 {
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		}
+	}
+
+	if alpha == 0 {
+		if beta != 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := 0; j < n; j++ {
+					ctmp[j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float64
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float64
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Dsyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+func (Implementation) Dsyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f64.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Dsyr2k performs one of the symmetric rank 2k operations
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+func (Implementation) Dsyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float64
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] = alpha * (tmp1 + tmp2)
+					}
+				} else {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float64
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] *= beta
+						ctmp[jc] += alpha * (tmp1 + tmp2)
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			if beta == 0 {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] = alpha * (tmp1 + tmp2)
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] *= beta
+					ctmp[j] += alpha * (tmp1 + tmp2)
+				}
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			switch beta {
+			case 0:
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			case 1:
+			default:
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		switch beta {
+		case 0:
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		case 1:
+		default:
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Dtrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+func (Implementation) Dtrmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f64.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f64.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f64.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f64.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f64.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f64.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f64.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go b/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
new file mode 100644
index 00000000000..7b03ce46a8e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
@@ -0,0 +1,301 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C
+//	C = alpha * Aᵀ * B + beta * C
+//	C = alpha * A * Bᵀ + beta * C
+//	C = alpha * Aᵀ * Bᵀ + beta * C
+//
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemm(tA, tB blas.Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	sgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func sgemmParallel(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(Aᵀ * B)
+	// Cij = \sum_k Aik Bjk,	(A * Bᵀ)
+	// Cij = \sum_k Aki Bjk,	(Aᵀ * Bᵀ)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		sgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	// workerLimit acts a number of maximum concurrent workers,
+	// with the limit set to the number of procs available.
+	workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
+
+	// wg is used to wait for all
+	var wg sync.WaitGroup
+	wg.Add(parBlocks)
+	defer wg.Wait()
+
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			workerLimit <- struct{}{}
+			go func(i, j int) {
+				defer func() {
+					wg.Done()
+					<-workerLimit
+				}()
+
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView32(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float32
+					if aTrans {
+						aSub = sliceView32(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView32(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView32(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView32(b, ldb, k, j, lenk, lenj)
+					}
+					sgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}(i, j)
+		}
+	}
+}
+
+// sgemmSerial is serial matrix multiply
+func sgemmSerial(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	switch {
+	case !aTrans && !bTrans:
+		sgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		sgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		sgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		sgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// sgemmSerial where neither a nor b are transposed
+func sgemmSerialNotNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is transposed and b is not
+func sgemmSerialTransNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is not transposed and b is
+func sgemmSerialNotTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f32.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// sgemmSerial where both are transposed
+func sgemmSerialTransTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView32(a []float32, lda, i, j, r, c int) []float32 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
diff --git a/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash b/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
new file mode 100644
index 00000000000..a107fce4923
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
@@ -0,0 +1,224 @@
+#!/usr/bin/env bash
+
+# Copyright ©2015 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+WARNINGF32='//\
+// Float32 implementations are autogenerated and not directly tested.\
+'
+WARNINGC64='//\
+// Complex64 implementations are autogenerated and not directly tested.\
+'
+
+# Level1 routines.
+
+echo Generating level1float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32.go
+cat level1float64.go \
+| gofmt -r 'blas.Float64Level1 -> blas.Float32Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'blas.DrotmParams -> blas.SrotmParams' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.L2NormInc -> f32.L2NormInc' \
+| gofmt -r 'f64.L2NormUnitary -> f32.L2NormUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e "s_^\(func (Implementation) \)Id\(.*\)\$_$WARNINGF32\1Is\2_" \
+      -e 's_^// Id_// Is_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+      -e 's_safmin = 0x1p-1022_safmin = 0x1p-126_' \
+>> level1float32.go
+
+echo Generating level1cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1cmplx64.go
+cat level1cmplx128.go \
+| gofmt -r 'blas.Complex128Level1 -> blas.Complex64Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotcInc -> c64.DotcInc' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'dcabs1 -> scabs1' \
+\
+| sed -e "s_^\(func (Implementation) \)Zdot\(.*\)\$_$WARNINGC64\1Cdot\2_" \
+      -e 's_^// Zdot_// Cdot_' \
+      -e "s_^\(func (Implementation) \)Zdscal\(.*\)\$_$WARNINGC64\1Csscal\2_" \
+      -e 's_^// Zdscal_// Csscal_' \
+      -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e "s_^\(func (Implementation) \)Iz\(.*\)\$_$WARNINGC64\1Ic\2_" \
+      -e 's_^// Iz_// Ic_' \
+      -e "s_^\(func (Implementation) \)Dz\(.*\)\$_$WARNINGC64\1Sc\2_" \
+      -e 's_^// Dz_// Sc_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+>> level1cmplx64.go
+
+echo Generating level1float32_sdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdot.go
+
+echo Generating level1float32_dsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_dsdot.go
+cat level1float64_ddot.go \
+| gofmt -r '[]float64 -> []float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DdotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DdotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Ds\2_" \
+      -e 's_^// D_// Ds_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_dsdot.go
+
+echo Generating level1float32_sdsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdsdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)) -> alpha + float32(f32.DdotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)))' \
+| gofmt -r 'f64.DotUnitary(a, b) -> alpha + float32(f32.DdotUnitary(a, b))' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Sds\2_" \
+      -e 's_^// D\(.*\)$_// Sds\1 plus a constant_' \
+      -e 's_\\sum_alpha + \\sum_' \
+      -e 's/n int/n int, alpha float32/' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdsdot.go
+
+
+# Level2 routines.
+
+echo Generating level2float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2float32.go
+cat level2float64.go \
+| gofmt -r 'blas.Float64Level2 -> blas.Float32Level2' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyIncTo -> f32.AxpyIncTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+| gofmt -r 'f64.Ger -> f32.Ger' \
+| gofmt -r 'f64.GemvN -> f32.GemvN' \
+| gofmt -r 'f64.GemvT -> f32.GemvT' \
+| gofmt -r 'Implementation{}.Dscal -> Implementation{}.Sscal' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level2float32.go
+
+echo Generating level2cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2cmplx64.go
+cat level2cmplx128.go \
+| gofmt -r 'blas.Complex128Level2 -> blas.Complex64Level2' \
+\
+| gofmt -r 'complex128 -> complex64' \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level2cmplx64.go
+
+# Level3 routines.
+
+echo Generating level3float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3float32.go
+cat level3float64.go \
+| gofmt -r 'blas.Float64Level3 -> blas.Float32Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level3float32.go
+
+echo Generating sgemm.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > sgemm.go
+cat dgemm.go \
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'sliceView64 -> sliceView32' \
+\
+| gofmt -r 'dgemmParallel -> sgemmParallel' \
+| gofmt -r 'computeNumBlocks64 -> computeNumBlocks32' \
+| gofmt -r 'dgemmSerial -> sgemmSerial' \
+| gofmt -r 'dgemmSerialNotNot -> sgemmSerialNotNot' \
+| gofmt -r 'dgemmSerialTransNot -> sgemmSerialTransNot' \
+| gofmt -r 'dgemmSerialNotTrans -> sgemmSerialNotTrans' \
+| gofmt -r 'dgemmSerialTransTrans -> sgemmSerialTransTrans' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_^// d_// s_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> sgemm.go
+
+echo Generating level3cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3cmplx64.go
+cat level3cmplx128.go \
+| gofmt -r 'blas.Complex128Level3 -> blas.Complex64Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'c128.DscalUnitary -> c64.SscalUnitary' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level3cmplx64.go
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
new file mode 100644
index 00000000000..d9b71a0d6b2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
@@ -0,0 +1,134 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ   x_base+16(FP), SI // SI = &x
+	MOVQ   y_base+40(FP), DI // DI = &y
+	MOVQ   n+64(FP), CX      // CX = n
+	CMPQ   CX, $0            // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+88(FP), R8     // R8 = ix  // Load the first index
+	SHLQ   $4, R8            // R8 *= sizeof(complex128)
+	MOVQ   iy+96(FP), R9     // R9 = iy
+	SHLQ   $4, R9            // R9 *= sizeof(complex128)
+	LEAQ   (SI)(R8*1), SI    // SI = &(x[ix])
+	LEAQ   (DI)(R9*1), DI    // DI = &(y[iy])
+	MOVQ   DI, DX            // DX = DI      // Separate Read/Write pointers
+	MOVQ   incX+72(FP), R8   // R8 = incX
+	SHLQ   $4, R8            // R8 *= sizeof(complex128)
+	MOVQ   incY+80(FP), R9   // R9 = iy
+	SHLQ   $4, R9            // R9 *= sizeof(complex128)
+	MOVUPS alpha+0(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPD $0x1, X1, X1      // X1 = { real(a), imag(a) }
+	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX            // CX = n % 4
+	SHRQ   $2, BX            // BX = floor( n / 4 )
+	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SI)(R8*1), X4
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVUPS (SI), X6
+	MOVUPS (SI)(R8*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	ADDPD  (DX)(R9*1), X5
+	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
+	ADDPD  (DX), X7
+	ADDPD  (DX)(R9*1), X9
+	MOVUPS X3, (DI)       // dst[i] = X_(i+1)
+	MOVUPS X5, (DI)(R9*1)
+	LEAQ   (DI)(R9*2), DI
+	MOVUPS X7, (DI)
+	MOVUPS X9, (DI)(R9*1)
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ   (DI)(R9*2), DI // DI = &(DI[incY*2])
+	DECQ   BX
+	JNZ    axpyi_loop     // } while --BX > 0
+	CMPQ   CX, $0         // if CX == 0 { return }
+	JE     axpyi_end
+
+axpyi_tail: // do {
+	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI), X3
+	MOVUPS X3, (DI)   // y[i] = X_i
+	ADDQ   R8, SI     // SI = &(SI[incX])
+	ADDQ   R9, DI     // DI = &(DI[incY])
+	LOOP   axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
new file mode 100644
index 00000000000..d35e95d982c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
@@ -0,0 +1,141 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ   dst_base+0(FP), DI // DI = &dst
+	MOVQ   x_base+56(FP), SI  // SI = &x
+	MOVQ   y_base+80(FP), DX  // DX = &y
+	MOVQ   n+104(FP), CX      // CX = n
+	CMPQ   CX, $0             // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+128(FP), R8     // R8 = ix  // Load the first index
+	SHLQ   $4, R8             // R8 *= sizeof(complex128)
+	MOVQ   iy+136(FP), R9     // R9 = iy
+	SHLQ   $4, R9             // R9 *= sizeof(complex128)
+	MOVQ   idst+32(FP), R10   // R10 = idst
+	SHLQ   $4, R10            // R10 *= sizeof(complex128)
+	LEAQ   (SI)(R8*1), SI     // SI = &(x[ix])
+	LEAQ   (DX)(R9*1), DX     // DX = &(y[iy])
+	LEAQ   (DI)(R10*1), DI    // DI = &(dst[idst])
+	MOVQ   incX+112(FP), R8   // R8 = incX
+	SHLQ   $4, R8             // R8 *= sizeof(complex128)
+	MOVQ   incY+120(FP), R9   // R9 = incY
+	SHLQ   $4, R9             // R9 *= sizeof(complex128)
+	MOVQ   incDst+24(FP), R10 // R10 = incDst
+	SHLQ   $4, R10            // R10 *= sizeof(complex128)
+	MOVUPS alpha+40(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPD $0x1, X1, X1       // X1 = { real(a), imag(a) }
+	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX             // CX = n % 4
+	SHRQ   $2, BX             // BX = floor( n / 4 )
+	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SI)(R8*1), X4
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+
+	MOVUPS (SI), X6
+	MOVUPS (SI)(R8*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	ADDPD  (DX)(R9*1), X5
+	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	ADDPD  (DX), X7
+	ADDPD  (DX)(R9*1), X9
+	MOVUPS X3, (DI)        // dst[i] = X_(i+1)
+	MOVUPS X5, (DI)(R10*1)
+	LEAQ   (DI)(R10*2), DI
+	MOVUPS X7, (DI)
+	MOVUPS X9, (DI)(R10*1)
+	LEAQ   (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ   (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	DECQ   BX
+	JNZ    axpyi_loop      // } while --BX > 0
+	CMPQ   CX, $0          // if CX == 0 { return }
+	JE     axpyi_end
+
+axpyi_tail: // do {
+	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	MOVUPS X3, (DI)   // y[i] X_(i+1)
+	ADDQ   R8, SI     // SI += incX
+	ADDQ   R9, DX     // DX += incY
+	ADDQ   R10, DI    // DI += incDst
+	LOOP   axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
new file mode 100644
index 00000000000..a6783255fd7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitary(alpha complex128, x, y []complex128)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+16(FP), SI // SI = &x
+	MOVQ    y_base+40(FP), DI // DI = &y
+	MOVQ    x_len+24(FP), CX  // CX = min( len(x), len(y) )
+	CMPQ    y_len+48(FP), CX
+	CMOVQLE y_len+48(FP), CX
+	CMPQ    CX, $0            // if CX == 0 { return }
+	JE      caxy_end
+	PXOR    X0, X0            // Clear work registers and cache-align loop
+	PXOR    X1, X1
+	MOVUPS  alpha+0(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPD  $0x1, X1, X1      // X1 = { real(a), imag(a) }
+	XORQ    AX, AX            // i = 0
+	MOVAPS  X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS  X1, X11
+	MOVQ    CX, BX
+	ANDQ    $3, CX            // CX = n % 4
+	SHRQ    $2, BX            // BX = floor( n / 4 )
+	JZ      caxy_tail         // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SI)(AX*8), X4
+	MOVUPS 32(SI)(AX*8), X6
+	MOVUPS 48(SI)(AX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI)(AX*8), X3
+	ADDPD  16(DI)(AX*8), X5
+	ADDPD  32(DI)(AX*8), X7
+	ADDPD  48(DI)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX
+	JNZ    caxy_loop        // } while --BX > 0
+	CMPQ   CX, $0           // if CX == 0 { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI)(AX*8), X3
+	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
+	ADDQ   $2, AX         // i += 2
+	LOOP   caxy_tail      // }  while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
new file mode 100644
index 00000000000..64add6886c7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
@@ -0,0 +1,123 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+40(FP), SI  // SI = &x
+	MOVQ    y_base+64(FP), DX  // DX = &y
+	MOVQ    x_len+48(FP), CX   // CX = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+72(FP), CX
+	CMOVQLE y_len+72(FP), CX
+	CMPQ    dst_len+8(FP), CX
+	CMOVQLE dst_len+8(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      caxy_end
+	MOVUPS  alpha+24(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPD  $0x1, X1, X1       // X1 = { real(a), imag(a) }
+	XORQ    AX, AX             // i = 0
+	MOVAPS  X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS  X1, X11
+	MOVQ    CX, BX
+	ANDQ    $3, CX             // CX = n % 4
+	SHRQ    $2, BX             // BX = floor( n / 4 )
+	JZ      caxy_tail          // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SI)(AX*8), X4
+	MOVUPS 32(SI)(AX*8), X6
+	MOVUPS 48(SI)(AX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX)(AX*8), X3
+	ADDPD  16(DX)(AX*8), X5
+	ADDPD  32(DX)(AX*8), X7
+	ADDPD  48(DX)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX
+	JNZ    caxy_loop        // } while --BX > 0
+	CMPQ   CX, $0           // if CX == 0 { return }
+	JE     caxy_end
+
+caxy_tail: // Same calculation, but read in values to avoid trampling memory
+	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX)(AX*8), X3
+	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
+	ADDQ   $2, AX         // i += 2
+	LOOP   caxy_tail      // }  while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
new file mode 100644
index 00000000000..8802ff138a3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package c128 provides complex128 vector primitives.
+package c128 // import "gonum.org/v1/gonum/internal/asm/c128"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
new file mode 100644
index 00000000000..235f67e7a2d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
@@ -0,0 +1,153 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR__X3    LONG $0x1E120FF2 // MOVDDUP (SI), X3
+#define MOVDDUP_XPTR_INCX__X5    LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
+#define MOVDDUP_XPTR_INCX_2__X7    LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
+#define MOVDDUP_XPTR_INCx3X__X9    LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
+
+#define MOVDDUP_8_XPTR__X2    LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
+#define MOVDDUP_8_XPTR_INCX__X4    LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
+#define MOVDDUP_8_XPTR_INCX_2__X6    LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
+#define MOVDDUP_8_XPTR_INCx3X__X8    LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+TEXT ·DotcInc(SB), NOSPLIT, $0
+	MOVQ   x_base+0(FP), X_PTR       // X_PTR = &x
+	MOVQ   y_base+24(FP), Y_PTR      // Y_PTR = &y
+	MOVQ   n+48(FP), LEN             // LEN = n
+	PXOR   SUM, SUM                  // SUM = 0
+	CMPQ   LEN, $0                   // if LEN == 0 { return }
+	JE     dot_end
+	PXOR   P_SUM, P_SUM              // P_SUM = 0
+	MOVQ   ix+72(FP), INC_X          // INC_X = ix * sizeof(complex128)
+	SHLQ   $4, INC_X
+	MOVQ   iy+80(FP), INC_Y          // INC_Y = iy * sizeof(complex128)
+	SHLQ   $4, INC_Y
+	LEAQ   (X_PTR)(INC_X*1), X_PTR   // X_PTR = &(X_PTR[ix])
+	LEAQ   (Y_PTR)(INC_Y*1), Y_PTR   // Y_PTR = &(Y_PTR[iy])
+	MOVQ   incX+56(FP), INC_X        // INC_X = incX
+	SHLQ   $4, INC_X                 // INC_X *=  sizeof(complex128)
+	MOVQ   incY+64(FP), INC_Y        // INC_Y = incY
+	SHLQ   $4, INC_Y                 // INC_Y *=  sizeof(complex128)
+	MOVSD  $(-1.0), NEG1
+	SHUFPD $0, NEG1, NEG1            // { -1, -1 }
+	MOVQ   LEN, TAIL
+	ANDQ   $3, TAIL                  // TAIL = n % 4
+	SHRQ   $2, LEN                   // LEN = floor( n / 4 )
+	JZ     dot_tail                  // if n <= 4 { goto dot_tail }
+	MOVAPS NEG1, P_NEG1              // Copy NEG1 to P_NEG1 for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
+
+dot_loop: // do {
+	MOVDDUP_XPTR__X3        // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_XPTR_INCX__X5
+	MOVDDUP_XPTR_INCX_2__X7
+	MOVDDUP_XPTR_INCx3X__X9
+
+	MOVDDUP_8_XPTR__X2        // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_8_XPTR_INCX__X4
+	MOVDDUP_8_XPTR_INCX_2__X6
+	MOVDDUP_8_XPTR_INCx3X__X8
+
+	// X_i = { -imag(x[i]), -imag(x[i]) }
+	MULPD NEG1, X2
+	MULPD P_NEG1, X4
+	MULPD NEG1, X6
+	MULPD P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR), X10
+	MOVUPS (Y_PTR)(INC_Y*1), X11
+	MOVUPS (Y_PTR)(INC_Y*2), X12
+	MOVUPS (Y_PTR)(INCx3_Y*1), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j     = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR__X3      // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_8_XPTR__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  NEG1, X2       // X_i     = { -imag(x[i])          , -imag(x[i])           }
+	MOVUPS (Y_PTR), X10   // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3        // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10 // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2        // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM      // sum += result[i]
+	ADDQ  INC_X, X_PTR // X_PTR += incX
+	ADDQ  INC_Y, Y_PTR // Y_PTR += incY
+	DECQ  TAIL
+	JNZ   dot_tail     // }  while --TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+88(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
new file mode 100644
index 00000000000..0ffd0f12897
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
@@ -0,0 +1,143 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR_IDX_8__X3    LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
+#define MOVDDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
+#define MOVDDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
+#define MOVDDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
+
+#define MOVDDUP_XPTR_IIDX_8__X2    LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
+#define MOVDDUP_16_XPTR_IIDX_8__X4    LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
+#define MOVDDUP_32_XPTR_IIDX_8__X6    LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
+#define MOVDDUP_48_XPTR_IIDX_8__X8    LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcUnitary(x, y []complex128) (sum complex128)
+TEXT ·DotcUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // sum = 0
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dot_end
+	XORPS   P_SUM, P_SUM         // psum = 0
+	MOVSD   $(-1.0), NEG1
+	SHUFPD  $0, NEG1, NEG1       // { -1, -1 }
+	XORQ    IDX, IDX             // i := 0
+	MOVQ    $1, I_IDX            // j := 1
+	MOVQ    LEN, TAIL
+	ANDQ    $3, TAIL             // TAIL = floor( TAIL / 4 )
+	SHRQ    $2, LEN              // LEN = TAIL % 4
+	JZ      dot_tail             // if LEN == 0 { goto dot_tail }
+
+	MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
+
+dot_loop: // do {
+	MOVDDUP_XPTR_IDX_8__X3    // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_16_XPTR_IDX_8__X5
+	MOVDDUP_32_XPTR_IDX_8__X7
+	MOVDDUP_48_XPTR_IDX_8__X9
+
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_16_XPTR_IIDX_8__X4
+	MOVDDUP_32_XPTR_IIDX_8__X6
+	MOVDDUP_48_XPTR_IIDX_8__X8
+
+	// X_i = { -imag(x[i]), -imag(x[i]) }
+	MULPD NEG1, X2
+	MULPD P_NEG1, X4
+	MULPD NEG1, X6
+	MULPD P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	ADDQ  $8, IDX    // IDX += 8
+	ADDQ  $8, I_IDX  // I_IDX += 8
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR_IDX_8__X3     // X_(i+1) = {  real(x[i])          ,  real(x[i])           }
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i     = {  imag(x[i])          ,  imag(x[i])           }
+	MULPD  NEG1, X2            // X_i     = { -imag(x[i])          , -imag(x[i])           }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3             // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10      // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2             // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM   // SUM += result[i]
+	ADDQ  $2, IDX   // IDX += 2
+	ADDQ  $2, I_IDX // I_IDX += 2
+	DECQ  TAIL
+	JNZ   dot_tail  // }  while --TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+48(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
new file mode 100644
index 00000000000..74fe5c3ba56
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
@@ -0,0 +1,141 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR__X3    LONG $0x1E120FF2 // MOVDDUP (SI), X3
+#define MOVDDUP_XPTR_INCX__X5    LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
+#define MOVDDUP_XPTR_INCX_2__X7    LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
+#define MOVDDUP_XPTR_INCx3X__X9    LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
+
+#define MOVDDUP_8_XPTR__X2    LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
+#define MOVDDUP_8_XPTR_INCX__X4    LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
+#define MOVDDUP_8_XPTR_INCX_2__X6    LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
+#define MOVDDUP_8_XPTR_INCx3X__X8    LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+
+// func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+TEXT ·DotuInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR       // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR      // Y_PTR = &y
+	MOVQ n+48(FP), LEN             // LEN = n
+	PXOR SUM, SUM                  // sum = 0
+	CMPQ LEN, $0                   // if LEN == 0 { return }
+	JE   dot_end
+	MOVQ ix+72(FP), INC_X          // INC_X = ix * sizeof(complex128)
+	SHLQ $4, INC_X
+	MOVQ iy+80(FP), INC_Y          // INC_Y = iy * sizeof(complex128)
+	SHLQ $4, INC_Y
+	LEAQ (X_PTR)(INC_X*1), X_PTR   // X_PTR = &(X_PTR[ix])
+	LEAQ (Y_PTR)(INC_Y*1), Y_PTR   // Y_PTR = &(Y_PTR[iy])
+	MOVQ incX+56(FP), INC_X        // INC_X = incX
+	SHLQ $4, INC_X                 // INC_X *=  sizeof(complex128)
+	MOVQ incY+64(FP), INC_Y        // INC_Y = incY
+	SHLQ $4, INC_Y                 // INC_Y *=  sizeof(complex128)
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL                  // LEN = LEN % 4
+	SHRQ $2, LEN                   // LEN = floor( LEN / 4 )
+	JZ   dot_tail                  // if LEN <= 4 { goto dot_tail }
+	PXOR P_SUM, P_SUM              // psum = 0
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
+
+dot_loop: // do {
+	MOVDDUP_XPTR__X3        // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_XPTR_INCX__X5
+	MOVDDUP_XPTR_INCX_2__X7
+	MOVDDUP_XPTR_INCx3X__X9
+
+	MOVDDUP_8_XPTR__X2        // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_8_XPTR_INCX__X4
+	MOVDDUP_8_XPTR_INCX_2__X6
+	MOVDDUP_8_XPTR_INCx3X__X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR), X10
+	MOVUPS (Y_PTR)(INC_Y*1), X11
+	MOVUPS (Y_PTR)(INC_Y*2), X12
+	MOVUPS (Y_PTR)(INCx3_Y*1), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j     = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+
+	DECQ  LEN
+	JNZ   dot_loop   // } while --BX > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR__X3      // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_8_XPTR__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3        // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10 // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2        // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM      // sum += result[i]
+	ADDQ  INC_X, X_PTR // X_PTR += incX
+	ADDQ  INC_Y, Y_PTR // Y_PTR += incY
+	DECQ  TAIL         // --TAIL
+	JNZ   dot_tail     // }  while TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+88(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
new file mode 100644
index 00000000000..8df019881bb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
@@ -0,0 +1,130 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR_IDX_8__X3    LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
+#define MOVDDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
+#define MOVDDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
+#define MOVDDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
+
+#define MOVDDUP_XPTR_IIDX_8__X2    LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
+#define MOVDDUP_16_XPTR_IIDX_8__X4    LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
+#define MOVDDUP_32_XPTR_IIDX_8__X6    LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
+#define MOVDDUP_48_XPTR_IIDX_8__X8    LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+
+// func DotuUnitary(x, y []complex128) (sum complex128)
+TEXT ·DotuUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // SUM = 0
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dot_end
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	XORQ    IDX, IDX             // IDX = 0
+	MOVQ    $1, DX               // j = 1
+	MOVQ    LEN, TAIL
+	ANDQ    $3, TAIL             // TAIL = floor( LEN / 4 )
+	SHRQ    $2, LEN              // LEN = LEN % 4
+	JZ      dot_tail             // if LEN == 0 { goto dot_tail }
+
+dot_loop: // do {
+	MOVDDUP_XPTR_IDX_8__X3    // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_16_XPTR_IDX_8__X5
+	MOVDDUP_32_XPTR_IDX_8__X7
+	MOVDDUP_48_XPTR_IDX_8__X9
+
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_16_XPTR_IIDX_8__X4
+	MOVDDUP_32_XPTR_IIDX_8__X6
+	MOVDDUP_48_XPTR_IIDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	ADDQ  $8, IDX    // IDX += 8
+	ADDQ  $8, I_IDX  // I_IDX += 8
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR_IDX_8__X3     // X_(i+1) = { real(x[i]            , real(x[i])            }
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i     = { imag(x[i])           , imag(x[i])            }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3             // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10      // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2             // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM   // psum += result[i]
+	ADDQ  $2, IDX   // IDX += 2
+	ADDQ  $2, I_IDX // I_IDX += 2
+	DECQ  TAIL      // --TAIL
+	JNZ   dot_tail  // }  while TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+48(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
new file mode 100644
index 00000000000..77a28ccead7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
@@ -0,0 +1,69 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define TAIL BX
+#define INC R9
+#define INC3 R10
+#define ALPHA X0
+#define ALPHA_2 X1
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
+
+// func DscalInc(alpha float64, x []complex128, n, inc uintptr)
+TEXT ·DscalInc(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), SRC // SRC = &x
+	MOVQ n+32(FP), LEN     // LEN = n
+	CMPQ LEN, $0           // if LEN == 0 { return }
+	JE   dscal_end
+
+	MOVDDUP_ALPHA             // ALPHA = alpha
+	MOVQ   inc+40(FP), INC    // INC = inc
+	SHLQ   $4, INC            // INC = INC * sizeof(complex128)
+	LEAQ   (INC)(INC*2), INC3 // INC3 = 3 * INC
+	MOVUPS ALPHA, ALPHA_2     // Copy ALPHA and ALPHA_2 for pipelining
+	MOVQ   LEN, TAIL          // TAIL = LEN
+	SHRQ   $2, LEN            // LEN = floor( n / 4 )
+	JZ     dscal_tail         // if LEN == 0 { goto dscal_tail }
+
+dscal_loop: // do {
+	MOVUPS (SRC), X2         // X_i = x[i]
+	MOVUPS (SRC)(INC*1), X3
+	MOVUPS (SRC)(INC*2), X4
+	MOVUPS (SRC)(INC3*1), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST)         // x[i] = X_i
+	MOVUPS X3, (DST)(INC*1)
+	MOVUPS X4, (DST)(INC*2)
+	MOVUPS X5, (DST)(INC3*1)
+
+	LEAQ (SRC)(INC*4), SRC // SRC += INC*4
+	DECQ LEN
+	JNZ  dscal_loop        // } while --LEN > 0
+
+dscal_tail:
+	ANDQ $3, TAIL  // TAIL = TAIL % 4
+	JE   dscal_end // if TAIL == 0 { return }
+
+dscal_tail_loop: // do {
+	MOVUPS (SRC), X2       // X_i = x[i]
+	MULPD  ALPHA, X2       // X_i *= ALPHA
+	MOVUPS X2, (DST)       // x[i] = X_i
+	ADDQ   INC, SRC        // SRC += INC
+	DECQ   TAIL
+	JNZ    dscal_tail_loop // } while --TAIL > 0
+
+dscal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
new file mode 100644
index 00000000000..9fa91e46241
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
@@ -0,0 +1,66 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define IDX AX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
+
+// func DscalUnitary(alpha float64, x []complex128)
+TEXT ·DscalUnitary(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), SRC // SRC = &x
+	MOVQ x_len+16(FP), LEN // LEN = len(x)
+	CMPQ LEN, $0           // if LEN == 0 { return }
+	JE   dscal_end
+
+	MOVDDUP_ALPHA         // ALPHA = alpha
+	XORQ   IDX, IDX       // IDX = 0
+	MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
+	MOVQ   LEN, TAIL      // TAIL = LEN
+	SHRQ   $2, LEN        // LEN = floor( n / 4 )
+	JZ     dscal_tail     // if LEN == 0 { goto dscal_tail }
+
+dscal_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(SRC)(IDX*8), X3
+	MOVUPS 32(SRC)(IDX*8), X4
+	MOVUPS 48(SRC)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST)(IDX*8)   // x[i] = X_i
+	MOVUPS X3, 16(DST)(IDX*8)
+	MOVUPS X4, 32(DST)(IDX*8)
+	MOVUPS X5, 48(DST)(IDX*8)
+
+	ADDQ $8, IDX    // IDX += 8
+	DECQ LEN
+	JNZ  dscal_loop // } while --LEN > 0
+
+dscal_tail:
+	ANDQ $3, TAIL  // TAIL = TAIL % 4
+	JZ   dscal_end // if TAIL == 0 { return }
+
+dscal_tail_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
+	MULPD  ALPHA, X2        // X_i *= ALPHA
+	MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
+	ADDQ   $2, IDX          // IDX += 2
+	DECQ   TAIL
+	JNZ    dscal_tail_loop  // } while --TAIL > 0
+
+dscal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
new file mode 100644
index 00000000000..27c35817523
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
@@ -0,0 +1,33 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c128
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
new file mode 100644
index 00000000000..b76037fdd02
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
@@ -0,0 +1,116 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define IDX AX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_C X1
+#define ALPHA2 X10
+#define ALPHA_C2 X11
+
+#define MOVDDUP_X2_X3    LONG $0xDA120FF2 // MOVDDUP X2, X3
+#define MOVDDUP_X4_X5    LONG $0xEC120FF2 // MOVDDUP X4, X5
+#define MOVDDUP_X6_X7    LONG $0xFE120FF2 // MOVDDUP X6, X7
+#define MOVDDUP_X8_X9    LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+// func ScalUnitary(alpha complex128, x []complex128)
+TEXT ·ScalUnitary(SB), NOSPLIT, $0
+	MOVQ x_base+16(FP), SRC // SRC = &x
+	MOVQ x_len+24(FP), LEN  // LEN = len(x)
+	CMPQ LEN, $0            // if LEN == 0 { return }
+	JE   scal_end
+
+	MOVUPS alpha+0(FP), ALPHA     // ALPHA = { imag(alpha), real(alpha) }
+	MOVAPS ALPHA, ALPHA_C
+	SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
+
+	XORQ   IDX, IDX          // IDX = 0
+	MOVAPS ALPHA, ALPHA2     // Copy ALPHA and ALPHA_C for pipelining
+	MOVAPS ALPHA_C, ALPHA_C2
+	MOVQ   LEN, TAIL
+	SHRQ   $2, LEN           // LEN = floor( n / 4 )
+	JZ     scal_tail         // if BX == 0 { goto scal_tail }
+
+scal_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SRC)(IDX*8), X4
+	MOVUPS 32(SRC)(IDX*8), X6
+	MOVUPS 48(SRC)(IDX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+	MULPD ALPHA_C, X2
+	MULPD ALPHA, X3
+	MULPD ALPHA_C2, X4
+	MULPD ALPHA2, X5
+	MULPD ALPHA_C, X6
+	MULPD ALPHA, X7
+	MULPD ALPHA_C2, X8
+	MULPD ALPHA2, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	MOVUPS X3, (DST)(IDX*8)   // x[i] = X_(i+1)
+	MOVUPS X5, 16(DST)(IDX*8)
+	MOVUPS X7, 32(DST)(IDX*8)
+	MOVUPS X9, 48(DST)(IDX*8)
+	ADDQ   $8, IDX            // IDX += 8
+	DECQ   LEN
+	JNZ    scal_loop          // } while --LEN > 0
+
+scal_tail:
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JZ   scal_end // if TAIL == 0 { return }
+
+scal_tail_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3           // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2     // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  ALPHA_C, X2      // X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	MULPD  ALPHA, X3        // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
+	ADDQ   $2, IDX          // IDX += 2
+	DECQ   TAIL
+	JNZ    scal_tail_loop   // }  while --LEN > 0
+
+scal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
new file mode 100644
index 00000000000..6e0e51b6581
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
@@ -0,0 +1,121 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define TAIL BX
+#define INC R9
+#define INC3 R10
+#define ALPHA X0
+#define ALPHA_C X1
+#define ALPHA2 X10
+#define ALPHA_C2 X11
+
+#define MOVDDUP_X2_X3    LONG $0xDA120FF2 // MOVDDUP X2, X3
+#define MOVDDUP_X4_X5    LONG $0xEC120FF2 // MOVDDUP X4, X5
+#define MOVDDUP_X6_X7    LONG $0xFE120FF2 // MOVDDUP X6, X7
+#define MOVDDUP_X8_X9    LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+// func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
+TEXT ·ScalInc(SB), NOSPLIT, $0
+	MOVQ x_base+16(FP), SRC // SRC = &x
+	MOVQ n+40(FP), LEN      // LEN = len(x)
+	CMPQ LEN, $0
+	JE   scal_end           // if LEN == 0 { return }
+
+	MOVQ inc+48(FP), INC    // INC = inc
+	SHLQ $4, INC            // INC = INC * sizeof(complex128)
+	LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
+
+	MOVUPS alpha+0(FP), ALPHA     // ALPHA = { imag(alpha), real(alpha) }
+	MOVAPS ALPHA, ALPHA_C
+	SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
+
+	MOVAPS ALPHA, ALPHA2     // Copy ALPHA and ALPHA_C for pipelining
+	MOVAPS ALPHA_C, ALPHA_C2
+	MOVQ   LEN, TAIL
+	SHRQ   $2, LEN           // LEN = floor( n / 4 )
+	JZ     scal_tail         // if BX == 0 { goto scal_tail }
+
+scal_loop: // do {
+	MOVUPS (SRC), X2         // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SRC)(INC*1), X4
+	MOVUPS (SRC)(INC*2), X6
+	MOVUPS (SRC)(INC3*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+	MULPD ALPHA_C, X2
+	MULPD ALPHA, X3
+	MULPD ALPHA_C2, X4
+	MULPD ALPHA2, X5
+	MULPD ALPHA_C, X6
+	MULPD ALPHA, X7
+	MULPD ALPHA_C2, X8
+	MULPD ALPHA2, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	MOVUPS X3, (DST)         // x[i] = X_(i+1)
+	MOVUPS X5, (DST)(INC*1)
+	MOVUPS X7, (DST)(INC*2)
+	MOVUPS X9, (DST)(INC3*1)
+
+	LEAQ (SRC)(INC*4), SRC // SRC = &(SRC[inc*4])
+	DECQ LEN
+	JNZ  scal_loop         // } while --BX > 0
+
+scal_tail:
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JE   scal_end // if TAIL == 0 { return }
+
+scal_tail_loop: // do {
+	MOVUPS (SRC), X2    // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  ALPHA_C, X2  // X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	MULPD  ALPHA, X3    // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	MOVUPS X3, (DST)      // x[i] = X_i
+	ADDQ   INC, SRC       // SRC = &(SRC[incX])
+	DECQ   TAIL
+	JNZ    scal_tail_loop // } while --TAIL > 0
+
+scal_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
new file mode 100644
index 00000000000..9c3a8fb83dc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
@@ -0,0 +1,180 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c128
+
+import (
+	"math"
+	"math/cmplx"
+)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []complex128) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha complex128, x []complex128) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []complex128) []complex128 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []complex128) []complex128 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []complex128) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []complex128) []complex128 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += cmplx.Conj(v) * y[i]
+//	}
+//	return sum
+func DotUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += cmplx.Conj(v) * y[i]
+	}
+	return sum
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []complex128) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []complex128) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// Sum is
+//
+//	var sum complex128
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []complex128) complex128 {
+	var sum complex128
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
new file mode 100644
index 00000000000..c0e26a2f1e9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
@@ -0,0 +1,109 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package c128
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex128, x, y []complex128)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+
+// DscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func DscalUnitary(alpha float64, x []complex128)
+
+// DscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func DscalInc(alpha float64, x []complex128, n, inc uintptr)
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex128, x []complex128)
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * cmplx.Conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex128) (sum complex128)
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * cmplx.Conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex128) (sum complex128)
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
new file mode 100644
index 00000000000..21dfc4a8e12
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
@@ -0,0 +1,176 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package c128
+
+import "math/cmplx"
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex128, x, y []complex128) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func DscalUnitary(alpha float64, x []complex128) {
+	for i, v := range x {
+		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+	}
+}
+
+// DscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func DscalInc(alpha float64, x []complex128, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+		ix += inc
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex128, x []complex128, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += inc
+	}
+}
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex128, x []complex128) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * cmplx.Conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += y[i] * cmplx.Conj(v)
+	}
+	return sum
+}
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * cmplx.Conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * cmplx.Conj(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
new file mode 100644
index 00000000000..4d2c5e9ad50
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
@@ -0,0 +1,151 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ   x_base+8(FP), SI  // SI = &x
+	MOVQ   y_base+32(FP), DI // DI = &y
+	MOVQ   n+56(FP), CX      // CX = n
+	CMPQ   CX, $0            // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+80(FP), R8     // R8 = ix
+	MOVQ   iy+88(FP), R9     // R9 = iy
+	LEAQ   (SI)(R8*8), SI    // SI = &(x[ix])
+	LEAQ   (DI)(R9*8), DI    // DI = &(y[iy])
+	MOVQ   DI, DX            // DX = DI    // Read/Write pointers
+	MOVQ   incX+64(FP), R8   // R8 = incX
+	SHLQ   $3, R8            // R8 *= sizeof(complex64)
+	MOVQ   incY+72(FP), R9   // R9 = incY
+	SHLQ   $3, R9            // R9 *= sizeof(complex64)
+	MOVSD  alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPS $0x11, X1, X1     // X1 = { 0, 0, real(a), imag(a) }
+	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX            // CX = n % 4
+	SHRQ   $2, BX            // BX = floor( n / 4 )
+	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVSD (SI), X3       // X_i = { imag(x[i+1]), real(x[i+1]) }
+	MOVSD (SI)(R8*1), X5
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSD (SI), X7
+	MOVSD (SI)(R8*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	MOVSD (DX), X2
+	MOVSD (DX)(R9*1), X4
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	MOVSD (DX), X6
+	MOVSD (DX)(R9*1), X8
+	ADDPS X2, X3
+	ADDPS X4, X5
+	ADDPS X6, X7
+	ADDPS X8, X9
+
+	MOVSD X3, (DI)       // y[i] = X_i
+	MOVSD X5, (DI)(R9*1)
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
+	MOVSD X7, (DI)
+	MOVSD X9, (DI)(R9*1)
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
+	DECQ  BX
+	JNZ   axpyi_loop     // }  while --BX > 0
+	CMPQ  CX, $0         // if CX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail: // do {
+	MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }
+
+	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+	MULPS X1, X2
+	MULPS X0, X3
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)
+
+	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
+	MOVSD (DI), X4
+	ADDPS X4, X3
+	MOVSD X3, (DI)   // y[i] = X_i
+	ADDQ  R8, SI     // SI += incX
+	ADDQ  R9, DI     // DI += incY
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
new file mode 100644
index 00000000000..1519f2d9b38
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
@@ -0,0 +1,156 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ   dst_base+0(FP), DI // DI = &dst
+	MOVQ   x_base+48(FP), SI  // SI = &x
+	MOVQ   y_base+72(FP), DX  // DX = &y
+	MOVQ   n+96(FP), CX       // CX = n
+	CMPQ   CX, $0             // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+120(FP), R8     // Load the first index
+	MOVQ   iy+128(FP), R9
+	MOVQ   idst+32(FP), R10
+	LEAQ   (SI)(R8*8), SI     // SI = &(x[ix])
+	LEAQ   (DX)(R9*8), DX     // DX = &(y[iy])
+	LEAQ   (DI)(R10*8), DI    // DI = &(dst[idst])
+	MOVQ   incX+104(FP), R8   // Incrementors*8 for easy iteration (ADDQ)
+	SHLQ   $3, R8
+	MOVQ   incY+112(FP), R9
+	SHLQ   $3, R9
+	MOVQ   incDst+24(FP), R10
+	SHLQ   $3, R10
+	MOVSD  alpha+40(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPS $0x11, X1, X1      // X1 = { 0, 0, real(a), imag(a) }
+	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX             // CX = n % 4
+	SHRQ   $2, BX             // BX = floor( n / 4 )
+	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVSD (SI), X3       // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (SI)(R8*1), X5
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSD (SI), X7
+	MOVSD (SI)(R8*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	MOVSD (DX), X2
+	MOVSD (DX)(R9*1), X4
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	MOVSD (DX), X6
+	MOVSD (DX)(R9*1), X8
+	ADDPS X2, X3
+	ADDPS X4, X5
+	ADDPS X6, X7
+	ADDPS X8, X9
+
+	MOVSD X3, (DI)        // y[i] = X_i
+	MOVSD X5, (DI)(R10*1)
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
+	MOVSD X7, (DI)
+	MOVSD X9, (DI)(R10*1)
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
+	DECQ  BX
+	JNZ   axpyi_loop      // } while --BX > 0
+	CMPQ  CX, $0          // if CX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail:
+	MOVSD (SI), X3 // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3 // X_i     = { real(x[i]), real(x[i]) }
+
+	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+	MULPS X1, X2
+	MULPS X0, X3
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+
+	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
+	MOVSD (DX), X4
+	ADDPS X4, X3
+	MOVSD X3, (DI)   // y[i] = X_i
+	ADDQ  R8, SI     // SI += incX
+	ADDQ  R9, DX     // DX += incY
+	ADDQ  R10, DI    // DI += incDst
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
new file mode 100644
index 00000000000..71274c92cca
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
@@ -0,0 +1,160 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitary(alpha complex64, x, y []complex64)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), SI  // SI = &x
+	MOVQ    y_base+32(FP), DI // DI = &y
+	MOVQ    x_len+16(FP), CX  // CX = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), CX
+	CMOVQLE y_len+40(FP), CX
+	CMPQ    CX, $0            // if CX == 0 { return }
+	JE      caxy_end
+	PXOR    X0, X0            // Clear work registers and cache-align loop
+	PXOR    X1, X1
+	MOVSD   alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	SHUFPD  $0, X0, X0        // X0  = { imag(a), real(a), imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPS  $0x11, X1, X1     // X1 = { real(a), imag(a), real(a), imag(a) }
+	XORQ    AX, AX            // i = 0
+	MOVQ    DI, BX            // Align on 16-byte boundary for ADDPS
+	ANDQ    $15, BX           // BX = &y & 15
+	JZ      caxy_no_trim      // if BX == 0 { goto caxy_no_trim }
+
+	// Trim first value in unaligned buffer
+	XORPS X2, X2         // Clear work registers and cache-align loop
+	XORPS X3, X3
+	XORPS X4, X4
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	MOVSD (DI)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JZ    caxy_end       // if CX == 0 { return }
+
+caxy_no_trim:
+	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $7, CX    // CX = n % 8
+	SHRQ   $3, BX    // BX = floor( n / 8 )
+	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
+	MOVUPS (SI)(AX*8), X3
+	MOVUPS 16(SI)(AX*8), X5
+	MOVUPS 32(SI)(AX*8), X7
+	MOVUPS 48(SI)(AX*8), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
+	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
+	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
+	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
+	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
+	ADDPS  (DI)(AX*8), X3
+	ADDPS  16(DI)(AX*8), X5
+	ADDPS  32(DI)(AX*8), X7
+	ADDPS  48(DI)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX               // --BX
+	JNZ    caxy_loop        // }  while BX > 0
+	CMPQ   CX, $0           // if CX == 0  { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])   }
+	ADDSUBPS_X2_X3
+	MOVSD (DI)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // ++i
+	LOOP  caxy_tail      // } while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
new file mode 100644
index 00000000000..2e80d8ca94b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
@@ -0,0 +1,157 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+32(FP), SI  // SI = &x
+	MOVQ    y_base+56(FP), DX  // DX = &y
+	MOVQ    x_len+40(FP), CX
+	CMPQ    y_len+64(FP), CX   // CX = min( len(x), len(y), len(dst) )
+	CMOVQLE y_len+64(FP), CX
+	CMPQ    dst_len+8(FP), CX
+	CMOVQLE dst_len+8(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      caxy_end
+	MOVSD   alpha+24(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	SHUFPD  $0, X0, X0         // X0  = { imag(a), real(a), imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPS  $0x11, X1, X1      // X1 = { real(a), imag(a), real(a), imag(a) }
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, BX             // Align on 16-byte boundary for ADDPS
+	ANDQ    $15, BX            // BX = &y & 15
+	JZ      caxy_no_trim       // if BX == 0 { goto caxy_no_trim }
+
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	MOVSD (DX)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // dst[i]  = X3
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JZ    caxy_tail      // if BX == 0 { goto caxy_tail }
+
+caxy_no_trim:
+	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $7, CX    // CX = n % 8
+	SHRQ   $3, BX    // BX = floor( n / 8 )
+	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
+
+caxy_loop:
+	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
+	MOVUPS (SI)(AX*8), X3
+	MOVUPS 16(SI)(AX*8), X5
+	MOVUPS 32(SI)(AX*8), X7
+	MOVUPS 48(SI)(AX*8), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
+	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
+	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
+	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
+	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
+	ADDPS  (DX)(AX*8), X3
+	ADDPS  16(DX)(AX*8), X5
+	ADDPS  32(DX)(AX*8), X7
+	ADDPS  48(DX)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX               // --BX
+	JNZ    caxy_loop        // }  while BX > 0
+	CMPQ   CX, $0           // if CX == 0  { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])  }
+	ADDSUBPS_X2_X3
+	MOVSD (DX)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // ++i
+	LOOP  caxy_tail      // } while --CX > 0
+
+caxy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
new file mode 100644
index 00000000000..910e1e5c732
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
@@ -0,0 +1,7 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+func conj(c complex64) complex64 { return complex(real(c), -imag(c)) }
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
new file mode 100644
index 00000000000..35f1b2a26b3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package c64 provides complex64 vector primitives.
+package c64 // import "gonum.org/v1/gonum/internal/asm/c64"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
new file mode 100644
index 00000000000..8efda0bb778
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
@@ -0,0 +1,160 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSHDUP_X5_X4    LONG $0xE5160FF3 // MOVSHDUP X5, X4
+#define MOVSHDUP_X7_X6    LONG $0xF7160FF3 // MOVSHDUP X7, X6
+#define MOVSHDUP_X9_X8    LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
+
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+#define MOVSLDUP_X5_X5    LONG $0xED120FF3 // MOVSLDUP X5, X5
+#define MOVSLDUP_X7_X7    LONG $0xFF120FF3 // MOVSLDUP X7, X7
+#define MOVSLDUP_X9_X9    LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+TEXT ·DotcInc(SB), NOSPLIT, $0
+	MOVQ   x_base+0(FP), X_PTR     // X_PTR = &x
+	MOVQ   y_base+24(FP), Y_PTR    // Y_PTR = &y
+	PXOR   SUM, SUM                // SUM = 0
+	PXOR   P_SUM, P_SUM            // P_SUM = 0
+	MOVQ   n+48(FP), LEN           // LEN = n
+	CMPQ   LEN, $0                 // if LEN == 0 { return }
+	JE     dotc_end
+	MOVQ   ix+72(FP), INC_X
+	MOVQ   iy+80(FP), INC_Y
+	LEAQ   (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
+	LEAQ   (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
+	MOVQ   incX+56(FP), INC_X      // INC_X = incX * sizeof(complex64)
+	SHLQ   $3, INC_X
+	MOVQ   incY+64(FP), INC_Y      // INC_Y = incY * sizeof(complex64)
+	SHLQ   $3, INC_Y
+	MOVSS  $(-1.0), NEG1
+	SHUFPS $0, NEG1, NEG1          // { -1, -1, -1, -1 }
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dotc_tail // if LEN == 0 { goto dotc_tail }
+
+	MOVUPS NEG1, P_NEG1              // Copy NEG1 for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dotc_loop: // do {
+	MOVSD (X_PTR), X3            // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (X_PTR)(INC_X*1), X5
+	MOVSD (X_PTR)(INC_X*2), X7
+	MOVSD (X_PTR)(INCx3_X*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = { -imag(x[i]), -imag(x[i]) }
+	MULPS NEG1, X2
+	MULPS P_NEG1, X4
+	MULPS NEG1, X6
+	MULPS P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVSD (Y_PTR), X10
+	MOVSD (Y_PTR)(INC_Y*1), X11
+	MOVSD (Y_PTR)(INC_Y*2), X12
+	MOVSD (Y_PTR)(INCx3_Y*1), X13
+
+	// X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
+	//	real(result[i]):  real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
+
+	DECQ LEN
+	JNZ  dotc_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dotc_end
+
+dotc_tail: // do {
+	MOVSD  (X_PTR), X3    // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2        // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3        // X_i = { real(x[i]), real(x[i]) }
+	MULPS  NEG1, X2       // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3        // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2        // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	ADDPS X3, SUM      // SUM += X_i
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dotc_tail    // } while --TAIL > 0
+
+dotc_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
new file mode 100644
index 00000000000..78f43eee06a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
@@ -0,0 +1,208 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSLDUP_XPTR_IDX_8__X3    LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
+#define MOVSLDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
+#define MOVSLDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
+#define MOVSLDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
+
+#define MOVSHDUP_XPTR_IDX_8__X2    LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
+#define MOVSHDUP_16_XPTR_IDX_8__X4    LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
+#define MOVSHDUP_32_XPTR_IDX_8__X6    LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
+#define MOVSHDUP_48_XPTR_IDX_8__X8    LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcUnitary(x, y []complex64) (sum complex64)
+TEXT ·DotcUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dotc_end
+	XORQ    IDX, IDX             // i = 0
+	MOVSS   $(-1.0), NEG1
+	SHUFPS  $0, NEG1, NEG1       // { -1, -1, -1, -1 }
+
+	MOVQ X_PTR, DX
+	ANDQ $15, DX      // DX = &x & 15
+	JZ   dotc_aligned // if DX == 0 { goto dotc_aligned }
+
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i     = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j     = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j     = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	MOVAPS X3, SUM  // SUM = X_i
+	INCQ   IDX      // IDX++
+	DECQ   LEN      // LEN--
+	JZ     dotc_ret // if LEN == 0 { goto dotc_ret }
+
+dotc_aligned:
+	MOVQ   LEN, TAIL
+	ANDQ   $7, TAIL     // TAIL = LEN % 8
+	SHRQ   $3, LEN      // LEN = floor( LEN / 8 )
+	JZ     dotc_tail    // if LEN == 0 { return }
+	MOVUPS NEG1, P_NEG1 // Copy NEG1 for pipelining
+
+dotc_loop: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_16_XPTR_IDX_8__X5
+	MOVSLDUP_32_XPTR_IDX_8__X7
+	MOVSLDUP_48_XPTR_IDX_8__X9
+
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i+1]), imag(x[i+1]) }
+	MOVSHDUP_16_XPTR_IDX_8__X4
+	MOVSHDUP_32_XPTR_IDX_8__X6
+	MOVSHDUP_48_XPTR_IDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i-1) = { -imag(x[i]), -imag(x[i]), -imag(x[i]+1), -imag(x[i]+1) }
+	MULPS NEG1, X2
+	MULPS P_NEG1, X4
+	MULPS NEG1, X6
+	MULPS P_NEG1, X8
+
+	// X_i     = {  imag(y[i])   * real(x[i]),   real(y[i])   * real(x[i]),
+	// 		imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1])  }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = {  real(y[i])   * imag(x[i]),   imag(y[i])   * imag(x[i]),
+	//		real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1])  }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):   imag(y[i])   * real(x[i])   + real(y[i])   * imag(x[i]),
+	//	real(result[i]):   real(y[i])   * real(x[i])   - imag(y[i])   * imag(x[i]),
+	//	imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
+	//	real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	ADDQ $8, IDX   // IDX += 8
+	DECQ LEN
+	JNZ  dotc_loop // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   dotc_end
+
+dotc_tail:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN       // LEN = floor( LEN / 2 )
+	JZ   dotc_tail_one // if LEN == 0 { goto dotc_tail_one }
+
+dotc_tail_two: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0xB1, X10, X10     // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+	ADDQ $2, IDX       // IDX += 2
+	DECQ LEN
+	JNZ  dotc_tail_two // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	ANDQ $1, TAIL
+	JZ   dotc_end
+
+dotc_tail_one:
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+dotc_end:
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[0] + SUM[0] }
+	MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[1] + SUM[0] }
+
+dotc_ret:
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
new file mode 100644
index 00000000000..3dc2e144a88
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
@@ -0,0 +1,148 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSHDUP_X5_X4    LONG $0xE5160FF3 // MOVSHDUP X5, X4
+#define MOVSHDUP_X7_X6    LONG $0xF7160FF3 // MOVSHDUP X7, X6
+#define MOVSHDUP_X9_X8    LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
+
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+#define MOVSLDUP_X5_X5    LONG $0xED120FF3 // MOVSLDUP X5, X5
+#define MOVSLDUP_X7_X7    LONG $0xFF120FF3 // MOVSLDUP X7, X7
+#define MOVSLDUP_X9_X9    LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+
+// func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+TEXT ·DotuInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR     // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR    // Y_PTR = &y
+	PXOR SUM, SUM                // SUM = 0
+	PXOR P_SUM, P_SUM            // P_SUM = 0
+	MOVQ n+48(FP), LEN           // LEN = n
+	CMPQ LEN, $0                 // if LEN == 0 { return }
+	JE   dotu_end
+	MOVQ ix+72(FP), INC_X
+	MOVQ iy+80(FP), INC_Y
+	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
+	MOVQ incX+56(FP), INC_X      // INC_X = incX * sizeof(complex64)
+	SHLQ $3, INC_X
+	MOVQ incY+64(FP), INC_Y      // INC_Y = incY * sizeof(complex64)
+	SHLQ $3, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dotu_tail // if TAIL == 0 { goto dotu_tail }
+
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dotu_loop: // do {
+	MOVSD (X_PTR), X3            // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (X_PTR)(INC_X*1), X5
+	MOVSD (X_PTR)(INC_X*2), X7
+	MOVSD (X_PTR)(INCx3_X*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVSD (Y_PTR), X10
+	MOVSD (Y_PTR)(INC_Y*1), X11
+	MOVSD (Y_PTR)(INC_Y*2), X12
+	MOVSD (Y_PTR)(INCx3_Y*1), X13
+
+	// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
+	//	real(result[i]):  real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
+
+	DECQ LEN
+	JNZ  dotu_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dotu_end
+
+dotu_tail: // do {
+	MOVSD  (X_PTR), X3    // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2        // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3        // X_i = { real(x[i]), real(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3        // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2        // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDPS X3, SUM      // SUM += X_i
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dotu_tail    // } while --TAIL > 0
+
+dotu_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
new file mode 100644
index 00000000000..f11c6de78f2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
@@ -0,0 +1,197 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSLDUP_XPTR_IDX_8__X3    LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
+#define MOVSLDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
+#define MOVSLDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
+#define MOVSLDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
+
+#define MOVSHDUP_XPTR_IDX_8__X2    LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
+#define MOVSHDUP_16_XPTR_IDX_8__X4    LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
+#define MOVSHDUP_32_XPTR_IDX_8__X6    LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
+#define MOVSHDUP_48_XPTR_IDX_8__X8    LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotuUnitary(x, y []complex64) (sum complex64)
+TEXT ·DotuUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dotu_end
+	XORQ    IDX, IDX             // IDX = 0
+
+	MOVQ X_PTR, DX
+	ANDQ $15, DX      // DX = &x & 15
+	JZ   dotu_aligned // if DX == 0 { goto dotu_aligned }
+
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i     = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j     = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j     = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	MOVAPS X3, SUM  // SUM = X_i
+	INCQ   IDX      // IDX++
+	DECQ   LEN      // LEN--
+	JZ     dotu_end // if LEN == 0 { goto dotu_end }
+
+dotu_aligned:
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL     // TAIL = LEN % 8
+	SHRQ $3, LEN      // LEN = floor( LEN / 8 )
+	JZ   dotu_tail    // if LEN == 0 { goto dotu_tail }
+	PXOR P_SUM, P_SUM
+
+dotu_loop: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_16_XPTR_IDX_8__X5
+	MOVSLDUP_32_XPTR_IDX_8__X7
+	MOVSLDUP_48_XPTR_IDX_8__X9
+
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_16_XPTR_IDX_8__X4
+	MOVSHDUP_32_XPTR_IDX_8__X6
+	MOVSHDUP_48_XPTR_IDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_i     = {  imag(y[i])   * real(x[i]),   real(y[i])   * real(x[i]),
+	// 		imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1])  }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = {  real(y[i])   * imag(x[i]),   imag(y[i])   * imag(x[i]),
+	//		real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1])  }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):   imag(y[i])   * real(x[i])   + real(y[i])   * imag(x[i]),
+	//	real(result[i]):   real(y[i])   * real(x[i])   - imag(y[i])   * imag(x[i]),
+	//	imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
+	//	real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	ADDQ $8, IDX   // IDX += 8
+	DECQ LEN
+	JNZ  dotu_loop // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   dotu_end
+
+dotu_tail:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN       // LEN = floor( LEN / 2 )
+	JZ   dotu_tail_one // if LEN == 0 { goto dotc_tail_one }
+
+dotu_tail_two: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0xB1, X10, X10     // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+	ADDQ $2, IDX       // IDX += 2
+	DECQ LEN
+	JNZ  dotu_tail_two // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	ANDQ $1, TAIL
+	JZ   dotu_end
+
+dotu_tail_one:
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+dotu_end:
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[0] + SUM[0] }
+	MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[1] + SUM[0] }
+
+dotu_ret:
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
new file mode 100644
index 00000000000..6db0aa36f3f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
@@ -0,0 +1,85 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex64, x []complex64) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []complex64, alpha complex64, x []complex64) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex64, x []complex64, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []complex64, incDst uintptr, alpha complex64, x []complex64, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
+
+// SscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func SscalUnitary(alpha float32, x []complex64) {
+	for i, v := range x {
+		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+	}
+}
+
+// SscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func SscalInc(alpha float32, x []complex64, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+		ix += inc
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
new file mode 100644
index 00000000000..0aa626e141a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
@@ -0,0 +1,180 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+import (
+	"gonum.org/v1/gonum/internal/cmplx64"
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []complex64) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha complex64, x []complex64) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []complex64) []complex64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []complex64) []complex64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []complex64) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []complex64) []complex64 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += conj(v) * y[i]
+//	}
+//	return sum
+func DotUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += cmplx64.Conj(v) * y[i]
+	}
+	return sum
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []complex64) (norm float32) {
+	var scale float32
+	sumSquares := float32(1.0)
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx64.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []complex64) (norm float32) {
+	var scale float32
+	sumSquares := float32(1.0)
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx64.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// Sum is
+//
+//	var sum complex64
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []complex64) complex64 {
+	var sum complex64
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
new file mode 100644
index 00000000000..71367b016fa
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
@@ -0,0 +1,77 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package c64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex64, x, y []complex64)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex64) (sum complex64)
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex64) (sum complex64)
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
new file mode 100644
index 00000000000..0d79b24fc8c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package c64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex64, x, y []complex64) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += y[i] * conj(v)
+	}
+	return sum
+}
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * conj(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
new file mode 100644
index 00000000000..c0b84cd81e4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
@@ -0,0 +1,73 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ  n+56(FP), CX      // CX = n
+	CMPQ  CX, $0            // if n==0 { return }
+	JLE   axpyi_end
+	MOVQ  x_base+8(FP), SI  // SI = &x
+	MOVQ  y_base+32(FP), DI // DI = &y
+	MOVQ  ix+80(FP), R8     // R8 = ix
+	MOVQ  iy+88(FP), R9     // R9 = iy
+	LEAQ  (SI)(R8*4), SI    // SI = &(x[ix])
+	LEAQ  (DI)(R9*4), DI    // DI = &(y[iy])
+	MOVQ  DI, DX            // DX = DI   Read Pointer for y
+	MOVQ  incX+64(FP), R8   // R8 = incX
+	SHLQ  $2, R8            // R8 *= sizeof(float32)
+	MOVQ  incY+72(FP), R9   // R9 = incY
+	SHLQ  $2, R9            // R9 *= sizeof(float32)
+	MOVSS alpha+0(FP), X0   // X0 = alpha
+	MOVSS X0, X1            // X1 = X0  // for pipelining
+	MOVQ  CX, BX
+	ANDQ  $3, BX            // BX = n % 4
+	SHRQ  $2, CX            // CX = floor( n / 4 )
+	JZ    axpyi_tail_start  // if CX == 0 { goto axpyi_tail_start }
+
+axpyi_loop: // Loop unrolled 4x   do {
+	MOVSS (SI), X2       // X_i = x[i]
+	MOVSS (SI)(R8*1), X3
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSS (SI), X4
+	MOVSS (SI)(R8*1), X5
+	MULSS X1, X2         // X_i *= a
+	MULSS X0, X3
+	MULSS X1, X4
+	MULSS X0, X5
+	ADDSS (DX), X2       // X_i += y[i]
+	ADDSS (DX)(R9*1), X3
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	ADDSS (DX), X4
+	ADDSS (DX)(R9*1), X5
+	MOVSS X2, (DI)       // y[i] = X_i
+	MOVSS X3, (DI)(R9*1)
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
+	MOVSS X4, (DI)
+	MOVSS X5, (DI)(R9*1)
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])  // Increment addresses
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
+	LOOP  axpyi_loop     // } while --CX > 0
+	CMPQ  BX, $0         // if BX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+axpyi_tail: // do {
+	MOVSS (SI), X2   // X2 = x[i]
+	MULSS X1, X2     // X2 *= a
+	ADDSS (DI), X2   // X2 += y[i]
+	MOVSS X2, (DI)   // y[i] = X2
+	ADDQ  R8, SI     // SI = &(SI[incX])
+	ADDQ  R9, DI     // DI = &(DI[incY])
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
+
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
new file mode 100644
index 00000000000..3f1d2b93300
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
@@ -0,0 +1,78 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ  n+96(FP), CX       // CX = n
+	CMPQ  CX, $0             // if n==0 { return }
+	JLE   axpyi_end
+	MOVQ  dst_base+0(FP), DI // DI = &dst
+	MOVQ  x_base+48(FP), SI  // SI = &x
+	MOVQ  y_base+72(FP), DX  // DX = &y
+	MOVQ  ix+120(FP), R8     // R8 = ix  // Load the first index
+	MOVQ  iy+128(FP), R9     // R9 = iy
+	MOVQ  idst+32(FP), R10   // R10 = idst
+	LEAQ  (SI)(R8*4), SI     // SI = &(x[ix])
+	LEAQ  (DX)(R9*4), DX     // DX = &(y[iy])
+	LEAQ  (DI)(R10*4), DI    // DI = &(dst[idst])
+	MOVQ  incX+104(FP), R8   // R8 = incX
+	SHLQ  $2, R8             // R8 *= sizeof(float32)
+	MOVQ  incY+112(FP), R9   // R9 = incY
+	SHLQ  $2, R9             // R9 *= sizeof(float32)
+	MOVQ  incDst+24(FP), R10 // R10 = incDst
+	SHLQ  $2, R10            // R10 *= sizeof(float32)
+	MOVSS alpha+40(FP), X0   // X0 = alpha
+	MOVSS X0, X1             // X1 = X0  // for pipelining
+	MOVQ  CX, BX
+	ANDQ  $3, BX             // BX = n % 4
+	SHRQ  $2, CX             // CX = floor( n / 4 )
+	JZ    axpyi_tail_start   // if CX == 0 { goto axpyi_tail_start }
+
+axpyi_loop: // Loop unrolled 4x   do {
+	MOVSS (SI), X2        // X_i = x[i]
+	MOVSS (SI)(R8*1), X3
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	MOVSS (SI), X4
+	MOVSS (SI)(R8*1), X5
+	MULSS X1, X2          // X_i *= a
+	MULSS X0, X3
+	MULSS X1, X4
+	MULSS X0, X5
+	ADDSS (DX), X2        // X_i += y[i]
+	ADDSS (DX)(R9*1), X3
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	ADDSS (DX), X4
+	ADDSS (DX)(R9*1), X5
+	MOVSS X2, (DI)        // dst[i] = X_i
+	MOVSS X3, (DI)(R10*1)
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	MOVSS X4, (DI)
+	MOVSS X5, (DI)(R10*1)
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])  // Increment addresses
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	LOOP  axpyi_loop      // } while --CX > 0
+	CMPQ  BX, $0          // if BX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+axpyi_tail: // do {
+	MOVSS (SI), X2   // X2 = x[i]
+	MULSS X1, X2     // X2 *= a
+	ADDSS (DX), X2   // X2 += y[i]
+	MOVSS X2, (DI)   // dst[i] = X2
+	ADDQ  R8, SI     // SI = &(SI[incX])
+	ADDQ  R9, DX     // DX = &(DX[incY])
+	ADDQ  R10, DI    // DI = &(DI[incY])
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
+
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
new file mode 100644
index 00000000000..8e24be8100a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
@@ -0,0 +1,97 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyUnitary(alpha float32, x, y []float32)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), SI  // SI = &x
+	MOVQ    y_base+32(FP), DI // DI = &y
+	MOVQ    x_len+16(FP), BX  // BX = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), BX
+	CMOVQLE y_len+40(FP), BX
+	CMPQ    BX, $0            // if BX == 0 { return }
+	JE      axpy_end
+	MOVSS   alpha+0(FP), X0
+	SHUFPS  $0, X0, X0        // X0 = { a, a, a, a }
+	XORQ    AX, AX            // i = 0
+	PXOR    X2, X2            // 2 NOP instructions (PXOR) to align
+	PXOR    X3, X3            // loop to cache line
+	MOVQ    DI, CX
+	ANDQ    $0xF, CX          // Align on 16-byte boundary for ADDPS
+	JZ      axpy_no_trim      // if CX == 0 { goto axpy_no_trim }
+
+	XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
+	INCQ CX
+	SHRQ $2, CX
+
+axpy_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (SI)(AX*4), X2 // X2 = x[i]
+	MULSS X0, X2         // X2 *= a
+	ADDSS (DI)(AX*4), X2 // X2 += y[i]
+	MOVSS X2, (DI)(AX*4) // y[i] = X2
+	INCQ  AX             // i++
+	DECQ  BX
+	JZ    axpy_end       // if --BX == 0 { return }
+	LOOP  axpy_align     // } while --CX > 0
+
+axpy_no_trim:
+	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
+	MOVQ   BX, CX
+	ANDQ   $0xF, BX         // BX = len % 16
+	SHRQ   $4, CX           // CX = int( len / 16 )
+	JZ     axpy_tail4_start // if CX == 0 { return }
+
+axpy_loop: // Loop unrolled 16x   do {
+	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
+	MOVUPS 16(SI)(AX*4), X3
+	MOVUPS 32(SI)(AX*4), X4
+	MOVUPS 48(SI)(AX*4), X5
+	MULPS  X0, X2           // X2 *= a
+	MULPS  X1, X3
+	MULPS  X0, X4
+	MULPS  X1, X5
+	ADDPS  (DI)(AX*4), X2   // X2 += y[i:i+4]
+	ADDPS  16(DI)(AX*4), X3
+	ADDPS  32(DI)(AX*4), X4
+	ADDPS  48(DI)(AX*4), X5
+	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
+	MOVUPS X3, 16(DI)(AX*4)
+	MOVUPS X4, 32(DI)(AX*4)
+	MOVUPS X5, 48(DI)(AX*4)
+	ADDQ   $16, AX          // i += 16
+	LOOP   axpy_loop        // while (--CX) > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     axpy_end
+
+axpy_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ BX, CX          // CX = floor( BX / 4 )
+	SHRQ $2, CX
+	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
+
+axpy_tail4: // Loop unrolled 4x   do {
+	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
+	MULPS  X0, X2         // X2 *= a
+	ADDPS  (DI)(AX*4), X2 // X2 += y[i]
+	MOVUPS X2, (DI)(AX*4) // y[i] = X2
+	ADDQ   $4, AX         // i += 4
+	LOOP   axpy_tail4     // } while --CX > 0
+
+axpy_tail_start: // Reset loop counter for 1-wide tail loop
+	MOVQ BX, CX   // CX = BX % 4
+	ANDQ $3, CX
+	JZ   axpy_end // if CX == 0 { return }
+
+axpy_tail:
+	MOVSS (SI)(AX*4), X1 // X1 = x[i]
+	MULSS X0, X1         // X1 *= a
+	ADDSS (DI)(AX*4), X1 // X1 += y[i]
+	MOVSS X1, (DI)(AX*4) // y[i] = X1
+	INCQ  AX             // i++
+	LOOP  axpy_tail      // } while --CX > 0
+
+axpy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
new file mode 100644
index 00000000000..9a68f0f491d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
@@ -0,0 +1,98 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+32(FP), SI  // SI = &x
+	MOVQ    y_base+56(FP), DX  // DX = &y
+	MOVQ    x_len+40(FP), BX   // BX = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+64(FP), BX
+	CMOVQLE y_len+64(FP), BX
+	CMPQ    dst_len+8(FP), BX
+	CMOVQLE dst_len+8(FP), BX
+	CMPQ    BX, $0             // if BX == 0 { return }
+	JE      axpy_end
+	MOVSS   alpha+24(FP), X0
+	SHUFPS  $0, X0, X0         // X0 = { a, a, a, a, }
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, CX
+	ANDQ    $0xF, CX           // Align on 16-byte boundary for ADDPS
+	JZ      axpy_no_trim       // if CX == 0 { goto axpy_no_trim }
+
+	XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
+	INCQ CX
+	SHRQ $2, CX
+
+axpy_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (SI)(AX*4), X2 // X2 = x[i]
+	MULSS X0, X2         // X2 *= a
+	ADDSS (DX)(AX*4), X2 // X2 += y[i]
+	MOVSS X2, (DI)(AX*4) // y[i] = X2
+	INCQ  AX             // i++
+	DECQ  BX
+	JZ    axpy_end       // if --BX == 0 { return }
+	LOOP  axpy_align     // } while --CX > 0
+
+axpy_no_trim:
+	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
+	MOVQ   BX, CX
+	ANDQ   $0xF, BX         // BX = len % 16
+	SHRQ   $4, CX           // CX = floor( len / 16 )
+	JZ     axpy_tail4_start // if CX == 0 { return }
+
+axpy_loop: // Loop unrolled 16x  do {
+	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
+	MOVUPS 16(SI)(AX*4), X3
+	MOVUPS 32(SI)(AX*4), X4
+	MOVUPS 48(SI)(AX*4), X5
+	MULPS  X0, X2           // X2 *= a
+	MULPS  X1, X3
+	MULPS  X0, X4
+	MULPS  X1, X5
+	ADDPS  (DX)(AX*4), X2   // X2 += y[i:i+4]
+	ADDPS  16(DX)(AX*4), X3
+	ADDPS  32(DX)(AX*4), X4
+	ADDPS  48(DX)(AX*4), X5
+	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
+	MOVUPS X3, 16(DI)(AX*4)
+	MOVUPS X4, 32(DI)(AX*4)
+	MOVUPS X5, 48(DI)(AX*4)
+	ADDQ   $16, AX          // i += 16
+	LOOP   axpy_loop        // while (--CX) > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     axpy_end
+
+axpy_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ BX, CX          // CX = floor( BX / 4 )
+	SHRQ $2, CX
+	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
+
+axpy_tail4: // Loop unrolled 4x  do {
+	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
+	MULPS  X0, X2         // X2 *= a
+	ADDPS  (DX)(AX*4), X2 // X2 += y[i]
+	MOVUPS X2, (DI)(AX*4) // y[i] = X2
+	ADDQ   $4, AX         // i += 4
+	LOOP   axpy_tail4     // } while --CX > 0
+
+axpy_tail_start: // Reset loop counter for 1-wide tail loop
+	MOVQ BX, CX   // CX = BX % 4
+	ANDQ $3, CX
+	JZ   axpy_end // if CX == 0 { return }
+
+axpy_tail:
+	MOVSS (SI)(AX*4), X1 // X1 = x[i]
+	MULSS X0, X1         // X1 *= a
+	ADDSS (DX)(AX*4), X1 // X1 += y[i]
+	MOVSS X1, (DI)(AX*4) // y[i] = X1
+	INCQ  AX             // i++
+	LOOP  axpy_tail      // } while --CX > 0
+
+axpy_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
new file mode 100644
index 00000000000..85fcd89eede
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
@@ -0,0 +1,91 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R10
+#define INC_Y R9
+#define INCx3_Y R11
+#define SUM X0
+#define P_SUM X1
+
+// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
+TEXT ·DdotInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ n+48(FP), LEN        // LEN = n
+	PXOR SUM, SUM             // SUM = 0
+	CMPQ LEN, $0
+	JE   dot_end
+
+	MOVQ ix+72(FP), INC_X        // INC_X = ix
+	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
+
+	MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
+	SHLQ $2, INC_X
+	MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
+	SHLQ $2, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dot_tail  // if LEN == 0 { goto dot_tail }
+
+	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dot_loop: // Loop unrolled 4x  do {
+	CVTSS2SD (X_PTR), X2            // X_i = x[i:i+1]
+	CVTSS2SD (X_PTR)(INC_X*1), X3
+	CVTSS2SD (X_PTR)(INC_X*2), X4
+	CVTSS2SD (X_PTR)(INCx3_X*1), X5
+
+	CVTSS2SD (Y_PTR), X6            // X_j = y[i:i+1]
+	CVTSS2SD (Y_PTR)(INC_Y*1), X7
+	CVTSS2SD (Y_PTR)(INC_Y*2), X8
+	CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
+
+	MULSD X6, X2 // X_i *= X_j
+	MULSD X7, X3
+	MULSD X8, X4
+	MULSD X9, X5
+
+	ADDSD X2, SUM   // SUM += X_i
+	ADDSD X3, P_SUM
+	ADDSD X4, SUM
+	ADDSD X5, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
+
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDSD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	CVTSS2SD (X_PTR), X2  // X2 = x[i]
+	CVTSS2SD (Y_PTR), X3  // X2 *= y[i]
+	MULSD    X3, X2
+	ADDSD    X2, SUM      // SUM += X2
+	ADDQ     INC_X, X_PTR // X_PTR += INC_X
+	ADDQ     INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ     TAIL
+	JNZ      dot_tail     // } while --TAIL > 0
+
+dot_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
new file mode 100644
index 00000000000..87ef09fa39a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
@@ -0,0 +1,110 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define HADDPD_SUM_SUM    LONG $0xC07C0F66 // @ HADDPD X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DdotUnitary(x, y []float32) (sum float32)
+TEXT ·DdotUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // psum = 0
+	CMPQ    LEN, $0
+	JE      dot_end
+
+	XORQ IDX, IDX
+	MOVQ Y_PTR, DX
+	ANDQ $0xF, DX    // Align on 16-byte boundary for ADDPS
+	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
+
+	SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer  do {
+	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+	MULSD    X3, X2
+	ADDSD    X2, SUM            // SUM += X2
+	INCQ     IDX                // IDX++
+	DECQ     LEN
+	JZ       dot_end            // if --TAIL == 0 { return }
+	ADDQ     $4, DX
+	JNZ      dot_align          // } while --LEN > 0
+
+dot_no_trim:
+	PXOR P_SUM, P_SUM   // P_SUM = 0  for pipelining
+	MOVQ LEN, TAIL
+	ANDQ $0x7, TAIL     // TAIL = LEN % 8
+	SHRQ $3, LEN        // LEN = floor( LEN / 8 )
+	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_loop: // Loop unrolled 8x  do {
+	CVTPS2PD (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
+	CVTPS2PD 8(X_PTR)(IDX*4), X3
+	CVTPS2PD 16(X_PTR)(IDX*4), X4
+	CVTPS2PD 24(X_PTR)(IDX*4), X5
+
+	CVTPS2PD (Y_PTR)(IDX*4), X6   // X_j = y[i:i+1]
+	CVTPS2PD 8(Y_PTR)(IDX*4), X7
+	CVTPS2PD 16(Y_PTR)(IDX*4), X8
+	CVTPS2PD 24(Y_PTR)(IDX*4), X9
+
+	MULPD X6, X2 // X_i *= X_j
+	MULPD X7, X3
+	MULPD X8, X4
+	MULPD X9, X5
+
+	ADDPD X2, SUM   // SUM += X_i
+	ADDPD X3, P_SUM
+	ADDPD X4, SUM
+	ADDPD X5, P_SUM
+
+	ADDQ $8, IDX  // IDX += 8
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDPD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail_start:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN
+	JZ   dot_tail_one
+
+dot_tail_two:
+	CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+	CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
+	MULPD    X6, X2             // X_i *= X_j
+	ADDPD    X2, SUM            // SUM += X_i
+	ADDQ     $2, IDX            // IDX += 2
+	DECQ     LEN
+	JNZ      dot_tail_two       // } while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   dot_end
+
+dot_tail_one:
+	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+	MULSD    X3, X2             // X2 *= X3
+	ADDSD    X2, SUM            // SUM += X2
+
+dot_end:
+	HADDPD_SUM_SUM        // SUM = \sum{ SUM[i] }
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
new file mode 100644
index 00000000000..408847a698e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package f32 provides float32 vector primitives.
+package f32 // import "gonum.org/v1/gonum/internal/asm/f32"
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
new file mode 100644
index 00000000000..9ac8063691b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
@@ -0,0 +1,85 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R10
+#define INC_Y R9
+#define INCx3_Y R11
+#define SUM X0
+#define P_SUM X1
+
+// func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
+TEXT ·DotInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR SUM, SUM             // SUM = 0
+	MOVQ n+48(FP), LEN        // LEN = n
+	CMPQ LEN, $0
+	JE   dot_end
+
+	MOVQ ix+72(FP), INC_X        // INC_X = ix
+	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
+
+	MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
+	SHLQ $2, INC_X
+	MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
+	SHLQ $2, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $0x3, TAIL // TAIL = LEN % 4
+	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
+	JZ   dot_tail   // if LEN == 0 { goto dot_tail }
+
+	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dot_loop: // Loop unrolled 4x  do {
+	MOVSS (X_PTR), X2            // X_i = x[i:i+1]
+	MOVSS (X_PTR)(INC_X*1), X3
+	MOVSS (X_PTR)(INC_X*2), X4
+	MOVSS (X_PTR)(INCx3_X*1), X5
+
+	MULSS (Y_PTR), X2            // X_i *= y[i:i+1]
+	MULSS (Y_PTR)(INC_Y*1), X3
+	MULSS (Y_PTR)(INC_Y*2), X4
+	MULSS (Y_PTR)(INCx3_Y*1), X5
+
+	ADDSS X2, SUM   // SUM += X_i
+	ADDSS X3, P_SUM
+	ADDSS X4, SUM
+	ADDSS X5, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
+
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDSS P_SUM, SUM // P_SUM += SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVSS (X_PTR), X2  // X2 = x[i]
+	MULSS (Y_PTR), X2  // X2 *= y[i]
+	ADDSS X2, SUM      // SUM += X2
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dot_tail     // } while --TAIL > 0
+
+dot_end:
+	MOVSS SUM, sum+88(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
new file mode 100644
index 00000000000..0023a6e244e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
@@ -0,0 +1,106 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define HADDPS_SUM_SUM    LONG $0xC07C0FF2 // @ HADDPS X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DotUnitary(x, y []float32) (sum float32)
+TEXT ·DotUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0
+	JE      dot_end
+
+	XORQ IDX, IDX
+	MOVQ Y_PTR, DX
+	ANDQ $0xF, DX    // Align on 16-byte boundary for MULPS
+	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
+	SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
+	MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
+	ADDSS X2, SUM            // SUM += X2
+	INCQ  IDX                // IDX++
+	DECQ  LEN
+	JZ    dot_end            // if --TAIL == 0 { return }
+	ADDQ  $4, DX
+	JNZ   dot_align          // } while --DX > 0
+
+dot_no_trim:
+	PXOR P_SUM, P_SUM    // P_SUM = 0  for pipelining
+	MOVQ LEN, TAIL
+	ANDQ $0xF, TAIL      // TAIL = LEN % 16
+	SHRQ $4, LEN         // LEN = floor( LEN / 16 )
+	JZ   dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
+
+dot_loop: // Loop unrolled 16x  do {
+	MOVUPS (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
+	MOVUPS 16(X_PTR)(IDX*4), X3
+	MOVUPS 32(X_PTR)(IDX*4), X4
+	MOVUPS 48(X_PTR)(IDX*4), X5
+
+	MULPS (Y_PTR)(IDX*4), X2   // X_i *= y[i:i+1]
+	MULPS 16(Y_PTR)(IDX*4), X3
+	MULPS 32(Y_PTR)(IDX*4), X4
+	MULPS 48(Y_PTR)(IDX*4), X5
+
+	ADDPS X2, SUM   // SUM += X_i
+	ADDPS X3, P_SUM
+	ADDPS X4, SUM
+	ADDPS X5, P_SUM
+
+	ADDQ $16, IDX // IDX += 16
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ TAIL, LEN      // LEN = floor( TAIL / 4 )
+	SHRQ $2, LEN
+	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_tail4_loop: // Loop unrolled 4x  do {
+	MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+	MULPS  (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
+	ADDPS  X2, SUM            // SUM += X_i
+	ADDQ   $4, IDX            // i += 4
+	DECQ   LEN
+	JNZ    dot_tail4_loop     // } while --LEN > 0
+
+dot_tail_start: // Reset loop counter for 1-wide tail loop
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JZ   dot_end  // if TAIL == 0 { return }
+
+dot_tail: // do {
+	MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
+	MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
+	ADDSS X2, SUM            // psum += X2
+	INCQ  IDX                // IDX++
+	DECQ  TAIL
+	JNZ   dot_tail           // } while --TAIL > 0
+
+dot_end:
+	HADDPS_SUM_SUM        // SUM = \sum{ SUM[i] }
+	HADDPS_SUM_SUM
+	MOVSS SUM, sum+48(FP) // return SUM
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
new file mode 100644
index 00000000000..72acba2077d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
@@ -0,0 +1,18 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f32
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float32,
+	x []float32, incX uintptr,
+	y []float32, incY uintptr,
+	a []float32, lda uintptr)
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
new file mode 100644
index 00000000000..f8fd3df8627
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
@@ -0,0 +1,757 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SIZE 4
+#define BITSIZE 2
+#define KERNELSIZE 3
+
+#define M_DIM m+0(FP)
+#define M CX
+#define N_DIM n+8(FP)
+#define N BX
+
+#define TMP1 R14
+#define TMP2 R15
+
+#define X_PTR SI
+#define Y y_base+56(FP)
+#define Y_PTR DX
+#define A_ROW AX
+#define A_PTR DI
+
+#define INC_X R8
+#define INC3_X R9
+
+#define INC_Y R10
+#define INC3_Y R11
+
+#define LDA R12
+#define LDA3 R13
+
+#define ALPHA X0
+#define ALPHA_SPILL al-16(SP)
+
+#define LOAD_ALPHA \
+	MOVSS  alpha+16(FP), ALPHA \
+	SHUFPS $0, ALPHA, ALPHA
+
+#define LOAD_SCALED4 \
+	PREFETCHNTA 16*SIZE(X_PTR)    \
+	MOVDDUP     (X_PTR), X1       \
+	MOVDDUP     2*SIZE(X_PTR), X3 \
+	MOVSHDUP    X1, X2            \
+	MOVSHDUP    X3, X4            \
+	MOVSLDUP    X1, X1            \
+	MOVSLDUP    X3, X3            \
+	MULPS       ALPHA, X1         \
+	MULPS       ALPHA, X2         \
+	MULPS       ALPHA, X3         \
+	MULPS       ALPHA, X4
+
+#define LOAD_SCALED2 \
+	MOVDDUP  (X_PTR), X1 \
+	MOVSHDUP X1, X2      \
+	MOVSLDUP X1, X1      \
+	MULPS    ALPHA, X1   \
+	MULPS    ALPHA, X2
+
+#define LOAD_SCALED1 \
+	MOVSS  (X_PTR), X1 \
+	SHUFPS $0, X1, X1  \
+	MULPS  ALPHA, X1
+
+#define LOAD_SCALED4_INC \
+	PREFETCHNTA (X_PTR)(INC_X*8)      \
+	MOVSS       (X_PTR), X1           \
+	MOVSS       (X_PTR)(INC_X*1), X2  \
+	MOVSS       (X_PTR)(INC_X*2), X3  \
+	MOVSS       (X_PTR)(INC3_X*1), X4 \
+	SHUFPS      $0, X1, X1            \
+	SHUFPS      $0, X2, X2            \
+	SHUFPS      $0, X3, X3            \
+	SHUFPS      $0, X4, X4            \
+	MULPS       ALPHA, X1             \
+	MULPS       ALPHA, X2             \
+	MULPS       ALPHA, X3             \
+	MULPS       ALPHA, X4
+
+#define LOAD_SCALED2_INC \
+	MOVSS  (X_PTR), X1          \
+	MOVSS  (X_PTR)(INC_X*1), X2 \
+	SHUFPS $0, X1, X1           \
+	SHUFPS $0, X2, X2           \
+	MULPS  ALPHA, X1            \
+	MULPS  ALPHA, X2
+
+#define KERNEL_LOAD8 \
+	MOVUPS (Y_PTR), X5       \
+	MOVUPS 4*SIZE(Y_PTR), X6
+
+#define KERNEL_LOAD8_INC \
+	MOVSS    (Y_PTR), X5             \
+	MOVSS    (Y_PTR)(INC_Y*1), X6    \
+	MOVSS    (Y_PTR)(INC_Y*2), X7    \
+	MOVSS    (Y_PTR)(INC3_Y*1), X8   \
+	UNPCKLPS X6, X5                  \
+	UNPCKLPS X8, X7                  \
+	MOVLHPS  X7, X5                  \
+	LEAQ     (Y_PTR)(INC_Y*4), Y_PTR \
+	MOVSS    (Y_PTR), X6             \
+	MOVSS    (Y_PTR)(INC_Y*1), X7    \
+	MOVSS    (Y_PTR)(INC_Y*2), X8    \
+	MOVSS    (Y_PTR)(INC3_Y*1), X9   \
+	UNPCKLPS X7, X6                  \
+	UNPCKLPS X9, X8                  \
+	MOVLHPS  X8, X6
+
+#define KERNEL_LOAD4 \
+	MOVUPS (Y_PTR), X5
+
+#define KERNEL_LOAD4_INC \
+	MOVSS    (Y_PTR), X5           \
+	MOVSS    (Y_PTR)(INC_Y*1), X6  \
+	MOVSS    (Y_PTR)(INC_Y*2), X7  \
+	MOVSS    (Y_PTR)(INC3_Y*1), X8 \
+	UNPCKLPS X6, X5                \
+	UNPCKLPS X8, X7                \
+	MOVLHPS  X7, X5
+
+#define KERNEL_LOAD2 \
+	MOVSD (Y_PTR), X5
+
+#define KERNEL_LOAD2_INC \
+	MOVSS    (Y_PTR), X5          \
+	MOVSS    (Y_PTR)(INC_Y*1), X6 \
+	UNPCKLPS X6, X5
+
+#define KERNEL_4x8 \
+	MOVUPS X5, X7  \
+	MOVUPS X6, X8  \
+	MOVUPS X5, X9  \
+	MOVUPS X6, X10 \
+	MOVUPS X5, X11 \
+	MOVUPS X6, X12 \
+	MULPS  X1, X5  \
+	MULPS  X1, X6  \
+	MULPS  X2, X7  \
+	MULPS  X2, X8  \
+	MULPS  X3, X9  \
+	MULPS  X3, X10 \
+	MULPS  X4, X11 \
+	MULPS  X4, X12
+
+#define STORE_4x8 \
+	MOVUPS ALPHA, ALPHA_SPILL         \
+	MOVUPS (A_PTR), X13               \
+	ADDPS  X13, X5                    \
+	MOVUPS 4*SIZE(A_PTR), X14         \
+	ADDPS  X14, X6                    \
+	MOVUPS (A_PTR)(LDA*1), X15        \
+	ADDPS  X15, X7                    \
+	MOVUPS 4*SIZE(A_PTR)(LDA*1), X0   \
+	ADDPS  X0, X8                     \
+	MOVUPS (A_PTR)(LDA*2), X13        \
+	ADDPS  X13, X9                    \
+	MOVUPS 4*SIZE(A_PTR)(LDA*2), X14  \
+	ADDPS  X14, X10                   \
+	MOVUPS (A_PTR)(LDA3*1), X15       \
+	ADDPS  X15, X11                   \
+	MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0  \
+	ADDPS  X0, X12                    \
+	MOVUPS X5, (A_PTR)                \
+	MOVUPS X6, 4*SIZE(A_PTR)          \
+	MOVUPS X7, (A_PTR)(LDA*1)         \
+	MOVUPS X8, 4*SIZE(A_PTR)(LDA*1)   \
+	MOVUPS X9, (A_PTR)(LDA*2)         \
+	MOVUPS X10, 4*SIZE(A_PTR)(LDA*2)  \
+	MOVUPS X11, (A_PTR)(LDA3*1)       \
+	MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \
+	MOVUPS ALPHA_SPILL, ALPHA         \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_4x4 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6 \
+	MULPS  X3, X7 \
+	MULPS  X4, X8
+
+#define STORE_4x4 \
+	MOVUPS (A_PTR), X13         \
+	ADDPS  X13, X5              \
+	MOVUPS (A_PTR)(LDA*1), X14  \
+	ADDPS  X14, X6              \
+	MOVUPS (A_PTR)(LDA*2), X15  \
+	ADDPS  X15, X7              \
+	MOVUPS (A_PTR)(LDA3*1), X13 \
+	ADDPS  X13, X8              \
+	MOVUPS X5, (A_PTR)          \
+	MOVUPS X6, (A_PTR)(LDA*1)   \
+	MOVUPS X7, (A_PTR)(LDA*2)   \
+	MOVUPS X8, (A_PTR)(LDA3*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x2 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6 \
+	MULPS  X3, X7 \
+	MULPS  X4, X8
+
+#define STORE_4x2 \
+	MOVSD (A_PTR), X9          \
+	ADDPS X9, X5               \
+	MOVSD (A_PTR)(LDA*1), X10  \
+	ADDPS X10, X6              \
+	MOVSD (A_PTR)(LDA*2), X11  \
+	ADDPS X11, X7              \
+	MOVSD (A_PTR)(LDA3*1), X12 \
+	ADDPS X12, X8              \
+	MOVSD X5, (A_PTR)          \
+	MOVSD X6, (A_PTR)(LDA*1)   \
+	MOVSD X7, (A_PTR)(LDA*2)   \
+	MOVSD X8, (A_PTR)(LDA3*1)  \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_4x1 \
+	MOVSS (Y_PTR), X5 \
+	MOVSS X5, X6      \
+	MOVSS X5, X7      \
+	MOVSS X5, X8      \
+	MULSS X1, X5      \
+	MULSS X2, X6      \
+	MULSS X3, X7      \
+	MULSS X4, X8
+
+#define STORE_4x1 \
+	ADDSS (A_PTR), X5         \
+	ADDSS (A_PTR)(LDA*1), X6  \
+	ADDSS (A_PTR)(LDA*2), X7  \
+	ADDSS (A_PTR)(LDA3*1), X8 \
+	MOVSS X5, (A_PTR)         \
+	MOVSS X6, (A_PTR)(LDA*1)  \
+	MOVSS X7, (A_PTR)(LDA*2)  \
+	MOVSS X8, (A_PTR)(LDA3*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_2x8 \
+	MOVUPS X5, X7 \
+	MOVUPS X6, X8 \
+	MULPS  X1, X5 \
+	MULPS  X1, X6 \
+	MULPS  X2, X7 \
+	MULPS  X2, X8
+
+#define STORE_2x8 \
+	MOVUPS (A_PTR), X9               \
+	ADDPS  X9, X5                    \
+	MOVUPS 4*SIZE(A_PTR), X10        \
+	ADDPS  X10, X6                   \
+	MOVUPS (A_PTR)(LDA*1), X11       \
+	ADDPS  X11, X7                   \
+	MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \
+	ADDPS  X12, X8                   \
+	MOVUPS X5, (A_PTR)               \
+	MOVUPS X6, 4*SIZE(A_PTR)         \
+	MOVUPS X7, (A_PTR)(LDA*1)        \
+	MOVUPS X8, 4*SIZE(A_PTR)(LDA*1)  \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_2x4 \
+	MOVUPS X5, X6 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6
+
+#define STORE_2x4 \
+	MOVUPS (A_PTR), X9         \
+	ADDPS  X9, X5              \
+	MOVUPS (A_PTR)(LDA*1), X11 \
+	ADDPS  X11, X6             \
+	MOVUPS X5, (A_PTR)         \
+	MOVUPS X6, (A_PTR)(LDA*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_2x2 \
+	MOVSD X5, X6 \
+	MULPS X1, X5 \
+	MULPS X2, X6
+
+#define STORE_2x2 \
+	MOVSD (A_PTR), X7        \
+	ADDPS X7, X5             \
+	MOVSD (A_PTR)(LDA*1), X8 \
+	ADDPS X8, X6             \
+	MOVSD X5, (A_PTR)        \
+	MOVSD X6, (A_PTR)(LDA*1) \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_2x1 \
+	MOVSS (Y_PTR), X5 \
+	MOVSS X5, X6      \
+	MULSS X1, X5      \
+	MULSS X2, X6
+
+#define STORE_2x1 \
+	ADDSS (A_PTR), X5        \
+	ADDSS (A_PTR)(LDA*1), X6 \
+	MOVSS X5, (A_PTR)        \
+	MOVSS X6, (A_PTR)(LDA*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_1x8 \
+	MULPS X1, X5 \
+	MULPS X1, X6
+
+#define STORE_1x8 \
+	MOVUPS (A_PTR), X7       \
+	ADDPS  X7, X5            \
+	MOVUPS 4*SIZE(A_PTR), X8 \
+	ADDPS  X8, X6            \
+	MOVUPS X5, (A_PTR)       \
+	MOVUPS X6, 4*SIZE(A_PTR) \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_1x4 \
+	MULPS X1, X5 \
+	MULPS X1, X6
+
+#define STORE_1x4 \
+	MOVUPS (A_PTR), X7    \
+	ADDPS  X7, X5         \
+	MOVUPS X5, (A_PTR)    \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_1x2 \
+	MULPS X1, X5
+
+#define STORE_1x2 \
+	MOVSD (A_PTR), X6    \
+	ADDPS X6, X5         \
+	MOVSD X5, (A_PTR)    \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_1x1 \
+	MOVSS (Y_PTR), X5 \
+	MULSS X1, X5
+
+#define STORE_1x1 \
+	ADDSS (A_PTR), X5  \
+	MOVSS X5, (A_PTR)  \
+	ADDQ  $SIZE, A_PTR
+
+// func Ger(m, n uintptr, alpha float32,
+//	x []float32, incX uintptr,
+//	y []float32, incY uintptr,
+//	a []float32, lda uintptr)
+TEXT ·Ger(SB), 0, $16-120
+	MOVQ M_DIM, M
+	MOVQ N_DIM, N
+	CMPQ M, $0
+	JE   end
+	CMPQ N, $0
+	JE   end
+
+	LOAD_ALPHA
+
+	MOVQ x_base+24(FP), X_PTR
+	MOVQ y_base+56(FP), Y_PTR
+	MOVQ a_base+88(FP), A_ROW
+	MOVQ A_ROW, A_PTR
+	MOVQ lda+112(FP), LDA     // LDA = LDA * sizeof(float32)
+	SHLQ $BITSIZE, LDA
+	LEAQ (LDA)(LDA*2), LDA3   // LDA3 = LDA * 3
+
+	CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
+	JNE  inc
+	CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path)
+	JNE  inc
+
+	SHRQ $2, M
+	JZ   r2
+
+r4:
+
+	// LOAD 4
+	LOAD_SCALED4
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r4c4
+
+r4c8:
+	// 4x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_4x8
+	STORE_4x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r4c8
+
+r4c4:
+	TESTQ $4, N_DIM
+	JZ    r4c2
+
+	// 4x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x4
+	STORE_4x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r4c2:
+	TESTQ $2, N_DIM
+	JZ    r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_4x2
+	STORE_4x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r4c1:
+	TESTQ $1, N_DIM
+	JZ    r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ $SIZE, Y_PTR
+
+r4end:
+	ADDQ $4*SIZE, X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  r4
+
+r2:
+	TESTQ $2, M_DIM
+	JZ    r1
+
+	// LOAD 2
+	LOAD_SCALED2
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r2c4
+
+r2c8:
+	// 2x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_2x8
+	STORE_2x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r2c8
+
+r2c4:
+	TESTQ $4, N_DIM
+	JZ    r2c2
+
+	// 2x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_2x4
+	STORE_2x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r2c2:
+	TESTQ $2, N_DIM
+	JZ    r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x2
+	STORE_2x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r2c1:
+	TESTQ $1, N_DIM
+	JZ    r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ $SIZE, Y_PTR
+
+r2end:
+	ADDQ $2*SIZE, X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD_SCALED1
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r1c4
+
+r1c8:
+	// 1x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_1x8
+	STORE_1x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r1c8
+
+r1c4:
+	TESTQ $4, N_DIM
+	JZ    r1c2
+
+	// 1x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_1x4
+	STORE_1x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r1c2:
+	TESTQ $2, N_DIM
+	JZ    r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_1x2
+	STORE_1x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r1c1:
+	TESTQ $1, N_DIM
+	JZ    end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+end:
+	RET
+
+inc:  // Algorithm for incY != 0 ( split loads in kernel )
+
+	MOVQ incX+48(FP), INC_X       // INC_X = incX * sizeof(float32)
+	SHLQ $BITSIZE, INC_X
+	MOVQ incY+80(FP), INC_Y       // INC_Y = incY * sizeof(float32)
+	SHLQ $BITSIZE, INC_Y
+	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
+
+	XORQ    TMP2, TMP2
+	MOVQ    M, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_X, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_X, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR
+
+	XORQ    TMP2, TMP2
+	MOVQ    N, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_Y, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_Y, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
+
+	SHRQ $2, M
+	JZ   inc_r2
+
+inc_r4:
+	// LOAD 4
+	LOAD_SCALED4_INC
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r4c4
+
+inc_r4c8:
+	// 4x4 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_4x8
+	STORE_4x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r4c8
+
+inc_r4c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r4c2
+
+	// 4x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x4
+	STORE_4x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r4c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_4x2
+	STORE_4x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r4c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  inc_r4
+
+inc_r2:
+	TESTQ $2, M_DIM
+	JZ    inc_r1
+
+	// LOAD 2
+	LOAD_SCALED2_INC
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r2c4
+
+inc_r2c8:
+	// 2x8 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_2x8
+	STORE_2x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r2c8
+
+inc_r2c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r2c2
+
+	// 2x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_2x4
+	STORE_2x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r2c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x2
+	STORE_2x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r2c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+inc_r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD_SCALED1
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r1c4
+
+inc_r1c8:
+	// 1x8 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_1x8
+	STORE_1x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r1c8
+
+inc_r1c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r1c2
+
+	// 1x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_1x4
+	STORE_1x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r1c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_1x2
+	STORE_1x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r1c1:
+	TESTQ $1, N_DIM
+	JZ    inc_end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+inc_end:
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
new file mode 100644
index 00000000000..61ee6f1802a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
@@ -0,0 +1,39 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f32
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float32, x []float32, incX uintptr, y []float32, incY uintptr, a []float32, lda uintptr) {
+
+	if incX == 1 && incY == 1 {
+		x = x[:m]
+		y = y[:n]
+		for i, xv := range x {
+			AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
+		}
+		return
+	}
+
+	var ky, kx uintptr
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+
+	ix := kx
+	for i := 0; i < int(m); i++ {
+		AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], uintptr(n), uintptr(incY), 1, uintptr(ky), 0)
+		ix += incX
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
new file mode 100644
index 00000000000..a6000504a7c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
@@ -0,0 +1,92 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+// GemvN computes
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvN(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(n-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(m-1) * int(incY))
+	}
+
+	if incX == 1 && incY == 1 {
+		if beta == 0 {
+			for i = 0; i < m; i++ {
+				y[i] = alpha * DotUnitary(a[lda*i:lda*i+n], x)
+			}
+			return
+		}
+		for i = 0; i < m; i++ {
+			y[i] = y[i]*beta + alpha*DotUnitary(a[lda*i:lda*i+n], x)
+		}
+		return
+	}
+	iy := ky
+	if beta == 0 {
+		for i = 0; i < m; i++ {
+			y[iy] = alpha * DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+			iy += incY
+		}
+		return
+	}
+	for i = 0; i < m; i++ {
+		y[iy] = y[iy]*beta + alpha*DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+		iy += incY
+	}
+}
+
+// GemvT computes
+//
+//	y = alpha * Aᵀ * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvT(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	switch {
+	case beta == 0: // beta == 0 is special-cased to memclear
+		if incY == 1 {
+			for i := range y {
+				y[i] = 0
+			}
+		} else {
+			iy := ky
+			for i := 0; i < int(n); i++ {
+				y[iy] = 0
+				iy += incY
+			}
+		}
+	case int(incY) < 0:
+		ScalInc(beta, y, n, uintptr(int(-incY)))
+	case incY == 1:
+		ScalUnitary(beta, y[:n])
+	default:
+		ScalInc(beta, y, n, incY)
+	}
+
+	if incX == 1 && incY == 1 {
+		for i = 0; i < m; i++ {
+			AxpyUnitaryTo(y, alpha*x[i], a[lda*i:lda*i+n], y)
+		}
+		return
+	}
+	ix := kx
+	for i = 0; i < m; i++ {
+		AxpyInc(alpha*x[ix], a[lda*i:lda*i+n], y, n, 1, incY, 0, ky)
+		ix += incX
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
new file mode 100644
index 00000000000..0f2a77405c9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
@@ -0,0 +1,90 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+import "gonum.org/v1/gonum/internal/math32"
+
+// L2NormUnitary is the level 2 norm of x.
+func L2NormUnitary(x []float32) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := math32.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2NormInc is the level 2 norm of x.
+func L2NormInc(x []float32, n, incX uintptr) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for ix := uintptr(0); ix < n*incX; ix += incX {
+		val := x[ix]
+		if val == 0 {
+			continue
+		}
+		absxi := math32.Abs(val)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2DistanceUnitary is the L2 norm of x-y.
+func L2DistanceUnitary(x, y []float32) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := math32.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
new file mode 100644
index 00000000000..ad2adee6523
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
@@ -0,0 +1,59 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha float32, x []float32) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha float32, x []float32, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
new file mode 100644
index 00000000000..2ea05197430
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
@@ -0,0 +1,86 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f32
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float32, x, y []float32)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+
+// DdotUnitary is
+//
+//	for i, v := range x {
+//		sum += float64(y[i]) * float64(v)
+//	}
+//	return
+func DdotUnitary(x, y []float32) (sum float64)
+
+// DdotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += float64(y[iy]) * float64(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return
+func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float32) (sum float32)
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
+
+// Sum is
+//
+//	 var sum float32
+//	 for _, v := range x {
+//			sum += v
+//	 }
+//	 return sum
+func Sum(x []float32) float32
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
new file mode 100644
index 00000000000..07b36ff34be
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
@@ -0,0 +1,137 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f32
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float32, x, y []float32) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float32) (sum float32) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DdotUnitary is
+//
+//	for i, v := range x {
+//		sum += float64(y[i]) * float64(v)
+//	}
+//	return
+func DdotUnitary(x, y []float32) (sum float64) {
+	for i, v := range x {
+		sum += float64(y[i]) * float64(v)
+	}
+	return
+}
+
+// DdotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += float64(y[iy]) * float64(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return
+func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
+	for i := 0; i < int(n); i++ {
+		sum += float64(y[iy]) * float64(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return
+}
+
+// Sum is
+//
+//	var sum float32
+//	for _, v := range x {
+//		sum += v
+//	}
+//	return sum
+func Sum(x []float32) float32 {
+	var sum float32
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
new file mode 100644
index 00000000000..42e96361e48
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
@@ -0,0 +1,100 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define SUM_1 X1
+#define SUM_2 X2
+#define SUM_3 X3
+
+// func Sum(x []float32) float32
+TEXT ·Sum(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
+	MOVQ x_len+8(FP), LEN    // LEN = len(x)
+	XORQ IDX, IDX            // i = 0
+	PXOR SUM, SUM            // p_sum_i = 0
+	CMPQ LEN, $0             // if LEN == 0 { return 0 }
+	JE   sum_end
+
+	PXOR SUM_1, SUM_1
+	PXOR SUM_2, SUM_2
+	PXOR SUM_3, SUM_3
+
+	MOVQ X_PTR, TAIL // Check memory alignment
+	ANDQ $15, TAIL   // TAIL = &x % 16
+	JZ   no_trim     // if TAIL == 0 { goto no_trim }
+	SUBQ $16, TAIL   // TAIL -= 16
+
+sum_align: // Align on 16-byte boundary do {
+	ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
+	INCQ  IDX                 // i++
+	DECQ  LEN                 // LEN--
+	JZ    sum_end             // if LEN == 0 { return }
+	ADDQ  $4, TAIL            // TAIL += 4
+	JNZ   sum_align           // } while TAIL < 0
+
+no_trim:
+	MOVQ LEN, TAIL
+	SHRQ $4, LEN   // LEN = floor( n / 16 )
+	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }
+
+
+sum_loop: // sum 16x wide do {
+	ADDPS (X_PTR)(IDX*4), SUM     // sum_i += x[i:i+4]
+	ADDPS 16(X_PTR)(IDX*4), SUM_1
+	ADDPS 32(X_PTR)(IDX*4), SUM_2
+	ADDPS 48(X_PTR)(IDX*4), SUM_3
+
+	ADDQ  $16, IDX                // i += 16
+	DECQ  LEN
+	JNZ   sum_loop                // } while --LEN > 0
+
+sum_tail8:
+	ADDPS SUM_3, SUM
+	ADDPS SUM_2, SUM_1
+
+	TESTQ $8, TAIL
+	JZ    sum_tail4
+
+	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
+	ADDPS 16(X_PTR)(IDX*4), SUM_1
+	ADDQ  $8, IDX
+
+sum_tail4:
+	ADDPS SUM_1, SUM
+
+	TESTQ $4, TAIL
+	JZ    sum_tail2
+
+	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
+	ADDQ  $4, IDX
+
+sum_tail2:
+	HADDPS SUM, SUM            // sum_i[:2] += sum_i[2:4]
+
+	TESTQ $2, TAIL
+	JZ    sum_tail1
+
+	MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
+	ADDPS SUM_1, SUM            // sum_i += x[i:i+2]
+	ADDQ  $2, IDX
+
+sum_tail1:
+	HADDPS SUM, SUM // sum_i[0] += sum_i[1]
+
+	TESTQ $1, TAIL
+	JZ    sum_end
+
+	ADDSS (X_PTR)(IDX*4), SUM
+
+sum_end: // return sum
+	MOVSS SUM, ret+24(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go
new file mode 100644
index 00000000000..ac6eb81c0e2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/abs.go
@@ -0,0 +1,14 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// Abs returns the absolute value (also called the modulus) of x.
+func Abs(x complex64) float32 { return math.Hypot(real(x), imag(x)) }
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go
new file mode 100644
index 00000000000..705262f2f93
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/conj.go
@@ -0,0 +1,12 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+// Conj returns the complex conjugate of x.
+func Conj(x complex64) complex64 { return complex(real(x), -imag(x)) }
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go
new file mode 100644
index 00000000000..5424ea099c2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cmplx64 provides complex64 versions of standard library math/cmplx
+// package routines used by gonum/blas.
+package cmplx64 // import "gonum.org/v1/gonum/internal/cmplx64"
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go
new file mode 100644
index 00000000000..21d3d180e1e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/isinf.go
@@ -0,0 +1,25 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// IsInf returns true if either real(x) or imag(x) is an infinity.
+func IsInf(x complex64) bool {
+	if math.IsInf(real(x), 0) || math.IsInf(imag(x), 0) {
+		return true
+	}
+	return false
+}
+
+// Inf returns a complex infinity, complex(+Inf, +Inf).
+func Inf() complex64 {
+	inf := math.Inf(1)
+	return complex(inf, inf)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go
new file mode 100644
index 00000000000..d6d43dbd1f3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/isnan.go
@@ -0,0 +1,29 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// IsNaN returns true if either real(x) or imag(x) is NaN
+// and neither is an infinity.
+func IsNaN(x complex64) bool {
+	switch {
+	case math.IsInf(real(x), 0) || math.IsInf(imag(x), 0):
+		return false
+	case math.IsNaN(real(x)) || math.IsNaN(imag(x)):
+		return true
+	}
+	return false
+}
+
+// NaN returns a complex “not-a-number” value.
+func NaN() complex64 {
+	nan := math.NaN()
+	return complex(nan, nan)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go b/vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go
new file mode 100644
index 00000000000..439987b4baa
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/cmplx64/sqrt.go
@@ -0,0 +1,108 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx64
+
+import math "gonum.org/v1/gonum/internal/math32"
+
+// The original C code, the long comment, and the constants
+// below are from http://netlib.sandia.gov/cephes/c9x-complex/clog.c.
+// The go code is a simplified version of the original C.
+//
+// Cephes Math Library Release 2.8:  June, 2000
+// Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier
+//
+// The readme file at http://netlib.sandia.gov/cephes/ says:
+//    Some software in this archive may be from the book _Methods and
+// Programs for Mathematical Functions_ (Prentice-Hall or Simon & Schuster
+// International, 1989) or from the Cephes Mathematical Library, a
+// commercial product. In either event, it is copyrighted by the author.
+// What you see here may be used freely but it comes with no support or
+// guarantee.
+//
+//   The two known misprints in the book are repaired here in the
+// source listings for the gamma function and the incomplete beta
+// integral.
+//
+//   Stephen L. Moshier
+//   moshier@na-net.ornl.gov
+
+// Complex square root
+//
+// DESCRIPTION:
+//
+// If z = x + iy,  r = |z|, then
+//
+//                       1/2
+// Re w  =  [ (r + x)/2 ]   ,
+//
+//                       1/2
+// Im w  =  [ (r - x)/2 ]   .
+//
+// Cancelation error in r-x or r+x is avoided by using the
+// identity  2 Re w Im w  =  y.
+//
+// Note that -w is also a square root of z. The root chosen
+// is always in the right half plane and Im w has the same sign as y.
+//
+// ACCURACY:
+//
+//                      Relative error:
+// arithmetic   domain     # trials      peak         rms
+//    DEC       -10,+10     25000       3.2e-17     9.6e-18
+//    IEEE      -10,+10   1,000,000     2.9e-16     6.1e-17
+
+// Sqrt returns the square root of x.
+// The result r is chosen so that real(r) ≥ 0 and imag(r) has the same sign as imag(x).
+func Sqrt(x complex64) complex64 {
+	if imag(x) == 0 {
+		if real(x) == 0 {
+			return complex(0, 0)
+		}
+		if real(x) < 0 {
+			return complex(0, math.Sqrt(-real(x)))
+		}
+		return complex(math.Sqrt(real(x)), 0)
+	}
+	if real(x) == 0 {
+		if imag(x) < 0 {
+			r := math.Sqrt(-0.5 * imag(x))
+			return complex(r, -r)
+		}
+		r := math.Sqrt(0.5 * imag(x))
+		return complex(r, r)
+	}
+	a := real(x)
+	b := imag(x)
+	var scale float32
+	// Rescale to avoid internal overflow or underflow.
+	if math.Abs(a) > 4 || math.Abs(b) > 4 {
+		a *= 0.25
+		b *= 0.25
+		scale = 2
+	} else {
+		a *= 1.8014398509481984e16 // 2**54
+		b *= 1.8014398509481984e16
+		scale = 7.450580596923828125e-9 // 2**-27
+	}
+	r := math.Hypot(a, b)
+	var t float32
+	if a > 0 {
+		t = math.Sqrt(0.5*r + 0.5*a)
+		r = scale * math.Abs((0.5*b)/t)
+		t *= scale
+	} else {
+		r = math.Sqrt(0.5*r - 0.5*a)
+		t = scale * math.Abs((0.5*b)/r)
+		r *= scale
+	}
+	if b < 0 {
+		return complex(t, -r)
+	}
+	return complex(t, r)
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/doc.go b/vendor/gonum.org/v1/gonum/internal/math32/doc.go
new file mode 100644
index 00000000000..68917c64e64
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/doc.go
@@ -0,0 +1,7 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package math32 provides float32 versions of standard library math package
+// routines used by gonum/blas/native.
+package math32 // import "gonum.org/v1/gonum/internal/math32"
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/math.go b/vendor/gonum.org/v1/gonum/internal/math32/math.go
new file mode 100644
index 00000000000..5e92f3d02ee
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/math.go
@@ -0,0 +1,166 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math32
+
+import (
+	"math"
+)
+
+const (
+	unan    = 0x7fc00000
+	uinf    = 0x7f800000
+	uneginf = 0xff800000
+	mask    = 0x7f8 >> 3
+	shift   = 32 - 8 - 1
+	bias    = 127
+)
+
+// Abs returns the absolute value of x.
+//
+// Special cases are:
+//
+//	Abs(±Inf) = +Inf
+//	Abs(NaN) = NaN
+func Abs(x float32) float32 {
+	switch {
+	case x < 0:
+		return -x
+	case x == 0:
+		return 0 // return correctly abs(-0)
+	}
+	return x
+}
+
+// Copysign returns a value with the magnitude
+// of x and the sign of y.
+func Copysign(x, y float32) float32 {
+	const sign = 1 << 31
+	return math.Float32frombits(math.Float32bits(x)&^sign | math.Float32bits(y)&sign)
+}
+
+// Hypot returns Sqrt(p*p + q*q), taking care to avoid
+// unnecessary overflow and underflow.
+//
+// Special cases are:
+//
+//	Hypot(±Inf, q) = +Inf
+//	Hypot(p, ±Inf) = +Inf
+//	Hypot(NaN, q) = NaN
+//	Hypot(p, NaN) = NaN
+func Hypot(p, q float32) float32 {
+	// special cases
+	switch {
+	case IsInf(p, 0) || IsInf(q, 0):
+		return Inf(1)
+	case IsNaN(p) || IsNaN(q):
+		return NaN()
+	}
+	if p < 0 {
+		p = -p
+	}
+	if q < 0 {
+		q = -q
+	}
+	if p < q {
+		p, q = q, p
+	}
+	if p == 0 {
+		return 0
+	}
+	q = q / p
+	return p * Sqrt(1+q*q)
+}
+
+// Inf returns positive infinity if sign >= 0, negative infinity if sign < 0.
+func Inf(sign int) float32 {
+	var v uint32
+	if sign >= 0 {
+		v = uinf
+	} else {
+		v = uneginf
+	}
+	return math.Float32frombits(v)
+}
+
+// IsInf reports whether f is an infinity, according to sign.
+// If sign > 0, IsInf reports whether f is positive infinity.
+// If sign < 0, IsInf reports whether f is negative infinity.
+// If sign == 0, IsInf reports whether f is either infinity.
+func IsInf(f float32, sign int) bool {
+	// Test for infinity by comparing against maximum float.
+	// To avoid the floating-point hardware, could use:
+	//	x := math.Float32bits(f);
+	//	return sign >= 0 && x == uinf || sign <= 0 && x == uneginf;
+	return sign >= 0 && f > math.MaxFloat32 || sign <= 0 && f < -math.MaxFloat32
+}
+
+// IsNaN reports whether f is an IEEE 754 “not-a-number” value.
+func IsNaN(f float32) (is bool) {
+	// IEEE 754 says that only NaNs satisfy f != f.
+	// To avoid the floating-point hardware, could use:
+	//	x := math.Float32bits(f);
+	//	return uint32(x>>shift)&mask == mask && x != uinf && x != uneginf
+	return f != f
+}
+
+// Max returns the larger of x or y.
+//
+// Special cases are:
+//
+//	Max(x, +Inf) = Max(+Inf, x) = +Inf
+//	Max(x, NaN) = Max(NaN, x) = NaN
+//	Max(+0, ±0) = Max(±0, +0) = +0
+//	Max(-0, -0) = -0
+func Max(x, y float32) float32 {
+	// special cases
+	switch {
+	case IsInf(x, 1) || IsInf(y, 1):
+		return Inf(1)
+	case IsNaN(x) || IsNaN(y):
+		return NaN()
+	case x == 0 && x == y:
+		if Signbit(x) {
+			return y
+		}
+		return x
+	}
+	if x > y {
+		return x
+	}
+	return y
+}
+
+// Min returns the smaller of x or y.
+//
+// Special cases are:
+//
+//	Min(x, -Inf) = Min(-Inf, x) = -Inf
+//	Min(x, NaN) = Min(NaN, x) = NaN
+//	Min(-0, ±0) = Min(±0, -0) = -0
+func Min(x, y float32) float32 {
+	// special cases
+	switch {
+	case IsInf(x, -1) || IsInf(y, -1):
+		return Inf(-1)
+	case IsNaN(x) || IsNaN(y):
+		return NaN()
+	case x == 0 && x == y:
+		if Signbit(x) {
+			return x
+		}
+		return y
+	}
+	if x < y {
+		return x
+	}
+	return y
+}
+
+// NaN returns an IEEE 754 “not-a-number” value.
+func NaN() float32 { return math.Float32frombits(unan) }
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/signbit.go b/vendor/gonum.org/v1/gonum/internal/math32/signbit.go
new file mode 100644
index 00000000000..3e9f0bb41dc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/signbit.go
@@ -0,0 +1,16 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math32
+
+import "math"
+
+// Signbit returns true if x is negative or negative zero.
+func Signbit(x float32) bool {
+	return math.Float32bits(x)&(1<<31) != 0
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt.go b/vendor/gonum.org/v1/gonum/internal/math32/sqrt.go
new file mode 100644
index 00000000000..41f4a134df9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt.go
@@ -0,0 +1,26 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !arm64) || noasm || gccgo || safe
+// +build !amd64,!arm64 noasm gccgo safe
+
+package math32
+
+import (
+	"math"
+)
+
+// Sqrt returns the square root of x.
+//
+// Special cases are:
+//
+//	Sqrt(+Inf) = +Inf
+//	Sqrt(±0) = ±0
+//	Sqrt(x < 0) = NaN
+//	Sqrt(NaN) = NaN
+func Sqrt(x float32) float32 {
+	// FIXME(kortschak): Direct translation of the math package
+	// asm code for 386 fails to build.
+	return float32(math.Sqrt(float64(x)))
+}
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go
new file mode 100644
index 00000000000..eca83f8700c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.go
@@ -0,0 +1,22 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package math32
+
+// Sqrt returns the square root of x.
+//
+// Special cases are:
+//
+//	Sqrt(+Inf) = +Inf
+//	Sqrt(±0) = ±0
+//	Sqrt(x < 0) = NaN
+//	Sqrt(NaN) = NaN
+func Sqrt(x float32) float32
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s
new file mode 100644
index 00000000000..1c1432a3cae
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_amd64.s
@@ -0,0 +1,17 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Sqrt(x float32) float32
+TEXT ·Sqrt(SB),NOSPLIT,$0
+	SQRTSS x+0(FP), X0
+	MOVSS X0, ret+8(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go
new file mode 100644
index 00000000000..eca83f8700c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.go
@@ -0,0 +1,22 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package math32
+
+// Sqrt returns the square root of x.
+//
+// Special cases are:
+//
+//	Sqrt(+Inf) = +Inf
+//	Sqrt(±0) = ±0
+//	Sqrt(x < 0) = NaN
+//	Sqrt(NaN) = NaN
+func Sqrt(x float32) float32
diff --git a/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s
new file mode 100644
index 00000000000..f18b5521d45
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/internal/math32/sqrt_arm64.s
@@ -0,0 +1,18 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Sqrt(x float32) float32
+TEXT ·Sqrt(SB),NOSPLIT,$0
+	FMOVS	x+0(FP), F0
+	FSQRTS	F0, F0
+	FMOVS	F0, ret+8(FP)
+	RET
diff --git a/vendor/gonum.org/v1/gonum/lapack/.gitignore b/vendor/gonum.org/v1/gonum/lapack/.gitignore
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vendor/gonum.org/v1/gonum/lapack/README.md b/vendor/gonum.org/v1/gonum/lapack/README.md
new file mode 100644
index 00000000000..ee23148c973
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/README.md
@@ -0,0 +1,29 @@
+Gonum LAPACK
+======
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/lapack)](https://pkg.go.dev/gonum.org/v1/gonum/lapack)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/lapack?status.svg)](https://godocs.io/gonum.org/v1/gonum/lapack)
+
+A collection of packages to provide LAPACK functionality for the Go programming
+language (http://golang.org). This provides a partial implementation in native go
+and a wrapper using cgo to a c-based implementation.
+
+## Installation
+
+```
+  go get gonum.org/v1/gonum/lapack/...
+```
+
+## Packages
+
+### lapack
+
+Defines the LAPACK API based on http://www.netlib.org/lapack/lapacke.html
+
+### lapack/gonum
+
+Go implementation of the LAPACK API (incomplete, implements the `float64` API).
+
+### lapack/lapack64
+
+Wrappers for an implementation of the double (i.e., `float64`) precision real parts of
+the LAPACK API.
diff --git a/vendor/gonum.org/v1/gonum/lapack/doc.go b/vendor/gonum.org/v1/gonum/lapack/doc.go
new file mode 100644
index 00000000000..2475cb4aa08
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package lapack provides interfaces for the LAPACK linear algebra standard.
+package lapack // import "gonum.org/v1/gonum/lapack"
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go
new file mode 100644
index 00000000000..fd421d7ef51
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dbdsqr.go
@@ -0,0 +1,506 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dbdsqr performs a singular value decomposition of a real n×n bidiagonal matrix.
+//
+// The SVD of the bidiagonal matrix B is
+//
+//	B = Q * S * Pᵀ
+//
+// where S is a diagonal matrix of singular values, Q is an orthogonal matrix of
+// left singular vectors, and P is an orthogonal matrix of right singular vectors.
+//
+// Q and P are only computed if requested. If left singular vectors are requested,
+// this routine returns U * Q instead of Q, and if right singular vectors are
+// requested Pᵀ * VT is returned instead of Pᵀ.
+//
+// Frequently Dbdsqr is used in conjunction with Dgebrd which reduces a general
+// matrix A into bidiagonal form. In this case, the SVD of A is
+//
+//	A = (U * Q) * S * (Pᵀ * VT)
+//
+// This routine may also compute Qᵀ * C.
+//
+// d and e contain the elements of the bidiagonal matrix b. d must have length at
+// least n, and e must have length at least n-1. Dbdsqr will panic if there is
+// insufficient length. On exit, D contains the singular values of B in decreasing
+// order.
+//
+// VT is a matrix of size n×ncvt whose elements are stored in vt. The elements
+// of vt are modified to contain Pᵀ * VT on exit. VT is not used if ncvt == 0.
+//
+// U is a matrix of size nru×n whose elements are stored in u. The elements
+// of u are modified to contain U * Q on exit. U is not used if nru == 0.
+//
+// C is a matrix of size n×ncc whose elements are stored in c. The elements
+// of c are modified to contain Qᵀ * C on exit. C is not used if ncc == 0.
+//
+// work contains temporary storage and must have length at least 4*(n-1). Dbdsqr
+// will panic if there is insufficient working memory.
+//
+// Dbdsqr returns whether the decomposition was successful.
+//
+// Dbdsqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dbdsqr(uplo blas.Uplo, n, ncvt, nru, ncc int, d, e, vt []float64, ldvt int, u []float64, ldu int, c []float64, ldc int, work []float64) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case ncvt < 0:
+		panic(ncvtLT0)
+	case nru < 0:
+		panic(nruLT0)
+	case ncc < 0:
+		panic(nccLT0)
+	case ldvt < max(1, ncvt):
+		panic(badLdVT)
+	case (ldu < max(1, n) && nru > 0) || (ldu < 1 && nru == 0):
+		panic(badLdU)
+	case ldc < max(1, ncc):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(vt) < (n-1)*ldvt+ncvt && ncvt != 0 {
+		panic(shortVT)
+	}
+	if len(u) < (nru-1)*ldu+n && nru != 0 {
+		panic(shortU)
+	}
+	if len(c) < (n-1)*ldc+ncc && ncc != 0 {
+		panic(shortC)
+	}
+	if len(d) < n {
+		panic(shortD)
+	}
+	if len(e) < n-1 {
+		panic(shortE)
+	}
+	if len(work) < 4*(n-1) {
+		panic(shortWork)
+	}
+
+	var info int
+	bi := blas64.Implementation()
+	const maxIter = 6
+
+	if n != 1 {
+		// If the singular vectors do not need to be computed, use qd algorithm.
+		if !(ncvt > 0 || nru > 0 || ncc > 0) {
+			info = impl.Dlasq1(n, d, e, work)
+			// If info is 2 dqds didn't finish, and so try to.
+			if info != 2 {
+				return info == 0
+			}
+		}
+		nm1 := n - 1
+		nm12 := nm1 + nm1
+		nm13 := nm12 + nm1
+		idir := 0
+
+		eps := dlamchE
+		unfl := dlamchS
+		lower := uplo == blas.Lower
+		var cs, sn, r float64
+		if lower {
+			for i := 0; i < n-1; i++ {
+				cs, sn, r = impl.Dlartg(d[i], e[i])
+				d[i] = r
+				e[i] = sn * d[i+1]
+				d[i+1] *= cs
+				work[i] = cs
+				work[nm1+i] = sn
+			}
+			if nru > 0 {
+				impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, n, work, work[n-1:], u, ldu)
+			}
+			if ncc > 0 {
+				impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, n, ncc, work, work[n-1:], c, ldc)
+			}
+		}
+		// Compute singular values to a relative accuracy of tol. If tol is negative
+		// the values will be computed to an absolute accuracy of math.Abs(tol) * norm(b)
+		tolmul := math.Max(10, math.Min(100, math.Pow(eps, -1.0/8)))
+		tol := tolmul * eps
+		var smax float64
+		for i := 0; i < n; i++ {
+			smax = math.Max(smax, math.Abs(d[i]))
+		}
+		for i := 0; i < n-1; i++ {
+			smax = math.Max(smax, math.Abs(e[i]))
+		}
+
+		var smin float64
+		var thresh float64
+		if tol >= 0 {
+			sminoa := math.Abs(d[0])
+			if sminoa != 0 {
+				mu := sminoa
+				for i := 1; i < n; i++ {
+					mu = math.Abs(d[i]) * (mu / (mu + math.Abs(e[i-1])))
+					sminoa = math.Min(sminoa, mu)
+					if sminoa == 0 {
+						break
+					}
+				}
+			}
+			sminoa = sminoa / math.Sqrt(float64(n))
+			thresh = math.Max(tol*sminoa, float64(maxIter*n*n)*unfl)
+		} else {
+			thresh = math.Max(math.Abs(tol)*smax, float64(maxIter*n*n)*unfl)
+		}
+		// Prepare for the main iteration loop for the singular values.
+		maxIt := maxIter * n * n
+		iter := 0
+		oldl2 := -1
+		oldm := -1
+		// m points to the last element of unconverged part of matrix.
+		m := n
+
+	Outer:
+		for m > 1 {
+			if iter > maxIt {
+				info = 0
+				for i := 0; i < n-1; i++ {
+					if e[i] != 0 {
+						info++
+					}
+				}
+				return info == 0
+			}
+			// Find diagonal block of matrix to work on.
+			if tol < 0 && math.Abs(d[m-1]) <= thresh {
+				d[m-1] = 0
+			}
+			smax = math.Abs(d[m-1])
+			var l2 int
+			var broke bool
+			for l3 := 0; l3 < m-1; l3++ {
+				l2 = m - l3 - 2
+				abss := math.Abs(d[l2])
+				abse := math.Abs(e[l2])
+				if tol < 0 && abss <= thresh {
+					d[l2] = 0
+				}
+				if abse <= thresh {
+					broke = true
+					break
+				}
+				smax = math.Max(math.Max(smax, abss), abse)
+			}
+			if broke {
+				e[l2] = 0
+				if l2 == m-2 {
+					// Convergence of bottom singular value, return to top.
+					m--
+					continue
+				}
+				l2++
+			} else {
+				l2 = 0
+			}
+			// e[ll] through e[m-2] are nonzero, e[ll-1] is zero
+			if l2 == m-2 {
+				// Handle 2×2 block separately.
+				var sinr, cosr, sinl, cosl float64
+				d[m-1], d[m-2], sinr, cosr, sinl, cosl = impl.Dlasv2(d[m-2], e[m-2], d[m-1])
+				e[m-2] = 0
+				if ncvt > 0 {
+					bi.Drot(ncvt, vt[(m-2)*ldvt:], 1, vt[(m-1)*ldvt:], 1, cosr, sinr)
+				}
+				if nru > 0 {
+					bi.Drot(nru, u[m-2:], ldu, u[m-1:], ldu, cosl, sinl)
+				}
+				if ncc > 0 {
+					bi.Drot(ncc, c[(m-2)*ldc:], 1, c[(m-1)*ldc:], 1, cosl, sinl)
+				}
+				m -= 2
+				continue
+			}
+			// If working on a new submatrix, choose shift direction from larger end
+			// diagonal element toward smaller.
+			if l2 > oldm-1 || m-1 < oldl2 {
+				if math.Abs(d[l2]) >= math.Abs(d[m-1]) {
+					idir = 1
+				} else {
+					idir = 2
+				}
+			}
+			// Apply convergence tests.
+			// TODO(btracey): There is a lot of similar looking code here. See
+			// if there is a better way to de-duplicate.
+			if idir == 1 {
+				// Run convergence test in forward direction.
+				// First apply standard test to bottom of matrix.
+				if math.Abs(e[m-2]) <= math.Abs(tol)*math.Abs(d[m-1]) || (tol < 0 && math.Abs(e[m-2]) <= thresh) {
+					e[m-2] = 0
+					continue
+				}
+				if tol >= 0 {
+					// If relative accuracy desired, apply convergence criterion forward.
+					mu := math.Abs(d[l2])
+					smin = mu
+					for l3 := l2; l3 < m-1; l3++ {
+						if math.Abs(e[l3]) <= tol*mu {
+							e[l3] = 0
+							continue Outer
+						}
+						mu = math.Abs(d[l3+1]) * (mu / (mu + math.Abs(e[l3])))
+						smin = math.Min(smin, mu)
+					}
+				}
+			} else {
+				// Run convergence test in backward direction.
+				// First apply standard test to top of matrix.
+				if math.Abs(e[l2]) <= math.Abs(tol)*math.Abs(d[l2]) || (tol < 0 && math.Abs(e[l2]) <= thresh) {
+					e[l2] = 0
+					continue
+				}
+				if tol >= 0 {
+					// If relative accuracy desired, apply convergence criterion backward.
+					mu := math.Abs(d[m-1])
+					smin = mu
+					for l3 := m - 2; l3 >= l2; l3-- {
+						if math.Abs(e[l3]) <= tol*mu {
+							e[l3] = 0
+							continue Outer
+						}
+						mu = math.Abs(d[l3]) * (mu / (mu + math.Abs(e[l3])))
+						smin = math.Min(smin, mu)
+					}
+				}
+			}
+			oldl2 = l2
+			oldm = m
+			// Compute shift. First, test if shifting would ruin relative accuracy,
+			// and if so set the shift to zero.
+			var shift float64
+			if tol >= 0 && float64(n)*tol*(smin/smax) <= math.Max(eps, (1.0/100)*tol) {
+				shift = 0
+			} else {
+				var sl2 float64
+				if idir == 1 {
+					sl2 = math.Abs(d[l2])
+					shift, _ = impl.Dlas2(d[m-2], e[m-2], d[m-1])
+				} else {
+					sl2 = math.Abs(d[m-1])
+					shift, _ = impl.Dlas2(d[l2], e[l2], d[l2+1])
+				}
+				// Test if shift is negligible
+				if sl2 > 0 {
+					if (shift/sl2)*(shift/sl2) < eps {
+						shift = 0
+					}
+				}
+			}
+			iter += m - l2 + 1
+			// If no shift, do simplified QR iteration.
+			if shift == 0 {
+				if idir == 1 {
+					cs := 1.0
+					oldcs := 1.0
+					var sn, r, oldsn float64
+					for i := l2; i < m-1; i++ {
+						cs, sn, r = impl.Dlartg(d[i]*cs, e[i])
+						if i > l2 {
+							e[i-1] = oldsn * r
+						}
+						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i+1]*sn)
+						work[i-l2] = cs
+						work[i-l2+nm1] = sn
+						work[i-l2+nm12] = oldcs
+						work[i-l2+nm13] = oldsn
+					}
+					h := d[m-1] * cs
+					d[m-1] = h * oldcs
+					e[m-2] = h * oldsn
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
+					}
+					if math.Abs(e[m-2]) < thresh {
+						e[m-2] = 0
+					}
+				} else {
+					cs := 1.0
+					oldcs := 1.0
+					var sn, r, oldsn float64
+					for i := m - 1; i >= l2+1; i-- {
+						cs, sn, r = impl.Dlartg(d[i]*cs, e[i-1])
+						if i < m-1 {
+							e[i] = oldsn * r
+						}
+						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i-1]*sn)
+						work[i-l2-1] = cs
+						work[i-l2+nm1-1] = -sn
+						work[i-l2+nm12-1] = oldcs
+						work[i-l2+nm13-1] = -oldsn
+					}
+					h := d[l2] * cs
+					d[l2] = h * oldcs
+					e[l2] = h * oldsn
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
+					}
+					if math.Abs(e[l2]) <= thresh {
+						e[l2] = 0
+					}
+				}
+			} else {
+				// Use nonzero shift.
+				if idir == 1 {
+					// Chase bulge from top to bottom. Save cosines and sines for
+					// later singular vector updates.
+					f := (math.Abs(d[l2]) - shift) * (math.Copysign(1, d[l2]) + shift/d[l2])
+					g := e[l2]
+					var cosl, sinl float64
+					for i := l2; i < m-1; i++ {
+						cosr, sinr, r := impl.Dlartg(f, g)
+						if i > l2 {
+							e[i-1] = r
+						}
+						f = cosr*d[i] + sinr*e[i]
+						e[i] = cosr*e[i] - sinr*d[i]
+						g = sinr * d[i+1]
+						d[i+1] *= cosr
+						cosl, sinl, r = impl.Dlartg(f, g)
+						d[i] = r
+						f = cosl*e[i] + sinl*d[i+1]
+						d[i+1] = cosl*d[i+1] - sinl*e[i]
+						if i < m-2 {
+							g = sinl * e[i+1]
+							e[i+1] = cosl * e[i+1]
+						}
+						work[i-l2] = cosr
+						work[i-l2+nm1] = sinr
+						work[i-l2+nm12] = cosl
+						work[i-l2+nm13] = sinl
+					}
+					e[m-2] = f
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
+					}
+					if math.Abs(e[m-2]) <= thresh {
+						e[m-2] = 0
+					}
+				} else {
+					// Chase bulge from top to bottom. Save cosines and sines for
+					// later singular vector updates.
+					f := (math.Abs(d[m-1]) - shift) * (math.Copysign(1, d[m-1]) + shift/d[m-1])
+					g := e[m-2]
+					for i := m - 1; i > l2; i-- {
+						cosr, sinr, r := impl.Dlartg(f, g)
+						if i < m-1 {
+							e[i] = r
+						}
+						f = cosr*d[i] + sinr*e[i-1]
+						e[i-1] = cosr*e[i-1] - sinr*d[i]
+						g = sinr * d[i-1]
+						d[i-1] *= cosr
+						cosl, sinl, r := impl.Dlartg(f, g)
+						d[i] = r
+						f = cosl*e[i-1] + sinl*d[i-1]
+						d[i-1] = cosl*d[i-1] - sinl*e[i-1]
+						if i > l2+1 {
+							g = sinl * e[i-2]
+							e[i-2] *= cosl
+						}
+						work[i-l2-1] = cosr
+						work[i-l2+nm1-1] = -sinr
+						work[i-l2+nm12-1] = cosl
+						work[i-l2+nm13-1] = -sinl
+					}
+					e[l2] = f
+					if math.Abs(e[l2]) <= thresh {
+						e[l2] = 0
+					}
+					if ncvt > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
+					}
+					if nru > 0 {
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
+					}
+					if ncc > 0 {
+						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
+					}
+				}
+			}
+		}
+	}
+
+	// All singular values converged, make them positive.
+	for i := 0; i < n; i++ {
+		if d[i] < 0 {
+			d[i] *= -1
+			if ncvt > 0 {
+				bi.Dscal(ncvt, -1, vt[i*ldvt:], 1)
+			}
+		}
+	}
+
+	// Sort the singular values in decreasing order.
+	for i := 0; i < n-1; i++ {
+		isub := 0
+		smin := d[0]
+		for j := 1; j < n-i; j++ {
+			if d[j] <= smin {
+				isub = j
+				smin = d[j]
+			}
+		}
+		if isub != n-i {
+			// Swap singular values and vectors.
+			d[isub] = d[n-i-1]
+			d[n-i-1] = smin
+			if ncvt > 0 {
+				bi.Dswap(ncvt, vt[isub*ldvt:], 1, vt[(n-i-1)*ldvt:], 1)
+			}
+			if nru > 0 {
+				bi.Dswap(nru, u[isub:], ldu, u[n-i-1:], ldu)
+			}
+			if ncc > 0 {
+				bi.Dswap(ncc, c[isub*ldc:], 1, c[(n-i-1)*ldc:], 1)
+			}
+		}
+	}
+	info = 0
+	for i := 0; i < n-1; i++ {
+		if e[i] != 0 {
+			info++
+		}
+	}
+	return info == 0
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go
new file mode 100644
index 00000000000..b6af972e6a3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebak.go
@@ -0,0 +1,91 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgebak updates an n×m matrix V as
+//
+//	V = P D V       if side == lapack.EVRight,
+//	V = P D^{-1} V  if side == lapack.EVLeft,
+//
+// where P and D are n×n permutation and scaling matrices, respectively,
+// implicitly represented by job, scale, ilo and ihi as returned by Dgebal.
+//
+// Typically, columns of the matrix V contain the right or left (determined by
+// side) eigenvectors of the balanced matrix output by Dgebal, and Dgebak forms
+// the eigenvectors of the original matrix.
+//
+// Dgebak is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebak(job lapack.BalanceJob, side lapack.EVSide, n, ilo, ihi int, scale []float64, m int, v []float64, ldv int) {
+	switch {
+	case job != lapack.BalanceNone && job != lapack.Permute && job != lapack.Scale && job != lapack.PermuteScale:
+		panic(badBalanceJob)
+	case side != lapack.EVLeft && side != lapack.EVRight:
+		panic(badEVSide)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case m < 0:
+		panic(mLT0)
+	case ldv < max(1, m):
+		panic(badLdV)
+	}
+
+	// Quick return if possible.
+	if n == 0 || m == 0 {
+		return
+	}
+
+	if len(scale) < n {
+		panic(shortScale)
+	}
+	if len(v) < (n-1)*ldv+m {
+		panic(shortV)
+	}
+
+	// Quick return if possible.
+	if job == lapack.BalanceNone {
+		return
+	}
+
+	bi := blas64.Implementation()
+	if ilo != ihi && job != lapack.Permute {
+		// Backward balance.
+		if side == lapack.EVRight {
+			for i := ilo; i <= ihi; i++ {
+				bi.Dscal(m, scale[i], v[i*ldv:], 1)
+			}
+		} else {
+			for i := ilo; i <= ihi; i++ {
+				bi.Dscal(m, 1/scale[i], v[i*ldv:], 1)
+			}
+		}
+	}
+	if job == lapack.Scale {
+		return
+	}
+	// Backward permutation.
+	for i := ilo - 1; i >= 0; i-- {
+		k := int(scale[i])
+		if k == i {
+			continue
+		}
+		bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1)
+	}
+	for i := ihi + 1; i < n; i++ {
+		k := int(scale[i])
+		if k == i {
+			continue
+		}
+		bi.Dswap(m, v[i*ldv:], 1, v[k*ldv:], 1)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go
new file mode 100644
index 00000000000..7623e2faeef
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebal.go
@@ -0,0 +1,248 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgebal balances an n×n matrix A. Balancing consists of two stages, permuting
+// and scaling. Both steps are optional and depend on the value of job.
+//
+// Permuting consists of applying a permutation matrix P such that the matrix
+// that results from Pᵀ*A*P takes the upper block triangular form
+//
+//	         [ T1  X  Y  ]
+//	Pᵀ A P = [  0  B  Z  ],
+//	         [  0  0  T2 ]
+//
+// where T1 and T2 are upper triangular matrices and B contains at least one
+// nonzero off-diagonal element in each row and column. The indices ilo and ihi
+// mark the starting and ending columns of the submatrix B. The eigenvalues of A
+// isolated in the first 0 to ilo-1 and last ihi+1 to n-1 elements on the
+// diagonal can be read off without any roundoff error.
+//
+// Scaling consists of applying a diagonal similarity transformation D such that
+// D^{-1}*B*D has the 1-norm of each row and its corresponding column nearly
+// equal. The output matrix is
+//
+//	[ T1     X*D          Y    ]
+//	[  0  inv(D)*B*D  inv(D)*Z ].
+//	[  0      0           T2   ]
+//
+// Scaling may reduce the 1-norm of the matrix, and improve the accuracy of
+// the computed eigenvalues and/or eigenvectors.
+//
+// job specifies the operations that will be performed on A.
+// If job is lapack.BalanceNone, Dgebal sets scale[i] = 1 for all i and returns ilo=0, ihi=n-1.
+// If job is lapack.Permute, only permuting will be done.
+// If job is lapack.Scale, only scaling will be done.
+// If job is lapack.PermuteScale, both permuting and scaling will be done.
+//
+// On return, if job is lapack.Permute or lapack.PermuteScale, it will hold that
+//
+//	A[i,j] == 0,   for i > j and j ∈ {0, ..., ilo-1, ihi+1, ..., n-1}.
+//
+// If job is lapack.BalanceNone or lapack.Scale, or if n == 0, it will hold that
+//
+//	ilo == 0 and ihi == n-1.
+//
+// On return, scale will contain information about the permutations and scaling
+// factors applied to A. If π(j) denotes the index of the column interchanged
+// with column j, and D[j,j] denotes the scaling factor applied to column j,
+// then
+//
+//	scale[j] == π(j),     for j ∈ {0, ..., ilo-1, ihi+1, ..., n-1},
+//	         == D[j,j],   for j ∈ {ilo, ..., ihi}.
+//
+// scale must have length equal to n, otherwise Dgebal will panic.
+//
+// Dgebal is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebal(job lapack.BalanceJob, n int, a []float64, lda int, scale []float64) (ilo, ihi int) {
+	switch {
+	case job != lapack.BalanceNone && job != lapack.Permute && job != lapack.Scale && job != lapack.PermuteScale:
+		panic(badBalanceJob)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	ilo = 0
+	ihi = n - 1
+
+	if n == 0 {
+		return ilo, ihi
+	}
+
+	if len(scale) != n {
+		panic(shortScale)
+	}
+
+	if job == lapack.BalanceNone {
+		for i := range scale {
+			scale[i] = 1
+		}
+		return ilo, ihi
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+	swapped := true
+
+	if job == lapack.Scale {
+		goto scaling
+	}
+
+	// Permutation to isolate eigenvalues if possible.
+	//
+	// Search for rows isolating an eigenvalue and push them down.
+	for swapped {
+		swapped = false
+	rows:
+		for i := ihi; i >= 0; i-- {
+			for j := 0; j <= ihi; j++ {
+				if i == j {
+					continue
+				}
+				if a[i*lda+j] != 0 {
+					continue rows
+				}
+			}
+			// Row i has only zero off-diagonal elements in the
+			// block A[ilo:ihi+1,ilo:ihi+1].
+			scale[ihi] = float64(i)
+			if i != ihi {
+				bi.Dswap(ihi+1, a[i:], lda, a[ihi:], lda)
+				bi.Dswap(n, a[i*lda:], 1, a[ihi*lda:], 1)
+			}
+			if ihi == 0 {
+				scale[0] = 1
+				return ilo, ihi
+			}
+			ihi--
+			swapped = true
+			break
+		}
+	}
+	// Search for columns isolating an eigenvalue and push them left.
+	swapped = true
+	for swapped {
+		swapped = false
+	columns:
+		for j := ilo; j <= ihi; j++ {
+			for i := ilo; i <= ihi; i++ {
+				if i == j {
+					continue
+				}
+				if a[i*lda+j] != 0 {
+					continue columns
+				}
+			}
+			// Column j has only zero off-diagonal elements in the
+			// block A[ilo:ihi+1,ilo:ihi+1].
+			scale[ilo] = float64(j)
+			if j != ilo {
+				bi.Dswap(ihi+1, a[j:], lda, a[ilo:], lda)
+				bi.Dswap(n-ilo, a[j*lda+ilo:], 1, a[ilo*lda+ilo:], 1)
+			}
+			swapped = true
+			ilo++
+			break
+		}
+	}
+
+scaling:
+	for i := ilo; i <= ihi; i++ {
+		scale[i] = 1
+	}
+
+	if job == lapack.Permute {
+		return ilo, ihi
+	}
+
+	// Balance the submatrix in rows ilo to ihi.
+
+	const (
+		// sclfac should be a power of 2 to avoid roundoff errors.
+		// Elements of scale are restricted to powers of sclfac,
+		// therefore the matrix will be only nearly balanced.
+		sclfac = 2
+		// factor determines the minimum reduction of the row and column
+		// norms that is considered non-negligible. It must be less than 1.
+		factor = 0.95
+	)
+	sfmin1 := dlamchS / dlamchP
+	sfmax1 := 1 / sfmin1
+	sfmin2 := sfmin1 * sclfac
+	sfmax2 := 1 / sfmin2
+
+	// Iterative loop for norm reduction.
+	var conv bool
+	for !conv {
+		conv = true
+		for i := ilo; i <= ihi; i++ {
+			c := bi.Dnrm2(ihi-ilo+1, a[ilo*lda+i:], lda)
+			r := bi.Dnrm2(ihi-ilo+1, a[i*lda+ilo:], 1)
+			ica := bi.Idamax(ihi+1, a[i:], lda)
+			ca := math.Abs(a[ica*lda+i])
+			ira := bi.Idamax(n-ilo, a[i*lda+ilo:], 1)
+			ra := math.Abs(a[i*lda+ilo+ira])
+
+			// Guard against zero c or r due to underflow.
+			if c == 0 || r == 0 {
+				continue
+			}
+			g := r / sclfac
+			f := 1.0
+			s := c + r
+			for c < g && math.Max(f, math.Max(c, ca)) < sfmax2 && math.Min(r, math.Min(g, ra)) > sfmin2 {
+				if math.IsNaN(c + f + ca + r + g + ra) {
+					// Panic if NaN to avoid infinite loop.
+					panic("lapack: NaN")
+				}
+				f *= sclfac
+				c *= sclfac
+				ca *= sclfac
+				g /= sclfac
+				r /= sclfac
+				ra /= sclfac
+			}
+			g = c / sclfac
+			for r <= g && math.Max(r, ra) < sfmax2 && math.Min(math.Min(f, c), math.Min(g, ca)) > sfmin2 {
+				f /= sclfac
+				c /= sclfac
+				ca /= sclfac
+				g /= sclfac
+				r *= sclfac
+				ra *= sclfac
+			}
+
+			if c+r >= factor*s {
+				// Reduction would be negligible.
+				continue
+			}
+			if f < 1 && scale[i] < 1 && f*scale[i] <= sfmin1 {
+				continue
+			}
+			if f > 1 && scale[i] > 1 && scale[i] >= sfmax1/f {
+				continue
+			}
+
+			// Now balance.
+			scale[i] *= f
+			bi.Dscal(n-ilo, 1/f, a[i*lda+ilo:], 1)
+			bi.Dscal(ihi+1, f, a[i:], lda)
+			conv = false
+		}
+	}
+	return ilo, ihi
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go
new file mode 100644
index 00000000000..4f323ec500a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebd2.go
@@ -0,0 +1,88 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgebd2 reduces an m×n matrix A to upper or lower bidiagonal form by an orthogonal
+// transformation.
+//
+//	Qᵀ * A * P = B
+//
+// if m >= n, B is upper diagonal, otherwise B is lower bidiagonal.
+// d is the diagonal, len = min(m,n)
+// e is the off-diagonal len = min(m,n)-1
+//
+// Dgebd2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebd2(m, n int, a []float64, lda int, d, e, tauQ, tauP, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	minmn := min(m, n)
+	if minmn == 0 {
+		return
+	}
+
+	switch {
+	case len(d) < minmn:
+		panic(shortD)
+	case len(e) < minmn-1:
+		panic(shortE)
+	case len(tauQ) < minmn:
+		panic(shortTauQ)
+	case len(tauP) < minmn:
+		panic(shortTauP)
+	case len(work) < max(m, n):
+		panic(shortWork)
+	}
+
+	if m >= n {
+		for i := 0; i < n; i++ {
+			a[i*lda+i], tauQ[i] = impl.Dlarfg(m-i, a[i*lda+i], a[min(i+1, m-1)*lda+i:], lda)
+			d[i] = a[i*lda+i]
+			a[i*lda+i] = 1
+			// Apply H_i to A[i:m, i+1:n] from the left.
+			if i < n-1 {
+				impl.Dlarf(blas.Left, m-i, n-i-1, a[i*lda+i:], lda, tauQ[i], a[i*lda+i+1:], lda, work)
+			}
+			a[i*lda+i] = d[i]
+			if i < n-1 {
+				a[i*lda+i+1], tauP[i] = impl.Dlarfg(n-i-1, a[i*lda+i+1], a[i*lda+min(i+2, n-1):], 1)
+				e[i] = a[i*lda+i+1]
+				a[i*lda+i+1] = 1
+				impl.Dlarf(blas.Right, m-i-1, n-i-1, a[i*lda+i+1:], 1, tauP[i], a[(i+1)*lda+i+1:], lda, work)
+				a[i*lda+i+1] = e[i]
+			} else {
+				tauP[i] = 0
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		a[i*lda+i], tauP[i] = impl.Dlarfg(n-i, a[i*lda+i], a[i*lda+min(i+1, n-1):], 1)
+		d[i] = a[i*lda+i]
+		a[i*lda+i] = 1
+		if i < m-1 {
+			impl.Dlarf(blas.Right, m-i-1, n-i, a[i*lda+i:], 1, tauP[i], a[(i+1)*lda+i:], lda, work)
+		}
+		a[i*lda+i] = d[i]
+		if i < m-1 {
+			a[(i+1)*lda+i], tauQ[i] = impl.Dlarfg(m-i-1, a[(i+1)*lda+i], a[min(i+2, m-1)*lda+i:], lda)
+			e[i] = a[(i+1)*lda+i]
+			a[(i+1)*lda+i] = 1
+			impl.Dlarf(blas.Left, m-i-1, n-i-1, a[(i+1)*lda+i:], lda, tauQ[i], a[(i+1)*lda+i+1:], lda, work)
+			a[(i+1)*lda+i] = e[i]
+		} else {
+			tauQ[i] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go
new file mode 100644
index 00000000000..6b6654ba6b8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgebrd.go
@@ -0,0 +1,169 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgebrd reduces a general m×n matrix A to upper or lower bidiagonal form B by
+// an orthogonal transformation:
+//
+//	Qᵀ * A * P = B.
+//
+// The diagonal elements of B are stored in d and the off-diagonal elements are stored
+// in e. These are additionally stored along the diagonal of A and the off-diagonal
+// of A. If m >= n B is an upper-bidiagonal matrix, and if m < n B is a
+// lower-bidiagonal matrix.
+//
+// The remaining elements of A store the data needed to construct Q and P.
+// The matrices Q and P are products of elementary reflectors
+//
+//	if m >= n, Q = H_0 * H_1 * ... * H_{n-1},
+//	           P = G_0 * G_1 * ... * G_{n-2},
+//	if m < n,  Q = H_0 * H_1 * ... * H_{m-2},
+//	           P = G_0 * G_1 * ... * G_{m-1},
+//
+// where
+//
+//	H_i = I - tauQ[i] * v_i * v_iᵀ,
+//	G_i = I - tauP[i] * u_i * u_iᵀ.
+//
+// As an example, on exit the entries of A when m = 6, and n = 5
+//
+//	[ d   e  u1  u1  u1]
+//	[v1   d   e  u2  u2]
+//	[v1  v2   d   e  u3]
+//	[v1  v2  v3   d   e]
+//	[v1  v2  v3  v4   d]
+//	[v1  v2  v3  v4  v5]
+//
+// and when m = 5, n = 6
+//
+//	[ d  u1  u1  u1  u1  u1]
+//	[ e   d  u2  u2  u2  u2]
+//	[v1   e   d  u3  u3  u3]
+//	[v1  v2   e   d  u4  u4]
+//	[v1  v2  v3   e   d  u5]
+//
+// d, tauQ, and tauP must all have length at least min(m,n), and e must have
+// length min(m,n) - 1, unless lwork is -1 when there is no check except for
+// work which must have a length of at least one.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= max(1,m,n) or be -1 and this function will panic otherwise.
+// Dgebrd is blocked decomposition, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgebrd,
+// the optimal work length will be stored into work[0].
+//
+// Dgebrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgebrd(m, n int, a []float64, lda int, d, e, tauQ, tauP, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, max(m, n)) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	minmn := min(m, n)
+	if minmn == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DGEBRD", " ", m, n, -1, -1)
+	lwkopt := (m + n) * nb
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(d) < minmn:
+		panic(shortD)
+	case len(e) < minmn-1:
+		panic(shortE)
+	case len(tauQ) < minmn:
+		panic(shortTauQ)
+	case len(tauP) < minmn:
+		panic(shortTauP)
+	}
+
+	nx := minmn
+	ws := max(m, n)
+	if 1 < nb && nb < minmn {
+		// At least one blocked operation can be done.
+		// Get the crossover point nx.
+		nx = max(nb, impl.Ilaenv(3, "DGEBRD", " ", m, n, -1, -1))
+		// Determine when to switch from blocked to unblocked code.
+		if nx < minmn {
+			// At least one blocked operation will be done.
+			ws = (m + n) * nb
+			if lwork < ws {
+				// Not enough work space for the optimal nb,
+				// consider using a smaller block size.
+				nbmin := impl.Ilaenv(2, "DGEBRD", " ", m, n, -1, -1)
+				if lwork >= (m+n)*nbmin {
+					// Enough work space for minimum block size.
+					nb = lwork / (m + n)
+				} else {
+					nb = minmn
+					nx = minmn
+				}
+			}
+		}
+	}
+	bi := blas64.Implementation()
+	ldworkx := nb
+	ldworky := nb
+	var i int
+	for i = 0; i < minmn-nx; i += nb {
+		// Reduce rows and columns i:i+nb to bidiagonal form and return
+		// the matrices X and Y which are needed to update the unreduced
+		// part of the matrix.
+		// X is stored in the first m rows of work, y in the next rows.
+		x := work[:m*ldworkx]
+		y := work[m*ldworkx:]
+		impl.Dlabrd(m-i, n-i, nb, a[i*lda+i:], lda,
+			d[i:], e[i:], tauQ[i:], tauP[i:],
+			x, ldworkx, y, ldworky)
+
+		// Update the trailing submatrix A[i+nb:m,i+nb:n], using an update
+		// of the form  A := A - V*Y**T - X*U**T
+		bi.Dgemm(blas.NoTrans, blas.Trans, m-i-nb, n-i-nb, nb,
+			-1, a[(i+nb)*lda+i:], lda, y[nb*ldworky:], ldworky,
+			1, a[(i+nb)*lda+i+nb:], lda)
+
+		bi.Dgemm(blas.NoTrans, blas.NoTrans, m-i-nb, n-i-nb, nb,
+			-1, x[nb*ldworkx:], ldworkx, a[i*lda+i+nb:], lda,
+			1, a[(i+nb)*lda+i+nb:], lda)
+
+		// Copy diagonal and off-diagonal elements of B back into A.
+		if m >= n {
+			for j := i; j < i+nb; j++ {
+				a[j*lda+j] = d[j]
+				a[j*lda+j+1] = e[j]
+			}
+		} else {
+			for j := i; j < i+nb; j++ {
+				a[j*lda+j] = d[j]
+				a[(j+1)*lda+j] = e[j]
+			}
+		}
+	}
+	// Use unblocked code to reduce the remainder of the matrix.
+	impl.Dgebd2(m-i, n-i, a[i*lda+i:], lda, d[i:], e[i:], tauQ[i:], tauP[i:], work)
+	work[0] = float64(ws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go
new file mode 100644
index 00000000000..1d04644142e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgecon.go
@@ -0,0 +1,106 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgecon estimates and returns the reciprocal of the condition number of the
+// n×n matrix A, in either the 1-norm or the ∞-norm, using the LU factorization
+// computed by Dgetrf.
+//
+// An estimate is obtained for norm(A⁻¹), and the reciprocal of the condition
+// number rcond is computed as
+//
+//	rcond 1 / ( norm(A) * norm(A⁻¹) ).
+//
+// If n is zero, rcond is always 1.
+//
+// anorm is the 1-norm or the ∞-norm of the original matrix A. anorm must be
+// non-negative, otherwise Dgecon will panic. If anorm is 0 or infinity, Dgecon
+// returns 0. If anorm is NaN, Dgecon returns NaN.
+//
+// work must have length at least 4*n and iwork must have length at least n,
+// otherwise Dgecon will panic.
+func (impl Implementation) Dgecon(norm lapack.MatrixNorm, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 {
+	switch {
+	case norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case anorm < 0:
+		panic(negANorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(work) < 4*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	// Quick return if possible.
+	switch {
+	case anorm == 0:
+		return 0
+	case math.IsNaN(anorm):
+		// Propagate NaN.
+		return anorm
+	case math.IsInf(anorm, 1):
+		return 0
+	}
+
+	bi := blas64.Implementation()
+	var rcond, ainvnm float64
+	var kase int
+	var normin bool
+	isave := new([3]int)
+	onenrm := norm == lapack.MaxColumnSum
+	smlnum := dlamchS
+	kase1 := 2
+	if onenrm {
+		kase1 = 1
+	}
+	for {
+		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
+		if kase == 0 {
+			if ainvnm != 0 {
+				rcond = (1 / ainvnm) / anorm
+			}
+			return rcond
+		}
+		var sl, su float64
+		if kase == kase1 {
+			sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.Unit, normin, n, a, lda, work, work[2*n:])
+			su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[3*n:])
+		} else {
+			su = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[3*n:])
+			sl = impl.Dlatrs(blas.Lower, blas.Trans, blas.Unit, normin, n, a, lda, work, work[2*n:])
+		}
+		scale := sl * su
+		normin = true
+		if scale != 1 {
+			ix := bi.Idamax(n, work, 1)
+			if scale == 0 || scale < math.Abs(work[ix])*smlnum {
+				return rcond
+			}
+			impl.Drscl(n, scale, work, 1)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go
new file mode 100644
index 00000000000..b49b66fc658
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeev.go
@@ -0,0 +1,287 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgeev computes the eigenvalues and, optionally, the left and/or right
+// eigenvectors for an n×n real nonsymmetric matrix A.
+//
+// The right eigenvector v_j of A corresponding to an eigenvalue λ_j
+// is defined by
+//
+//	A v_j = λ_j v_j,
+//
+// and the left eigenvector u_j corresponding to an eigenvalue λ_j is defined by
+//
+//	u_jᴴ A = λ_j u_jᴴ,
+//
+// where u_jᴴ is the conjugate transpose of u_j.
+//
+// On return, A will be overwritten and the left and right eigenvectors will be
+// stored, respectively, in the columns of the n×n matrices VL and VR in the
+// same order as their eigenvalues. If the j-th eigenvalue is real, then
+//
+//	u_j = VL[:,j],
+//	v_j = VR[:,j],
+//
+// and if it is not real, then j and j+1 form a complex conjugate pair and the
+// eigenvectors can be recovered as
+//
+//	u_j     = VL[:,j] + i*VL[:,j+1],
+//	u_{j+1} = VL[:,j] - i*VL[:,j+1],
+//	v_j     = VR[:,j] + i*VR[:,j+1],
+//	v_{j+1} = VR[:,j] - i*VR[:,j+1],
+//
+// where i is the imaginary unit. The computed eigenvectors are normalized to
+// have Euclidean norm equal to 1 and largest component real.
+//
+// Left eigenvectors will be computed only if jobvl == lapack.LeftEVCompute,
+// otherwise jobvl must be lapack.LeftEVNone.
+// Right eigenvectors will be computed only if jobvr == lapack.RightEVCompute,
+// otherwise jobvr must be lapack.RightEVNone.
+// For other values of jobvl and jobvr Dgeev will panic.
+//
+// wr and wi contain the real and imaginary parts, respectively, of the computed
+// eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with
+// the eigenvalue having the positive imaginary part first.
+// wr and wi must have length n, and Dgeev will panic otherwise.
+//
+// work must have length at least lwork and lwork must be at least max(1,4*n) if
+// the left or right eigenvectors are computed, and at least max(1,3*n) if no
+// eigenvectors are computed. For good performance, lwork must generally be
+// larger.  On return, optimal value of lwork will be stored in work[0].
+//
+// If lwork == -1, instead of performing Dgeev, the function only calculates the
+// optimal value of lwork and stores it into work[0].
+//
+// On return, first is the index of the first valid eigenvalue. If first == 0,
+// all eigenvalues and eigenvectors have been computed. If first is positive,
+// Dgeev failed to compute all the eigenvalues, no eigenvectors have been
+// computed and wr[first:] and wi[first:] contain those eigenvalues which have
+// converged.
+func (impl Implementation) Dgeev(jobvl lapack.LeftEVJob, jobvr lapack.RightEVJob, n int, a []float64, lda int, wr, wi []float64, vl []float64, ldvl int, vr []float64, ldvr int, work []float64, lwork int) (first int) {
+	wantvl := jobvl == lapack.LeftEVCompute
+	wantvr := jobvr == lapack.RightEVCompute
+	var minwrk int
+	if wantvl || wantvr {
+		minwrk = max(1, 4*n)
+	} else {
+		minwrk = max(1, 3*n)
+	}
+	switch {
+	case jobvl != lapack.LeftEVCompute && jobvl != lapack.LeftEVNone:
+		panic(badLeftEVJob)
+	case jobvr != lapack.RightEVCompute && jobvr != lapack.RightEVNone:
+		panic(badRightEVJob)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldvl < 1 || (ldvl < n && wantvl):
+		panic(badLdVL)
+	case ldvr < 1 || (ldvr < n && wantvr):
+		panic(badLdVR)
+	case lwork < minwrk && lwork != -1:
+		panic(badLWork)
+	case len(work) < lwork:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	maxwrk := 2*n + n*impl.Ilaenv(1, "DGEHRD", " ", n, 1, n, 0)
+	if wantvl || wantvr {
+		maxwrk = max(maxwrk, 2*n+(n-1)*impl.Ilaenv(1, "DORGHR", " ", n, 1, n, -1))
+		impl.Dhseqr(lapack.EigenvaluesAndSchur, lapack.SchurOrig, n, 0, n-1,
+			a, lda, wr, wi, nil, n, work, -1)
+		maxwrk = max(maxwrk, max(n+1, n+int(work[0])))
+		side := lapack.EVLeft
+		if wantvr {
+			side = lapack.EVRight
+		}
+		impl.Dtrevc3(side, lapack.EVAllMulQ, nil, n, a, lda, vl, ldvl, vr, ldvr,
+			n, work, -1)
+		maxwrk = max(maxwrk, n+int(work[0]))
+		maxwrk = max(maxwrk, 4*n)
+	} else {
+		impl.Dhseqr(lapack.EigenvaluesOnly, lapack.SchurNone, n, 0, n-1,
+			a, lda, wr, wi, vr, ldvr, work, -1)
+		maxwrk = max(maxwrk, max(n+1, n+int(work[0])))
+	}
+	maxwrk = max(maxwrk, minwrk)
+
+	if lwork == -1 {
+		work[0] = float64(maxwrk)
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(wr) != n:
+		panic(badLenWr)
+	case len(wi) != n:
+		panic(badLenWi)
+	case len(vl) < (n-1)*ldvl+n && wantvl:
+		panic(shortVL)
+	case len(vr) < (n-1)*ldvr+n && wantvr:
+		panic(shortVR)
+	}
+
+	// Get machine constants.
+	smlnum := math.Sqrt(dlamchS) / dlamchP
+	bignum := 1 / smlnum
+
+	// Scale A if max element outside range [smlnum,bignum].
+	anrm := impl.Dlange(lapack.MaxAbs, n, n, a, lda, nil)
+	var scalea bool
+	var cscale float64
+	if 0 < anrm && anrm < smlnum {
+		scalea = true
+		cscale = smlnum
+	} else if anrm > bignum {
+		scalea = true
+		cscale = bignum
+	}
+	if scalea {
+		impl.Dlascl(lapack.General, 0, 0, anrm, cscale, n, n, a, lda)
+	}
+
+	// Balance the matrix.
+	workbal := work[:n]
+	ilo, ihi := impl.Dgebal(lapack.PermuteScale, n, a, lda, workbal)
+
+	// Reduce to upper Hessenberg form.
+	iwrk := 2 * n
+	tau := work[n : iwrk-1]
+	impl.Dgehrd(n, ilo, ihi, a, lda, tau, work[iwrk:], lwork-iwrk)
+
+	var side lapack.EVSide
+	if wantvl {
+		side = lapack.EVLeft
+		// Copy Householder vectors to VL.
+		impl.Dlacpy(blas.Lower, n, n, a, lda, vl, ldvl)
+		// Generate orthogonal matrix in VL.
+		impl.Dorghr(n, ilo, ihi, vl, ldvl, tau, work[iwrk:], lwork-iwrk)
+		// Perform QR iteration, accumulating Schur vectors in VL.
+		iwrk = n
+		first = impl.Dhseqr(lapack.EigenvaluesAndSchur, lapack.SchurOrig, n, ilo, ihi,
+			a, lda, wr, wi, vl, ldvl, work[iwrk:], lwork-iwrk)
+		if wantvr {
+			// Want left and right eigenvectors.
+			// Copy Schur vectors to VR.
+			side = lapack.EVBoth
+			impl.Dlacpy(blas.All, n, n, vl, ldvl, vr, ldvr)
+		}
+	} else if wantvr {
+		side = lapack.EVRight
+		// Copy Householder vectors to VR.
+		impl.Dlacpy(blas.Lower, n, n, a, lda, vr, ldvr)
+		// Generate orthogonal matrix in VR.
+		impl.Dorghr(n, ilo, ihi, vr, ldvr, tau, work[iwrk:], lwork-iwrk)
+		// Perform QR iteration, accumulating Schur vectors in VR.
+		iwrk = n
+		first = impl.Dhseqr(lapack.EigenvaluesAndSchur, lapack.SchurOrig, n, ilo, ihi,
+			a, lda, wr, wi, vr, ldvr, work[iwrk:], lwork-iwrk)
+	} else {
+		// Compute eigenvalues only.
+		iwrk = n
+		first = impl.Dhseqr(lapack.EigenvaluesOnly, lapack.SchurNone, n, ilo, ihi,
+			a, lda, wr, wi, nil, 1, work[iwrk:], lwork-iwrk)
+	}
+
+	if first > 0 {
+		if scalea {
+			// Undo scaling.
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wr[first:], 1)
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wi[first:], 1)
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, ilo, 1, wr, 1)
+			impl.Dlascl(lapack.General, 0, 0, cscale, anrm, ilo, 1, wi, 1)
+		}
+		work[0] = float64(maxwrk)
+		return first
+	}
+
+	if wantvl || wantvr {
+		// Compute left and/or right eigenvectors.
+		impl.Dtrevc3(side, lapack.EVAllMulQ, nil, n,
+			a, lda, vl, ldvl, vr, ldvr, n, work[iwrk:], lwork-iwrk)
+	}
+	bi := blas64.Implementation()
+	if wantvl {
+		// Undo balancing of left eigenvectors.
+		impl.Dgebak(lapack.PermuteScale, lapack.EVLeft, n, ilo, ihi, workbal, n, vl, ldvl)
+		// Normalize left eigenvectors and make largest component real.
+		for i, wii := range wi {
+			if wii < 0 {
+				continue
+			}
+			if wii == 0 {
+				scl := 1 / bi.Dnrm2(n, vl[i:], ldvl)
+				bi.Dscal(n, scl, vl[i:], ldvl)
+				continue
+			}
+			scl := 1 / impl.Dlapy2(bi.Dnrm2(n, vl[i:], ldvl), bi.Dnrm2(n, vl[i+1:], ldvl))
+			bi.Dscal(n, scl, vl[i:], ldvl)
+			bi.Dscal(n, scl, vl[i+1:], ldvl)
+			for k := 0; k < n; k++ {
+				vi := vl[k*ldvl+i]
+				vi1 := vl[k*ldvl+i+1]
+				work[iwrk+k] = vi*vi + vi1*vi1
+			}
+			k := bi.Idamax(n, work[iwrk:iwrk+n], 1)
+			cs, sn, _ := impl.Dlartg(vl[k*ldvl+i], vl[k*ldvl+i+1])
+			bi.Drot(n, vl[i:], ldvl, vl[i+1:], ldvl, cs, sn)
+			vl[k*ldvl+i+1] = 0
+		}
+	}
+	if wantvr {
+		// Undo balancing of right eigenvectors.
+		impl.Dgebak(lapack.PermuteScale, lapack.EVRight, n, ilo, ihi, workbal, n, vr, ldvr)
+		// Normalize right eigenvectors and make largest component real.
+		for i, wii := range wi {
+			if wii < 0 {
+				continue
+			}
+			if wii == 0 {
+				scl := 1 / bi.Dnrm2(n, vr[i:], ldvr)
+				bi.Dscal(n, scl, vr[i:], ldvr)
+				continue
+			}
+			scl := 1 / impl.Dlapy2(bi.Dnrm2(n, vr[i:], ldvr), bi.Dnrm2(n, vr[i+1:], ldvr))
+			bi.Dscal(n, scl, vr[i:], ldvr)
+			bi.Dscal(n, scl, vr[i+1:], ldvr)
+			for k := 0; k < n; k++ {
+				vi := vr[k*ldvr+i]
+				vi1 := vr[k*ldvr+i+1]
+				work[iwrk+k] = vi*vi + vi1*vi1
+			}
+			k := bi.Idamax(n, work[iwrk:iwrk+n], 1)
+			cs, sn, _ := impl.Dlartg(vr[k*ldvr+i], vr[k*ldvr+i+1])
+			bi.Drot(n, vr[i:], ldvr, vr[i+1:], ldvr, cs, sn)
+			vr[k*ldvr+i+1] = 0
+		}
+	}
+
+	if scalea {
+		// Undo scaling.
+		impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wr[first:], 1)
+		impl.Dlascl(lapack.General, 0, 0, cscale, anrm, n-first, 1, wi[first:], 1)
+	}
+
+	work[0] = float64(maxwrk)
+	return first
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go
new file mode 100644
index 00000000000..64b0cb40283
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehd2.go
@@ -0,0 +1,105 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgehd2 reduces a block of a general n×n matrix A to upper Hessenberg form H
+// by an orthogonal similarity transformation Qᵀ * A * Q = H.
+//
+// The matrix Q is represented as a product of (ihi-ilo) elementary
+// reflectors
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// Each H_i has the form
+//
+//	H_i = I - tau[i] * v * vᵀ
+//
+// where v is a real vector with v[0:i+1] = 0, v[i+1] = 1 and v[ihi+1:n] = 0.
+// v[i+2:ihi+1] is stored on exit in A[i+2:ihi+1,i].
+//
+// On entry, a contains the n×n general matrix to be reduced. On return, the
+// upper triangle and the first subdiagonal of A are overwritten with the upper
+// Hessenberg matrix H, and the elements below the first subdiagonal, with the
+// slice tau, represent the orthogonal matrix Q as a product of elementary
+// reflectors.
+//
+// The contents of A are illustrated by the following example, with n = 7, ilo =
+// 1 and ihi = 5.
+// On entry,
+//
+//	[ a   a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[                         a ]
+//
+// on return,
+//
+//	[ a   a   h   h   h   h   a ]
+//	[     a   h   h   h   h   a ]
+//	[     h   h   h   h   h   h ]
+//	[     v1  h   h   h   h   h ]
+//	[     v1  v2  h   h   h   h ]
+//	[     v1  v2  v3  h   h   h ]
+//	[                         a ]
+//
+// where a denotes an element of the original matrix A, h denotes a
+// modified element of the upper Hessenberg matrix H, and vi denotes an
+// element of the vector defining H_i.
+//
+// ilo and ihi determine the block of A that will be reduced to upper Hessenberg
+// form. It must hold that 0 <= ilo <= ihi <= max(0, n-1), otherwise Dgehd2 will
+// panic.
+//
+// On return, tau will contain the scalar factors of the elementary reflectors.
+// It must have length equal to n-1, otherwise Dgehd2 will panic.
+//
+// work must have length at least n, otherwise Dgehd2 will panic.
+//
+// Dgehd2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgehd2(n, ilo, ihi int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(tau) != n-1:
+		panic(badLenTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	for i := ilo; i < ihi; i++ {
+		// Compute elementary reflector H_i to annihilate A[i+2:ihi+1,i].
+		var aii float64
+		aii, tau[i] = impl.Dlarfg(ihi-i, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
+		a[(i+1)*lda+i] = 1
+
+		// Apply H_i to A[0:ihi+1,i+1:ihi+1] from the right.
+		impl.Dlarf(blas.Right, ihi+1, ihi-i, a[(i+1)*lda+i:], lda, tau[i], a[i+1:], lda, work)
+
+		// Apply H_i to A[i+1:ihi+1,i+1:n] from the left.
+		impl.Dlarf(blas.Left, ihi-i, n-i-1, a[(i+1)*lda+i:], lda, tau[i], a[(i+1)*lda+i+1:], lda, work)
+		a[(i+1)*lda+i] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go
new file mode 100644
index 00000000000..ae1533029d4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgehrd.go
@@ -0,0 +1,202 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgehrd reduces a block of a real n×n general matrix A to upper Hessenberg
+// form H by an orthogonal similarity transformation Qᵀ * A * Q = H.
+//
+// The matrix Q is represented as a product of (ihi-ilo) elementary
+// reflectors
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// Each H_i has the form
+//
+//	H_i = I - tau[i] * v * vᵀ
+//
+// where v is a real vector with v[0:i+1] = 0, v[i+1] = 1 and v[ihi+1:n] = 0.
+// v[i+2:ihi+1] is stored on exit in A[i+2:ihi+1,i].
+//
+// On entry, a contains the n×n general matrix to be reduced. On return, the
+// upper triangle and the first subdiagonal of A will be overwritten with the
+// upper Hessenberg matrix H, and the elements below the first subdiagonal, with
+// the slice tau, represent the orthogonal matrix Q as a product of elementary
+// reflectors.
+//
+// The contents of a are illustrated by the following example, with n = 7, ilo =
+// 1 and ihi = 5.
+// On entry,
+//
+//	[ a   a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[     a   a   a   a   a   a ]
+//	[                         a ]
+//
+// on return,
+//
+//	[ a   a   h   h   h   h   a ]
+//	[     a   h   h   h   h   a ]
+//	[     h   h   h   h   h   h ]
+//	[     v1  h   h   h   h   h ]
+//	[     v1  v2  h   h   h   h ]
+//	[     v1  v2  v3  h   h   h ]
+//	[                         a ]
+//
+// where a denotes an element of the original matrix A, h denotes a
+// modified element of the upper Hessenberg matrix H, and vi denotes an
+// element of the vector defining H_i.
+//
+// ilo and ihi determine the block of A that will be reduced to upper Hessenberg
+// form. It must hold that 0 <= ilo <= ihi < n if n > 0, and ilo == 0 and ihi ==
+// -1 if n == 0, otherwise Dgehrd will panic.
+//
+// On return, tau will contain the scalar factors of the elementary reflectors.
+// Elements tau[:ilo] and tau[ihi:] will be set to zero. tau must have length
+// equal to n-1 if n > 0, otherwise Dgehrd will panic.
+//
+// work must have length at least lwork and lwork must be at least max(1,n),
+// otherwise Dgehrd will panic. On return, work[0] contains the optimal value of
+// lwork.
+//
+// If lwork == -1, instead of performing Dgehrd, only the optimal value of lwork
+// will be stored in work[0].
+//
+// Dgehrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgehrd(n, ilo, ihi int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < lwork:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	const (
+		nbmax = 64
+		ldt   = nbmax + 1
+		tsize = ldt * nbmax
+	)
+	// Compute the workspace requirements.
+	nb := min(nbmax, impl.Ilaenv(1, "DGEHRD", " ", n, ilo, ihi, -1))
+	lwkopt := n*nb + tsize
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) != n-1 {
+		panic(badLenTau)
+	}
+
+	// Set tau[:ilo] and tau[ihi:] to zero.
+	for i := 0; i < ilo; i++ {
+		tau[i] = 0
+	}
+	for i := ihi; i < n-1; i++ {
+		tau[i] = 0
+	}
+
+	// Quick return if possible.
+	nh := ihi - ilo + 1
+	if nh <= 1 {
+		work[0] = 1
+		return
+	}
+
+	// Determine the block size.
+	nbmin := 2
+	var nx int
+	if 1 < nb && nb < nh {
+		// Determine when to cross over from blocked to unblocked code
+		// (last block is always handled by unblocked code).
+		nx = max(nb, impl.Ilaenv(3, "DGEHRD", " ", n, ilo, ihi, -1))
+		if nx < nh {
+			// Determine if workspace is large enough for blocked code.
+			if lwork < n*nb+tsize {
+				// Not enough workspace to use optimal nb:
+				// determine the minimum value of nb, and reduce
+				// nb or force use of unblocked code.
+				nbmin = max(2, impl.Ilaenv(2, "DGEHRD", " ", n, ilo, ihi, -1))
+				if lwork >= n*nbmin+tsize {
+					nb = (lwork - tsize) / n
+				} else {
+					nb = 1
+				}
+			}
+		}
+	}
+	ldwork := nb // work is used as an n×nb matrix.
+
+	var i int
+	if nb < nbmin || nh <= nb {
+		// Use unblocked code below.
+		i = ilo
+	} else {
+		// Use blocked code.
+		bi := blas64.Implementation()
+		iwt := n * nb // Size of the matrix Y and index where the matrix T starts in work.
+		for i = ilo; i < ihi-nx; i += nb {
+			ib := min(nb, ihi-i)
+
+			// Reduce columns [i:i+ib] to Hessenberg form, returning the
+			// matrices V and T of the block reflector H = I - V*T*Vᵀ
+			// which performs the reduction, and also the matrix Y = A*V*T.
+			impl.Dlahr2(ihi+1, i+1, ib, a[i:], lda, tau[i:], work[iwt:], ldt, work, ldwork)
+
+			// Apply the block reflector H to A[:ihi+1,i+ib:ihi+1] from the
+			// right, computing  A := A - Y * Vᵀ. V[i+ib,i+ib-1] must be set
+			// to 1.
+			ei := a[(i+ib)*lda+i+ib-1]
+			a[(i+ib)*lda+i+ib-1] = 1
+			bi.Dgemm(blas.NoTrans, blas.Trans, ihi+1, ihi-i-ib+1, ib,
+				-1, work, ldwork,
+				a[(i+ib)*lda+i:], lda,
+				1, a[i+ib:], lda)
+			a[(i+ib)*lda+i+ib-1] = ei
+
+			// Apply the block reflector H to A[0:i+1,i+1:i+ib-1] from the
+			// right.
+			bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, i+1, ib-1,
+				1, a[(i+1)*lda+i:], lda, work, ldwork)
+			for j := 0; j <= ib-2; j++ {
+				bi.Daxpy(i+1, -1, work[j:], ldwork, a[i+j+1:], lda)
+			}
+
+			// Apply the block reflector H to A[i+1:ihi+1,i+ib:n] from the
+			// left.
+			impl.Dlarfb(blas.Left, blas.Trans, lapack.Forward, lapack.ColumnWise,
+				ihi-i, n-i-ib, ib,
+				a[(i+1)*lda+i:], lda, work[iwt:], ldt, a[(i+1)*lda+i+ib:], lda, work, ldwork)
+		}
+	}
+	// Use unblocked code to reduce the rest of the matrix.
+	impl.Dgehd2(n, i, ihi, a, lda, tau, work)
+	work[0] = float64(lwkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go
new file mode 100644
index 00000000000..abc96f7d2a9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelq2.go
@@ -0,0 +1,65 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgelq2 computes the LQ factorization of the m×n matrix A.
+//
+// In an LQ factorization, L is a lower triangular m×n matrix, and Q is an n×n
+// orthonormal matrix.
+//
+// a is modified to contain the information to construct L and Q.
+// The lower triangle of a contains the matrix L. The upper triangular elements
+// (not including the diagonal) contain the elementary reflectors. tau is modified
+// to contain the reflector scales. tau must have length of at least k = min(m,n)
+// and this function will panic otherwise.
+//
+// See Dgeqr2 for a description of the elementary reflectors and orthonormal
+// matrix Q. Q is constructed as a product of these elementary reflectors,
+// Q = H_{k-1} * ... * H_1 * H_0.
+//
+// work is temporary storage of length at least m and this function will panic otherwise.
+//
+// Dgelq2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgelq2(m, n int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	for i := 0; i < k; i++ {
+		a[i*lda+i], tau[i] = impl.Dlarfg(n-i, a[i*lda+i], a[i*lda+min(i+1, n-1):], 1)
+		if i < m-1 {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(blas.Right, m-i-1, n-i,
+				a[i*lda+i:], 1,
+				tau[i],
+				a[(i+1)*lda+i:], lda,
+				work)
+			a[i*lda+i] = aii
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go
new file mode 100644
index 00000000000..f1fd13a0196
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgelqf.go
@@ -0,0 +1,97 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgelqf computes the LQ factorization of the m×n matrix A using a blocked
+// algorithm. See the documentation for Dgelq2 for a description of the
+// parameters at entry and exit.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m, and this function will panic otherwise.
+// Dgelqf is a blocked LQ factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgelqf,
+// the optimal work length will be stored into work[0].
+//
+// tau must have length at least min(m,n), and this function will panic otherwise.
+func (impl Implementation) Dgelqf(m, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, m) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	k := min(m, n)
+	if k == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DGELQF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(m * nb)
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) < k {
+		panic(shortTau)
+	}
+
+	// Find the optimal blocking size based on the size of available memory
+	// and optimal machine parameters.
+	nbmin := 2
+	var nx int
+	iws := m
+	if 1 < nb && nb < k {
+		nx = max(0, impl.Ilaenv(3, "DGELQF", " ", m, n, -1, -1))
+		if nx < k {
+			iws = m * nb
+			if lwork < iws {
+				nb = lwork / m
+				nbmin = max(2, impl.Ilaenv(2, "DGELQF", " ", m, n, -1, -1))
+			}
+		}
+	}
+	ldwork := nb
+	// Computed blocked LQ factorization.
+	var i int
+	if nbmin <= nb && nb < k && nx < k {
+		for i = 0; i < k-nx; i += nb {
+			ib := min(k-i, nb)
+			impl.Dgelq2(ib, n-i, a[i*lda+i:], lda, tau[i:], work)
+			if i+ib < m {
+				impl.Dlarft(lapack.Forward, lapack.RowWise, n-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+				impl.Dlarfb(blas.Right, blas.NoTrans, lapack.Forward, lapack.RowWise,
+					m-i-ib, n-i, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[(i+ib)*lda+i:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+		}
+	}
+	// Perform unblocked LQ factorization on the remainder.
+	if i < k {
+		impl.Dgelq2(m-i, n-i, a[i*lda+i:], lda, tau[i:], work)
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go
new file mode 100644
index 00000000000..3018973a9e7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgels.go
@@ -0,0 +1,220 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgels finds a minimum-norm solution based on the matrices A and B using the
+// QR or LQ factorization. Dgels returns false if the matrix
+// A is singular, and true if this solution was successfully found.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//  1. If m >= n and trans == blas.NoTrans, Dgels finds X such that || A*X - B||_2
+//     is minimized.
+//  2. If m < n and trans == blas.NoTrans, Dgels finds the minimum norm solution of
+//     A * X = B.
+//  3. If m >= n and trans == blas.Trans, Dgels finds the minimum norm solution of
+//     Aᵀ * X = B.
+//  4. If m < n and trans == blas.Trans, Dgels finds X such that || A*X - B||_2
+//     is minimized.
+//
+// Note that the least-squares solutions (cases 1 and 3) perform the minimization
+// per column of B. This is not the same as finding the minimum-norm matrix.
+//
+// The matrix A is a general matrix of size m×n and is modified during this call.
+// The input matrix B is of size max(m,n)×nrhs, and serves two purposes. On entry,
+// the elements of b specify the input matrix B. B has size m×nrhs if
+// trans == blas.NoTrans, and n×nrhs if trans == blas.Trans. On exit, the
+// leading submatrix of b contains the solution vectors X. If trans == blas.NoTrans,
+// this submatrix is of size n×nrhs, and of size m×nrhs otherwise.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= max(m,n) + max(m,n,nrhs), and this function will panic
+// otherwise. A longer work will enable blocked algorithms to be called.
+// In the special case that lwork == -1, work[0] will be set to the optimal working
+// length.
+func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float64, lda int, b []float64, ldb int, work []float64, lwork int) bool {
+	mn := min(m, n)
+	minwrk := mn + max(mn, nrhs)
+	switch {
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	case lwork < max(1, minwrk) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if mn == 0 || nrhs == 0 {
+		impl.Dlaset(blas.All, max(m, n), nrhs, 0, 0, b, ldb)
+		work[0] = 1
+		return true
+	}
+
+	// Find optimal block size.
+	var nb int
+	if m >= n {
+		nb = impl.Ilaenv(1, "DGEQRF", " ", m, n, -1, -1)
+		if trans != blas.NoTrans {
+			nb = max(nb, impl.Ilaenv(1, "DORMQR", "LN", m, nrhs, n, -1))
+		} else {
+			nb = max(nb, impl.Ilaenv(1, "DORMQR", "LT", m, nrhs, n, -1))
+		}
+	} else {
+		nb = impl.Ilaenv(1, "DGELQF", " ", m, n, -1, -1)
+		if trans != blas.NoTrans {
+			nb = max(nb, impl.Ilaenv(1, "DORMLQ", "LT", n, nrhs, m, -1))
+		} else {
+			nb = max(nb, impl.Ilaenv(1, "DORMLQ", "LN", n, nrhs, m, -1))
+		}
+	}
+	wsize := max(1, mn+max(mn, nrhs)*nb)
+	work[0] = float64(wsize)
+
+	if lwork == -1 {
+		return true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (max(m, n)-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	// Scale the input matrices if they contain extreme values.
+	smlnum := dlamchS / dlamchP
+	bignum := 1 / smlnum
+	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, nil)
+	var iascl int
+	if anrm > 0 && anrm < smlnum {
+		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
+		iascl = 1
+	} else if anrm > bignum {
+		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
+	} else if anrm == 0 {
+		// Matrix is all zeros.
+		impl.Dlaset(blas.All, max(m, n), nrhs, 0, 0, b, ldb)
+		return true
+	}
+	brow := m
+	if trans != blas.NoTrans {
+		brow = n
+	}
+	bnrm := impl.Dlange(lapack.MaxAbs, brow, nrhs, b, ldb, nil)
+	ibscl := 0
+	if bnrm > 0 && bnrm < smlnum {
+		impl.Dlascl(lapack.General, 0, 0, bnrm, smlnum, brow, nrhs, b, ldb)
+		ibscl = 1
+	} else if bnrm > bignum {
+		impl.Dlascl(lapack.General, 0, 0, bnrm, bignum, brow, nrhs, b, ldb)
+		ibscl = 2
+	}
+
+	// Solve the minimization problem using a QR or an LQ decomposition.
+	var scllen int
+	if m >= n {
+		impl.Dgeqrf(m, n, a, lda, work[:n], work[mn:], lwork-mn)
+		if trans == blas.NoTrans {
+			impl.Dormqr(blas.Left, blas.Trans, m, nrhs, n,
+				a, lda,
+				work[:n],
+				b, ldb,
+				work[mn:], lwork-mn)
+			ok := impl.Dtrtrs(blas.Upper, blas.NoTrans, blas.NonUnit, n, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+			scllen = n
+		} else {
+			ok := impl.Dtrtrs(blas.Upper, blas.Trans, blas.NonUnit, n, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+			for i := n; i < m; i++ {
+				for j := 0; j < nrhs; j++ {
+					b[i*ldb+j] = 0
+				}
+			}
+			impl.Dormqr(blas.Left, blas.NoTrans, m, nrhs, n,
+				a, lda,
+				work[:n],
+				b, ldb,
+				work[mn:], lwork-mn)
+			scllen = m
+		}
+	} else {
+		impl.Dgelqf(m, n, a, lda, work, work[mn:], lwork-mn)
+		if trans == blas.NoTrans {
+			ok := impl.Dtrtrs(blas.Lower, blas.NoTrans, blas.NonUnit,
+				m, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+			for i := m; i < n; i++ {
+				for j := 0; j < nrhs; j++ {
+					b[i*ldb+j] = 0
+				}
+			}
+			impl.Dormlq(blas.Left, blas.Trans, n, nrhs, m,
+				a, lda,
+				work,
+				b, ldb,
+				work[mn:], lwork-mn)
+			scllen = n
+		} else {
+			impl.Dormlq(blas.Left, blas.NoTrans, n, nrhs, m,
+				a, lda,
+				work,
+				b, ldb,
+				work[mn:], lwork-mn)
+			ok := impl.Dtrtrs(blas.Lower, blas.Trans, blas.NonUnit,
+				m, nrhs,
+				a, lda,
+				b, ldb)
+			if !ok {
+				return false
+			}
+		}
+	}
+
+	// Adjust answer vector based on scaling.
+	if iascl == 1 {
+		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, scllen, nrhs, b, ldb)
+	}
+	if iascl == 2 {
+		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, scllen, nrhs, b, ldb)
+	}
+	if ibscl == 1 {
+		impl.Dlascl(lapack.General, 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb)
+	}
+	if ibscl == 2 {
+		impl.Dlascl(lapack.General, 0, 0, bignum, bnrm, scllen, nrhs, b, ldb)
+	}
+
+	work[0] = float64(wsize)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go
new file mode 100644
index 00000000000..d18989d2743
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeql2.go
@@ -0,0 +1,67 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgeql2 computes the QL factorization of the m×n matrix A. That is, Dgeql2
+// computes Q and L such that
+//
+//	A = Q * L
+//
+// where Q is an m×m orthonormal matrix and L is a lower trapezoidal matrix.
+//
+// Q is represented as a product of elementary reflectors,
+//
+//	Q = H_{k-1} * ... * H_1 * H_0
+//
+// where k = min(m,n) and each H_i has the form
+//
+//	H_i = I - tau[i] * v_i * v_iᵀ
+//
+// Vector v_i has v[m-k+i+1:m] = 0, v[m-k+i] = 1, and v[:m-k+i+1] is stored on
+// exit in A[0:m-k+i-1, n-k+i].
+//
+// tau must have length at least min(m,n), and Dgeql2 will panic otherwise.
+//
+// work is temporary memory storage and must have length at least n.
+//
+// Dgeql2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeql2(m, n int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	var aii float64
+	for i := k - 1; i >= 0; i-- {
+		// Generate elementary reflector H_i to annihilate A[0:m-k+i-1, n-k+i].
+		aii, tau[i] = impl.Dlarfg(m-k+i+1, a[(m-k+i)*lda+n-k+i], a[n-k+i:], lda)
+
+		// Apply H_i to A[0:m-k+i, 0:n-k+i-1] from the left.
+		a[(m-k+i)*lda+n-k+i] = 1
+		impl.Dlarf(blas.Left, m-k+i+1, n-k+i, a[n-k+i:], lda, tau[i], a, lda, work)
+		a[(m-k+i)*lda+n-k+i] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go
new file mode 100644
index 00000000000..da8cd4fa761
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqp3.go
@@ -0,0 +1,195 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgeqp3 computes a QR factorization with column pivoting of the m×n matrix A:
+//
+//	A*P = Q*R
+//
+// where P is a permutation matrix, Q is an orthogonal matrix and R is a
+// min(m,n)×n upper trapezoidal matrix.
+//
+// On return, the upper triangle of A contains the matrix R. The elements below
+// the diagonal together with tau represent the matrix Q as a product of
+// elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// Each H_i has the form
+//
+//	H_i = I - tau * v * vᵀ
+//
+// where tau is a scalar and v is a vector with v[0:i] = 0 and v[i] = 1;
+// v[i+1:m] is stored on exit in A[i+1:m,i], and tau in tau[i].
+//
+// jpvt specifies a column pivot to be applied to A. On entry, if jpvt[j] is at
+// least zero, the jth column of A is permuted to the front of A*P (a leading
+// column), if jpvt[j] is -1 the jth column of A is a free column. If jpvt[j] <
+// -1, Dgeqp3 will panic. On return, jpvt holds the permutation that was
+// applied; the jth column of A*P was the jpvt[j] column of A. jpvt must have
+// length n or Dgeqp3 will panic.
+//
+// tau holds the scalar factors of the elementary reflectors. It must have
+// length min(m,n), otherwise Dgeqp3 will panic.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// 3*n+1, otherwise Dgeqp3 will panic. For optimal performance lwork must be at
+// least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return, work[0]
+// will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Dgeqp3, only the optimal value of lwork
+// will be stored in work[0].
+//
+// Dgeqp3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int) {
+	const (
+		inb    = 1
+		inbmin = 2
+		ixover = 3
+	)
+
+	minmn := min(m, n)
+	iws := 3*n + 1
+	if minmn == 0 {
+		iws = 1
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < iws && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if minmn == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(inb, "DGEQRF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(2*n + (n+1)*nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(jpvt) != n:
+		panic(badLenJpvt)
+	case len(tau) < minmn:
+		panic(shortTau)
+	}
+
+	for _, v := range jpvt {
+		if v < -1 || n <= v {
+			panic(badJpvt)
+		}
+	}
+
+	bi := blas64.Implementation()
+
+	// Move initial columns up front.
+	var nfxd int
+	for j := 0; j < n; j++ {
+		if jpvt[j] == -1 {
+			jpvt[j] = j
+			continue
+		}
+		if j != nfxd {
+			bi.Dswap(m, a[j:], lda, a[nfxd:], lda)
+			jpvt[j], jpvt[nfxd] = jpvt[nfxd], j
+		} else {
+			jpvt[j] = j
+		}
+		nfxd++
+	}
+
+	// Factorize nfxd columns.
+	//
+	// Compute the QR factorization of nfxd columns and update remaining columns.
+	if nfxd > 0 {
+		na := min(m, nfxd)
+		impl.Dgeqrf(m, na, a, lda, tau[:na], work, lwork)
+		iws = max(iws, int(work[0]))
+		if na < n {
+			impl.Dormqr(blas.Left, blas.Trans, m, n-na, na, a, lda, tau[:na], a[na:], lda,
+				work, lwork)
+			iws = max(iws, int(work[0]))
+		}
+	}
+
+	if nfxd >= minmn {
+		work[0] = float64(iws)
+		return
+	}
+
+	// Factorize free columns.
+	sm := m - nfxd
+	sn := n - nfxd
+	sminmn := minmn - nfxd
+
+	// Determine the block size.
+	nb = impl.Ilaenv(inb, "DGEQRF", " ", sm, sn, -1, -1)
+	nbmin := 2
+	nx := 0
+
+	if 1 < nb && nb < sminmn {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(ixover, "DGEQRF", " ", sm, sn, -1, -1))
+
+		if nx < sminmn {
+			// Determine if workspace is large enough for blocked code.
+			minws := 2*sn + (sn+1)*nb
+			iws = max(iws, minws)
+			if lwork < minws {
+				// Not enough workspace to use optimal nb. Reduce
+				// nb and determine the minimum value of nb.
+				nb = (lwork - 2*sn) / (sn + 1)
+				nbmin = max(2, impl.Ilaenv(inbmin, "DGEQRF", " ", sm, sn, -1, -1))
+			}
+		}
+	}
+
+	// Initialize partial column norms.
+	// The first n elements of work store the exact column norms.
+	for j := nfxd; j < n; j++ {
+		work[j] = bi.Dnrm2(sm, a[nfxd*lda+j:], lda)
+		work[n+j] = work[j]
+	}
+	j := nfxd
+	if nbmin <= nb && nb < sminmn && nx < sminmn {
+		// Use blocked code initially.
+
+		// Compute factorization.
+		var fjb int
+		for topbmn := minmn - nx; j < topbmn; j += fjb {
+			jb := min(nb, topbmn-j)
+
+			// Factorize jb columns among columns j:n.
+			fjb = impl.Dlaqps(m, n-j, j, jb, a[j:], lda, jpvt[j:], tau[j:],
+				work[j:n], work[j+n:2*n], work[2*n:2*n+jb], work[2*n+jb:], jb)
+		}
+	}
+
+	// Use unblocked code to factor the last or only block.
+	if j < minmn {
+		impl.Dlaqp2(m, n-j, j, a[j:], lda, jpvt[j:], tau[j:],
+			work[j:n], work[j+n:2*n], work[2*n:])
+	}
+
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go
new file mode 100644
index 00000000000..4d1a4b3b0ca
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqr2.go
@@ -0,0 +1,78 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgeqr2 computes a QR factorization of the m×n matrix A.
+//
+// In a QR factorization, Q is an m×m orthonormal matrix, and R is an
+// upper triangular m×n matrix.
+//
+// A is modified to contain the information to construct Q and R.
+// The upper triangle of a contains the matrix R. The lower triangular elements
+// (not including the diagonal) contain the elementary reflectors. tau is modified
+// to contain the reflector scales. tau must have length min(m,n), and
+// this function will panic otherwise.
+//
+// The ith elementary reflector can be explicitly constructed by first extracting
+// the
+//
+//	v[j] = 0           j < i
+//	v[j] = 1           j == i
+//	v[j] = a[j*lda+i]  j > i
+//
+// and computing H_i = I - tau[i] * v * vᵀ.
+//
+// The orthonormal matrix Q can be constructed from a product of these elementary
+// reflectors, Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// work is temporary storage of length at least n and this function will panic otherwise.
+//
+// Dgeqr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeqr2(m, n int, a []float64, lda int, tau, work []float64) {
+	// TODO(btracey): This is oriented such that columns of a are eliminated.
+	// This likely could be re-arranged to take better advantage of row-major
+	// storage.
+
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	}
+
+	for i := 0; i < k; i++ {
+		// Generate elementary reflector H_i.
+		a[i*lda+i], tau[i] = impl.Dlarfg(m-i, a[i*lda+i], a[min((i+1), m-1)*lda+i:], lda)
+		if i < n-1 {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(blas.Left, m-i, n-i-1,
+				a[i*lda+i:], lda,
+				tau[i],
+				a[i*lda+i+1:], lda,
+				work)
+			a[i*lda+i] = aii
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go
new file mode 100644
index 00000000000..2bcbde586ce
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgeqrf.go
@@ -0,0 +1,108 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgeqrf computes the QR factorization of the m×n matrix A using a blocked
+// algorithm. See the documentation for Dgeqr2 for a description of the
+// parameters at entry and exit.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// The length of work must be at least max(1, lwork) and lwork must be -1
+// or at least n, otherwise this function will panic.
+// Dgeqrf is a blocked QR factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgeqrf,
+// the optimal work length will be stored into work[0].
+//
+// tau must have length min(m,n), and this function will panic otherwise.
+func (impl Implementation) Dgeqrf(m, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		work[0] = 1
+		return
+	}
+
+	// nb is the optimal blocksize, i.e. the number of columns transformed at a time.
+	nb := impl.Ilaenv(1, "DGEQRF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) != k {
+		panic(badLenTau)
+	}
+
+	nbmin := 2 // Minimal block size.
+	var nx int // Use unblocked (unless changed in the next for loop)
+	iws := n
+	// Only consider blocked if the suggested block size is > 1 and the
+	// number of rows or columns is sufficiently large.
+	if 1 < nb && nb < k {
+		// nx is the block size at which the code switches from blocked
+		// to unblocked.
+		nx = max(0, impl.Ilaenv(3, "DGEQRF", " ", m, n, -1, -1))
+		if k > nx {
+			iws = n * nb
+			if lwork < iws {
+				// Not enough workspace to use the optimal block
+				// size. Get the minimum block size instead.
+				nb = lwork / n
+				nbmin = max(2, impl.Ilaenv(2, "DGEQRF", " ", m, n, -1, -1))
+			}
+		}
+	}
+
+	// Compute QR using a blocked algorithm.
+	var i int
+	if nbmin <= nb && nb < k && nx < k {
+		ldwork := nb
+		for i = 0; i < k-nx; i += nb {
+			ib := min(k-i, nb)
+			// Compute the QR factorization of the current block.
+			impl.Dgeqr2(m-i, ib, a[i*lda+i:], lda, tau[i:i+ib], work)
+			if i+ib < n {
+				// Form the triangular factor of the block reflector and apply Hᵀ
+				// In Dlarft, work becomes the T matrix.
+				impl.Dlarft(lapack.Forward, lapack.ColumnWise, m-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+				impl.Dlarfb(blas.Left, blas.Trans, lapack.Forward, lapack.ColumnWise,
+					m-i, n-i-ib, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[i*lda+i+ib:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+		}
+	}
+	// Call unblocked code on the remaining columns.
+	if i < k {
+		impl.Dgeqr2(m-i, n-i, a[i*lda+i:], lda, tau[i:], work)
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go
new file mode 100644
index 00000000000..44ca1bc1a09
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerq2.go
@@ -0,0 +1,74 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgerq2 computes an RQ factorization of the m×n matrix A,
+//
+//	A = R * Q.
+//
+// On exit, if m <= n, the upper triangle of the subarray
+// A[0:m, n-m:n] contains the m×m upper triangular matrix R.
+// If m >= n, the elements on and above the (m-n)-th subdiagonal
+// contain the m×n upper trapezoidal matrix R.
+// The remaining elements, with tau, represent the
+// orthogonal matrix Q as a product of min(m,n) elementary
+// reflectors.
+//
+// The matrix Q is represented as a product of elementary reflectors
+//
+//	Q = H_0 H_1 . . . H_{min(m,n)-1}.
+//
+// Each H(i) has the form
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// where v is a vector with v[0:n-k+i-1] stored in A[m-k+i, 0:n-k+i-1],
+// v[n-k+i:n] = 0 and v[n-k+i] = 1.
+//
+// tau must have length min(m,n) and work must have length m, otherwise
+// Dgerq2 will panic.
+//
+// Dgerq2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgerq2(m, n int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	}
+
+	for i := k - 1; i >= 0; i-- {
+		// Generate elementary reflector H[i] to annihilate
+		// A[m-k+i, 0:n-k+i-1].
+		mki := m - k + i
+		nki := n - k + i
+		var aii float64
+		aii, tau[i] = impl.Dlarfg(nki+1, a[mki*lda+nki], a[mki*lda:], 1)
+
+		// Apply H[i] to A[0:m-k+i-1, 0:n-k+i] from the right.
+		a[mki*lda+nki] = 1
+		impl.Dlarf(blas.Right, mki, nki+1, a[mki*lda:], 1, tau[i], a, lda, work)
+		a[mki*lda+nki] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go
new file mode 100644
index 00000000000..fe010b4792c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgerqf.go
@@ -0,0 +1,135 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgerqf computes an RQ factorization of the m×n matrix A,
+//
+//	A = R * Q.
+//
+// On exit, if m <= n, the upper triangle of the subarray
+// A[0:m, n-m:n] contains the m×m upper triangular matrix R.
+// If m >= n, the elements on and above the (m-n)-th subdiagonal
+// contain the m×n upper trapezoidal matrix R.
+// The remaining elements, with tau, represent the
+// orthogonal matrix Q as a product of min(m,n) elementary
+// reflectors.
+//
+// The matrix Q is represented as a product of elementary reflectors
+//
+//	Q = H_0 H_1 . . . H_{min(m,n)-1}.
+//
+// Each H(i) has the form
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// where v is a vector with v[0:n-k+i-1] stored in A[m-k+i, 0:n-k+i-1],
+// v[n-k+i:n] = 0 and v[n-k+i] = 1.
+//
+// tau must have length min(m,n), work must have length max(1, lwork),
+// and lwork must be -1 or at least max(1, m), otherwise Dgerqf will panic.
+// On exit, work[0] will contain the optimal length for work.
+//
+// Dgerqf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgerqf(m, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, m) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	k := min(m, n)
+	if k == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DGERQF", " ", m, n, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(m * nb)
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(tau) != k {
+		panic(badLenTau)
+	}
+
+	nbmin := 2
+	nx := 1
+	iws := m
+	var ldwork int
+	if 1 < nb && nb < k {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(3, "DGERQF", " ", m, n, -1, -1))
+		if nx < k {
+			// Determine whether workspace is large enough for blocked code.
+			iws = m * nb
+			if lwork < iws {
+				// Not enough workspace to use optimal nb. Reduce
+				// nb and determine the minimum value of nb.
+				nb = lwork / m
+				nbmin = max(2, impl.Ilaenv(2, "DGERQF", " ", m, n, -1, -1))
+			}
+			ldwork = nb
+		}
+	}
+
+	var mu, nu int
+	if nbmin <= nb && nb < k && nx < k {
+		// Use blocked code initially.
+		// The last kk rows are handled by the block method.
+		ki := ((k - nx - 1) / nb) * nb
+		kk := min(k, ki+nb)
+
+		var i int
+		for i = k - kk + ki; i >= k-kk; i -= nb {
+			ib := min(k-i, nb)
+
+			// Compute the RQ factorization of the current block
+			// A[m-k+i:m-k+i+ib-1, 0:n-k+i+ib-1].
+			impl.Dgerq2(ib, n-k+i+ib, a[(m-k+i)*lda:], lda, tau[i:], work)
+			if m-k+i > 0 {
+				// Form the triangular factor of the block reflector
+				// H = H_{i+ib-1} . . . H_{i+1} H_i.
+				impl.Dlarft(lapack.Backward, lapack.RowWise,
+					n-k+i+ib, ib, a[(m-k+i)*lda:], lda, tau[i:],
+					work, ldwork)
+
+				// Apply H to A[0:m-k+i-1, 0:n-k+i+ib-1] from the right.
+				impl.Dlarfb(blas.Right, blas.NoTrans, lapack.Backward, lapack.RowWise,
+					m-k+i, n-k+i+ib, ib, a[(m-k+i)*lda:], lda,
+					work, ldwork,
+					a, lda,
+					work[ib*ldwork:], ldwork)
+			}
+		}
+		mu = m - k + i + nb
+		nu = n - k + i + nb
+	} else {
+		mu = m
+		nu = n
+	}
+
+	// Use unblocked code to factor the last or only block.
+	if mu > 0 && nu > 0 {
+		impl.Dgerq2(mu, nu, a, lda, tau, work)
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go
new file mode 100644
index 00000000000..b2201085c5d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesc2.go
@@ -0,0 +1,93 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgesc2 solves a system of linear equations
+//
+//	A * x = scale * b
+//
+// with a general n×n matrix A represented by the LU factorization with complete
+// pivoting
+//
+//	A = P * L * U * Q
+//
+// as computed by Dgetc2.
+//
+// On entry, rhs contains the right hand side vector b. On return, it is
+// overwritten with the solution vector x.
+//
+// Dgesc2 returns a scale factor
+//
+//	0 <= scale <= 1
+//
+// chosen to prevent overflow in the solution.
+//
+// Dgesc2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgesc2(n int, a []float64, lda int, rhs []float64, ipiv, jpiv []int) (scale float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(rhs) < n:
+		panic(shortRHS)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(jpiv) != n:
+		panic(badLenJpiv)
+	}
+
+	const smlnum = dlamchS / dlamchP
+
+	// Apply permutations ipiv to rhs.
+	impl.Dlaswp(1, rhs, 1, 0, n-1, ipiv[:n], 1)
+
+	// Solve for L part.
+	for i := 0; i < n-1; i++ {
+		for j := i + 1; j < n; j++ {
+			rhs[j] -= float64(a[j*lda+i] * rhs[i])
+		}
+	}
+
+	// Check for scaling.
+	scale = 1.0
+	bi := blas64.Implementation()
+	i := bi.Idamax(n, rhs, 1)
+	if 2*smlnum*math.Abs(rhs[i]) > math.Abs(a[(n-1)*lda+(n-1)]) {
+		temp := 0.5 / math.Abs(rhs[i])
+		bi.Dscal(n, temp, rhs, 1)
+		scale *= temp
+	}
+
+	// Solve for U part.
+	for i := n - 1; i >= 0; i-- {
+		temp := 1.0 / a[i*lda+i]
+		rhs[i] *= temp
+		for j := i + 1; j < n; j++ {
+			rhs[i] -= float64(rhs[j] * (a[i*lda+j] * temp))
+		}
+	}
+
+	// Apply permutations jpiv to the solution (rhs).
+	impl.Dlaswp(1, rhs, 1, 0, n-1, jpiv[:n], -1)
+
+	return scale
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go
new file mode 100644
index 00000000000..0be4414ca1c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesv.go
@@ -0,0 +1,60 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dgesv computes the solution to a real system of linear equations
+//
+//	A * X = B
+//
+// where A is an n×n matrix and X and B are n×nrhs matrices.
+//
+// The LU decomposition with partial pivoting and row interchanges is used to
+// factor A as
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is unit lower triangular, and U is upper
+// triangular. On return, the factors L and U are stored in a; the unit diagonal
+// elements of L are not stored. The row pivot indices that define the
+// permutation matrix P are stored in ipiv.
+//
+// The factored form of A is then used to solve the system of equations A * X =
+// B. On entry, b contains the right hand side matrix B. On return, if ok is
+// true, b contains the solution matrix X.
+func (impl Implementation) Dgesv(n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortAB)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	ok = impl.Dgetrf(n, n, a, lda, ipiv)
+	if ok {
+		impl.Dgetrs(blas.NoTrans, n, nrhs, a, lda, ipiv, b, ldb)
+	}
+
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go
new file mode 100644
index 00000000000..97da749bfb2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgesvd.go
@@ -0,0 +1,1378 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+const noSVDO = "dgesvd: not coded for overwrite"
+
+// Dgesvd computes the singular value decomposition of the input matrix A.
+//
+// The singular value decomposition is
+//
+//	A = U * Sigma * Vᵀ
+//
+// where Sigma is an m×n diagonal matrix containing the singular values of A,
+// U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
+// min(m,n) columns of U and V are the left and right singular vectors of A
+// respectively.
+//
+// jobU and jobVT are options for computing the singular vectors. The behavior
+// is as follows
+//
+//	jobU == lapack.SVDAll       All m columns of U are returned in u
+//	jobU == lapack.SVDStore     The first min(m,n) columns are returned in u
+//	jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
+//	jobU == lapack.SVDNone      The columns of U are not computed.
+//
+// The behavior is the same for jobVT and the rows of Vᵀ. At most one of jobU
+// and jobVT can equal lapack.SVDOverwrite, and Dgesvd will panic otherwise.
+//
+// On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
+// the data is overwritten. On exit, A contains the appropriate singular vectors
+// if either job is lapack.SVDOverwrite.
+//
+// s is a slice of length at least min(m,n) and on exit contains the singular
+// values in decreasing order.
+//
+// u contains the left singular vectors on exit, stored column-wise. If
+// jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDStore u is
+// of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
+// not used.
+//
+// vt contains the left singular vectors on exit, stored row-wise. If
+// jobV == lapack.SVDAll, vt is of size n×n. If jobVT == lapack.SVDStore vt is
+// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
+// not used.
+//
+// work is a slice for storing temporary memory, and lwork is the usable size of
+// the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
+// If lwork == -1, instead of performing Dgesvd, the optimal work length will be
+// stored into work[0]. Dgesvd will panic if the working memory has insufficient
+// storage.
+//
+// Dgesvd returns whether the decomposition successfully completed.
+func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool) {
+	if jobU == lapack.SVDOverwrite || jobVT == lapack.SVDOverwrite {
+		panic(noSVDO)
+	}
+
+	wantua := jobU == lapack.SVDAll
+	wantus := jobU == lapack.SVDStore
+	wantuas := wantua || wantus
+	wantuo := jobU == lapack.SVDOverwrite
+	wantun := jobU == lapack.SVDNone
+	if !(wantua || wantus || wantuo || wantun) {
+		panic(badSVDJob)
+	}
+
+	wantva := jobVT == lapack.SVDAll
+	wantvs := jobVT == lapack.SVDStore
+	wantvas := wantva || wantvs
+	wantvo := jobVT == lapack.SVDOverwrite
+	wantvn := jobVT == lapack.SVDNone
+	if !(wantva || wantvs || wantvo || wantvn) {
+		panic(badSVDJob)
+	}
+
+	if wantuo && wantvo {
+		panic(bothSVDOver)
+	}
+
+	minmn := min(m, n)
+	minwork := 1
+	if minmn > 0 {
+		minwork = max(3*minmn+max(m, n), 5*minmn)
+	}
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldu < 1, wantua && ldu < m, wantus && ldu < minmn:
+		panic(badLdU)
+	case ldvt < 1 || (wantvas && ldvt < n):
+		panic(badLdVT)
+	case lwork < minwork && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if minmn == 0 {
+		work[0] = 1
+		return true
+	}
+
+	// Compute optimal workspace size for subroutines.
+	opts := string(jobU) + string(jobVT)
+	mnthr := impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
+	maxwrk := 1
+	var wrkbl, bdspac int
+	if m >= n {
+		bdspac = 5 * n
+		impl.Dgeqrf(m, n, a, lda, nil, work, -1)
+		lwork_dgeqrf := int(work[0])
+
+		impl.Dorgqr(m, n, n, a, lda, nil, work, -1)
+		lwork_dorgqr_n := int(work[0])
+		impl.Dorgqr(m, m, n, a, lda, nil, work, -1)
+		lwork_dorgqr_m := int(work[0])
+
+		impl.Dgebrd(n, n, a, lda, s, nil, nil, nil, work, -1)
+		lwork_dgebrd := int(work[0])
+
+		impl.Dorgbr(lapack.GeneratePT, n, n, n, a, lda, nil, work, -1)
+		lwork_dorgbr_p := int(work[0])
+
+		impl.Dorgbr(lapack.GenerateQ, n, n, n, a, lda, nil, work, -1)
+		lwork_dorgbr_q := int(work[0])
+
+		if m >= mnthr {
+			if wantun {
+				// Path 1 (m much larger than n, jobU == None)
+				maxwrk = n + lwork_dgeqrf
+				maxwrk = max(maxwrk, 3*n+lwork_dgebrd)
+				if wantvo || wantvas {
+					maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
+				}
+				maxwrk = max(maxwrk, bdspac)
+			} else if wantuo && wantvn {
+				// Path 2 (m much larger than n, jobU == Overwrite, jobVT == None)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
+			} else if wantuo && wantvas {
+				// Path 3 (m much larger than n, jobU == Overwrite, jobVT == Store or All)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
+			} else if wantus && wantvn {
+				// Path 4 (m much larger than n, jobU == Store, jobVT == None)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			} else if wantus && wantvo {
+				// Path 5 (m much larger than n, jobU == Store, jobVT == Overwrite)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*n*n + wrkbl
+			} else if wantus && wantvas {
+				// Path 6 (m much larger than n, jobU == Store, jobVT == Store or All)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			} else if wantua && wantvn {
+				// Path 7 (m much larger than n, jobU == All, jobVT == None)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			} else if wantua && wantvo {
+				// Path 8 (m much larger than n, jobU == All, jobVT == Overwrite)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*n*n + wrkbl
+			} else if wantua && wantvas {
+				// Path 9 (m much larger than n, jobU == All, jobVT == Store or All)
+				wrkbl = n + lwork_dgeqrf
+				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
+				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = n*n + wrkbl
+			}
+		} else {
+			// Path 10 (m at least n, but not much larger)
+			impl.Dgebrd(m, n, a, lda, s, nil, nil, nil, work, -1)
+			lwork_dgebrd := int(work[0])
+			maxwrk = 3*n + lwork_dgebrd
+			if wantus || wantuo {
+				impl.Dorgbr(lapack.GenerateQ, m, n, n, a, lda, nil, work, -1)
+				lwork_dorgbr_q = int(work[0])
+				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
+			}
+			if wantua {
+				impl.Dorgbr(lapack.GenerateQ, m, m, n, a, lda, nil, work, -1)
+				lwork_dorgbr_q := int(work[0])
+				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
+			}
+			if !wantvn {
+				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
+			}
+			maxwrk = max(maxwrk, bdspac)
+		}
+	} else {
+		bdspac = 5 * m
+
+		impl.Dgelqf(m, n, a, lda, nil, work, -1)
+		lwork_dgelqf := int(work[0])
+
+		impl.Dorglq(n, n, m, nil, n, nil, work, -1)
+		lwork_dorglq_n := int(work[0])
+		impl.Dorglq(m, n, m, a, lda, nil, work, -1)
+		lwork_dorglq_m := int(work[0])
+
+		impl.Dgebrd(m, m, a, lda, s, nil, nil, nil, work, -1)
+		lwork_dgebrd := int(work[0])
+
+		impl.Dorgbr(lapack.GeneratePT, m, m, m, a, n, nil, work, -1)
+		lwork_dorgbr_p := int(work[0])
+
+		impl.Dorgbr(lapack.GenerateQ, m, m, m, a, n, nil, work, -1)
+		lwork_dorgbr_q := int(work[0])
+
+		if n >= mnthr {
+			if wantvn {
+				// Path 1t (n much larger than m, jobVT == None)
+				maxwrk = m + lwork_dgelqf
+				maxwrk = max(maxwrk, 3*m+lwork_dgebrd)
+				if wantuo || wantuas {
+					maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
+				}
+				maxwrk = max(maxwrk, bdspac)
+			} else if wantvo && wantun {
+				// Path 2t (n much larger than m, jobU == None, jobVT == Overwrite)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
+			} else if wantvo && wantuas {
+				// Path 3t (n much larger than m, jobU == Store or All, jobVT == Overwrite)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
+			} else if wantvs && wantun {
+				// Path 4t (n much larger than m, jobU == None, jobVT == Store)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			} else if wantvs && wantuo {
+				// Path 5t (n much larger than m, jobU == Overwrite, jobVT == Store)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*m*m + wrkbl
+			} else if wantvs && wantuas {
+				// Path 6t (n much larger than m, jobU == Store or All, jobVT == Store)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_m)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			} else if wantva && wantun {
+				// Path 7t (n much larger than m, jobU== None, jobVT == All)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_n)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			} else if wantva && wantuo {
+				// Path 8t (n much larger than m, jobU == Overwrite, jobVT == All)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_n)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = 2*m*m + wrkbl
+			} else if wantva && wantuas {
+				// Path 9t (n much larger than m, jobU == Store or All, jobVT == All)
+				wrkbl = m + lwork_dgelqf
+				wrkbl = max(wrkbl, m+lwork_dorglq_n)
+				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
+				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
+				wrkbl = max(wrkbl, bdspac)
+				maxwrk = m*m + wrkbl
+			}
+		} else {
+			// Path 10t (n greater than m, but not much larger)
+			impl.Dgebrd(m, n, a, lda, s, nil, nil, nil, work, -1)
+			lwork_dgebrd = int(work[0])
+			maxwrk = 3*m + lwork_dgebrd
+			if wantvs || wantvo {
+				impl.Dorgbr(lapack.GeneratePT, m, n, m, a, n, nil, work, -1)
+				lwork_dorgbr_p = int(work[0])
+				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
+			}
+			if wantva {
+				impl.Dorgbr(lapack.GeneratePT, n, n, m, a, n, nil, work, -1)
+				lwork_dorgbr_p = int(work[0])
+				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
+			}
+			if !wantun {
+				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
+			}
+			maxwrk = max(maxwrk, bdspac)
+		}
+	}
+
+	maxwrk = max(maxwrk, minwork)
+	if lwork == -1 {
+		work[0] = float64(maxwrk)
+		return true
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+	if len(s) < minmn {
+		panic(shortS)
+	}
+	if (len(u) < (m-1)*ldu+m && wantua) || (len(u) < (m-1)*ldu+minmn && wantus) {
+		panic(shortU)
+	}
+	if (len(vt) < (n-1)*ldvt+n && wantva) || (len(vt) < (minmn-1)*ldvt+n && wantvs) {
+		panic(shortVT)
+	}
+
+	// Perform decomposition.
+	eps := dlamchE
+	smlnum := math.Sqrt(dlamchS) / eps
+	bignum := 1 / smlnum
+
+	// Scale A if max element outside range [smlnum, bignum].
+	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, nil)
+	var iscl bool
+	if anrm > 0 && anrm < smlnum {
+		iscl = true
+		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
+	} else if anrm > bignum {
+		iscl = true
+		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
+	}
+
+	bi := blas64.Implementation()
+	var ie int
+	if m >= n {
+		// If A has sufficiently more rows than columns, use the QR decomposition.
+		if m >= mnthr {
+			// m >> n
+			if wantun {
+				// Path 1.
+				itau := 0
+				iwork := itau + n
+
+				// Compute A = Q * R.
+				impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+				// Zero out below R.
+				impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
+				ie = 0
+				itauq := ie + n
+				itaup := itauq + n
+				iwork = itaup + n
+				// Bidiagonalize R in A.
+				impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:],
+					work[itaup:], work[iwork:], lwork-iwork)
+				ncvt := 0
+				if wantvo || wantvas {
+					impl.Dorgbr(lapack.GeneratePT, n, n, n, a, lda, work[itaup:],
+						work[iwork:], lwork-iwork)
+					ncvt = n
+				}
+				iwork = ie + n
+
+				// Perform bidiagonal QR iteration computing right singular vectors
+				// of A in A if desired.
+				ok = impl.Dbdsqr(blas.Upper, n, ncvt, 0, 0, s, work[ie:],
+					a, lda, work, 1, work, 1, work[iwork:])
+
+				// If right singular vectors desired in VT, copy them there.
+				if wantvas {
+					impl.Dlacpy(blas.All, n, n, a, lda, vt, ldvt)
+				}
+			} else if wantuo && wantvn {
+				// Path 2
+				panic(noSVDO)
+			} else if wantuo && wantvas {
+				// Path 3
+				panic(noSVDO)
+			} else if wantus {
+				if wantvn {
+					// Path 4
+					if lwork >= n*n+max(4*n, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*n {
+							ldworkr = lda
+						} else {
+							ldworkr = n
+						}
+						itau := ir + ldworkr*n
+						iwork := itau + n
+						// Compute A = Q * R.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R to work[ir:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr)
+
+						// Generate Q in A.
+						impl.Dorgqr(m, n, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[ir:].
+						impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left vectors bidiagonalizing R in work[ir:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[ir:], ldworkr,
+							work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], work, 1,
+							work[ir:], ldworkr, work, 1, work[iwork:])
+
+						// Multiply Q in A by left singular vectors of R in
+						// work[ir:], storing result in U.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda,
+							work[ir:], ldworkr, 0, u, ldu)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, n, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Zero out below R in A.
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
+
+						// Bidiagonalize R in A.
+						impl.Dgebrd(n, n, a, lda, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left vectors bidiagonalizing R.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
+							a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left
+						// singular vectors of A in U.
+						ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:], work, 1,
+							u, ldu, work, 1, work[iwork:])
+					}
+				} else if wantvo {
+					// Path 5
+					panic(noSVDO)
+				} else if wantvas {
+					// Path 6
+					if lwork >= n*n+max(4*n, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+						var ldworku int
+						if lwork >= wrkbl+lda*n {
+							ldworku = lda
+						} else {
+							ldworku = n
+						}
+						itau := iu + ldworku*n
+						iwork := itau + n
+
+						// Compute A = Q * R.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						// Copy R to work[iu:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku)
+
+						// Generate Q in A.
+						impl.Dorgqr(m, n, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[iu:], copying result to VT.
+						impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt)
+
+						// Generate left bidiagonalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[iu:], ldworku,
+							work[itauq:], work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[iu:], and computing right singular
+						// vectors of R in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:],
+							vt, ldvt, work[iu:], ldworku, work, 1, work[iwork:])
+
+						// Multiply Q in A by left singular vectors of R in
+						// work[iu:], storing result in U.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, a, lda,
+							work[iu:], ldworku, 0, u, ldu)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q * R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, n, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R to VT, zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)
+
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in VT.
+						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left bidiagonalizing vectors in VT.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
+							vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
+							vt, ldvt, u, ldu, work, 1, work[iwork:])
+					}
+				}
+			} else if wantua {
+				if wantvn {
+					// Path 7
+					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*n {
+							ldworkr = lda
+						} else {
+							ldworkr = n
+						}
+						itau := ir + ldworkr*n
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Copy R to work[ir:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[ir+ldworkr:], ldworkr)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[ir:].
+						impl.Dgebrd(n, n, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in work[ir:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[ir:], ldworkr,
+							work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, n, 0, n, 0, s, work[ie:], work, 1,
+							work[ir:], ldworkr, work, 1, work[iwork:])
+
+						// Multiply Q in U by left singular vectors of R in
+						// work[ir:], storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1, u, ldu,
+							work[ir:], ldworkr, 0, a, lda)
+
+						// Copy left singular vectors of A from A to U.
+						impl.Dlacpy(blas.All, m, n, a, lda, u, ldu)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Zero out below R in A.
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
+
+						// Bidiagonalize R in A.
+						impl.Dgebrd(n, n, a, lda, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left bidiagonalizing vectors in A.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n,
+							a, lda, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left
+						// singular vectors of A in U.
+						ok = impl.Dbdsqr(blas.Upper, n, 0, m, 0, s, work[ie:],
+							work, 1, u, ldu, work, 1, work[iwork:])
+					}
+				} else if wantvo {
+					// Path 8.
+					panic(noSVDO)
+				} else if wantvas {
+					// Path 9.
+					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+						var ldworku int
+						if lwork >= wrkbl+lda*n {
+							ldworku = lda
+						} else {
+							ldworku = n
+						}
+						itau := iu + ldworku*n
+						iwork := itau + n
+
+						// Compute A = Q * R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R to work[iu:], zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, work[iu+ldworku:], ldworku)
+
+						ie = itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in work[iu:], copying result to VT.
+						impl.Dgebrd(n, n, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, n, n, work[iu:], ldworku, vt, ldvt)
+
+						// Generate left bidiagonalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GenerateQ, n, n, n, work[iu:], ldworku,
+							work[itauq:], work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of R in work[iu:] and computing right
+						// singular vectors of R in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, n, 0, s, work[ie:],
+							vt, ldvt, work[iu:], ldworku, work, 1, work[iwork:])
+
+						// Multiply Q in U by left singular vectors of R in
+						// work[iu:], storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, n, 1,
+							u, ldu, work[iu:], ldworku, 0, a, lda)
+
+						// Copy left singular vectors of A from A to U.
+						impl.Dlacpy(blas.All, m, n, a, lda, u, ldu)
+
+						/*
+							// Bidiagonalize R in VT.
+							impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
+								work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+							// Multiply Q in U by left bidiagonalizing vectors in VT.
+							impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans,
+								m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+
+							// Generate right bidiagonalizing vectors in VT.
+							impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+								work[itaup:], work[iwork:], lwork-iwork)
+							iwork = ie + n
+
+							// Perform bidiagonal QR iteration, computing left singular
+							// vectors of A in U and computing right singular vectors
+							// of A in VT.
+							ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
+								vt, ldvt, u, ldu, work, 1, work[iwork:])
+						*/
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + n
+
+						// Compute A = Q*R, copying result to U.
+						impl.Dgeqrf(m, n, a, lda, work[itau:itau+n], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+
+						// Generate Q in U.
+						impl.Dorgqr(m, m, n, u, ldu, work[itau:itau+n], work[iwork:], lwork-iwork)
+
+						// Copy R from A to VT, zeroing out below it.
+						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
+						if n > 1 {
+							impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)
+						}
+
+						ie := itau
+						itauq := ie + n
+						itaup := itauq + n
+						iwork = itaup + n
+
+						// Bidiagonalize R in VT.
+						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply Q in U by left bidiagonalizing vectors in VT.
+						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans,
+							m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonizing vectors in VT.
+						impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + n
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:],
+							vt, ldvt, u, ldu, work, 1, work[iwork:])
+					}
+				}
+			}
+		} else {
+			// Path 10.
+			// M at least N, but not much larger.
+			ie = 0
+			itauq := ie + n
+			itaup := itauq + n
+			iwork := itaup + n
+
+			// Bidiagonalize A.
+			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:],
+				work[itaup:], work[iwork:], lwork-iwork)
+			if wantuas {
+				// Left singular vectors are desired in U. Copy result to U and
+				// generate left biadiagonalizing vectors in U.
+				impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
+				var ncu int
+				if wantus {
+					ncu = n
+				}
+				if wantua {
+					ncu = m
+				}
+				impl.Dorgbr(lapack.GenerateQ, m, ncu, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+			}
+			if wantvas {
+				// Right singular vectors are desired in VT. Copy result to VT and
+				// generate left biadiagonalizing vectors in VT.
+				impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
+				impl.Dorgbr(lapack.GeneratePT, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
+			}
+			if wantuo {
+				panic(noSVDO)
+			}
+			if wantvo {
+				panic(noSVDO)
+			}
+			iwork = ie + n
+			var nru, ncvt int
+			if wantuas || wantuo {
+				nru = m
+			}
+			if wantun {
+				nru = 0
+			}
+			if wantvas || wantvo {
+				ncvt = n
+			}
+			if wantvn {
+				ncvt = 0
+			}
+			if !wantuo && !wantvo {
+				// Perform bidiagonal QR iteration, if desired, computing left
+				// singular vectors in U and right singular vectors in VT.
+				ok = impl.Dbdsqr(blas.Upper, n, ncvt, nru, 0, s, work[ie:],
+					vt, ldvt, u, ldu, work, 1, work[iwork:])
+			} else {
+				// There will be two branches when the implementation is complete.
+				panic(noSVDO)
+			}
+		}
+	} else {
+		// A has more columns than rows. If A has sufficiently more columns than
+		// rows, first reduce using the LQ decomposition.
+		if n >= mnthr {
+			// n >> m.
+			if wantvn {
+				// Path 1t.
+				itau := 0
+				iwork := itau + m
+
+				// Compute A = L*Q.
+				impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+				// Zero out above L.
+				impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
+				ie := 0
+				itauq := ie + m
+				itaup := itauq + m
+				iwork = itaup + m
+
+				// Bidiagonalize L in A.
+				impl.Dgebrd(m, m, a, lda, s, work[ie:itauq],
+					work[itauq:itaup], work[itaup:iwork], work[iwork:], lwork-iwork)
+				if wantuo || wantuas {
+					impl.Dorgbr(lapack.GenerateQ, m, m, m, a, lda,
+						work[itauq:], work[iwork:], lwork-iwork)
+				}
+				iwork = ie + m
+				nru := 0
+				if wantuo || wantuas {
+					nru = m
+				}
+
+				// Perform bidiagonal QR iteration, computing left singular vectors
+				// of A in A if desired.
+				ok = impl.Dbdsqr(blas.Upper, m, 0, nru, 0, s, work[ie:],
+					work, 1, a, lda, work, 1, work[iwork:])
+
+				// If left singular vectors desired in U, copy them there.
+				if wantuas {
+					impl.Dlacpy(blas.All, m, m, a, lda, u, ldu)
+				}
+			} else if wantvo && wantun {
+				// Path 2t.
+				panic(noSVDO)
+			} else if wantvo && wantuas {
+				// Path 3t.
+				panic(noSVDO)
+			} else if wantvs {
+				if wantun {
+					// Path 4t.
+					if lwork >= m*m+max(4*m, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*m {
+							ldworkr = lda
+						} else {
+							ldworkr = m
+						}
+						itau := ir + ldworkr*m
+						iwork := itau + m
+
+						// Compute A = L*Q.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to work[ir:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr)
+
+						// Generate Q in A.
+						impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[ir:].
+						impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate right vectors bidiagonalizing L in work[ir:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[ir:], ldworkr,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right singular
+						// vectors of L in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:],
+							work[ir:], ldworkr, work, 1, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[ir:] by
+						// Q in A, storing result in VT.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[ir:], ldworkr, a, lda, 0, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+
+						// Compute A = L*Q.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy result to VT.
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Zero out above L in A.
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
+
+						// Bidiagonalize L in A.
+						impl.Dgebrd(m, m, a, lda, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right vectors bidiagonalizing L by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right
+						// singular vectors of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:],
+							vt, ldvt, work, 1, work, 1, work[iwork:])
+					}
+				} else if wantuo {
+					// Path 5t.
+					panic(noSVDO)
+				} else if wantuas {
+					// Path 6t.
+					if lwork >= m*m+max(4*m, bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+						var ldworku int
+						if lwork >= wrkbl+lda*m {
+							ldworku = lda
+						} else {
+							ldworku = m
+						}
+						itau := iu + ldworku*m
+						iwork := itau + m
+
+						// Compute A = L*Q.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to work[iu:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku)
+
+						// Generate Q in A.
+						impl.Dorglq(m, n, m, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[iu:], copying result to U.
+						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)
+
+						// Generate right bidiagionalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[iu:], ldworku,
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of L in U and computing right singular vectors of
+						// L in work[iu:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:],
+							work[iu:], ldworku, u, ldu, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[iu:] by
+						// Q in A, storing result in VT.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[iu:], ldworku, a, lda, 0, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+
+						// Compute A = L*Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(m, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to U, zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)
+
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in U.
+						impl.Dgebrd(m, m, u, ldu, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right bidiagonalizing vectors in U by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt,
+							u, ldu, work, 1, work[iwork:])
+					}
+				}
+			} else if wantva {
+				if wantun {
+					// Path 7t.
+					if lwork >= m*m+max(max(n+m, 4*m), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						ir := 0
+						var ldworkr int
+						if lwork >= wrkbl+lda*m {
+							ldworkr = lda
+						} else {
+							ldworkr = m
+						}
+						itau := ir + ldworkr*m
+						iwork := itau + m
+
+						// Compute A = L*Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Copy L to work[ir:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[ir:], ldworkr)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[ir+1:], ldworkr)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[ir:].
+						impl.Dgebrd(m, m, work[ir:], ldworkr, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate right bidiagonalizing vectors in work[ir:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[ir:], ldworkr,
+							work[itaup:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right
+						// singular vectors of L in work[ir:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, 0, 0, s, work[ie:],
+							work[ir:], ldworkr, work, 1, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[ir:] by
+						// Q in VT, storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[ir:], ldworkr, vt, ldvt, 0, a, lda)
+
+						// Copy right singular vectors of A from A to VT.
+						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+						// Compute A = L * Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						ie := itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Zero out above L in A.
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, a[1:], lda)
+
+						// Bidiagonalize L in A.
+						impl.Dgebrd(m, m, a, lda, s, work[ie:], work[itauq:],
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right bidiagonalizing vectors in A by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							a, lda, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing right singular
+						// vectors of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, 0, 0, s, work[ie:],
+							vt, ldvt, work, 1, work, 1, work[iwork:])
+					}
+				} else if wantuo {
+					panic(noSVDO)
+				} else if wantuas {
+					// Path 9t.
+					if lwork >= m*m+max(max(m+n, 4*m), bdspac) {
+						// Sufficient workspace for a fast algorithm.
+						iu := 0
+
+						var ldworku int
+						if lwork >= wrkbl+lda*m {
+							ldworku = lda
+						} else {
+							ldworku = m
+						}
+						itau := iu + ldworku*m
+						iwork := itau + m
+
+						// Generate A = L * Q copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to work[iu:], zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+1:], ldworku)
+						ie = itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in work[iu:], copying result to U.
+						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:],
+							work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)
+
+						// Generate right bidiagonalizing vectors in work[iu:].
+						impl.Dorgbr(lapack.GeneratePT, m, m, m, work[iu:], ldworku,
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of L in U and computing right singular vectors
+						// of L in work[iu:].
+						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:],
+							work[iu:], ldworku, u, ldu, work, 1, work[iwork:])
+
+						// Multiply right singular vectors of L in work[iu:]
+						// Q in VT, storing result in A.
+						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1,
+							work[iu:], ldworku, vt, ldvt, 0, a, lda)
+
+						// Copy right singular vectors of A from A to VT.
+						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
+					} else {
+						// Insufficient workspace for a fast algorithm.
+						itau := 0
+						iwork := itau + m
+
+						// Compute A = L * Q, copying result to VT.
+						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
+						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+
+						// Generate Q in VT.
+						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
+
+						// Copy L to U, zeroing out above it.
+						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
+						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)
+
+						ie = itau
+						itauq := ie + m
+						itaup := itauq + m
+						iwork = itaup + m
+
+						// Bidiagonalize L in U.
+						impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:],
+							work[itaup:], work[iwork:], lwork-iwork)
+
+						// Multiply right bidiagonalizing vectors in U by Q in VT.
+						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m,
+							u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
+
+						// Generate left bidiagonalizing vectors in U.
+						impl.Dorgbr(lapack.GenerateQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+						iwork = ie + m
+
+						// Perform bidiagonal QR iteration, computing left singular
+						// vectors of A in U and computing right singular vectors
+						// of A in VT.
+						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:],
+							vt, ldvt, u, ldu, work, 1, work[iwork:])
+					}
+				}
+			}
+		} else {
+			// Path 10t.
+			// N at least M, but not much larger.
+			ie = 0
+			itauq := ie + m
+			itaup := itauq + m
+			iwork := itaup + m
+
+			// Bidiagonalize A.
+			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
+			if wantuas {
+				// If left singular vectors desired in U, copy result to U and
+				// generate left bidiagonalizing vectors in U.
+				impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
+				impl.Dorgbr(lapack.GenerateQ, m, m, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
+			}
+			if wantvas {
+				// If right singular vectors desired in VT, copy result to VT
+				// and generate right bidiagonalizing vectors in VT.
+				impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
+				var nrvt int
+				if wantva {
+					nrvt = n
+				} else {
+					nrvt = m
+				}
+				impl.Dorgbr(lapack.GeneratePT, nrvt, n, m, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
+			}
+			if wantuo {
+				panic(noSVDO)
+			}
+			if wantvo {
+				panic(noSVDO)
+			}
+			iwork = ie + m
+			var nru, ncvt int
+			if wantuas || wantuo {
+				nru = m
+			}
+			if wantvas || wantvo {
+				ncvt = n
+			}
+			if !wantuo && !wantvo {
+				// Perform bidiagonal QR iteration, if desired, computing left
+				// singular vectors in U and computing right singular vectors in
+				// VT.
+				ok = impl.Dbdsqr(blas.Lower, m, ncvt, nru, 0, s, work[ie:],
+					vt, ldvt, u, ldu, work, 1, work[iwork:])
+			} else {
+				// There will be two branches when the implementation is complete.
+				panic(noSVDO)
+			}
+		}
+	}
+	if !ok {
+		if ie > 1 {
+			for i := 0; i < minmn-1; i++ {
+				work[i+1] = work[i+ie]
+			}
+		}
+		if ie < 1 {
+			for i := minmn - 2; i >= 0; i-- {
+				work[i+1] = work[i+ie]
+			}
+		}
+	}
+	// Undo scaling if necessary.
+	if iscl {
+		if anrm > bignum {
+			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, 1, minmn, s, minmn)
+		}
+		if !ok && anrm > bignum {
+			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, 1, minmn-1, work[1:], minmn)
+		}
+		if anrm < smlnum {
+			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, 1, minmn, s, minmn)
+		}
+		if !ok && anrm < smlnum {
+			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, 1, minmn-1, work[1:], minmn)
+		}
+	}
+	work[0] = float64(maxwrk)
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go
new file mode 100644
index 00000000000..41203e9fa2c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetc2.go
@@ -0,0 +1,125 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetc2 computes an LU factorization with complete pivoting of the n×n matrix
+// A. The factorization has the form
+//
+//	A = P * L * U * Q,
+//
+// where P and Q are permutation matrices, L is lower triangular with unit
+// diagonal elements and U is upper triangular.
+//
+// On entry, a contains the matrix A to be factored. On return, a is overwritten
+// with the factors L and U. The unit diagonal elements of L are not stored.
+//
+// On return, ipiv and jpiv contain the pivot indices: row i has been
+// interchanged with row ipiv[i] and column j has been interchanged with column
+// jpiv[j]. ipiv and jpiv must have length n, otherwise Dgetc2 will panic.
+//
+// If k is non-negative, then U[k,k] is likely to produce overflow when solving
+// for x in A*x=b and U has been perturbed to avoid the overflow.
+//
+// Dgetc2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgetc2(n int, a []float64, lda int, ipiv, jpiv []int) (k int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Negative k indicates U was not perturbed.
+	k = -1
+
+	// Quick return if possible.
+	if n == 0 {
+		return k
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(jpiv) != n:
+		panic(badLenJpvt)
+	}
+
+	const (
+		eps    = dlamchP
+		smlnum = dlamchS / eps
+	)
+
+	if n == 1 {
+		ipiv[0], jpiv[0] = 0, 0
+		if math.Abs(a[0]) < smlnum {
+			k = 0
+			a[0] = smlnum
+		}
+		return k
+	}
+
+	// Factorize A using complete pivoting.
+	// Set pivots less than smin to smin.
+	var smin float64
+	var ipv, jpv int
+	bi := blas64.Implementation()
+	for i := 0; i < n-1; i++ {
+		var xmax float64
+		for ip := i; ip < n; ip++ {
+			for jp := i; jp < n; jp++ {
+				if math.Abs(a[ip*lda+jp]) >= xmax {
+					xmax = math.Abs(a[ip*lda+jp])
+					ipv = ip
+					jpv = jp
+				}
+			}
+		}
+		if i == 0 {
+			smin = math.Max(eps*xmax, smlnum)
+		}
+
+		// Swap rows.
+		if ipv != i {
+			bi.Dswap(n, a[ipv*lda:], 1, a[i*lda:], 1)
+		}
+		ipiv[i] = ipv
+
+		// Swap columns.
+		if jpv != i {
+			bi.Dswap(n, a[jpv:], lda, a[i:], lda)
+		}
+		jpiv[i] = jpv
+
+		// Check for singularity.
+		if math.Abs(a[i*lda+i]) < smin {
+			k = i
+			a[i*lda+i] = smin
+		}
+
+		for j := i + 1; j < n; j++ {
+			a[j*lda+i] /= a[i*lda+i]
+		}
+		bi.Dger(n-i-1, n-i-1, -1, a[(i+1)*lda+i:], lda, a[i*lda+i+1:], 1, a[(i+1)*lda+i+1:], lda)
+	}
+
+	if math.Abs(a[(n-1)*lda+n-1]) < smin {
+		k = n - 1
+		a[(n-1)*lda+(n-1)] = smin
+	}
+
+	// Set last pivots to last index.
+	ipiv[n-1] = n - 1
+	jpiv[n-1] = n - 1
+
+	return k
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go
new file mode 100644
index 00000000000..6a7003cf316
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetf2.go
@@ -0,0 +1,90 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetf2 computes the LU decomposition of an m×n matrix A using partial
+// pivoting with row interchanges.
+//
+// The LU decomposition is a factorization of A into
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is a lower triangular with unit diagonal
+// elements (lower trapezoidal if m > n), and U is upper triangular (upper
+// trapezoidal if m < n).
+//
+// On entry, a contains the matrix A. On return, L and U are stored in place
+// into a, and P is represented by ipiv.
+//
+// ipiv contains a sequence of row interchanges. It indicates that row i of the
+// matrix was interchanged with ipiv[i]. ipiv must have length min(m,n), and
+// Dgetf2 will panic otherwise. ipiv is zero-indexed.
+//
+// Dgetf2 returns whether the matrix A is nonsingular. The LU decomposition will
+// be computed regardless of the singularity of A, but the result should not be
+// used to solve a system of equation.
+//
+// Dgetf2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dgetf2(m, n int, a []float64, lda int, ipiv []int) (ok bool) {
+	mn := min(m, n)
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if mn == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != mn:
+		panic(badLenIpiv)
+	}
+
+	bi := blas64.Implementation()
+
+	sfmin := dlamchS
+	ok = true
+	for j := 0; j < mn; j++ {
+		// Find a pivot and test for singularity.
+		jp := j + bi.Idamax(m-j, a[j*lda+j:], lda)
+		ipiv[j] = jp
+		if a[jp*lda+j] == 0 {
+			ok = false
+		} else {
+			// Swap the rows if necessary.
+			if jp != j {
+				bi.Dswap(n, a[j*lda:], 1, a[jp*lda:], 1)
+			}
+			if j < m-1 {
+				aj := a[j*lda+j]
+				if math.Abs(aj) >= sfmin {
+					bi.Dscal(m-j-1, 1/aj, a[(j+1)*lda+j:], lda)
+				} else {
+					for i := 0; i < m-j-1; i++ {
+						a[(j+1)*lda+j] = a[(j+1)*lda+j] / a[lda*j+j]
+					}
+				}
+			}
+		}
+		if j < mn-1 {
+			bi.Dger(m-j-1, n-j-1, -1, a[(j+1)*lda+j:], lda, a[j*lda+j+1:], 1, a[(j+1)*lda+j+1:], lda)
+		}
+	}
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go
new file mode 100644
index 00000000000..38ae8efa143
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrf.go
@@ -0,0 +1,89 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetrf computes the LU decomposition of an m×n matrix A using partial
+// pivoting with row interchanges.
+//
+// The LU decomposition is a factorization of A into
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is a lower triangular with unit diagonal
+// elements (lower trapezoidal if m > n), and U is upper triangular (upper
+// trapezoidal if m < n).
+//
+// On entry, a contains the matrix A. On return, L and U are stored in place
+// into a, and P is represented by ipiv.
+//
+// ipiv contains a sequence of row interchanges. It indicates that row i of the
+// matrix was interchanged with ipiv[i]. ipiv must have length min(m,n), and
+// Dgetrf will panic otherwise. ipiv is zero-indexed.
+//
+// Dgetrf returns whether the matrix A is nonsingular. The LU decomposition will
+// be computed regardless of the singularity of A, but the result should not be
+// used to solve a system of equation.
+func (impl Implementation) Dgetrf(m, n int, a []float64, lda int, ipiv []int) (ok bool) {
+	mn := min(m, n)
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if mn == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != mn:
+		panic(badLenIpiv)
+	}
+
+	bi := blas64.Implementation()
+
+	nb := impl.Ilaenv(1, "DGETRF", " ", m, n, -1, -1)
+	if nb <= 1 || mn <= nb {
+		// Use the unblocked algorithm.
+		return impl.Dgetf2(m, n, a, lda, ipiv)
+	}
+	ok = true
+	for j := 0; j < mn; j += nb {
+		jb := min(mn-j, nb)
+		blockOk := impl.Dgetf2(m-j, jb, a[j*lda+j:], lda, ipiv[j:j+jb])
+		if !blockOk {
+			ok = false
+		}
+		for i := j; i <= min(m-1, j+jb-1); i++ {
+			ipiv[i] = j + ipiv[i]
+		}
+		impl.Dlaswp(j, a, lda, j, j+jb-1, ipiv[:j+jb], 1)
+		if j+jb < n {
+			impl.Dlaswp(n-j-jb, a[j+jb:], lda, j, j+jb-1, ipiv[:j+jb], 1)
+			bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit,
+				jb, n-j-jb, 1,
+				a[j*lda+j:], lda,
+				a[j*lda+j+jb:], lda)
+			if j+jb < m {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, m-j-jb, n-j-jb, jb, -1,
+					a[(j+jb)*lda+j:], lda,
+					a[j*lda+j+jb:], lda,
+					1, a[(j+jb)*lda+j+jb:], lda)
+			}
+		}
+	}
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go
new file mode 100644
index 00000000000..b2f2ae46b92
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetri.go
@@ -0,0 +1,116 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetri computes the inverse of the matrix A using the LU factorization computed
+// by Dgetrf. On entry, a contains the PLU decomposition of A as computed by
+// Dgetrf and on exit contains the reciprocal of the original matrix.
+//
+// Dgetri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= n and this function will panic otherwise.
+// Dgetri is a blocked inversion, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dgetri,
+// the optimal work length will be stored into work[0].
+func (impl Implementation) Dgetri(n int, a []float64, lda int, ipiv []int, work []float64, lwork int) (ok bool) {
+	iws := max(1, n)
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < iws && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if n == 0 {
+		work[0] = 1
+		return true
+	}
+
+	nb := impl.Ilaenv(1, "DGETRI", " ", n, -1, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	}
+
+	// Form inv(U).
+	ok = impl.Dtrtri(blas.Upper, blas.NonUnit, n, a, lda)
+	if !ok {
+		return false
+	}
+
+	nbmin := 2
+	if 1 < nb && nb < n {
+		iws = max(n*nb, 1)
+		if lwork < iws {
+			nb = lwork / n
+			nbmin = max(2, impl.Ilaenv(2, "DGETRI", " ", n, -1, -1, -1))
+		}
+	}
+	ldwork := nb
+
+	bi := blas64.Implementation()
+	// Solve the equation inv(A)*L = inv(U) for inv(A).
+	// TODO(btracey): Replace this with a more row-major oriented algorithm.
+	if nb < nbmin || n <= nb {
+		// Unblocked code.
+		for j := n - 1; j >= 0; j-- {
+			for i := j + 1; i < n; i++ {
+				// Copy current column of L to work and replace with zeros.
+				work[i] = a[i*lda+j]
+				a[i*lda+j] = 0
+			}
+			// Compute current column of inv(A).
+			if j < n-1 {
+				bi.Dgemv(blas.NoTrans, n, n-j-1, -1, a[(j+1):], lda, work[(j+1):], 1, 1, a[j:], lda)
+			}
+		}
+	} else {
+		// Blocked code.
+		nn := ((n - 1) / nb) * nb
+		for j := nn; j >= 0; j -= nb {
+			jb := min(nb, n-j)
+			// Copy current block column of L to work and replace
+			// with zeros.
+			for jj := j; jj < j+jb; jj++ {
+				for i := jj + 1; i < n; i++ {
+					work[i*ldwork+(jj-j)] = a[i*lda+jj]
+					a[i*lda+jj] = 0
+				}
+			}
+			// Compute current block column of inv(A).
+			if j+jb < n {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, jb, n-j-jb, -1, a[(j+jb):], lda, work[(j+jb)*ldwork:], ldwork, 1, a[j:], lda)
+			}
+			bi.Dtrsm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, jb, 1, work[j*ldwork:], ldwork, a[j:], lda)
+		}
+	}
+	// Apply column interchanges.
+	for j := n - 2; j >= 0; j-- {
+		jp := ipiv[j]
+		if jp != j {
+			bi.Dswap(n, a[j:], lda, a[jp:], lda)
+		}
+	}
+	work[0] = float64(iws)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go
new file mode 100644
index 00000000000..35b33aa7d77
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgetrs.go
@@ -0,0 +1,74 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dgetrs solves a system of equations using an LU factorization.
+// The system of equations solved is
+//
+//	A * X = B  if trans == blas.Trans
+//	Aᵀ * X = B if trans == blas.NoTrans
+//
+// A is a general n×n matrix with stride lda. B is a general matrix of size n×nrhs.
+//
+// On entry b contains the elements of the matrix B. On exit, b contains the
+// elements of X, the solution to the system of equations.
+//
+// a and ipiv contain the LU factorization of A and the permutation indices as
+// computed by Dgetrf. ipiv is zero-indexed.
+func (impl Implementation) Dgetrs(trans blas.Transpose, n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int) {
+	switch {
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	}
+
+	bi := blas64.Implementation()
+
+	if trans == blas.NoTrans {
+		// Solve A * X = B.
+		impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, 1)
+		// Solve L * X = B, updating b.
+		bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.Unit,
+			n, nrhs, 1, a, lda, b, ldb)
+		// Solve U * X = B, updating b.
+		bi.Dtrsm(blas.Left, blas.Upper, blas.NoTrans, blas.NonUnit,
+			n, nrhs, 1, a, lda, b, ldb)
+		return
+	}
+	// Solve Aᵀ * X = B.
+	// Solve Uᵀ * X = B, updating b.
+	bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit,
+		n, nrhs, 1, a, lda, b, ldb)
+	// Solve Lᵀ * X = B, updating b.
+	bi.Dtrsm(blas.Left, blas.Lower, blas.Trans, blas.Unit,
+		n, nrhs, 1, a, lda, b, ldb)
+	impl.Dlaswp(nrhs, b, ldb, 0, n-1, ipiv, -1)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go
new file mode 100644
index 00000000000..c9d6b4d1b3c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgghrd.go
@@ -0,0 +1,125 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dgghrd reduces a pair of real matrices (A,B) to generalized upper Hessenberg
+// form using orthogonal transformations, where A is a general matrix and B is
+// upper triangular.
+//
+// This subroutine simultaneously reduces A to a Hessenberg matrix H
+//
+//	Qᵀ*A*Z = H,
+//
+// and transforms B to another upper triangular matrix T
+//
+//	Qᵀ*B*Z = T.
+//
+// The orthogonal matrices Q and Z are determined as products of Givens
+// rotations. They may either be formed explicitly (lapack.OrthoExplicit), or
+// they may be postmultiplied into input matrices Q1 and Z1
+// (lapack.OrthoPostmul), so that
+//
+//	Q1 * A * Z1ᵀ = (Q1*Q) * H * (Z1*Z)ᵀ,
+//	Q1 * B * Z1ᵀ = (Q1*Q) * T * (Z1*Z)ᵀ.
+//
+// ilo and ihi determine the block of A that will be reduced. It must hold that
+//
+//   - 0 <= ilo <= ihi < n      if n > 0,
+//   - ilo == 0 and ihi == -1   if n == 0,
+//
+// otherwise Dgghrd will panic.
+//
+// Dgghrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgghrd(compq, compz lapack.OrthoComp, n, ilo, ihi int, a []float64, lda int, b []float64, ldb int, q []float64, ldq int, z []float64, ldz int) {
+	switch {
+	case compq != lapack.OrthoNone && compq != lapack.OrthoExplicit && compq != lapack.OrthoPostmul:
+		panic(badOrthoComp)
+	case compz != lapack.OrthoNone && compz != lapack.OrthoExplicit && compz != lapack.OrthoPostmul:
+		panic(badOrthoComp)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case (compq != lapack.OrthoNone && ldq < n) || ldq < 1:
+		panic(badLdQ)
+	case (compz != lapack.OrthoNone && ldz < n) || ldz < 1:
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+n:
+		panic(shortB)
+	case compq != lapack.OrthoNone && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case compz != lapack.OrthoNone && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	}
+
+	if compq == lapack.OrthoExplicit {
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+	}
+	if compz == lapack.OrthoExplicit {
+		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return
+	}
+
+	// Zero out lower triangle of B.
+	for i := 1; i < n; i++ {
+		for j := 0; j < i; j++ {
+			b[i*ldb+j] = 0
+		}
+	}
+	bi := blas64.Implementation()
+	// Reduce A and B.
+	for jcol := ilo; jcol <= ihi-2; jcol++ {
+		for jrow := ihi; jrow >= jcol+2; jrow-- {
+			// Step 1: rotate rows jrow-1, jrow to kill A[jrow,jcol].
+			var c, s float64
+			c, s, a[(jrow-1)*lda+jcol] = impl.Dlartg(a[(jrow-1)*lda+jcol], a[jrow*lda+jcol])
+			a[jrow*lda+jcol] = 0
+
+			bi.Drot(n-jcol-1, a[(jrow-1)*lda+jcol+1:], 1, a[jrow*lda+jcol+1:], 1, c, s)
+			bi.Drot(n+2-jrow-1, b[(jrow-1)*ldb+jrow-1:], 1, b[jrow*ldb+jrow-1:], 1, c, s)
+
+			if compq != lapack.OrthoNone {
+				bi.Drot(n, q[jrow-1:], ldq, q[jrow:], ldq, c, s)
+			}
+
+			// Step 2: rotate columns jrow, jrow-1 to kill B[jrow,jrow-1].
+			c, s, b[jrow*ldb+jrow] = impl.Dlartg(b[jrow*ldb+jrow], b[jrow*ldb+jrow-1])
+			b[jrow*ldb+jrow-1] = 0
+
+			bi.Drot(ihi+1, a[jrow:], lda, a[jrow-1:], lda, c, s)
+			bi.Drot(jrow, b[jrow:], ldb, b[jrow-1:], ldb, c, s)
+
+			if compz != lapack.OrthoNone {
+				bi.Drot(n, z[jrow:], ldz, z[jrow-1:], ldz, c, s)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go
new file mode 100644
index 00000000000..cfe10efa9da
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvd3.go
@@ -0,0 +1,258 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dggsvd3 computes the generalized singular value decomposition (GSVD)
+// of an m×n matrix A and p×n matrix B:
+//
+//	Uᵀ*A*Q = D1*[ 0 R ]
+//
+//	Vᵀ*B*Q = D2*[ 0 R ]
+//
+// where U, V and Q are orthogonal matrices.
+//
+// Dggsvd3 returns k and l, the dimensions of the sub-blocks. k+l
+// is the effective numerical rank of the (m+p)×n matrix [ Aᵀ Bᵀ ]ᵀ.
+// R is a (k+l)×(k+l) nonsingular upper triangular matrix, D1 and
+// D2 are m×(k+l) and p×(k+l) diagonal matrices and of the following
+// structures, respectively:
+//
+// If m-k-l >= 0,
+//
+//	                  k  l
+//	     D1 =     k [ I  0 ]
+//	              l [ 0  C ]
+//	          m-k-l [ 0  0 ]
+//
+//	                k  l
+//	     D2 = l   [ 0  S ]
+//	          p-l [ 0  0 ]
+//
+//	             n-k-l  k    l
+//	[ 0 R ] = k [  0   R11  R12 ] k
+//	          l [  0    0   R22 ] l
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_{k+l} ),
+//	S = diag( beta_k,  ... , beta_{k+l} ),
+//	C^2 + S^2 = I.
+//
+// R is stored in
+//
+//	A[0:k+l, n-k-l:n]
+//
+// on exit.
+//
+// If m-k-l < 0,
+//
+//	               k m-k k+l-m
+//	    D1 =   k [ I  0    0  ]
+//	         m-k [ 0  C    0  ]
+//
+//	                 k m-k k+l-m
+//	    D2 =   m-k [ 0  S    0  ]
+//	         k+l-m [ 0  0    I  ]
+//	           p-l [ 0  0    0  ]
+//
+//	               n-k-l  k   m-k  k+l-m
+//	[ 0 R ] =    k [ 0    R11  R12  R13 ]
+//	           m-k [ 0     0   R22  R23 ]
+//	         k+l-m [ 0     0    0   R33 ]
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_m ),
+//	S = diag( beta_k,  ... , beta_m ),
+//	C^2 + S^2 = I.
+//
+//	R = [ R11 R12 R13 ] is stored in A[1:m, n-k-l+1:n]
+//	    [  0  R22 R23 ]
+//
+// and R33 is stored in
+//
+//	B[m-k:l, n+m-k-l:n] on exit.
+//
+// Dggsvd3 computes C, S, R, and optionally the orthogonal transformation
+// matrices U, V and Q.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// alpha and beta must have length n or Dggsvd3 will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//
+//	alpha[0:k] = 1,
+//	beta[0:k]  = 0,
+//
+// if m-k-l >= 0,
+//
+//	alpha[k:k+l] = diag(C),
+//	beta[k:k+l]  = diag(S),
+//
+// if m-k-l < 0,
+//
+//	alpha[k:m]= C, alpha[m:k+l]= 0
+//	beta[k:m] = S, beta[m:k+l] = 1.
+//
+// if k+l < n,
+//
+//	alpha[k+l:n] = 0 and
+//	beta[k+l:n]  = 0.
+//
+// On exit, iwork contains the permutation required to sort alpha descending.
+//
+// iwork must have length n, work must have length at least max(1, lwork), and
+// lwork must be -1 or greater than n, otherwise Dggsvd3 will panic. If
+// lwork is -1, work[0] holds the optimal lwork on return, but Dggsvd3 does
+// not perform the GSVD.
+func (impl Implementation) Dggsvd3(jobU, jobV, jobQ lapack.GSVDJob, m, n, p int, a []float64, lda int, b []float64, ldb int, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64, lwork int, iwork []int) (k, l int, ok bool) {
+	wantu := jobU == lapack.GSVDU
+	wantv := jobV == lapack.GSVDV
+	wantq := jobQ == lapack.GSVDQ
+	switch {
+	case !wantu && jobU != lapack.GSVDNone:
+		panic(badGSVDJob + "U")
+	case !wantv && jobV != lapack.GSVDNone:
+		panic(badGSVDJob + "V")
+	case !wantq && jobQ != lapack.GSVDNone:
+		panic(badGSVDJob + "Q")
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case p < 0:
+		panic(pLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldu < 1, wantu && ldu < m:
+		panic(badLdU)
+	case ldv < 1, wantv && ldv < p:
+		panic(badLdV)
+	case ldq < 1, wantq && ldq < n:
+		panic(badLdQ)
+	case len(iwork) < n:
+		panic(shortWork)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Determine optimal work length.
+	impl.Dggsvp3(jobU, jobV, jobQ,
+		m, p, n,
+		a, lda,
+		b, ldb,
+		0, 0,
+		u, ldu,
+		v, ldv,
+		q, ldq,
+		iwork,
+		work, work, -1)
+	lwkopt := n + int(work[0])
+	lwkopt = max(lwkopt, 2*n)
+	lwkopt = max(lwkopt, 1)
+	work[0] = float64(lwkopt)
+	if lwork == -1 {
+		return 0, 0, true
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (p-1)*ldb+n:
+		panic(shortB)
+	case wantu && len(u) < (m-1)*ldu+m:
+		panic(shortU)
+	case wantv && len(v) < (p-1)*ldv+p:
+		panic(shortV)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case len(alpha) != n:
+		panic(badLenAlpha)
+	case len(beta) != n:
+		panic(badLenBeta)
+	}
+
+	// Compute the Frobenius norm of matrices A and B.
+	anorm := impl.Dlange(lapack.Frobenius, m, n, a, lda, nil)
+	bnorm := impl.Dlange(lapack.Frobenius, p, n, b, ldb, nil)
+
+	// Get machine precision and set up threshold for determining
+	// the effective numerical rank of the matrices A and B.
+	tola := float64(max(m, n)) * math.Max(anorm, dlamchS) * dlamchP
+	tolb := float64(max(p, n)) * math.Max(bnorm, dlamchS) * dlamchP
+
+	// Preprocessing.
+	k, l = impl.Dggsvp3(jobU, jobV, jobQ,
+		m, p, n,
+		a, lda,
+		b, ldb,
+		tola, tolb,
+		u, ldu,
+		v, ldv,
+		q, ldq,
+		iwork,
+		work[:n], work[n:], lwork-n)
+
+	// Compute the GSVD of two upper "triangular" matrices.
+	_, ok = impl.Dtgsja(jobU, jobV, jobQ,
+		m, p, n,
+		k, l,
+		a, lda,
+		b, ldb,
+		tola, tolb,
+		alpha, beta,
+		u, ldu,
+		v, ldv,
+		q, ldq,
+		work)
+
+	// Sort the singular values and store the pivot indices in iwork
+	// Copy alpha to work, then sort alpha in work.
+	bi := blas64.Implementation()
+	bi.Dcopy(n, alpha, 1, work[:n], 1)
+	ibnd := min(l, m-k)
+	for i := 0; i < ibnd; i++ {
+		// Scan for largest alpha_{k+i}.
+		isub := i
+		smax := work[k+i]
+		for j := i + 1; j < ibnd; j++ {
+			if v := work[k+j]; v > smax {
+				isub = j
+				smax = v
+			}
+		}
+		if isub != i {
+			work[k+isub] = work[k+i]
+			work[k+i] = smax
+			iwork[k+i] = k + isub
+		} else {
+			iwork[k+i] = k + i
+		}
+	}
+
+	work[0] = float64(lwkopt)
+
+	return k, l, ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go
new file mode 100644
index 00000000000..f7f04c764f5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dggsvp3.go
@@ -0,0 +1,286 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dggsvp3 computes orthogonal matrices U, V and Q such that
+//
+//	                n-k-l  k    l
+//	Uᵀ*A*Q =     k [ 0    A12  A13 ] if m-k-l >= 0;
+//	             l [ 0     0   A23 ]
+//	         m-k-l [ 0     0    0  ]
+//
+//	                n-k-l  k    l
+//	Uᵀ*A*Q =     k [ 0    A12  A13 ] if m-k-l < 0;
+//	           m-k [ 0     0   A23 ]
+//
+//	                n-k-l  k    l
+//	Vᵀ*B*Q =     l [ 0     0   B13 ]
+//	           p-l [ 0     0    0  ]
+//
+// where the k×k matrix A12 and l×l matrix B13 are non-singular
+// upper triangular. A23 is l×l upper triangular if m-k-l >= 0,
+// otherwise A23 is (m-k)×l upper trapezoidal.
+//
+// Dggsvp3 returns k and l, the dimensions of the sub-blocks. k+l
+// is the effective numerical rank of the (m+p)×n matrix [ Aᵀ Bᵀ ]ᵀ.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// tola and tolb are the convergence criteria for the Jacobi-Kogbetliantz
+// iteration procedure. Generally, they are the same as used in the preprocessing
+// step, for example,
+//
+//	tola = max(m, n)*norm(A)*eps,
+//	tolb = max(p, n)*norm(B)*eps.
+//
+// Where eps is the machine epsilon.
+//
+// iwork must have length n, work must have length at least max(1, lwork), and
+// lwork must be -1 or greater than zero, otherwise Dggsvp3 will panic.
+//
+// Dggsvp3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dggsvp3(jobU, jobV, jobQ lapack.GSVDJob, m, p, n int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, iwork []int, tau, work []float64, lwork int) (k, l int) {
+	wantu := jobU == lapack.GSVDU
+	wantv := jobV == lapack.GSVDV
+	wantq := jobQ == lapack.GSVDQ
+	switch {
+	case !wantu && jobU != lapack.GSVDNone:
+		panic(badGSVDJob + "U")
+	case !wantv && jobV != lapack.GSVDNone:
+		panic(badGSVDJob + "V")
+	case !wantq && jobQ != lapack.GSVDNone:
+		panic(badGSVDJob + "Q")
+	case m < 0:
+		panic(mLT0)
+	case p < 0:
+		panic(pLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldu < 1, wantu && ldu < m:
+		panic(badLdU)
+	case ldv < 1, wantv && ldv < p:
+		panic(badLdV)
+	case ldq < 1, wantq && ldq < n:
+		panic(badLdQ)
+	case len(iwork) != n:
+		panic(shortWork)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	var lwkopt int
+	impl.Dgeqp3(p, n, b, ldb, iwork, tau, work, -1)
+	lwkopt = int(work[0])
+	if wantv {
+		lwkopt = max(lwkopt, p)
+	}
+	lwkopt = max(lwkopt, min(n, p))
+	lwkopt = max(lwkopt, m)
+	if wantq {
+		lwkopt = max(lwkopt, n)
+	}
+	impl.Dgeqp3(m, n, a, lda, iwork, tau, work, -1)
+	lwkopt = max(lwkopt, int(work[0]))
+	lwkopt = max(1, lwkopt)
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return 0, 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (p-1)*ldb+n:
+		panic(shortB)
+	case wantu && len(u) < (m-1)*ldu+m:
+		panic(shortU)
+	case wantv && len(v) < (p-1)*ldv+p:
+		panic(shortV)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case len(tau) < n:
+		// tau check must come after lwkopt query since
+		// the Dggsvd3 call for lwkopt query may have
+		// lwork == -1, and tau is provided by work.
+		panic(shortTau)
+	}
+
+	const forward = true
+
+	// QR with column pivoting of B: B*P = V*[ S11 S12 ].
+	//                                       [  0   0  ]
+	for i := range iwork[:n] {
+		iwork[i] = 0
+	}
+	impl.Dgeqp3(p, n, b, ldb, iwork, tau, work, lwork)
+
+	// Update A := A*P.
+	impl.Dlapmt(forward, m, n, a, lda, iwork)
+
+	// Determine the effective rank of matrix B.
+	for i := 0; i < min(p, n); i++ {
+		if math.Abs(b[i*ldb+i]) > tolb {
+			l++
+		}
+	}
+
+	if wantv {
+		// Copy the details of V, and form V.
+		impl.Dlaset(blas.All, p, p, 0, 0, v, ldv)
+		if p > 1 {
+			impl.Dlacpy(blas.Lower, p-1, min(p, n), b[ldb:], ldb, v[ldv:], ldv)
+		}
+		impl.Dorg2r(p, p, min(p, n), v, ldv, tau[:min(p, n)], work)
+	}
+
+	// Clean up B.
+	for i := 1; i < l; i++ {
+		r := b[i*ldb : i*ldb+i]
+		for j := range r {
+			r[j] = 0
+		}
+	}
+	if p > l {
+		impl.Dlaset(blas.All, p-l, n, 0, 0, b[l*ldb:], ldb)
+	}
+
+	if wantq {
+		// Set Q = I and update Q := Q*P.
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+		impl.Dlapmt(forward, n, n, q, ldq, iwork)
+	}
+
+	if p >= l && n != l {
+		// RQ factorization of [ S11 S12 ]: [ S11 S12 ] = [ 0 S12 ]*Z.
+		impl.Dgerq2(l, n, b, ldb, tau, work)
+
+		// Update A := A*Zᵀ.
+		impl.Dormr2(blas.Right, blas.Trans, m, n, l, b, ldb, tau, a, lda, work)
+
+		if wantq {
+			// Update Q := Q*Zᵀ.
+			impl.Dormr2(blas.Right, blas.Trans, n, n, l, b, ldb, tau, q, ldq, work)
+		}
+
+		// Clean up B.
+		impl.Dlaset(blas.All, l, n-l, 0, 0, b, ldb)
+		for i := 1; i < l; i++ {
+			r := b[i*ldb+n-l : i*ldb+i+n-l]
+			for j := range r {
+				r[j] = 0
+			}
+		}
+	}
+
+	// Let              N-L     L
+	//            A = [ A11    A12 ] M,
+	//
+	// then the following does the complete QR decomposition of A11:
+	//
+	//          A11 = U*[  0  T12 ]*P1ᵀ.
+	//                  [  0   0  ]
+	for i := range iwork[:n-l] {
+		iwork[i] = 0
+	}
+	impl.Dgeqp3(m, n-l, a, lda, iwork[:n-l], tau, work, lwork)
+
+	// Determine the effective rank of A11.
+	for i := 0; i < min(m, n-l); i++ {
+		if math.Abs(a[i*lda+i]) > tola {
+			k++
+		}
+	}
+
+	// Update A12 := Uᵀ*A12, where A12 = A[0:m, n-l:n].
+	impl.Dorm2r(blas.Left, blas.Trans, m, l, min(m, n-l), a, lda, tau[:min(m, n-l)], a[n-l:], lda, work)
+
+	if wantu {
+		// Copy the details of U, and form U.
+		impl.Dlaset(blas.All, m, m, 0, 0, u, ldu)
+		if m > 1 {
+			impl.Dlacpy(blas.Lower, m-1, min(m, n-l), a[lda:], lda, u[ldu:], ldu)
+		}
+		k := min(m, n-l)
+		impl.Dorg2r(m, m, k, u, ldu, tau[:k], work)
+	}
+
+	if wantq {
+		// Update Q[0:n, 0:n-l] := Q[0:n, 0:n-l]*P1.
+		impl.Dlapmt(forward, n, n-l, q, ldq, iwork[:n-l])
+	}
+
+	// Clean up A: set the strictly lower triangular part of
+	// A[0:k, 0:k] = 0, and A[k:m, 0:n-l] = 0.
+	for i := 1; i < k; i++ {
+		r := a[i*lda : i*lda+i]
+		for j := range r {
+			r[j] = 0
+		}
+	}
+	if m > k {
+		impl.Dlaset(blas.All, m-k, n-l, 0, 0, a[k*lda:], lda)
+	}
+
+	if n-l > k {
+		// RQ factorization of [ T11 T12 ] = [ 0 T12 ]*Z1.
+		impl.Dgerq2(k, n-l, a, lda, tau, work)
+
+		if wantq {
+			// Update Q[0:n, 0:n-l] := Q[0:n, 0:n-l]*Z1ᵀ.
+			impl.Dorm2r(blas.Right, blas.Trans, n, n-l, k, a, lda, tau[:k], q, ldq, work)
+		}
+
+		// Clean up A.
+		impl.Dlaset(blas.All, k, n-l-k, 0, 0, a, lda)
+		for i := 1; i < k; i++ {
+			r := a[i*lda+n-k-l : i*lda+i+n-k-l]
+			for j := range r {
+				a[j] = 0
+			}
+		}
+	}
+
+	if m > k {
+		// QR factorization of A[k:m, n-l:n].
+		impl.Dgeqr2(m-k, l, a[k*lda+n-l:], lda, tau[:min(m-k, l)], work)
+		if wantu {
+			// Update U[:, k:m) := U[:, k:m]*U1.
+			impl.Dorm2r(blas.Right, blas.NoTrans, m, m-k, min(m-k, l), a[k*lda+n-l:], lda, tau[:min(m-k, l)], u[k:], ldu, work)
+		}
+
+		// Clean up A.
+		for i := k + 1; i < m; i++ {
+			r := a[i*lda+n-l : i*lda+min(n-l+i-k, n)]
+			for j := range r {
+				r[j] = 0
+			}
+		}
+	}
+
+	work[0] = float64(lwkopt)
+	return k, l
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go
new file mode 100644
index 00000000000..944af1a6070
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dgtsv.go
@@ -0,0 +1,101 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dgtsv solves the equation
+//
+//	A * X = B
+//
+// where A is an n×n tridiagonal matrix. It uses Gaussian elimination with
+// partial pivoting. The equation Aᵀ * X = B may be solved by swapping the
+// arguments for du and dl.
+//
+// On entry, dl, d and du contain the sub-diagonal, the diagonal and the
+// super-diagonal, respectively, of A. On return, the first n-2 elements of dl,
+// the first n-1 elements of du and the first n elements of d may be
+// overwritten.
+//
+// On entry, b contains the n×nrhs right-hand side matrix B. On return, b will
+// be overwritten. If ok is true, it will be overwritten by the solution matrix X.
+//
+// Dgtsv returns whether the solution X has been successfully computed.
+func (impl Implementation) Dgtsv(n, nrhs int, dl, d, du []float64, b []float64, ldb int) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	if n == 0 || nrhs == 0 {
+		return true
+	}
+
+	switch {
+	case len(dl) < n-1:
+		panic(shortDL)
+	case len(d) < n:
+		panic(shortD)
+	case len(du) < n-1:
+		panic(shortDU)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	dl = dl[:n-1]
+	d = d[:n]
+	du = du[:n-1]
+
+	for i := 0; i < n-1; i++ {
+		if math.Abs(d[i]) >= math.Abs(dl[i]) {
+			// No row interchange required.
+			if d[i] == 0 {
+				return false
+			}
+			fact := dl[i] / d[i]
+			d[i+1] -= fact * du[i]
+			for j := 0; j < nrhs; j++ {
+				b[(i+1)*ldb+j] -= fact * b[i*ldb+j]
+			}
+			dl[i] = 0
+		} else {
+			// Interchange rows i and i+1.
+			fact := d[i] / dl[i]
+			d[i] = dl[i]
+			tmp := d[i+1]
+			d[i+1] = du[i] - fact*tmp
+			du[i] = tmp
+			if i+1 < n-1 {
+				dl[i] = du[i+1]
+				du[i+1] = -fact * dl[i]
+			}
+			for j := 0; j < nrhs; j++ {
+				tmp = b[i*ldb+j]
+				b[i*ldb+j] = b[(i+1)*ldb+j]
+				b[(i+1)*ldb+j] = tmp - fact*b[(i+1)*ldb+j]
+			}
+		}
+	}
+	if d[n-1] == 0 {
+		return false
+	}
+
+	// Back solve with the matrix U from the factorization.
+	for j := 0; j < nrhs; j++ {
+		b[(n-1)*ldb+j] /= d[n-1]
+		if n > 1 {
+			b[(n-2)*ldb+j] = (b[(n-2)*ldb+j] - du[n-2]*b[(n-1)*ldb+j]) / d[n-2]
+		}
+		for i := n - 3; i >= 0; i-- {
+			b[i*ldb+j] = (b[i*ldb+j] - du[i]*b[(i+1)*ldb+j] - dl[i]*b[(i+2)*ldb+j]) / d[i]
+		}
+	}
+
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go
new file mode 100644
index 00000000000..beccf132b75
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dhseqr.go
@@ -0,0 +1,272 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dhseqr computes the eigenvalues of an n×n Hessenberg matrix H and,
+// optionally, the matrices T and Z from the Schur decomposition
+//
+//	H = Z T Zᵀ,
+//
+// where T is an n×n upper quasi-triangular matrix (the Schur form), and Z is
+// the n×n orthogonal matrix of Schur vectors.
+//
+// Optionally Z may be postmultiplied into an input orthogonal matrix Q so that
+// this routine can give the Schur factorization of a matrix A which has been
+// reduced to the Hessenberg form H by the orthogonal matrix Q:
+//
+//	A = Q H Qᵀ = (QZ) T (QZ)ᵀ.
+//
+// If job == lapack.EigenvaluesOnly, only the eigenvalues will be computed.
+// If job == lapack.EigenvaluesAndSchur, the eigenvalues and the Schur form T will
+// be computed.
+// For other values of job Dhseqr will panic.
+//
+// If compz == lapack.SchurNone, no Schur vectors will be computed and Z will not be
+// referenced.
+// If compz == lapack.SchurHess, on return Z will contain the matrix of Schur
+// vectors of H.
+// If compz == lapack.SchurOrig, on entry z is assumed to contain the orthogonal
+// matrix Q that is the identity except for the submatrix
+// Q[ilo:ihi+1,ilo:ihi+1]. On return z will be updated to the product Q*Z.
+//
+// ilo and ihi determine the block of H on which Dhseqr operates. It is assumed
+// that H is already upper triangular in rows and columns [0:ilo] and [ihi+1:n],
+// although it will be only checked that the block is isolated, that is,
+//
+//	ilo == 0   or H[ilo,ilo-1] == 0,
+//	ihi == n-1 or H[ihi+1,ihi] == 0,
+//
+// and Dhseqr will panic otherwise. ilo and ihi are typically set by a previous
+// call to Dgebal, otherwise they should be set to 0 and n-1, respectively. It
+// must hold that
+//
+//	0 <= ilo <= ihi < n     if n > 0,
+//	ilo == 0 and ihi == -1  if n == 0.
+//
+// wr and wi must have length n.
+//
+// work must have length at least lwork and lwork must be at least max(1,n)
+// otherwise Dhseqr will panic. The minimum lwork delivers very good and
+// sometimes optimal performance, although lwork as large as 11*n may be
+// required. On return, work[0] will contain the optimal value of lwork.
+//
+// If lwork is -1, instead of performing Dhseqr, the function only estimates the
+// optimal workspace size and stores it into work[0]. Neither h nor z are
+// accessed.
+//
+// unconverged indicates whether Dhseqr computed all the eigenvalues.
+//
+// If unconverged == 0, all the eigenvalues have been computed and their real
+// and imaginary parts will be stored on return in wr and wi, respectively. If
+// two eigenvalues are computed as a complex conjugate pair, they are stored in
+// consecutive elements of wr and wi, say the i-th and (i+1)th, with wi[i] > 0
+// and wi[i+1] < 0.
+//
+// If unconverged == 0 and job == lapack.EigenvaluesAndSchur, on return H will
+// contain the upper quasi-triangular matrix T from the Schur decomposition (the
+// Schur form). 2×2 diagonal blocks (corresponding to complex conjugate pairs of
+// eigenvalues) will be returned in standard form, with
+//
+//	H[i,i] == H[i+1,i+1],
+//
+// and
+//
+//	H[i+1,i]*H[i,i+1] < 0.
+//
+// The eigenvalues will be stored in wr and wi in the same order as on the
+// diagonal of the Schur form returned in H, with
+//
+//	wr[i] = H[i,i],
+//
+// and, if H[i:i+2,i:i+2] is a 2×2 diagonal block,
+//
+//	wi[i]   = sqrt(-H[i+1,i]*H[i,i+1]),
+//	wi[i+1] = -wi[i].
+//
+// If unconverged == 0 and job == lapack.EigenvaluesOnly, the contents of h
+// on return is unspecified.
+//
+// If unconverged > 0, some eigenvalues have not converged, and the blocks
+// [0:ilo] and [unconverged:n] of wr and wi will contain those eigenvalues which
+// have been successfully computed. Failures are rare.
+//
+// If unconverged > 0 and job == lapack.EigenvaluesOnly, on return the
+// remaining unconverged eigenvalues are the eigenvalues of the upper Hessenberg
+// matrix H[ilo:unconverged,ilo:unconverged].
+//
+// If unconverged > 0 and job == lapack.EigenvaluesAndSchur, then on
+// return
+//
+//	(initial H) U = U (final H),   (*)
+//
+// where U is an orthogonal matrix. The final H is upper Hessenberg and
+// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
+//
+// If unconverged > 0 and compz == lapack.SchurOrig, then on return
+//
+//	(final Z) = (initial Z) U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of job.
+//
+// If unconverged > 0 and compz == lapack.SchurHess, then on return
+//
+//	(final Z) = U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of job.
+//
+// References:
+//
+//	[1] R. Byers. LAPACK 3.1 xHSEQR: Tuning and Implementation Notes on the
+//	    Small Bulge Multi-Shift QR Algorithm with Aggressive Early Deflation.
+//	    LAPACK Working Note 187 (2007)
+//	    URL: http://www.netlib.org/lapack/lawnspdf/lawn187.pdf
+//	[2] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part I:
+//	    Maintaining Well-Focused Shifts and Level 3 Performance. SIAM J. Matrix
+//	    Anal. Appl. 23(4) (2002), pp. 929—947
+//	    URL: http://dx.doi.org/10.1137/S0895479801384573
+//	[3] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
+//	    Aggressive Early Deflation. SIAM J. Matrix Anal. Appl. 23(4) (2002), pp. 948—973
+//	    URL: http://dx.doi.org/10.1137/S0895479801384585
+//
+// Dhseqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dhseqr(job lapack.SchurJob, compz lapack.SchurComp, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, z []float64, ldz int, work []float64, lwork int) (unconverged int) {
+	wantt := job == lapack.EigenvaluesAndSchur
+	wantz := compz == lapack.SchurHess || compz == lapack.SchurOrig
+
+	switch {
+	case job != lapack.EigenvaluesOnly && job != lapack.EigenvaluesAndSchur:
+		panic(badSchurJob)
+	case compz != lapack.SchurNone && compz != lapack.SchurHess && compz != lapack.SchurOrig:
+		panic(badSchurComp)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	// Quick return in case of a workspace query.
+	if lwork == -1 {
+		impl.Dlaqr04(wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo, ihi, z, ldz, work, -1, 1)
+		work[0] = math.Max(float64(n), work[0])
+		return 0
+	}
+
+	switch {
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case len(wr) < n:
+		panic(shortWr)
+	case len(wi) < n:
+		panic(shortWi)
+	}
+
+	const (
+		// Matrices of order ntiny or smaller must be processed by
+		// Dlahqr because of insufficient subdiagonal scratch space.
+		// This is a hard limit.
+		ntiny = 15
+
+		// nl is the size of a local workspace to help small matrices
+		// through a rare Dlahqr failure. nl > ntiny is required and
+		// nl <= nmin = Ilaenv(ispec=12,...) is recommended (the default
+		// value of nmin is 75). Using nl = 49 allows up to six
+		// simultaneous shifts and a 16×16 deflation window.
+		nl = 49
+	)
+
+	// Copy eigenvalues isolated by Dgebal.
+	for i := 0; i < ilo; i++ {
+		wr[i] = h[i*ldh+i]
+		wi[i] = 0
+	}
+	for i := ihi + 1; i < n; i++ {
+		wr[i] = h[i*ldh+i]
+		wi[i] = 0
+	}
+
+	// Initialize Z to identity matrix if requested.
+	if compz == lapack.SchurHess {
+		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
+	}
+
+	// Quick return if possible.
+	if ilo == ihi {
+		wr[ilo] = h[ilo*ldh+ilo]
+		wi[ilo] = 0
+		return 0
+	}
+
+	// Dlahqr/Dlaqr04 crossover point.
+	nmin := impl.Ilaenv(12, "DHSEQR", string(job)+string(compz), n, ilo, ihi, lwork)
+	nmin = max(ntiny, nmin)
+
+	if n > nmin {
+		// Dlaqr0 for big matrices.
+		unconverged = impl.Dlaqr04(wantt, wantz, n, ilo, ihi, h, ldh, wr[:ihi+1], wi[:ihi+1],
+			ilo, ihi, z, ldz, work, lwork, 1)
+	} else {
+		// Dlahqr for small matrices.
+		unconverged = impl.Dlahqr(wantt, wantz, n, ilo, ihi, h, ldh, wr[:ihi+1], wi[:ihi+1],
+			ilo, ihi, z, ldz)
+		if unconverged > 0 {
+			// A rare Dlahqr failure! Dlaqr04 sometimes succeeds
+			// when Dlahqr fails.
+			kbot := unconverged
+			if n >= nl {
+				// Larger matrices have enough subdiagonal
+				// scratch space to call Dlaqr04 directly.
+				unconverged = impl.Dlaqr04(wantt, wantz, n, ilo, kbot, h, ldh,
+					wr[:ihi+1], wi[:ihi+1], ilo, ihi, z, ldz, work, lwork, 1)
+			} else {
+				// Tiny matrices don't have enough subdiagonal
+				// scratch space to benefit from Dlaqr04. Hence,
+				// tiny matrices must be copied into a larger
+				// array before calling Dlaqr04.
+				var hl [nl * nl]float64
+				impl.Dlacpy(blas.All, n, n, h, ldh, hl[:], nl)
+				impl.Dlaset(blas.All, nl, nl-n, 0, 0, hl[n:], nl)
+				var workl [nl]float64
+				unconverged = impl.Dlaqr04(wantt, wantz, nl, ilo, kbot, hl[:], nl,
+					wr[:ihi+1], wi[:ihi+1], ilo, ihi, z, ldz, workl[:], nl, 1)
+				work[0] = workl[0]
+				if wantt || unconverged > 0 {
+					impl.Dlacpy(blas.All, n, n, hl[:], nl, h, ldh)
+				}
+			}
+		}
+	}
+	// Zero out under the first subdiagonal, if necessary.
+	if (wantt || unconverged > 0) && n > 2 {
+		impl.Dlaset(blas.Lower, n-2, n-2, 0, 0, h[2*ldh:], ldh)
+	}
+
+	work[0] = math.Max(float64(n), work[0])
+	return unconverged
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go
new file mode 100644
index 00000000000..396242cc2d4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlabrd.go
@@ -0,0 +1,183 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlabrd reduces the first NB rows and columns of a real general m×n matrix
+// A to upper or lower bidiagonal form by an orthogonal transformation
+//
+//	Q**T * A * P
+//
+// If m >= n, A is reduced to upper bidiagonal form and upon exit the elements
+// on and below the diagonal in the first nb columns represent the elementary
+// reflectors, and the elements above the diagonal in the first nb rows represent
+// the matrix P. If m < n, A is reduced to lower bidiagonal form and the elements
+// P is instead stored above the diagonal.
+//
+// The reduction to bidiagonal form is stored in d and e, where d are the diagonal
+// elements, and e are the off-diagonal elements.
+//
+// The matrices Q and P are products of elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{nb-1}
+//	P = G_0 * G_1 * ... * G_{nb-1}
+//
+// where
+//
+//	H_i = I - tauQ[i] * v_i * v_iᵀ
+//	G_i = I - tauP[i] * u_i * u_iᵀ
+//
+// As an example, on exit the entries of A when m = 6, n = 5, and nb = 2
+//
+//	[ 1   1  u1  u1  u1]
+//	[v1   1   1  u2  u2]
+//	[v1  v2   a   a   a]
+//	[v1  v2   a   a   a]
+//	[v1  v2   a   a   a]
+//	[v1  v2   a   a   a]
+//
+// and when m = 5, n = 6, and nb = 2
+//
+//	[ 1  u1  u1  u1  u1  u1]
+//	[ 1   1  u2  u2  u2  u2]
+//	[v1   1   a   a   a   a]
+//	[v1  v2   a   a   a   a]
+//	[v1  v2   a   a   a   a]
+//
+// Dlabrd also returns the matrices X and Y which are used with U and V to
+// apply the transformation to the unreduced part of the matrix
+//
+//	A := A - V*Yᵀ - X*Uᵀ
+//
+// and returns the matrices X and Y which are needed to apply the
+// transformation to the unreduced part of A.
+//
+// X is an m×nb matrix, Y is an n×nb matrix. d, e, taup, and tauq must all have
+// length at least nb. Dlabrd will panic if these size constraints are violated.
+//
+// Dlabrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlabrd(m, n, nb int, a []float64, lda int, d, e, tauQ, tauP, x []float64, ldx int, y []float64, ldy int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case nb > m:
+		panic(nbGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldx < max(1, nb):
+		panic(badLdX)
+	case ldy < max(1, nb):
+		panic(badLdY)
+	}
+
+	if m == 0 || n == 0 || nb == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(d) < nb:
+		panic(shortD)
+	case len(e) < nb:
+		panic(shortE)
+	case len(tauQ) < nb:
+		panic(shortTauQ)
+	case len(tauP) < nb:
+		panic(shortTauP)
+	case len(x) < (m-1)*ldx+nb:
+		panic(shortX)
+	case len(y) < (n-1)*ldy+nb:
+		panic(shortY)
+	}
+
+	bi := blas64.Implementation()
+
+	if m >= n {
+		// Reduce to upper bidiagonal form.
+		for i := 0; i < nb; i++ {
+			bi.Dgemv(blas.NoTrans, m-i, i, -1, a[i*lda:], lda, y[i*ldy:], 1, 1, a[i*lda+i:], lda)
+			bi.Dgemv(blas.NoTrans, m-i, i, -1, x[i*ldx:], ldx, a[i:], lda, 1, a[i*lda+i:], lda)
+
+			a[i*lda+i], tauQ[i] = impl.Dlarfg(m-i, a[i*lda+i], a[min(i+1, m-1)*lda+i:], lda)
+			d[i] = a[i*lda+i]
+			if i < n-1 {
+				// Compute Y[i+1:n, i].
+				a[i*lda+i] = 1
+				bi.Dgemv(blas.Trans, m-i, n-i-1, 1, a[i*lda+i+1:], lda, a[i*lda+i:], lda, 0, y[(i+1)*ldy+i:], ldy)
+				bi.Dgemv(blas.Trans, m-i, i, 1, a[i*lda:], lda, a[i*lda+i:], lda, 0, y[i:], ldy)
+				bi.Dgemv(blas.NoTrans, n-i-1, i, -1, y[(i+1)*ldy:], ldy, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+				bi.Dgemv(blas.Trans, m-i, i, 1, x[i*ldx:], ldx, a[i*lda+i:], lda, 0, y[i:], ldy)
+				bi.Dgemv(blas.Trans, i, n-i-1, -1, a[i+1:], lda, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+				bi.Dscal(n-i-1, tauQ[i], y[(i+1)*ldy+i:], ldy)
+
+				// Update A[i, i+1:n].
+				bi.Dgemv(blas.NoTrans, n-i-1, i+1, -1, y[(i+1)*ldy:], ldy, a[i*lda:], 1, 1, a[i*lda+i+1:], 1)
+				bi.Dgemv(blas.Trans, i, n-i-1, -1, a[i+1:], lda, x[i*ldx:], 1, 1, a[i*lda+i+1:], 1)
+
+				// Generate reflection P[i] to annihilate A[i, i+2:n].
+				a[i*lda+i+1], tauP[i] = impl.Dlarfg(n-i-1, a[i*lda+i+1], a[i*lda+min(i+2, n-1):], 1)
+				e[i] = a[i*lda+i+1]
+				a[i*lda+i+1] = 1
+
+				// Compute X[i+1:m, i].
+				bi.Dgemv(blas.NoTrans, m-i-1, n-i-1, 1, a[(i+1)*lda+i+1:], lda, a[i*lda+i+1:], 1, 0, x[(i+1)*ldx+i:], ldx)
+				bi.Dgemv(blas.Trans, n-i-1, i+1, 1, y[(i+1)*ldy:], ldy, a[i*lda+i+1:], 1, 0, x[i:], ldx)
+				bi.Dgemv(blas.NoTrans, m-i-1, i+1, -1, a[(i+1)*lda:], lda, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+				bi.Dgemv(blas.NoTrans, i, n-i-1, 1, a[i+1:], lda, a[i*lda+i+1:], 1, 0, x[i:], ldx)
+				bi.Dgemv(blas.NoTrans, m-i-1, i, -1, x[(i+1)*ldx:], ldx, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+				bi.Dscal(m-i-1, tauP[i], x[(i+1)*ldx+i:], ldx)
+			}
+		}
+		return
+	}
+	// Reduce to lower bidiagonal form.
+	for i := 0; i < nb; i++ {
+		// Update A[i,i:n]
+		bi.Dgemv(blas.NoTrans, n-i, i, -1, y[i*ldy:], ldy, a[i*lda:], 1, 1, a[i*lda+i:], 1)
+		bi.Dgemv(blas.Trans, i, n-i, -1, a[i:], lda, x[i*ldx:], 1, 1, a[i*lda+i:], 1)
+
+		// Generate reflection P[i] to annihilate A[i, i+1:n]
+		a[i*lda+i], tauP[i] = impl.Dlarfg(n-i, a[i*lda+i], a[i*lda+min(i+1, n-1):], 1)
+		d[i] = a[i*lda+i]
+		if i < m-1 {
+			a[i*lda+i] = 1
+			// Compute X[i+1:m, i].
+			bi.Dgemv(blas.NoTrans, m-i-1, n-i, 1, a[(i+1)*lda+i:], lda, a[i*lda+i:], 1, 0, x[(i+1)*ldx+i:], ldx)
+			bi.Dgemv(blas.Trans, n-i, i, 1, y[i*ldy:], ldy, a[i*lda+i:], 1, 0, x[i:], ldx)
+			bi.Dgemv(blas.NoTrans, m-i-1, i, -1, a[(i+1)*lda:], lda, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+			bi.Dgemv(blas.NoTrans, i, n-i, 1, a[i:], lda, a[i*lda+i:], 1, 0, x[i:], ldx)
+			bi.Dgemv(blas.NoTrans, m-i-1, i, -1, x[(i+1)*ldx:], ldx, x[i:], ldx, 1, x[(i+1)*ldx+i:], ldx)
+			bi.Dscal(m-i-1, tauP[i], x[(i+1)*ldx+i:], ldx)
+
+			// Update A[i+1:m, i].
+			bi.Dgemv(blas.NoTrans, m-i-1, i, -1, a[(i+1)*lda:], lda, y[i*ldy:], 1, 1, a[(i+1)*lda+i:], lda)
+			bi.Dgemv(blas.NoTrans, m-i-1, i+1, -1, x[(i+1)*ldx:], ldx, a[i:], lda, 1, a[(i+1)*lda+i:], lda)
+
+			// Generate reflection Q[i] to annihilate A[i+2:m, i].
+			a[(i+1)*lda+i], tauQ[i] = impl.Dlarfg(m-i-1, a[(i+1)*lda+i], a[min(i+2, m-1)*lda+i:], lda)
+			e[i] = a[(i+1)*lda+i]
+			a[(i+1)*lda+i] = 1
+
+			// Compute Y[i+1:n, i].
+			bi.Dgemv(blas.Trans, m-i-1, n-i-1, 1, a[(i+1)*lda+i+1:], lda, a[(i+1)*lda+i:], lda, 0, y[(i+1)*ldy+i:], ldy)
+			bi.Dgemv(blas.Trans, m-i-1, i, 1, a[(i+1)*lda:], lda, a[(i+1)*lda+i:], lda, 0, y[i:], ldy)
+			bi.Dgemv(blas.NoTrans, n-i-1, i, -1, y[(i+1)*ldy:], ldy, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+			bi.Dgemv(blas.Trans, m-i-1, i+1, 1, x[(i+1)*ldx:], ldx, a[(i+1)*lda+i:], lda, 0, y[i:], ldy)
+			bi.Dgemv(blas.Trans, i+1, n-i-1, -1, a[i+1:], lda, y[i:], ldy, 1, y[(i+1)*ldy+i:], ldy)
+			bi.Dscal(n-i-1, tauQ[i], y[(i+1)*ldy+i:], ldy)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go
new file mode 100644
index 00000000000..cd6cf719d59
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacn2.go
@@ -0,0 +1,136 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlacn2 estimates the 1-norm of an n×n matrix A using sequential updates with
+// matrix-vector products provided externally.
+//
+// Dlacn2 is called sequentially and it returns the value of est and kase to be
+// used on the next call.
+// On the initial call, kase must be 0.
+// In between calls, x must be overwritten by
+//
+//	A * X    if kase was returned as 1,
+//	Aᵀ * X   if kase was returned as 2,
+//
+// and all other parameters must not be changed.
+// On the final return, kase is returned as 0, v contains A*W where W is a
+// vector, and est = norm(V)/norm(W) is a lower bound for 1-norm of A.
+//
+// v, x, and isgn must all have length n and n must be at least 1, otherwise
+// Dlacn2 will panic. isave is used for temporary storage.
+//
+// Dlacn2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlacn2(n int, v, x []float64, isgn []int, est float64, kase int, isave *[3]int) (float64, int) {
+	switch {
+	case n < 1:
+		panic(nLT1)
+	case len(v) < n:
+		panic(shortV)
+	case len(x) < n:
+		panic(shortX)
+	case len(isgn) < n:
+		panic(shortIsgn)
+	case isave[0] < 0 || 5 < isave[0]:
+		panic(badIsave)
+	case isave[0] == 0 && kase != 0:
+		panic(badIsave)
+	}
+
+	const itmax = 5
+	bi := blas64.Implementation()
+
+	if kase == 0 {
+		for i := 0; i < n; i++ {
+			x[i] = 1 / float64(n)
+		}
+		kase = 1
+		isave[0] = 1
+		return est, kase
+	}
+	switch isave[0] {
+	case 1:
+		if n == 1 {
+			v[0] = x[0]
+			est = math.Abs(v[0])
+			kase = 0
+			return est, kase
+		}
+		est = bi.Dasum(n, x, 1)
+		for i := 0; i < n; i++ {
+			x[i] = math.Copysign(1, x[i])
+			isgn[i] = int(x[i])
+		}
+		kase = 2
+		isave[0] = 2
+		return est, kase
+	case 2:
+		isave[1] = bi.Idamax(n, x, 1)
+		isave[2] = 2
+		for i := 0; i < n; i++ {
+			x[i] = 0
+		}
+		x[isave[1]] = 1
+		kase = 1
+		isave[0] = 3
+		return est, kase
+	case 3:
+		bi.Dcopy(n, x, 1, v, 1)
+		estold := est
+		est = bi.Dasum(n, v, 1)
+		sameSigns := true
+		for i := 0; i < n; i++ {
+			if int(math.Copysign(1, x[i])) != isgn[i] {
+				sameSigns = false
+				break
+			}
+		}
+		if !sameSigns && est > estold {
+			for i := 0; i < n; i++ {
+				x[i] = math.Copysign(1, x[i])
+				isgn[i] = int(x[i])
+			}
+			kase = 2
+			isave[0] = 4
+			return est, kase
+		}
+	case 4:
+		jlast := isave[1]
+		isave[1] = bi.Idamax(n, x, 1)
+		if x[jlast] != math.Abs(x[isave[1]]) && isave[2] < itmax {
+			isave[2] += 1
+			for i := 0; i < n; i++ {
+				x[i] = 0
+			}
+			x[isave[1]] = 1
+			kase = 1
+			isave[0] = 3
+			return est, kase
+		}
+	case 5:
+		tmp := 2 * (bi.Dasum(n, x, 1)) / float64(3*n)
+		if tmp > est {
+			bi.Dcopy(n, x, 1, v, 1)
+			est = tmp
+		}
+		kase = 0
+		return est, kase
+	}
+	// Iteration complete. Final stage
+	altsgn := 1.0
+	for i := 0; i < n; i++ {
+		x[i] = altsgn * (1 + float64(i)/float64(n-1))
+		altsgn *= -1
+	}
+	kase = 1
+	isave[0] = 5
+	return est, kase
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go
new file mode 100644
index 00000000000..793bb8c7caf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlacpy.go
@@ -0,0 +1,59 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlacpy copies the elements of A specified by uplo into B. Uplo can specify
+// a triangular portion with blas.Upper or blas.Lower, or can specify all of the
+// elements with blas.All.
+//
+// Dlacpy is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlacpy(uplo blas.Uplo, m, n int, a []float64, lda int, b []float64, ldb int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower && uplo != blas.All:
+		panic(badUplo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, n):
+		panic(badLdB)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(b) < (m-1)*ldb+n:
+		panic(shortB)
+	}
+
+	switch uplo {
+	case blas.Upper:
+		for i := 0; i < m; i++ {
+			for j := i; j < n; j++ {
+				b[i*ldb+j] = a[i*lda+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < m; i++ {
+			for j := 0; j < min(i+1, n); j++ {
+				b[i*ldb+j] = a[i*lda+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				b[i*ldb+j] = a[i*lda+j]
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go
new file mode 100644
index 00000000000..2eda3a18fec
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlae2.go
@@ -0,0 +1,51 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlae2 computes the eigenvalues of a 2×2 symmetric matrix
+//
+//	[a b]
+//	[b c]
+//
+// and returns the eigenvalue with the larger absolute value as rt1 and the
+// smaller as rt2.
+//
+// Dlae2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlae2(a, b, c float64) (rt1, rt2 float64) {
+	sm := a + c
+	df := a - c
+	adf := math.Abs(df)
+	tb := b + b
+	ab := math.Abs(tb)
+	acmx := c
+	acmn := a
+	if math.Abs(a) > math.Abs(c) {
+		acmx = a
+		acmn = c
+	}
+	var rt float64
+	if adf > ab {
+		rt = adf * math.Sqrt(1+(ab/adf)*(ab/adf))
+	} else if adf < ab {
+		rt = ab * math.Sqrt(1+(adf/ab)*(adf/ab))
+	} else {
+		rt = ab * math.Sqrt2
+	}
+	if sm < 0 {
+		rt1 = 0.5 * (sm - rt)
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+		return rt1, rt2
+	}
+	if sm > 0 {
+		rt1 = 0.5 * (sm + rt)
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+		return rt1, rt2
+	}
+	rt1 = 0.5 * rt
+	rt2 = -0.5 * rt
+	return rt1, rt2
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go
new file mode 100644
index 00000000000..56923f51d3e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaev2.go
@@ -0,0 +1,85 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlaev2 computes the Eigen decomposition of a symmetric 2×2 matrix.
+// The matrix is given by
+//
+//	[a b]
+//	[b c]
+//
+// Dlaev2 returns rt1 and rt2, the eigenvalues of the matrix where |RT1| > |RT2|,
+// and [cs1, sn1] which is the unit right eigenvalue for RT1.
+//
+//	[ cs1 sn1] [a b] [cs1 -sn1] = [rt1   0]
+//	[-sn1 cs1] [b c] [sn1  cs1]   [  0 rt2]
+//
+// Dlaev2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaev2(a, b, c float64) (rt1, rt2, cs1, sn1 float64) {
+	sm := a + c
+	df := a - c
+	adf := math.Abs(df)
+	tb := b + b
+	ab := math.Abs(tb)
+	acmx := c
+	acmn := a
+	if math.Abs(a) > math.Abs(c) {
+		acmx = a
+		acmn = c
+	}
+	var rt float64
+	if adf > ab {
+		rt = adf * math.Sqrt(1+(ab/adf)*(ab/adf))
+	} else if adf < ab {
+		rt = ab * math.Sqrt(1+(adf/ab)*(adf/ab))
+	} else {
+		rt = ab * math.Sqrt(2)
+	}
+	var sgn1 float64
+	if sm < 0 {
+		rt1 = 0.5 * (sm - rt)
+		sgn1 = -1
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+	} else if sm > 0 {
+		rt1 = 0.5 * (sm + rt)
+		sgn1 = 1
+		rt2 = (acmx/rt1)*acmn - (b/rt1)*b
+	} else {
+		rt1 = 0.5 * rt
+		rt2 = -0.5 * rt
+		sgn1 = 1
+	}
+	var cs, sgn2 float64
+	if df >= 0 {
+		cs = df + rt
+		sgn2 = 1
+	} else {
+		cs = df - rt
+		sgn2 = -1
+	}
+	acs := math.Abs(cs)
+	if acs > ab {
+		ct := -tb / cs
+		sn1 = 1 / math.Sqrt(1+ct*ct)
+		cs1 = ct * sn1
+	} else {
+		if ab == 0 {
+			cs1 = 1
+			sn1 = 0
+		} else {
+			tn := -cs / tb
+			cs1 = 1 / math.Sqrt(1+tn*tn)
+			sn1 = tn * cs1
+		}
+	}
+	if sgn1 == sgn2 {
+		tn := cs1
+		cs1 = -sn1
+		sn1 = tn
+	}
+	return rt1, rt2, cs1, sn1
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go
new file mode 100644
index 00000000000..2b79bd8ae7f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaexc.go
@@ -0,0 +1,269 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlaexc swaps two adjacent diagonal blocks of order 1 or 2 in an n×n upper
+// quasi-triangular matrix T by an orthogonal similarity transformation.
+//
+// T must be in Schur canonical form, that is, block upper triangular with 1×1
+// and 2×2 diagonal blocks; each 2×2 diagonal block has its diagonal elements
+// equal and its off-diagonal elements of opposite sign. On return, T will
+// contain the updated matrix again in Schur canonical form.
+//
+// If wantq is true, the transformation is accumulated in the n×n matrix Q,
+// otherwise Q is not referenced.
+//
+// j1 is the index of the first row of the first block. n1 and n2 are the order
+// of the first and second block, respectively.
+//
+// work must have length at least n, otherwise Dlaexc will panic.
+//
+// If ok is false, the transformed matrix T would be too far from Schur form.
+// The blocks are not swapped, and T and Q are not modified.
+//
+// If n1 and n2 are both equal to 1, Dlaexc will always return true.
+//
+// Dlaexc is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaexc(wantq bool, n int, t []float64, ldt int, q []float64, ldq int, j1, n1, n2 int, work []float64) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ldt < max(1, n):
+		panic(badLdT)
+	case wantq && ldt < max(1, n):
+		panic(badLdQ)
+	case j1 < 0 || n <= j1:
+		panic(badJ1)
+	case len(work) < n:
+		panic(shortWork)
+	case n1 < 0 || 2 < n1:
+		panic(badN1)
+	case n2 < 0 || 2 < n2:
+		panic(badN2)
+	}
+
+	if n == 0 || n1 == 0 || n2 == 0 {
+		return true
+	}
+
+	switch {
+	case len(t) < (n-1)*ldt+n:
+		panic(shortT)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	}
+
+	if j1+n1 >= n {
+		// TODO(vladimir-ch): Reference LAPACK does this check whether
+		// the start of the second block is in the matrix T. It returns
+		// true if it is not and moreover it does not check whether the
+		// whole second block fits into T. This does not feel
+		// satisfactory. The only caller of Dlaexc is Dtrexc, so if the
+		// caller makes sure that this does not happen, we could be
+		// stricter here.
+		return true
+	}
+
+	j2 := j1 + 1
+	j3 := j1 + 2
+
+	bi := blas64.Implementation()
+
+	if n1 == 1 && n2 == 1 {
+		// Swap two 1×1 blocks.
+		t11 := t[j1*ldt+j1]
+		t22 := t[j2*ldt+j2]
+
+		// Determine the transformation to perform the interchange.
+		cs, sn, _ := impl.Dlartg(t[j1*ldt+j2], t22-t11)
+
+		// Apply transformation to the matrix T.
+		if n-j3 > 0 {
+			bi.Drot(n-j3, t[j1*ldt+j3:], 1, t[j2*ldt+j3:], 1, cs, sn)
+		}
+		if j1 > 0 {
+			bi.Drot(j1, t[j1:], ldt, t[j2:], ldt, cs, sn)
+		}
+
+		t[j1*ldt+j1] = t22
+		t[j2*ldt+j2] = t11
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			bi.Drot(n, q[j1:], ldq, q[j2:], ldq, cs, sn)
+		}
+
+		return true
+	}
+
+	// Swapping involves at least one 2×2 block.
+	//
+	// Copy the diagonal block of order n1+n2 to the local array d and
+	// compute its norm.
+	nd := n1 + n2
+	var d [16]float64
+	const ldd = 4
+	impl.Dlacpy(blas.All, nd, nd, t[j1*ldt+j1:], ldt, d[:], ldd)
+	dnorm := impl.Dlange(lapack.MaxAbs, nd, nd, d[:], ldd, work)
+
+	// Compute machine-dependent threshold for test for accepting swap.
+	eps := dlamchP
+	thresh := math.Max(10*eps*dnorm, dlamchS/eps)
+
+	// Solve T11*X - X*T22 = scale*T12 for X.
+	var x [4]float64
+	const ldx = 2
+	scale, _, _ := impl.Dlasy2(false, false, -1, n1, n2, d[:], ldd, d[n1*ldd+n1:], ldd, d[n1:], ldd, x[:], ldx)
+
+	// Swap the adjacent diagonal blocks.
+	switch {
+	case n1 == 1 && n2 == 2:
+		// Generate elementary reflector H so that
+		//  ( scale, X11, X12 ) H = ( 0, 0, * )
+		u := [3]float64{scale, x[0], 1}
+		_, tau := impl.Dlarfg(3, x[1], u[:2], 1)
+		t11 := t[j1*ldt+j1]
+
+		// Perform swap provisionally on diagonal block in d.
+		impl.Dlarfx(blas.Left, 3, 3, u[:], tau, d[:], ldd, work)
+		impl.Dlarfx(blas.Right, 3, 3, u[:], tau, d[:], ldd, work)
+
+		// Test whether to reject swap.
+		if math.Max(math.Abs(d[2*ldd]), math.Max(math.Abs(d[2*ldd+1]), math.Abs(d[2*ldd+2]-t11))) > thresh {
+			return false
+		}
+
+		// Accept swap: apply transformation to the entire matrix T.
+		impl.Dlarfx(blas.Left, 3, n-j1, u[:], tau, t[j1*ldt+j1:], ldt, work)
+		impl.Dlarfx(blas.Right, j2+1, 3, u[:], tau, t[j1:], ldt, work)
+
+		t[j3*ldt+j1] = 0
+		t[j3*ldt+j2] = 0
+		t[j3*ldt+j3] = t11
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			impl.Dlarfx(blas.Right, n, 3, u[:], tau, q[j1:], ldq, work)
+		}
+
+	case n1 == 2 && n2 == 1:
+		//  Generate elementary reflector H so that:
+		//   H (  -X11 ) = ( * )
+		//     (  -X21 ) = ( 0 )
+		//     ( scale ) = ( 0 )
+		u := [3]float64{1, -x[ldx], scale}
+		_, tau := impl.Dlarfg(3, -x[0], u[1:], 1)
+		t33 := t[j3*ldt+j3]
+
+		// Perform swap provisionally on diagonal block in D.
+		impl.Dlarfx(blas.Left, 3, 3, u[:], tau, d[:], ldd, work)
+		impl.Dlarfx(blas.Right, 3, 3, u[:], tau, d[:], ldd, work)
+
+		// Test whether to reject swap.
+		if math.Max(math.Abs(d[ldd]), math.Max(math.Abs(d[2*ldd]), math.Abs(d[0]-t33))) > thresh {
+			return false
+		}
+
+		// Accept swap: apply transformation to the entire matrix T.
+		impl.Dlarfx(blas.Right, j3+1, 3, u[:], tau, t[j1:], ldt, work)
+		impl.Dlarfx(blas.Left, 3, n-j1-1, u[:], tau, t[j1*ldt+j2:], ldt, work)
+
+		t[j1*ldt+j1] = t33
+		t[j2*ldt+j1] = 0
+		t[j3*ldt+j1] = 0
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			impl.Dlarfx(blas.Right, n, 3, u[:], tau, q[j1:], ldq, work)
+		}
+
+	default: // n1 == 2 && n2 == 2
+		// Generate elementary reflectors H_1 and H_2 so that:
+		//  H_2 H_1 (  -X11  -X12 ) = (  *  * )
+		//          (  -X21  -X22 )   (  0  * )
+		//          ( scale    0  )   (  0  0 )
+		//          (    0  scale )   (  0  0 )
+		u1 := [3]float64{1, -x[ldx], scale}
+		_, tau1 := impl.Dlarfg(3, -x[0], u1[1:], 1)
+
+		temp := -tau1 * (x[1] + u1[1]*x[ldx+1])
+		u2 := [3]float64{1, -temp * u1[2], scale}
+		_, tau2 := impl.Dlarfg(3, -temp*u1[1]-x[ldx+1], u2[1:], 1)
+
+		// Perform swap provisionally on diagonal block in D.
+		impl.Dlarfx(blas.Left, 3, 4, u1[:], tau1, d[:], ldd, work)
+		impl.Dlarfx(blas.Right, 4, 3, u1[:], tau1, d[:], ldd, work)
+		impl.Dlarfx(blas.Left, 3, 4, u2[:], tau2, d[ldd:], ldd, work)
+		impl.Dlarfx(blas.Right, 4, 3, u2[:], tau2, d[1:], ldd, work)
+
+		// Test whether to reject swap.
+		m1 := math.Max(math.Abs(d[2*ldd]), math.Abs(d[2*ldd+1]))
+		m2 := math.Max(math.Abs(d[3*ldd]), math.Abs(d[3*ldd+1]))
+		if math.Max(m1, m2) > thresh {
+			return false
+		}
+
+		// Accept swap: apply transformation to the entire matrix T.
+		j4 := j1 + 3
+		impl.Dlarfx(blas.Left, 3, n-j1, u1[:], tau1, t[j1*ldt+j1:], ldt, work)
+		impl.Dlarfx(blas.Right, j4+1, 3, u1[:], tau1, t[j1:], ldt, work)
+		impl.Dlarfx(blas.Left, 3, n-j1, u2[:], tau2, t[j2*ldt+j1:], ldt, work)
+		impl.Dlarfx(blas.Right, j4+1, 3, u2[:], tau2, t[j2:], ldt, work)
+
+		t[j3*ldt+j1] = 0
+		t[j3*ldt+j2] = 0
+		t[j4*ldt+j1] = 0
+		t[j4*ldt+j2] = 0
+
+		if wantq {
+			// Accumulate transformation in the matrix Q.
+			impl.Dlarfx(blas.Right, n, 3, u1[:], tau1, q[j1:], ldq, work)
+			impl.Dlarfx(blas.Right, n, 3, u2[:], tau2, q[j2:], ldq, work)
+		}
+	}
+
+	if n2 == 2 {
+		// Standardize new 2×2 block T11.
+		a, b := t[j1*ldt+j1], t[j1*ldt+j2]
+		c, d := t[j2*ldt+j1], t[j2*ldt+j2]
+		var cs, sn float64
+		t[j1*ldt+j1], t[j1*ldt+j2], t[j2*ldt+j1], t[j2*ldt+j2], _, _, _, _, cs, sn = impl.Dlanv2(a, b, c, d)
+		if n-j1-2 > 0 {
+			bi.Drot(n-j1-2, t[j1*ldt+j1+2:], 1, t[j2*ldt+j1+2:], 1, cs, sn)
+		}
+		if j1 > 0 {
+			bi.Drot(j1, t[j1:], ldt, t[j2:], ldt, cs, sn)
+		}
+		if wantq {
+			bi.Drot(n, q[j1:], ldq, q[j2:], ldq, cs, sn)
+		}
+	}
+	if n1 == 2 {
+		// Standardize new 2×2 block T22.
+		j3 := j1 + n2
+		j4 := j3 + 1
+		a, b := t[j3*ldt+j3], t[j3*ldt+j4]
+		c, d := t[j4*ldt+j3], t[j4*ldt+j4]
+		var cs, sn float64
+		t[j3*ldt+j3], t[j3*ldt+j4], t[j4*ldt+j3], t[j4*ldt+j4], _, _, _, _, cs, sn = impl.Dlanv2(a, b, c, d)
+		if n-j3-2 > 0 {
+			bi.Drot(n-j3-2, t[j3*ldt+j3+2:], 1, t[j4*ldt+j3+2:], 1, cs, sn)
+		}
+		bi.Drot(j3, t[j3:], ldt, t[j4:], ldt, cs, sn)
+		if wantq {
+			bi.Drot(n, q[j3:], ldq, q[j4:], ldq, cs, sn)
+		}
+	}
+
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go
new file mode 100644
index 00000000000..cd644b65bb5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlag2.go
@@ -0,0 +1,237 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlag2 computes the eigenvalues of a 2×2 generalized eigenvalue problem
+//
+//	A - w*B
+//
+// where B is an upper triangular matrix.
+//
+// Dlag2 uses scaling as necessary to avoid over-/underflow. Scaling results in
+// a modified eigenvalue problem
+//
+//	s*A - w*B
+//
+// where s is a non-negative scaling factor chosen so that w, w*B, and s*A do
+// not overflow and, if possible, do not underflow, either.
+//
+// scale1 and scale2 are used to avoid over-/underflow in the eigenvalue
+// equation which defines the first and second eigenvalue respectively. Note
+// that scale1 and scale2 may be zero or less than the underflow threshold if
+// the corresponding exact eigenvalue is sufficiently large.
+//
+// If the eigenvalues are real, then:
+//   - wi is zero,
+//   - the eigenvalues are wr1/scale1 and wr2/scale2.
+//
+// If the eigenvalues are complex, then:
+//   - wi is non-negative,
+//   - the eigenvalues are (wr1 ± wi*i)/scale1,
+//   - wr1 = wr2,
+//   - scale1 = scale2.
+//
+// Dlag2 assumes that the one-norm of A and B is less than 1/dlamchS. Entries of
+// A less than sqrt(dlamchS)*norm(A) are subject to being treated as zero. The
+// diagonals of B should be at least sqrt(dlamchS) times the largest element of
+// B (in absolute value); if a diagonal is smaller than that, then
+// ±sqrt(dlamchS) will be used instead of that diagonal.
+//
+// Dlag2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlag2(a []float64, lda int, b []float64, ldb int) (scale1, scale2, wr1, wr2, wi float64) {
+	switch {
+	case lda < 2:
+		panic(badLdA)
+	case ldb < 2:
+		panic(badLdB)
+	case len(a) < lda+2:
+		panic(shortA)
+	case len(b) < ldb+2:
+		panic(shortB)
+	}
+
+	const (
+		safmin = dlamchS
+		safmax = 1 / safmin
+		fuzzy1 = 1 + 1e-5
+	)
+	rtmin := math.Sqrt(safmin)
+	rtmax := 1 / rtmin
+
+	// Scale A.
+	anorm := math.Max(math.Abs(a[0])+math.Abs(a[lda]),
+		math.Abs(a[1])+math.Abs(a[lda+1]))
+	anorm = math.Max(anorm, safmin)
+	ascale := 1 / anorm
+	a11 := ascale * a[0]
+	a21 := ascale * a[lda]
+	a12 := ascale * a[1]
+	a22 := ascale * a[lda+1]
+
+	// Perturb B if necessary to insure non-singularity.
+	b11 := b[0]
+	b12 := b[1]
+	b22 := b[ldb+1]
+	bmin := rtmin * math.Max(math.Max(math.Abs(b11), math.Abs(b12)),
+		math.Max(math.Abs(b22), rtmin))
+	if math.Abs(b11) < bmin {
+		b11 = math.Copysign(bmin, b11)
+	}
+	if math.Abs(b22) < bmin {
+		b22 = math.Copysign(bmin, b22)
+	}
+
+	// Scale B.
+	bnorm := math.Max(math.Max(math.Abs(b11), math.Abs(b12)+math.Abs(b22)), safmin)
+	bsize := math.Max(math.Abs(b11), math.Abs(b22))
+	bscale := 1 / bsize
+	b11 *= bscale
+	b12 *= bscale
+	b22 *= bscale
+
+	// Compute larger eigenvalue by method described by C. van Loan.
+	var (
+		as12, abi22   float64
+		pp, qq, shift float64
+	)
+	binv11 := 1 / b11
+	binv22 := 1 / b22
+	s1 := a11 * binv11
+	s2 := a22 * binv22
+	// AS is A shifted by -shift*B.
+	if math.Abs(s1) <= math.Abs(s2) {
+		shift = s1
+		as12 = a12 - shift*b12
+		as22 := a22 - shift*b22
+		ss := a21 * (binv11 * binv22)
+		abi22 = as22*binv22 - ss*b12
+		pp = 0.5 * abi22
+		qq = ss * as12
+	} else {
+		shift = s2
+		as12 = a12 - shift*b12
+		as11 := a11 - shift*b11
+		ss := a21 * (binv11 * binv22)
+		abi22 = -ss * b12
+		pp = 0.5 * (as11*binv11 + abi22)
+		qq = ss * as12
+	}
+	var discr, r float64
+	if math.Abs(pp*rtmin) >= 1 {
+		tmp := rtmin * pp
+		discr = tmp*tmp + qq*safmin
+		r = math.Sqrt(math.Abs(discr)) * rtmax
+	} else {
+		pp2 := pp * pp
+		if pp2+math.Abs(qq) <= safmin {
+			tmp := rtmax * pp
+			discr = tmp*tmp + qq*safmax
+			r = math.Sqrt(math.Abs(discr)) * rtmin
+		} else {
+			discr = pp2 + qq
+			r = math.Sqrt(math.Abs(discr))
+		}
+	}
+
+	// TODO(vladimir-ch): Is the following comment from the reference needed in
+	// a Go implementation?
+	//
+	// Note: the test of r in the following `if` is to cover the case when discr
+	// is small and negative and is flushed to zero during the calculation of r.
+	// On machines which have a consistent flush-to-zero threshold and handle
+	// numbers above that threshold correctly, it would not be necessary.
+	if discr >= 0 || r == 0 {
+		sum := pp + math.Copysign(r, pp)
+		diff := pp - math.Copysign(r, pp)
+		wbig := shift + sum
+
+		// Compute smaller eigenvalue.
+		wsmall := shift + diff
+		if 0.5*math.Abs(wbig) > math.Max(math.Abs(wsmall), safmin) {
+			wdet := (a11*a22 - a12*a21) * (binv11 * binv22)
+			wsmall = wdet / wbig
+		}
+		// Choose (real) eigenvalue closest to 2,2 element of A*B^{-1} for wr1.
+		if pp > abi22 {
+			wr1 = math.Min(wbig, wsmall)
+			wr2 = math.Max(wbig, wsmall)
+		} else {
+			wr1 = math.Max(wbig, wsmall)
+			wr2 = math.Min(wbig, wsmall)
+		}
+	} else {
+		// Complex eigenvalues.
+		wr1 = shift + pp
+		wr2 = wr1
+		wi = r
+	}
+
+	// Further scaling to avoid underflow and overflow in computing
+	// scale1 and overflow in computing w*B.
+	//
+	// This scale factor (wscale) is bounded from above using c1 and c2,
+	// and from below using c3 and c4:
+	//  - c1 implements the condition s*A must never overflow.
+	//  - c2 implements the condition w*B must never overflow.
+	//  - c3, with c2, implement the condition that s*A - w*B must never overflow.
+	//  - c4 implements the condition s should not underflow.
+	//  - c5 implements the condition max(s,|w|) should be at least 2.
+	c1 := bsize * (safmin * math.Max(1, ascale))
+	c2 := safmin * math.Max(1, bnorm)
+	c3 := bsize * safmin
+	c4 := 1.0
+	c5 := 1.0
+	if ascale <= 1 || bsize <= 1 {
+		c5 = math.Min(1, ascale*bsize)
+		if ascale <= 1 && bsize <= 1 {
+			c4 = math.Min(1, (ascale/safmin)*bsize)
+		}
+	}
+
+	// Scale first eigenvalue.
+	wabs := math.Abs(wr1) + math.Abs(wi)
+	wsize := math.Max(math.Max(safmin, c1), math.Max(fuzzy1*(wabs*c2+c3),
+		math.Min(c4, 0.5*math.Max(wabs, c5))))
+	maxABsize := math.Max(ascale, bsize)
+	minABsize := math.Min(ascale, bsize)
+	if wsize != 1 {
+		wscale := 1 / wsize
+		if wsize > 1 {
+			scale1 = (maxABsize * wscale) * minABsize
+		} else {
+			scale1 = (minABsize * wscale) * maxABsize
+		}
+		wr1 *= wscale
+		if wi != 0 {
+			wi *= wscale
+			wr2 = wr1
+			scale2 = scale1
+		}
+	} else {
+		scale1 = ascale * bsize
+		scale2 = scale1
+	}
+
+	// Scale second eigenvalue if real.
+	if wi == 0 {
+		wsize = math.Max(math.Max(safmin, c1), math.Max(fuzzy1*(math.Abs(wr2)*c2+c3),
+			math.Min(c4, 0.5*math.Max(math.Abs(wr2), c5))))
+		if wsize != 1 {
+			wscale := 1 / wsize
+			if wsize > 1 {
+				scale2 = (maxABsize * wscale) * minABsize
+			} else {
+				scale2 = (minABsize * wscale) * maxABsize
+			}
+			wr2 *= wscale
+		} else {
+			scale2 = ascale * bsize
+		}
+	}
+
+	return scale1, scale2, wr1, wr2, wi
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go
new file mode 100644
index 00000000000..7bd4f219707
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlags2.go
@@ -0,0 +1,186 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlags2 computes 2-by-2 orthogonal matrices U, V and Q with the
+// triangles of A and B specified by upper.
+//
+// If upper is true
+//
+//	Uᵀ*A*Q = Uᵀ*[ a1 a2 ]*Q = [ x  0 ]
+//	            [ 0  a3 ]     [ x  x ]
+//
+// and
+//
+//	Vᵀ*B*Q = Vᵀ*[ b1 b2 ]*Q = [ x  0 ]
+//	            [ 0  b3 ]     [ x  x ]
+//
+// otherwise
+//
+//	Uᵀ*A*Q = Uᵀ*[ a1 0  ]*Q = [ x  x ]
+//	            [ a2 a3 ]     [ 0  x ]
+//
+// and
+//
+//	Vᵀ*B*Q = Vᵀ*[ b1 0  ]*Q = [ x  x ]
+//	            [ b2 b3 ]     [ 0  x ].
+//
+// The rows of the transformed A and B are parallel, where
+//
+//	U = [  csu  snu ], V = [  csv snv ], Q = [  csq   snq ]
+//	    [ -snu  csu ]      [ -snv csv ]      [ -snq   csq ]
+//
+// Dlags2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlags2(upper bool, a1, a2, a3, b1, b2, b3 float64) (csu, snu, csv, snv, csq, snq float64) {
+	if upper {
+		// Input matrices A and B are upper triangular matrices.
+		//
+		// Form matrix C = A*adj(B) = [ a b ]
+		//                            [ 0 d ]
+		a := a1 * b3
+		d := a3 * b1
+		b := a2*b1 - a1*b2
+
+		// The SVD of real 2-by-2 triangular C.
+		//
+		//  [ csl -snl ]*[ a b ]*[  csr  snr ] = [ r 0 ]
+		//  [ snl  csl ] [ 0 d ] [ -snr  csr ]   [ 0 t ]
+		_, _, snr, csr, snl, csl := impl.Dlasv2(a, b, d)
+
+		if math.Abs(csl) >= math.Abs(snl) || math.Abs(csr) >= math.Abs(snr) {
+			// Compute the [0, 0] and [0, 1] elements of Uᵀ*A and Vᵀ*B,
+			// and [0, 1] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua11r := csl * a1
+			ua12 := csl*a2 + snl*a3
+
+			vb11r := csr * b1
+			vb12 := csr*b2 + snr*b3
+
+			aua12 := math.Abs(csl)*math.Abs(a2) + math.Abs(snl)*math.Abs(a3)
+			avb12 := math.Abs(csr)*math.Abs(b2) + math.Abs(snr)*math.Abs(b3)
+
+			// Zero [0, 1] elements of Uᵀ*A and Vᵀ*B.
+			if math.Abs(ua11r)+math.Abs(ua12) != 0 {
+				if aua12/(math.Abs(ua11r)+math.Abs(ua12)) <= avb12/(math.Abs(vb11r)+math.Abs(vb12)) {
+					csq, snq, _ = impl.Dlartg(-ua11r, ua12)
+				} else {
+					csq, snq, _ = impl.Dlartg(-vb11r, vb12)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(-vb11r, vb12)
+			}
+
+			csu = csl
+			snu = -snl
+			csv = csr
+			snv = -snr
+		} else {
+			// Compute the [1, 0] and [1, 1] elements of Uᵀ*A and Vᵀ*B,
+			// and [1, 1] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua21 := -snl * a1
+			ua22 := -snl*a2 + csl*a3
+
+			vb21 := -snr * b1
+			vb22 := -snr*b2 + csr*b3
+
+			aua22 := math.Abs(snl)*math.Abs(a2) + math.Abs(csl)*math.Abs(a3)
+			avb22 := math.Abs(snr)*math.Abs(b2) + math.Abs(csr)*math.Abs(b3)
+
+			// Zero [1, 1] elements of Uᵀ*A and Vᵀ*B, and then swap.
+			if math.Abs(ua21)+math.Abs(ua22) != 0 {
+				if aua22/(math.Abs(ua21)+math.Abs(ua22)) <= avb22/(math.Abs(vb21)+math.Abs(vb22)) {
+					csq, snq, _ = impl.Dlartg(-ua21, ua22)
+				} else {
+					csq, snq, _ = impl.Dlartg(-vb21, vb22)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(-vb21, vb22)
+			}
+
+			csu = snl
+			snu = csl
+			csv = snr
+			snv = csr
+		}
+	} else {
+		// Input matrices A and B are lower triangular matrices
+		//
+		// Form matrix C = A*adj(B) = [ a 0 ]
+		//                            [ c d ]
+		a := a1 * b3
+		d := a3 * b1
+		c := a2*b3 - a3*b2
+
+		// The SVD of real 2-by-2 triangular C
+		//
+		// [ csl -snl ]*[ a 0 ]*[  csr  snr ] = [ r 0 ]
+		// [ snl  csl ] [ c d ] [ -snr  csr ]   [ 0 t ]
+		_, _, snr, csr, snl, csl := impl.Dlasv2(a, c, d)
+
+		if math.Abs(csr) >= math.Abs(snr) || math.Abs(csl) >= math.Abs(snl) {
+			// Compute the [1, 0] and [1, 1] elements of Uᵀ*A and Vᵀ*B,
+			// and [1, 0] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua21 := -snr*a1 + csr*a2
+			ua22r := csr * a3
+
+			vb21 := -snl*b1 + csl*b2
+			vb22r := csl * b3
+
+			aua21 := math.Abs(snr)*math.Abs(a1) + math.Abs(csr)*math.Abs(a2)
+			avb21 := math.Abs(snl)*math.Abs(b1) + math.Abs(csl)*math.Abs(b2)
+
+			// Zero [1, 0] elements of Uᵀ*A and Vᵀ*B.
+			if (math.Abs(ua21) + math.Abs(ua22r)) != 0 {
+				if aua21/(math.Abs(ua21)+math.Abs(ua22r)) <= avb21/(math.Abs(vb21)+math.Abs(vb22r)) {
+					csq, snq, _ = impl.Dlartg(ua22r, ua21)
+				} else {
+					csq, snq, _ = impl.Dlartg(vb22r, vb21)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(vb22r, vb21)
+			}
+
+			csu = csr
+			snu = -snr
+			csv = csl
+			snv = -snl
+		} else {
+			// Compute the [0, 0] and [0, 1] elements of Uᵀ *A and Vᵀ *B,
+			// and [0, 0] element of |U|ᵀ*|A| and |V|ᵀ*|B|.
+
+			ua11 := csr*a1 + snr*a2
+			ua12 := snr * a3
+
+			vb11 := csl*b1 + snl*b2
+			vb12 := snl * b3
+
+			aua11 := math.Abs(csr)*math.Abs(a1) + math.Abs(snr)*math.Abs(a2)
+			avb11 := math.Abs(csl)*math.Abs(b1) + math.Abs(snl)*math.Abs(b2)
+
+			// Zero [0, 0] elements of Uᵀ*A and Vᵀ*B, and then swap.
+			if (math.Abs(ua11) + math.Abs(ua12)) != 0 {
+				if aua11/(math.Abs(ua11)+math.Abs(ua12)) <= avb11/(math.Abs(vb11)+math.Abs(vb12)) {
+					csq, snq, _ = impl.Dlartg(ua12, ua11)
+				} else {
+					csq, snq, _ = impl.Dlartg(vb12, vb11)
+				}
+			} else {
+				csq, snq, _ = impl.Dlartg(vb12, vb11)
+			}
+
+			csu = snr
+			snu = csr
+			csv = snl
+			snv = csl
+		}
+	}
+
+	return csu, snu, csv, snv, csq, snq
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go
new file mode 100644
index 00000000000..fc8c8eb403b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlagtm.go
@@ -0,0 +1,111 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlagtm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C   if trans == blas.NoTrans
+//	C = alpha * Aᵀ * B + beta * C  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an m×m tridiagonal matrix represented by its diagonals dl, d, du,
+// B and C are m×n dense matrices, and alpha and beta are scalars.
+func (impl Implementation) Dlagtm(trans blas.Transpose, m, n int, alpha float64, dl, d, du []float64, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	switch {
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldb < max(1, n):
+		panic(badLdB)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(dl) < m-1:
+		panic(shortDL)
+	case len(d) < m:
+		panic(shortD)
+	case len(du) < m-1:
+		panic(shortDU)
+	case len(b) < (m-1)*ldb+n:
+		panic(shortB)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ci := c[i*ldc : i*ldc+n]
+				for j := range ci {
+					ci[j] *= beta
+				}
+			}
+		}
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	if m == 1 {
+		if alpha == 1 {
+			for j := 0; j < n; j++ {
+				c[j] += d[0] * b[j]
+			}
+		} else {
+			for j := 0; j < n; j++ {
+				c[j] += alpha * d[0] * b[j]
+			}
+		}
+		return
+	}
+
+	if trans != blas.NoTrans {
+		dl, du = du, dl
+	}
+
+	if alpha == 1 {
+		for j := 0; j < n; j++ {
+			c[j] += d[0]*b[j] + du[0]*b[ldb+j]
+		}
+		for i := 1; i < m-1; i++ {
+			for j := 0; j < n; j++ {
+				c[i*ldc+j] += dl[i-1]*b[(i-1)*ldb+j] + d[i]*b[i*ldb+j] + du[i]*b[(i+1)*ldb+j]
+			}
+		}
+		for j := 0; j < n; j++ {
+			c[(m-1)*ldc+j] += dl[m-2]*b[(m-2)*ldb+j] + d[m-1]*b[(m-1)*ldb+j]
+		}
+	} else {
+		for j := 0; j < n; j++ {
+			c[j] += alpha * (d[0]*b[j] + du[0]*b[ldb+j])
+		}
+		for i := 1; i < m-1; i++ {
+			for j := 0; j < n; j++ {
+				c[i*ldc+j] += alpha * (dl[i-1]*b[(i-1)*ldb+j] + d[i]*b[i*ldb+j] + du[i]*b[(i+1)*ldb+j])
+			}
+		}
+		for j := 0; j < n; j++ {
+			c[(m-1)*ldc+j] += alpha * (dl[m-2]*b[(m-2)*ldb+j] + d[m-1]*b[(m-1)*ldb+j])
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go
new file mode 100644
index 00000000000..6f1202547e4
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahqr.go
@@ -0,0 +1,449 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlahqr computes the eigenvalues and Schur factorization of a block of an n×n
+// upper Hessenberg matrix H, using the double-shift/single-shift QR algorithm.
+//
+// h and ldh represent the matrix H. Dlahqr works primarily with the Hessenberg
+// submatrix H[ilo:ihi+1,ilo:ihi+1], but applies transformations to all of H if
+// wantt is true. It is assumed that H[ihi+1:n,ihi+1:n] is already upper
+// quasi-triangular, although this is not checked.
+//
+// It must hold that
+//
+//	0 <= ilo <= max(0,ihi), and ihi < n,
+//
+// and that
+//
+//	H[ilo,ilo-1] == 0,  if ilo > 0,
+//
+// otherwise Dlahqr will panic.
+//
+// If unconverged is zero on return, wr[ilo:ihi+1] and wi[ilo:ihi+1] will contain
+// respectively the real and imaginary parts of the computed eigenvalues ilo
+// to ihi. If two eigenvalues are computed as a complex conjugate pair, they are
+// stored in consecutive elements of wr and wi, say the i-th and (i+1)th, with
+// wi[i] > 0 and wi[i+1] < 0. If wantt is true, the eigenvalues are stored in
+// the same order as on the diagonal of the Schur form returned in H, with
+// wr[i] = H[i,i], and, if H[i:i+2,i:i+2] is a 2×2 diagonal block,
+// wi[i] = sqrt(abs(H[i+1,i]*H[i,i+1])) and wi[i+1] = -wi[i].
+//
+// wr and wi must have length ihi+1.
+//
+// z and ldz represent an n×n matrix Z. If wantz is true, the transformations
+// will be applied to the submatrix Z[iloz:ihiz+1,ilo:ihi+1] and it must hold that
+//
+//	0 <= iloz <= ilo, and ihi <= ihiz < n.
+//
+// If wantz is false, z is not referenced.
+//
+// unconverged indicates whether Dlahqr computed all the eigenvalues ilo to ihi
+// in a total of 30 iterations per eigenvalue.
+//
+// If unconverged is zero, all the eigenvalues ilo to ihi have been computed and
+// will be stored on return in wr[ilo:ihi+1] and wi[ilo:ihi+1].
+//
+// If unconverged is zero and wantt is true, H[ilo:ihi+1,ilo:ihi+1] will be
+// overwritten on return by upper quasi-triangular full Schur form with any
+// 2×2 diagonal blocks in standard form.
+//
+// If unconverged is zero and if wantt is false, the contents of h on return is
+// unspecified.
+//
+// If unconverged is positive, some eigenvalues have not converged, and
+// wr[unconverged:ihi+1] and wi[unconverged:ihi+1] contain those eigenvalues
+// which have been successfully computed.
+//
+// If unconverged is positive and wantt is true, then on return
+//
+//	(initial H)*U = U*(final H),   (*)
+//
+// where U is an orthogonal matrix. The final H is upper Hessenberg and
+// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
+//
+// If unconverged is positive and wantt is false, on return the remaining
+// unconverged eigenvalues are the eigenvalues of the upper Hessenberg matrix
+// H[ilo:unconverged,ilo:unconverged].
+//
+// If unconverged is positive and wantz is true, then on return
+//
+//	(final Z) = (initial Z)*U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of wantt.
+//
+// Dlahqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlahqr(wantt, wantz bool, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, iloz, ihiz int, z []float64, ldz int) (unconverged int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0, max(0, ihi) < ilo:
+		panic(badIlo)
+	case ihi >= n:
+		panic(badIhi)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case wantz && (iloz < 0 || ilo < iloz):
+		panic(badIloz)
+	case wantz && (ihiz < ihi || n <= ihiz):
+		panic(badIhiz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case len(wr) != ihi+1:
+		panic(shortWr)
+	case len(wi) != ihi+1:
+		panic(shortWi)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case ilo > 0 && h[ilo*ldh+ilo-1] != 0:
+		panic(notIsolated)
+	}
+
+	if ilo == ihi {
+		wr[ilo] = h[ilo*ldh+ilo]
+		wi[ilo] = 0
+		return 0
+	}
+
+	// Clear out the trash.
+	for j := ilo; j < ihi-2; j++ {
+		h[(j+2)*ldh+j] = 0
+		h[(j+3)*ldh+j] = 0
+	}
+	if ilo <= ihi-2 {
+		h[ihi*ldh+ihi-2] = 0
+	}
+
+	nh := ihi - ilo + 1
+	nz := ihiz - iloz + 1
+
+	// Set machine-dependent constants for the stopping criterion.
+	ulp := dlamchP
+	smlnum := float64(nh) / ulp * dlamchS
+
+	// i1 and i2 are the indices of the first row and last column of H to
+	// which transformations must be applied. If eigenvalues only are being
+	// computed, i1 and i2 are set inside the main loop.
+	var i1, i2 int
+	if wantt {
+		i1 = 0
+		i2 = n - 1
+	}
+
+	itmax := 30 * max(10, nh) // Total number of QR iterations allowed.
+
+	// kdefl counts the number of iterations since a deflation.
+	kdefl := 0
+
+	// The main loop begins here. i is the loop index and decreases from ihi
+	// to ilo in steps of 1 or 2. Each iteration of the loop works with the
+	// active submatrix in rows and columns l to i. Eigenvalues i+1 to ihi
+	// have already converged. Either l = ilo or H[l,l-1] is negligible so
+	// that the matrix splits.
+	bi := blas64.Implementation()
+	i := ihi
+	for i >= ilo {
+		l := ilo
+
+		// Perform QR iterations on rows and columns ilo to i until a
+		// submatrix of order 1 or 2 splits off at the bottom because a
+		// subdiagonal element has become negligible.
+		converged := false
+		for its := 0; its <= itmax; its++ {
+			// Look for a single small subdiagonal element.
+			var k int
+			for k = i; k > l; k-- {
+				if math.Abs(h[k*ldh+k-1]) <= smlnum {
+					break
+				}
+				tst := math.Abs(h[(k-1)*ldh+k-1]) + math.Abs(h[k*ldh+k])
+				if tst == 0 {
+					if k-2 >= ilo {
+						tst += math.Abs(h[(k-1)*ldh+k-2])
+					}
+					if k+1 <= ihi {
+						tst += math.Abs(h[(k+1)*ldh+k])
+					}
+				}
+				// The following is a conservative small
+				// subdiagonal deflation criterion due to Ahues
+				// & Tisseur (LAWN 122, 1997). It has better
+				// mathematical foundation and improves accuracy
+				// in some cases.
+				if math.Abs(h[k*ldh+k-1]) <= ulp*tst {
+					ab := math.Max(math.Abs(h[k*ldh+k-1]), math.Abs(h[(k-1)*ldh+k]))
+					ba := math.Min(math.Abs(h[k*ldh+k-1]), math.Abs(h[(k-1)*ldh+k]))
+					aa := math.Max(math.Abs(h[k*ldh+k]), math.Abs(h[(k-1)*ldh+k-1]-h[k*ldh+k]))
+					bb := math.Min(math.Abs(h[k*ldh+k]), math.Abs(h[(k-1)*ldh+k-1]-h[k*ldh+k]))
+					s := aa + ab
+					if ab/s*ba <= math.Max(smlnum, aa/s*bb*ulp) {
+						break
+					}
+				}
+			}
+			l = k
+			if l > ilo {
+				// H[l,l-1] is negligible.
+				h[l*ldh+l-1] = 0
+			}
+			if l >= i-1 {
+				// Break the loop because a submatrix of order 1
+				// or 2 has split off.
+				converged = true
+				break
+			}
+			kdefl++
+
+			// Now the active submatrix is in rows and columns l to
+			// i. If eigenvalues only are being computed, only the
+			// active submatrix need be transformed.
+			if !wantt {
+				i1 = l
+				i2 = i
+			}
+
+			const (
+				dat1  = 0.75
+				dat2  = -0.4375
+				kexsh = 10
+			)
+			var h11, h21, h12, h22 float64
+			switch {
+			case kdefl%(2*kexsh) == 0: // Exceptional shift.
+				s := math.Abs(h[i*ldh+i-1]) + math.Abs(h[(i-1)*ldh+i-2])
+				h11 = dat1*s + h[i*ldh+i]
+				h12 = dat2 * s
+				h21 = s
+				h22 = h11
+			case kdefl%kexsh == 0: // Exceptional shift.
+				s := math.Abs(h[(l+1)*ldh+l]) + math.Abs(h[(l+2)*ldh+l+1])
+				h11 = dat1*s + h[l*ldh+l]
+				h12 = dat2 * s
+				h21 = s
+				h22 = h11
+			default: // Prepare to use Francis' double shift (i.e.,
+				// 2nd degree generalized Rayleigh quotient).
+				h11 = h[(i-1)*ldh+i-1]
+				h21 = h[i*ldh+i-1]
+				h12 = h[(i-1)*ldh+i]
+				h22 = h[i*ldh+i]
+			}
+			s := math.Abs(h11) + math.Abs(h12) + math.Abs(h21) + math.Abs(h22)
+			var (
+				rt1r, rt1i float64
+				rt2r, rt2i float64
+			)
+			if s != 0 {
+				h11 /= s
+				h21 /= s
+				h12 /= s
+				h22 /= s
+				tr := (h11 + h22) / 2
+				det := (h11-tr)*(h22-tr) - h12*h21
+				rtdisc := math.Sqrt(math.Abs(det))
+				if det >= 0 {
+					// Complex conjugate shifts.
+					rt1r = tr * s
+					rt2r = rt1r
+					rt1i = rtdisc * s
+					rt2i = -rt1i
+				} else {
+					// Real shifts (use only one of them).
+					rt1r = tr + rtdisc
+					rt2r = tr - rtdisc
+					if math.Abs(rt1r-h22) <= math.Abs(rt2r-h22) {
+						rt1r *= s
+						rt2r = rt1r
+					} else {
+						rt2r *= s
+						rt1r = rt2r
+					}
+					rt1i = 0
+					rt2i = 0
+				}
+			}
+
+			// Look for two consecutive small subdiagonal elements.
+			var m int
+			var v [3]float64
+			for m = i - 2; m >= l; m-- {
+				// Determine the effect of starting the
+				// double-shift QR iteration at row m, and see
+				// if this would make H[m,m-1] negligible. The
+				// following uses scaling to avoid overflows and
+				// most underflows.
+				h21s := h[(m+1)*ldh+m]
+				s := math.Abs(h[m*ldh+m]-rt2r) + math.Abs(rt2i) + math.Abs(h21s)
+				h21s /= s
+				v[0] = h21s*h[m*ldh+m+1] + (h[m*ldh+m]-rt1r)*((h[m*ldh+m]-rt2r)/s) - rt2i/s*rt1i
+				v[1] = h21s * (h[m*ldh+m] + h[(m+1)*ldh+m+1] - rt1r - rt2r)
+				v[2] = h21s * h[(m+2)*ldh+m+1]
+				s = math.Abs(v[0]) + math.Abs(v[1]) + math.Abs(v[2])
+				v[0] /= s
+				v[1] /= s
+				v[2] /= s
+				if m == l {
+					break
+				}
+				dsum := math.Abs(h[(m-1)*ldh+m-1]) + math.Abs(h[m*ldh+m]) + math.Abs(h[(m+1)*ldh+m+1])
+				if math.Abs(h[m*ldh+m-1])*(math.Abs(v[1])+math.Abs(v[2])) <= ulp*math.Abs(v[0])*dsum {
+					break
+				}
+			}
+
+			// Double-shift QR step.
+			for k := m; k < i; k++ {
+				// The first iteration of this loop determines a
+				// reflection G from the vector V and applies it
+				// from left and right to H, thus creating a
+				// non-zero bulge below the subdiagonal.
+				//
+				// Each subsequent iteration determines a
+				// reflection G to restore the Hessenberg form
+				// in the (k-1)th column, and thus chases the
+				// bulge one step toward the bottom of the
+				// active submatrix. nr is the order of G.
+
+				nr := min(3, i-k+1)
+				if k > m {
+					bi.Dcopy(nr, h[k*ldh+k-1:], ldh, v[:], 1)
+				}
+				var t0 float64
+				v[0], t0 = impl.Dlarfg(nr, v[0], v[1:], 1)
+				if k > m {
+					h[k*ldh+k-1] = v[0]
+					h[(k+1)*ldh+k-1] = 0
+					if k < i-1 {
+						h[(k+2)*ldh+k-1] = 0
+					}
+				} else if m > l {
+					// Use the following instead of H[k,k-1] = -H[k,k-1]
+					// to avoid a bug when v[1] and v[2] underflow.
+					h[k*ldh+k-1] *= 1 - t0
+				}
+				t1 := t0 * v[1]
+				if nr == 3 {
+					t2 := t0 * v[2]
+
+					// Apply G from the left to transform
+					// the rows of the matrix in columns k
+					// to i2.
+					for j := k; j <= i2; j++ {
+						sum := h[k*ldh+j] + v[1]*h[(k+1)*ldh+j] + v[2]*h[(k+2)*ldh+j]
+						h[k*ldh+j] -= sum * t0
+						h[(k+1)*ldh+j] -= sum * t1
+						h[(k+2)*ldh+j] -= sum * t2
+					}
+
+					// Apply G from the right to transform
+					// the columns of the matrix in rows i1
+					// to min(k+3,i).
+					for j := i1; j <= min(k+3, i); j++ {
+						sum := h[j*ldh+k] + v[1]*h[j*ldh+k+1] + v[2]*h[j*ldh+k+2]
+						h[j*ldh+k] -= sum * t0
+						h[j*ldh+k+1] -= sum * t1
+						h[j*ldh+k+2] -= sum * t2
+					}
+
+					if wantz {
+						// Accumulate transformations in the matrix Z.
+						for j := iloz; j <= ihiz; j++ {
+							sum := z[j*ldz+k] + v[1]*z[j*ldz+k+1] + v[2]*z[j*ldz+k+2]
+							z[j*ldz+k] -= sum * t0
+							z[j*ldz+k+1] -= sum * t1
+							z[j*ldz+k+2] -= sum * t2
+						}
+					}
+				} else if nr == 2 {
+					// Apply G from the left to transform
+					// the rows of the matrix in columns k
+					// to i2.
+					for j := k; j <= i2; j++ {
+						sum := h[k*ldh+j] + v[1]*h[(k+1)*ldh+j]
+						h[k*ldh+j] -= sum * t0
+						h[(k+1)*ldh+j] -= sum * t1
+					}
+
+					// Apply G from the right to transform
+					// the columns of the matrix in rows i1
+					// to min(k+3,i).
+					for j := i1; j <= i; j++ {
+						sum := h[j*ldh+k] + v[1]*h[j*ldh+k+1]
+						h[j*ldh+k] -= sum * t0
+						h[j*ldh+k+1] -= sum * t1
+					}
+
+					if wantz {
+						// Accumulate transformations in the matrix Z.
+						for j := iloz; j <= ihiz; j++ {
+							sum := z[j*ldz+k] + v[1]*z[j*ldz+k+1]
+							z[j*ldz+k] -= sum * t0
+							z[j*ldz+k+1] -= sum * t1
+						}
+					}
+				}
+			}
+		}
+
+		if !converged {
+			// The QR iteration finished without splitting off a
+			// submatrix of order 1 or 2.
+			return i + 1
+		}
+
+		if l == i {
+			// H[i,i-1] is negligible: one eigenvalue has converged.
+			wr[i] = h[i*ldh+i]
+			wi[i] = 0
+		} else if l == i-1 {
+			// H[i-1,i-2] is negligible: a pair of eigenvalues have converged.
+
+			// Transform the 2×2 submatrix to standard Schur form,
+			// and compute and store the eigenvalues.
+			var cs, sn float64
+			a, b := h[(i-1)*ldh+i-1], h[(i-1)*ldh+i]
+			c, d := h[i*ldh+i-1], h[i*ldh+i]
+			a, b, c, d, wr[i-1], wi[i-1], wr[i], wi[i], cs, sn = impl.Dlanv2(a, b, c, d)
+			h[(i-1)*ldh+i-1], h[(i-1)*ldh+i] = a, b
+			h[i*ldh+i-1], h[i*ldh+i] = c, d
+
+			if wantt {
+				// Apply the transformation to the rest of H.
+				if i2 > i {
+					bi.Drot(i2-i, h[(i-1)*ldh+i+1:], 1, h[i*ldh+i+1:], 1, cs, sn)
+				}
+				bi.Drot(i-i1-1, h[i1*ldh+i-1:], ldh, h[i1*ldh+i:], ldh, cs, sn)
+			}
+
+			if wantz {
+				// Apply the transformation to Z.
+				bi.Drot(nz, z[iloz*ldz+i-1:], ldz, z[iloz*ldz+i:], ldz, cs, sn)
+			}
+		}
+
+		// Reset deflation counter.
+		kdefl = 0
+
+		// Return to start of the main loop with new value of i.
+		i = l - 1
+	}
+	return 0
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go
new file mode 100644
index 00000000000..5921473342b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlahr2.go
@@ -0,0 +1,202 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlahr2 reduces the first nb columns of a real general n×(n-k+1) matrix A so
+// that elements below the k-th subdiagonal are zero. The reduction is performed
+// by an orthogonal similarity transformation Qᵀ * A * Q. Dlahr2 returns the
+// matrices V and T which determine Q as a block reflector I - V*T*Vᵀ, and
+// also the matrix Y = A * V * T.
+//
+// The matrix Q is represented as a product of nb elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{nb-1}.
+//
+// Each H_i has the form
+//
+//	H_i = I - tau[i] * v * vᵀ,
+//
+// where v is a real vector with v[0:i+k-1] = 0 and v[i+k-1] = 1. v[i+k:n] is
+// stored on exit in A[i+k+1:n,i].
+//
+// The elements of the vectors v together form the (n-k+1)×nb matrix
+// V which is needed, with T and Y, to apply the transformation to the
+// unreduced part of the matrix, using an update of the form
+//
+//	A = (I - V*T*Vᵀ) * (A - Y*Vᵀ).
+//
+// On entry, a contains the n×(n-k+1) general matrix A. On return, the elements
+// on and above the k-th subdiagonal in the first nb columns are overwritten
+// with the corresponding elements of the reduced matrix; the elements below the
+// k-th subdiagonal, with the slice tau, represent the matrix Q as a product of
+// elementary reflectors. The other columns of A are unchanged.
+//
+// The contents of A on exit are illustrated by the following example
+// with n = 7, k = 3 and nb = 2:
+//
+//	[ a   a   a   a   a ]
+//	[ a   a   a   a   a ]
+//	[ a   a   a   a   a ]
+//	[ h   h   a   a   a ]
+//	[ v0  h   a   a   a ]
+//	[ v0  v1  a   a   a ]
+//	[ v0  v1  a   a   a ]
+//
+// where a denotes an element of the original matrix A, h denotes a
+// modified element of the upper Hessenberg matrix H, and vi denotes an
+// element of the vector defining H_i.
+//
+// k is the offset for the reduction. Elements below the k-th subdiagonal in the
+// first nb columns are reduced to zero.
+//
+// nb is the number of columns to be reduced.
+//
+// On entry, a represents the n×(n-k+1) matrix A. On return, the elements on and
+// above the k-th subdiagonal in the first nb columns are overwritten with the
+// corresponding elements of the reduced matrix. The elements below the k-th
+// subdiagonal, with the slice tau, represent the matrix Q as a product of
+// elementary reflectors. The other columns of A are unchanged.
+//
+// tau will contain the scalar factors of the elementary reflectors. It must
+// have length at least nb.
+//
+// t and ldt represent the nb×nb upper triangular matrix T, and y and ldy
+// represent the n×nb matrix Y.
+//
+// Dlahr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlahr2(n, k, nb int, a []float64, lda int, tau, t []float64, ldt int, y []float64, ldy int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case lda < max(1, n-k+1):
+		panic(badLdA)
+	case ldt < max(1, nb):
+		panic(badLdT)
+	case ldy < max(1, nb):
+		panic(badLdY)
+	}
+
+	// Quick return if possible.
+	if n < 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n-k+1:
+		panic(shortA)
+	case len(tau) < nb:
+		panic(shortTau)
+	case len(t) < (nb-1)*ldt+nb:
+		panic(shortT)
+	case len(y) < (n-1)*ldy+nb:
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return
+	}
+
+	bi := blas64.Implementation()
+	var ei float64
+	for i := 0; i < nb; i++ {
+		if i > 0 {
+			// Update A[k:n,i].
+
+			// Update i-th column of A - Y * Vᵀ.
+			bi.Dgemv(blas.NoTrans, n-k, i,
+				-1, y[k*ldy:], ldy,
+				a[(k+i-1)*lda:], 1,
+				1, a[k*lda+i:], lda)
+
+			// Apply I - V * Tᵀ * Vᵀ to this column (call it b)
+			// from the left, using the last column of T as
+			// workspace.
+			// Let V = [ V1 ]   and   b = [ b1 ]   (first i rows)
+			//         [ V2 ]             [ b2 ]
+			// where V1 is unit lower triangular.
+			//
+			// w := V1ᵀ * b1.
+			bi.Dcopy(i, a[k*lda+i:], lda, t[nb-1:], ldt)
+			bi.Dtrmv(blas.Lower, blas.Trans, blas.Unit, i,
+				a[k*lda:], lda, t[nb-1:], ldt)
+
+			// w := w + V2ᵀ * b2.
+			bi.Dgemv(blas.Trans, n-k-i, i,
+				1, a[(k+i)*lda:], lda,
+				a[(k+i)*lda+i:], lda,
+				1, t[nb-1:], ldt)
+
+			// w := Tᵀ * w.
+			bi.Dtrmv(blas.Upper, blas.Trans, blas.NonUnit, i,
+				t, ldt, t[nb-1:], ldt)
+
+			// b2 := b2 - V2*w.
+			bi.Dgemv(blas.NoTrans, n-k-i, i,
+				-1, a[(k+i)*lda:], lda,
+				t[nb-1:], ldt,
+				1, a[(k+i)*lda+i:], lda)
+
+			// b1 := b1 - V1*w.
+			bi.Dtrmv(blas.Lower, blas.NoTrans, blas.Unit, i,
+				a[k*lda:], lda, t[nb-1:], ldt)
+			bi.Daxpy(i, -1, t[nb-1:], ldt, a[k*lda+i:], lda)
+
+			a[(k+i-1)*lda+i-1] = ei
+		}
+
+		// Generate the elementary reflector H_i to annihilate
+		// A[k+i+1:n,i].
+		ei, tau[i] = impl.Dlarfg(n-k-i, a[(k+i)*lda+i], a[min(k+i+1, n-1)*lda+i:], lda)
+		a[(k+i)*lda+i] = 1
+
+		// Compute Y[k:n,i].
+		bi.Dgemv(blas.NoTrans, n-k, n-k-i,
+			1, a[k*lda+i+1:], lda,
+			a[(k+i)*lda+i:], lda,
+			0, y[k*ldy+i:], ldy)
+		bi.Dgemv(blas.Trans, n-k-i, i,
+			1, a[(k+i)*lda:], lda,
+			a[(k+i)*lda+i:], lda,
+			0, t[i:], ldt)
+		bi.Dgemv(blas.NoTrans, n-k, i,
+			-1, y[k*ldy:], ldy,
+			t[i:], ldt,
+			1, y[k*ldy+i:], ldy)
+		bi.Dscal(n-k, tau[i], y[k*ldy+i:], ldy)
+
+		// Compute T[0:i,i].
+		bi.Dscal(i, -tau[i], t[i:], ldt)
+		bi.Dtrmv(blas.Upper, blas.NoTrans, blas.NonUnit, i,
+			t, ldt, t[i:], ldt)
+
+		t[i*ldt+i] = tau[i]
+	}
+	a[(k+nb-1)*lda+nb-1] = ei
+
+	// Compute Y[0:k,0:nb].
+	impl.Dlacpy(blas.All, k, nb, a[1:], lda, y, ldy)
+	bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, k, nb,
+		1, a[k*lda:], lda, y, ldy)
+	if n > k+nb {
+		bi.Dgemm(blas.NoTrans, blas.NoTrans, k, nb, n-k-nb,
+			1, a[1+nb:], lda,
+			a[(k+nb)*lda:], lda,
+			1, y, ldy)
+	}
+	bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.NonUnit, k, nb,
+		1, t, ldt, y, ldy)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go
new file mode 100644
index 00000000000..54d443988b7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaln2.go
@@ -0,0 +1,407 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlaln2 solves a linear equation or a system of 2 linear equations of the form
+//
+//	(ca A   - w D) X = scale B  if trans == false,
+//	(ca Aᵀ - w D) X = scale B   if trans == true,
+//
+// where A is a na×na real matrix, ca is a real scalar, D is a na×na diagonal
+// real matrix, w is a scalar, real if nw == 1, complex if nw == 2, and X and B
+// are na×1 matrices, real if w is real, complex if w is complex.
+//
+// If w is complex, X and B are represented as na×2 matrices, the first column
+// of each being the real part and the second being the imaginary part.
+//
+// na and nw must be 1 or 2, otherwise Dlaln2 will panic.
+//
+// d1 and d2 are the diagonal elements of D. d2 is not used if na == 1.
+//
+// wr and wi represent the real and imaginary part, respectively, of the scalar
+// w. wi is not used if nw == 1.
+//
+// smin is the desired lower bound on the singular values of A. This should be
+// a safe distance away from underflow or overflow, say, between
+// (underflow/machine precision) and (overflow*machine precision).
+//
+// If both singular values of (ca A - w D) are less than smin, smin*identity
+// will be used instead of (ca A - w D). If only one singular value is less than
+// smin, one element of (ca A - w D) will be perturbed enough to make the
+// smallest singular value roughly smin. If both singular values are at least
+// smin, (ca A - w D) will not be perturbed. In any case, the perturbation will
+// be at most some small multiple of max(smin, ulp*norm(ca A - w D)). The
+// singular values are computed by infinity-norm approximations, and thus will
+// only be correct to a factor of 2 or so.
+//
+// All input quantities are assumed to be smaller than overflow by a reasonable
+// factor.
+//
+// scale is a scaling factor less than or equal to 1 which is chosen so that X
+// can be computed without overflow. X is further scaled if necessary to assure
+// that norm(ca A - w D)*norm(X) is less than overflow.
+//
+// xnorm contains the infinity-norm of X when X is regarded as a na×nw real
+// matrix.
+//
+// ok will be false if (ca A - w D) had to be perturbed to make its smallest
+// singular value greater than smin, otherwise ok will be true.
+//
+// Dlaln2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaln2(trans bool, na, nw int, smin, ca float64, a []float64, lda int, d1, d2 float64, b []float64, ldb int, wr, wi float64, x []float64, ldx int) (scale, xnorm float64, ok bool) {
+	// TODO(vladimir-ch): Consider splitting this function into two, one
+	// handling the real case (nw == 1) and the other handling the complex
+	// case (nw == 2). Given that Go has complex types, their signatures
+	// would be simpler and more natural, and the implementation not as
+	// convoluted.
+
+	switch {
+	case na != 1 && na != 2:
+		panic(badNa)
+	case nw != 1 && nw != 2:
+		panic(badNw)
+	case lda < na:
+		panic(badLdA)
+	case len(a) < (na-1)*lda+na:
+		panic(shortA)
+	case ldb < nw:
+		panic(badLdB)
+	case len(b) < (na-1)*ldb+nw:
+		panic(shortB)
+	case ldx < nw:
+		panic(badLdX)
+	case len(x) < (na-1)*ldx+nw:
+		panic(shortX)
+	}
+
+	smlnum := 2 * dlamchS
+	bignum := 1 / smlnum
+	smini := math.Max(smin, smlnum)
+
+	ok = true
+	scale = 1
+
+	if na == 1 {
+		// 1×1 (i.e., scalar) system C X = B.
+
+		if nw == 1 {
+			// Real 1×1 system.
+
+			// C = ca A - w D.
+			csr := ca*a[0] - wr*d1
+			cnorm := math.Abs(csr)
+
+			// If |C| < smini, use C = smini.
+			if cnorm < smini {
+				csr = smini
+				cnorm = smini
+				ok = false
+			}
+
+			// Check scaling for X = B / C.
+			bnorm := math.Abs(b[0])
+			if cnorm < 1 && bnorm > math.Max(1, bignum*cnorm) {
+				scale = 1 / bnorm
+			}
+
+			// Compute X.
+			x[0] = b[0] * scale / csr
+			xnorm = math.Abs(x[0])
+
+			return scale, xnorm, ok
+		}
+
+		// Complex 1×1 system (w is complex).
+
+		// C = ca A - w D.
+		csr := ca*a[0] - wr*d1
+		csi := -wi * d1
+		cnorm := math.Abs(csr) + math.Abs(csi)
+
+		// If |C| < smini, use C = smini.
+		if cnorm < smini {
+			csr = smini
+			csi = 0
+			cnorm = smini
+			ok = false
+		}
+
+		// Check scaling for X = B / C.
+		bnorm := math.Abs(b[0]) + math.Abs(b[1])
+		if cnorm < 1 && bnorm > math.Max(1, bignum*cnorm) {
+			scale = 1 / bnorm
+		}
+
+		// Compute X.
+		cx := complex(scale*b[0], scale*b[1]) / complex(csr, csi)
+		x[0], x[1] = real(cx), imag(cx)
+		xnorm = math.Abs(x[0]) + math.Abs(x[1])
+
+		return scale, xnorm, ok
+	}
+
+	// 2×2 system.
+
+	// Compute the real part of
+	//  C = ca A   - w D
+	// or
+	//  C = ca Aᵀ - w D.
+	crv := [4]float64{
+		ca*a[0] - wr*d1,
+		ca * a[1],
+		ca * a[lda],
+		ca*a[lda+1] - wr*d2,
+	}
+	if trans {
+		crv[1] = ca * a[lda]
+		crv[2] = ca * a[1]
+	}
+
+	pivot := [4][4]int{
+		{0, 1, 2, 3},
+		{1, 0, 3, 2},
+		{2, 3, 0, 1},
+		{3, 2, 1, 0},
+	}
+
+	if nw == 1 {
+		// Real 2×2 system (w is real).
+
+		// Find the largest element in C.
+		var cmax float64
+		var icmax int
+		for j, v := range crv {
+			v = math.Abs(v)
+			if v > cmax {
+				cmax = v
+				icmax = j
+			}
+		}
+
+		// If norm(C) < smini, use smini*identity.
+		if cmax < smini {
+			bnorm := math.Max(math.Abs(b[0]), math.Abs(b[ldb]))
+			if smini < 1 && bnorm > math.Max(1, bignum*smini) {
+				scale = 1 / bnorm
+			}
+			temp := scale / smini
+			x[0] = temp * b[0]
+			x[ldx] = temp * b[ldb]
+			xnorm = temp * bnorm
+			ok = false
+
+			return scale, xnorm, ok
+		}
+
+		// Gaussian elimination with complete pivoting.
+		// Form upper triangular matrix
+		//  [ur11 ur12]
+		//  [   0 ur22]
+		ur11 := crv[icmax]
+		ur12 := crv[pivot[icmax][1]]
+		cr21 := crv[pivot[icmax][2]]
+		cr22 := crv[pivot[icmax][3]]
+		ur11r := 1 / ur11
+		lr21 := ur11r * cr21
+		ur22 := cr22 - ur12*lr21
+
+		// If smaller pivot < smini, use smini.
+		if math.Abs(ur22) < smini {
+			ur22 = smini
+			ok = false
+		}
+
+		var br1, br2 float64
+		if icmax > 1 {
+			// If the pivot lies in the second row, swap the rows.
+			br1 = b[ldb]
+			br2 = b[0]
+		} else {
+			br1 = b[0]
+			br2 = b[ldb]
+		}
+		br2 -= lr21 * br1 // Apply the Gaussian elimination step to the right-hand side.
+
+		bbnd := math.Max(math.Abs(ur22*ur11r*br1), math.Abs(br2))
+		if bbnd > 1 && math.Abs(ur22) < 1 && bbnd >= bignum*math.Abs(ur22) {
+			scale = 1 / bbnd
+		}
+
+		// Solve the linear system ur*xr=br.
+		xr2 := br2 * scale / ur22
+		xr1 := scale*br1*ur11r - ur11r*ur12*xr2
+		if icmax&0x1 != 0 {
+			// If the pivot lies in the second column, swap the components of the solution.
+			x[0] = xr2
+			x[ldx] = xr1
+		} else {
+			x[0] = xr1
+			x[ldx] = xr2
+		}
+		xnorm = math.Max(math.Abs(xr1), math.Abs(xr2))
+
+		// Further scaling if norm(A)*norm(X) > overflow.
+		if xnorm > 1 && cmax > 1 && xnorm > bignum/cmax {
+			temp := cmax / bignum
+			x[0] *= temp
+			x[ldx] *= temp
+			xnorm *= temp
+			scale *= temp
+		}
+
+		return scale, xnorm, ok
+	}
+
+	// Complex 2×2 system (w is complex).
+
+	// Find the largest element in C.
+	civ := [4]float64{
+		-wi * d1,
+		0,
+		0,
+		-wi * d2,
+	}
+	var cmax float64
+	var icmax int
+	for j, v := range crv {
+		v := math.Abs(v)
+		if v+math.Abs(civ[j]) > cmax {
+			cmax = v + math.Abs(civ[j])
+			icmax = j
+		}
+	}
+
+	// If norm(C) < smini, use smini*identity.
+	if cmax < smini {
+		br1 := math.Abs(b[0]) + math.Abs(b[1])
+		br2 := math.Abs(b[ldb]) + math.Abs(b[ldb+1])
+		bnorm := math.Max(br1, br2)
+		if smini < 1 && bnorm > 1 && bnorm > bignum*smini {
+			scale = 1 / bnorm
+		}
+		temp := scale / smini
+		x[0] = temp * b[0]
+		x[1] = temp * b[1]
+		x[ldb] = temp * b[ldb]
+		x[ldb+1] = temp * b[ldb+1]
+		xnorm = temp * bnorm
+		ok = false
+
+		return scale, xnorm, ok
+	}
+
+	// Gaussian elimination with complete pivoting.
+	ur11 := crv[icmax]
+	ui11 := civ[icmax]
+	ur12 := crv[pivot[icmax][1]]
+	ui12 := civ[pivot[icmax][1]]
+	cr21 := crv[pivot[icmax][2]]
+	ci21 := civ[pivot[icmax][2]]
+	cr22 := crv[pivot[icmax][3]]
+	ci22 := civ[pivot[icmax][3]]
+	var (
+		ur11r, ui11r float64
+		lr21, li21   float64
+		ur12s, ui12s float64
+		ur22, ui22   float64
+	)
+	if icmax == 0 || icmax == 3 {
+		// Off-diagonals of pivoted C are real.
+		if math.Abs(ur11) > math.Abs(ui11) {
+			temp := ui11 / ur11
+			ur11r = 1 / (ur11 * (1 + temp*temp))
+			ui11r = -temp * ur11r
+		} else {
+			temp := ur11 / ui11
+			ui11r = -1 / (ui11 * (1 + temp*temp))
+			ur11r = -temp * ui11r
+		}
+		lr21 = cr21 * ur11r
+		li21 = cr21 * ui11r
+		ur12s = ur12 * ur11r
+		ui12s = ur12 * ui11r
+		ur22 = cr22 - ur12*lr21
+		ui22 = ci22 - ur12*li21
+	} else {
+		// Diagonals of pivoted C are real.
+		ur11r = 1 / ur11
+		// ui11r is already 0.
+		lr21 = cr21 * ur11r
+		li21 = ci21 * ur11r
+		ur12s = ur12 * ur11r
+		ui12s = ui12 * ur11r
+		ur22 = cr22 - ur12*lr21 + ui12*li21
+		ui22 = -ur12*li21 - ui12*lr21
+	}
+	u22abs := math.Abs(ur22) + math.Abs(ui22)
+
+	// If smaller pivot < smini, use smini.
+	if u22abs < smini {
+		ur22 = smini
+		ui22 = 0
+		ok = false
+	}
+
+	var br1, bi1 float64
+	var br2, bi2 float64
+	if icmax > 1 {
+		// If the pivot lies in the second row, swap the rows.
+		br1 = b[ldb]
+		bi1 = b[ldb+1]
+		br2 = b[0]
+		bi2 = b[1]
+	} else {
+		br1 = b[0]
+		bi1 = b[1]
+		br2 = b[ldb]
+		bi2 = b[ldb+1]
+	}
+	br2 += -lr21*br1 + li21*bi1
+	bi2 += -li21*br1 - lr21*bi1
+
+	bbnd1 := u22abs * (math.Abs(ur11r) + math.Abs(ui11r)) * (math.Abs(br1) + math.Abs(bi1))
+	bbnd2 := math.Abs(br2) + math.Abs(bi2)
+	bbnd := math.Max(bbnd1, bbnd2)
+	if bbnd > 1 && u22abs < 1 && bbnd >= bignum*u22abs {
+		scale = 1 / bbnd
+		br1 *= scale
+		bi1 *= scale
+		br2 *= scale
+		bi2 *= scale
+	}
+
+	cx2 := complex(br2, bi2) / complex(ur22, ui22)
+	xr2, xi2 := real(cx2), imag(cx2)
+	xr1 := ur11r*br1 - ui11r*bi1 - ur12s*xr2 + ui12s*xi2
+	xi1 := ui11r*br1 + ur11r*bi1 - ui12s*xr2 - ur12s*xi2
+	if icmax&0x1 != 0 {
+		// If the pivot lies in the second column, swap the components of the solution.
+		x[0] = xr2
+		x[1] = xi2
+		x[ldx] = xr1
+		x[ldx+1] = xi1
+	} else {
+		x[0] = xr1
+		x[1] = xi1
+		x[ldx] = xr2
+		x[ldx+1] = xi2
+	}
+	xnorm = math.Max(math.Abs(xr1)+math.Abs(xi1), math.Abs(xr2)+math.Abs(xi2))
+
+	// Further scaling if norm(A)*norm(X) > overflow.
+	if xnorm > 1 && cmax > 1 && xnorm > bignum/cmax {
+		temp := cmax / bignum
+		x[0] *= temp
+		x[1] *= temp
+		x[ldx] *= temp
+		x[ldx+1] *= temp
+		xnorm *= temp
+		scale *= temp
+	}
+
+	return scale, xnorm, ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go
new file mode 100644
index 00000000000..4b7b449f639
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangb.go
@@ -0,0 +1,87 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/internal/asm/f64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlangb returns the given norm of an m×n band matrix with kl sub-diagonals and
+// ku super-diagonals.
+func (impl Implementation) Dlangb(norm lapack.MatrixNorm, m, n, kl, ku int, ab []float64, ldab int) float64 {
+	ncol := kl + 1 + ku
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case kl < 0:
+		panic(klLT0)
+	case ku < 0:
+		panic(kuLT0)
+	case ldab < ncol:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(ab) < min(m, n+kl)*ldab:
+		panic(shortAB)
+	}
+
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		for i := 0; i < min(m, n+kl); i++ {
+			l := max(0, kl-i)
+			u := min(n+kl-i, ncol)
+			for _, aij := range ab[i*ldab+l : i*ldab+u] {
+				aij = math.Abs(aij)
+				if aij > value || math.IsNaN(aij) {
+					value = aij
+				}
+			}
+		}
+	case lapack.MaxRowSum:
+		for i := 0; i < min(m, n+kl); i++ {
+			l := max(0, kl-i)
+			u := min(n+kl-i, ncol)
+			sum := f64.L1Norm(ab[i*ldab+l : i*ldab+u])
+			if sum > value || math.IsNaN(sum) {
+				value = sum
+			}
+		}
+	case lapack.MaxColumnSum:
+		for j := 0; j < min(m+ku, n); j++ {
+			jb := min(kl+j, ncol-1)
+			ib := max(0, j-ku)
+			jlen := min(j+kl, m-1) - ib + 1
+			sum := f64.L1NormInc(ab[ib*ldab+jb:], jlen, max(1, ldab-1))
+			if sum > value || math.IsNaN(sum) {
+				value = sum
+			}
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		for i := 0; i < min(m, n+kl); i++ {
+			l := max(0, kl-i)
+			u := min(n+kl-i, ncol)
+			ilen := u - l
+			scale, sum = impl.Dlassq(ilen, ab[i*ldab+l:], 1, scale, sum)
+		}
+		value = scale * math.Sqrt(sum)
+	}
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go
new file mode 100644
index 00000000000..3a00dce1dac
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlange.go
@@ -0,0 +1,89 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlange returns the value of the specified norm of a general m×n matrix A:
+//
+//	lapack.MaxAbs:       the maximum absolute value of any element.
+//	lapack.MaxColumnSum: the maximum column sum of the absolute values of the elements (1-norm).
+//	lapack.MaxRowSum:    the maximum row sum of the absolute values of the elements (infinity-norm).
+//	lapack.Frobenius:    the square root of the sum of the squares of the elements (Frobenius norm).
+//
+// If norm == lapack.MaxColumnSum, work must be of length n, and this function will
+// panic otherwise. There are no restrictions on work for the other matrix norms.
+func (impl Implementation) Dlange(norm lapack.MatrixNorm, m, n int, a []float64, lda int, work []float64) float64 {
+	// TODO(btracey): These should probably be refactored to use BLAS calls.
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(badLdA)
+	case norm == lapack.MaxColumnSum && len(work) < n:
+		panic(shortWork)
+	}
+
+	switch norm {
+	case lapack.MaxAbs:
+		var value float64
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				value = math.Max(value, math.Abs(a[i*lda+j]))
+			}
+		}
+		return value
+	case lapack.MaxColumnSum:
+		for i := 0; i < n; i++ {
+			work[i] = 0
+		}
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				work[j] += math.Abs(a[i*lda+j])
+			}
+		}
+		var value float64
+		for i := 0; i < n; i++ {
+			value = math.Max(value, work[i])
+		}
+		return value
+	case lapack.MaxRowSum:
+		var value float64
+		for i := 0; i < m; i++ {
+			var sum float64
+			for j := 0; j < n; j++ {
+				sum += math.Abs(a[i*lda+j])
+			}
+			value = math.Max(value, sum)
+		}
+		return value
+	default:
+		// lapack.Frobenius
+		scale := 0.0
+		sum := 1.0
+		for i := 0; i < m; i++ {
+			scale, sum = impl.Dlassq(n, a[i*lda:], 1, scale, sum)
+		}
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go
new file mode 100644
index 00000000000..cd1c49b5c3a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlangt.go
@@ -0,0 +1,115 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlangt returns the value of the given norm of an n×n tridiagonal matrix
+// represented by the three diagonals.
+//
+// d must have length at least n and dl and du must have length at least n-1.
+func (impl Implementation) Dlangt(norm lapack.MatrixNorm, n int, dl, d, du []float64) float64 {
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(dl) < n-1:
+		panic(shortDL)
+	case len(d) < n:
+		panic(shortD)
+	case len(du) < n-1:
+		panic(shortDU)
+	}
+
+	dl = dl[:n-1]
+	d = d[:n]
+	du = du[:n-1]
+
+	var anorm float64
+	switch norm {
+	case lapack.MaxAbs:
+		for _, diag := range [][]float64{dl, d, du} {
+			for _, di := range diag {
+				if math.IsNaN(di) {
+					return di
+				}
+				di = math.Abs(di)
+				if di > anorm {
+					anorm = di
+				}
+			}
+		}
+	case lapack.MaxColumnSum:
+		if n == 1 {
+			return math.Abs(d[0])
+		}
+		anorm = math.Abs(d[0]) + math.Abs(dl[0])
+		if math.IsNaN(anorm) {
+			return anorm
+		}
+		tmp := math.Abs(du[n-2]) + math.Abs(d[n-1])
+		if math.IsNaN(tmp) {
+			return tmp
+		}
+		if tmp > anorm {
+			anorm = tmp
+		}
+		for i := 1; i < n-1; i++ {
+			tmp = math.Abs(du[i-1]) + math.Abs(d[i]) + math.Abs(dl[i])
+			if math.IsNaN(tmp) {
+				return tmp
+			}
+			if tmp > anorm {
+				anorm = tmp
+			}
+		}
+	case lapack.MaxRowSum:
+		if n == 1 {
+			return math.Abs(d[0])
+		}
+		anorm = math.Abs(d[0]) + math.Abs(du[0])
+		if math.IsNaN(anorm) {
+			return anorm
+		}
+		tmp := math.Abs(dl[n-2]) + math.Abs(d[n-1])
+		if math.IsNaN(tmp) {
+			return tmp
+		}
+		if tmp > anorm {
+			anorm = tmp
+		}
+		for i := 1; i < n-1; i++ {
+			tmp = math.Abs(dl[i-1]) + math.Abs(d[i]) + math.Abs(du[i])
+			if math.IsNaN(tmp) {
+				return tmp
+			}
+			if tmp > anorm {
+				anorm = tmp
+			}
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		ssq := 1.0
+		scale, ssq = impl.Dlassq(n, d, 1, scale, ssq)
+		if n > 1 {
+			scale, ssq = impl.Dlassq(n-1, dl, 1, scale, ssq)
+			scale, ssq = impl.Dlassq(n-1, du, 1, scale, ssq)
+		}
+		anorm = scale * math.Sqrt(ssq)
+	}
+	return anorm
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go
new file mode 100644
index 00000000000..054b90f02ba
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanhs.go
@@ -0,0 +1,78 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlanhs returns the value of the one norm, or the Frobenius norm, or the
+// infinity norm, or the element of largest absolute value of a Hessenberg
+// matrix A.
+//
+// If norm is lapack.MaxColumnSum, work must have length at least n.
+func (impl Implementation) Dlanhs(norm lapack.MatrixNorm, n int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxAbs && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case norm == lapack.MaxColumnSum && len(work) < n:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		for i := 0; i < n; i++ {
+			minj := max(0, i-1)
+			for _, v := range a[i*lda+minj : i*lda+n] {
+				value = math.Max(value, math.Abs(v))
+			}
+		}
+	case lapack.MaxColumnSum:
+		for i := 0; i < n; i++ {
+			work[i] = 0
+		}
+		for i := 0; i < n; i++ {
+			for j := max(0, i-1); j < n; j++ {
+				work[j] += math.Abs(a[i*lda+j])
+			}
+		}
+		for _, v := range work[:n] {
+			value = math.Max(value, v)
+		}
+	case lapack.MaxRowSum:
+		for i := 0; i < n; i++ {
+			minj := max(0, i-1)
+			sum := bi.Dasum(n-minj, a[i*lda+minj:], 1)
+			value = math.Max(value, sum)
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		for i := 0; i < n; i++ {
+			minj := max(0, i-1)
+			scale, sum = impl.Dlassq(n-minj, a[i*lda+minj:], 1, scale, sum)
+		}
+		value = scale * math.Sqrt(sum)
+	}
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go
new file mode 100644
index 00000000000..17801f84b64
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansb.go
@@ -0,0 +1,131 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlansb returns the given norm of an n×n symmetric band matrix with kd
+// super-diagonals.
+//
+// When norm is lapack.MaxColumnSum or lapack.MaxRowSum, the length of work must
+// be at least n.
+func (impl Implementation) Dlansb(norm lapack.MatrixNorm, uplo blas.Uplo, n, kd int, ab []float64, ldab int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(ab) < (n-1)*ldab+kd+1:
+		panic(shortAB)
+	case len(work) < n && (norm == lapack.MaxColumnSum || norm == lapack.MaxRowSum):
+		panic(shortWork)
+	}
+
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				for j := 0; j < min(n-i, kd+1); j++ {
+					aij := math.Abs(ab[i*ldab+j])
+					if aij > value || math.IsNaN(aij) {
+						value = aij
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				for j := max(0, kd-i); j < kd+1; j++ {
+					aij := math.Abs(ab[i*ldab+j])
+					if aij > value || math.IsNaN(aij) {
+						value = aij
+					}
+				}
+			}
+		}
+	case lapack.MaxColumnSum, lapack.MaxRowSum:
+		work = work[:n]
+		var sum float64
+		if uplo == blas.Upper {
+			for i := range work {
+				work[i] = 0
+			}
+			for i := 0; i < n; i++ {
+				sum := work[i] + math.Abs(ab[i*ldab])
+				for j := i + 1; j < min(i+kd+1, n); j++ {
+					aij := math.Abs(ab[i*ldab+j-i])
+					sum += aij
+					work[j] += aij
+				}
+				if sum > value || math.IsNaN(sum) {
+					value = sum
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				sum = 0
+				for j := max(0, i-kd); j < i; j++ {
+					aij := math.Abs(ab[i*ldab+kd+j-i])
+					sum += aij
+					work[j] += aij
+				}
+				work[i] = sum + math.Abs(ab[i*ldab+kd])
+			}
+			for _, sum := range work {
+				if sum > value || math.IsNaN(sum) {
+					value = sum
+				}
+			}
+		}
+	case lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		if uplo == blas.Upper {
+			if kd > 0 {
+				// Sum off-diagonals.
+				for i := 0; i < n-1; i++ {
+					ilen := min(n-i-1, kd)
+					scale, sum = impl.Dlassq(ilen, ab[i*ldab+1:], 1, scale, sum)
+				}
+				sum *= 2
+			}
+			// Sum diagonal.
+			scale, sum = impl.Dlassq(n, ab, ldab, scale, sum)
+		} else {
+			if kd > 0 {
+				// Sum off-diagonals.
+				for i := 1; i < n; i++ {
+					ilen := min(i, kd)
+					scale, sum = impl.Dlassq(ilen, ab[i*ldab+kd-ilen:], 1, scale, sum)
+				}
+				sum *= 2
+			}
+			// Sum diagonal.
+			scale, sum = impl.Dlassq(n, ab[kd:], ldab, scale, sum)
+		}
+		value = scale * math.Sqrt(sum)
+	}
+
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go
new file mode 100644
index 00000000000..9ca1897e34b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanst.go
@@ -0,0 +1,75 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlanst computes the specified norm of a symmetric tridiagonal matrix A.
+// The diagonal elements of A are stored in d and the off-diagonal elements
+// are stored in e.
+func (impl Implementation) Dlanst(norm lapack.MatrixNorm, n int, d, e []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case n < 0:
+		panic(nLT0)
+	}
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	}
+
+	switch norm {
+	default:
+		panic(badNorm)
+	case lapack.MaxAbs:
+		anorm := math.Abs(d[n-1])
+		for i := 0; i < n-1; i++ {
+			sum := math.Abs(d[i])
+			if anorm < sum || math.IsNaN(sum) {
+				anorm = sum
+			}
+			sum = math.Abs(e[i])
+			if anorm < sum || math.IsNaN(sum) {
+				anorm = sum
+			}
+		}
+		return anorm
+	case lapack.MaxColumnSum, lapack.MaxRowSum:
+		if n == 1 {
+			return math.Abs(d[0])
+		}
+		anorm := math.Abs(d[0]) + math.Abs(e[0])
+		sum := math.Abs(e[n-2]) + math.Abs(d[n-1])
+		if anorm < sum || math.IsNaN(sum) {
+			anorm = sum
+		}
+		for i := 1; i < n-1; i++ {
+			sum := math.Abs(d[i]) + math.Abs(e[i]) + math.Abs(e[i-1])
+			if anorm < sum || math.IsNaN(sum) {
+				anorm = sum
+			}
+		}
+		return anorm
+	case lapack.Frobenius:
+		var scale float64
+		sum := 1.0
+		if n > 1 {
+			scale, sum = impl.Dlassq(n-1, e, 1, scale, sum)
+			sum = 2 * sum
+		}
+		scale, sum = impl.Dlassq(n, d, 1, scale, sum)
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go
new file mode 100644
index 00000000000..b972c72e558
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlansy.go
@@ -0,0 +1,125 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlansy returns the value of the specified norm of an n×n symmetric matrix. If
+// norm == lapack.MaxColumnSum or norm == lapack.MaxRowSum, work must have length
+// at least n, otherwise work is unused.
+func (impl Implementation) Dlansy(norm lapack.MatrixNorm, uplo blas.Uplo, n int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case (norm == lapack.MaxColumnSum || norm == lapack.MaxRowSum) && len(work) < n:
+		panic(shortWork)
+	}
+
+	switch norm {
+	case lapack.MaxAbs:
+		if uplo == blas.Upper {
+			var max float64
+			for i := 0; i < n; i++ {
+				for j := i; j < n; j++ {
+					v := math.Abs(a[i*lda+j])
+					if math.IsNaN(v) {
+						return math.NaN()
+					}
+					if v > max {
+						max = v
+					}
+				}
+			}
+			return max
+		}
+		var max float64
+		for i := 0; i < n; i++ {
+			for j := 0; j <= i; j++ {
+				v := math.Abs(a[i*lda+j])
+				if math.IsNaN(v) {
+					return math.NaN()
+				}
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	case lapack.MaxRowSum, lapack.MaxColumnSum:
+		// A symmetric matrix has the same 1-norm and ∞-norm.
+		for i := 0; i < n; i++ {
+			work[i] = 0
+		}
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				work[i] += math.Abs(a[i*lda+i])
+				for j := i + 1; j < n; j++ {
+					v := math.Abs(a[i*lda+j])
+					work[i] += v
+					work[j] += v
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				for j := 0; j < i; j++ {
+					v := math.Abs(a[i*lda+j])
+					work[i] += v
+					work[j] += v
+				}
+				work[i] += math.Abs(a[i*lda+i])
+			}
+		}
+		var max float64
+		for i := 0; i < n; i++ {
+			v := work[i]
+			if math.IsNaN(v) {
+				return math.NaN()
+			}
+			if v > max {
+				max = v
+			}
+		}
+		return max
+	default:
+		// lapack.Frobenius:
+		scale := 0.0
+		sum := 1.0
+		// Sum off-diagonals.
+		if uplo == blas.Upper {
+			for i := 0; i < n-1; i++ {
+				scale, sum = impl.Dlassq(n-i-1, a[i*lda+i+1:], 1, scale, sum)
+			}
+		} else {
+			for i := 1; i < n; i++ {
+				scale, sum = impl.Dlassq(i, a[i*lda:], 1, scale, sum)
+			}
+		}
+		sum *= 2
+		// Sum diagonal.
+		scale, sum = impl.Dlassq(n, a, lda+1, scale, sum)
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go
new file mode 100644
index 00000000000..ceab2a6af3f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantb.go
@@ -0,0 +1,209 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlantb returns the value of the given norm of an n×n triangular band matrix A
+// with k+1 diagonals.
+//
+// When norm is lapack.MaxColumnSum, the length of work must be at least n.
+func (impl Implementation) Dlantb(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, n, k int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxAbs && norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kdLT0)
+	case lda < k+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+k+1:
+		panic(shortAB)
+	case len(work) < n && norm == lapack.MaxColumnSum:
+		panic(shortWork)
+	}
+
+	var value float64
+	switch norm {
+	case lapack.MaxAbs:
+		if uplo == blas.Upper {
+			var jfirst int
+			if diag == blas.Unit {
+				value = 1
+				jfirst = 1
+			}
+			for i := 0; i < n; i++ {
+				for _, aij := range a[i*lda+jfirst : i*lda+min(n-i, k+1)] {
+					if math.IsNaN(aij) {
+						return aij
+					}
+					aij = math.Abs(aij)
+					if aij > value {
+						value = aij
+					}
+				}
+			}
+		} else {
+			jlast := k + 1
+			if diag == blas.Unit {
+				value = 1
+				jlast = k
+			}
+			for i := 0; i < n; i++ {
+				for _, aij := range a[i*lda+max(0, k-i) : i*lda+jlast] {
+					if math.IsNaN(aij) {
+						return math.NaN()
+					}
+					aij = math.Abs(aij)
+					if aij > value {
+						value = aij
+					}
+				}
+			}
+		}
+	case lapack.MaxRowSum:
+		var sum float64
+		if uplo == blas.Upper {
+			var jfirst int
+			if diag == blas.Unit {
+				jfirst = 1
+			}
+			for i := 0; i < n; i++ {
+				sum = 0
+				if diag == blas.Unit {
+					sum = 1
+				}
+				for _, aij := range a[i*lda+jfirst : i*lda+min(n-i, k+1)] {
+					sum += math.Abs(aij)
+				}
+				if math.IsNaN(sum) {
+					return math.NaN()
+				}
+				if sum > value {
+					value = sum
+				}
+			}
+		} else {
+			jlast := k + 1
+			if diag == blas.Unit {
+				jlast = k
+			}
+			for i := 0; i < n; i++ {
+				sum = 0
+				if diag == blas.Unit {
+					sum = 1
+				}
+				for _, aij := range a[i*lda+max(0, k-i) : i*lda+jlast] {
+					sum += math.Abs(aij)
+				}
+				if math.IsNaN(sum) {
+					return math.NaN()
+				}
+				if sum > value {
+					value = sum
+				}
+			}
+		}
+	case lapack.MaxColumnSum:
+		work = work[:n]
+		if diag == blas.Unit {
+			for i := range work {
+				work[i] = 1
+			}
+		} else {
+			for i := range work {
+				work[i] = 0
+			}
+		}
+		if uplo == blas.Upper {
+			var jfirst int
+			if diag == blas.Unit {
+				jfirst = 1
+			}
+			for i := 0; i < n; i++ {
+				for j, aij := range a[i*lda+jfirst : i*lda+min(n-i, k+1)] {
+					work[i+jfirst+j] += math.Abs(aij)
+				}
+			}
+		} else {
+			jlast := k + 1
+			if diag == blas.Unit {
+				jlast = k
+			}
+			for i := 0; i < n; i++ {
+				off := max(0, k-i)
+				for j, aij := range a[i*lda+off : i*lda+jlast] {
+					work[i+j+off-k] += math.Abs(aij)
+				}
+			}
+		}
+		for _, wi := range work {
+			if math.IsNaN(wi) {
+				return math.NaN()
+			}
+			if wi > value {
+				value = wi
+			}
+		}
+	case lapack.Frobenius:
+		var scale, sum float64
+		switch uplo {
+		case blas.Upper:
+			if diag == blas.Unit {
+				scale = 1
+				sum = float64(n)
+				if k > 0 {
+					for i := 0; i < n-1; i++ {
+						ilen := min(n-i-1, k)
+						scale, sum = impl.Dlassq(ilen, a[i*lda+1:], 1, scale, sum)
+					}
+				}
+			} else {
+				scale = 0
+				sum = 1
+				for i := 0; i < n; i++ {
+					ilen := min(n-i, k+1)
+					scale, sum = impl.Dlassq(ilen, a[i*lda:], 1, scale, sum)
+				}
+			}
+		case blas.Lower:
+			if diag == blas.Unit {
+				scale = 1
+				sum = float64(n)
+				if k > 0 {
+					for i := 1; i < n; i++ {
+						ilen := min(i, k)
+						scale, sum = impl.Dlassq(ilen, a[i*lda+k-ilen:], 1, scale, sum)
+					}
+				}
+			} else {
+				scale = 0
+				sum = 1
+				for i := 0; i < n; i++ {
+					ilen := min(i, k) + 1
+					scale, sum = impl.Dlassq(ilen, a[i*lda+k+1-ilen:], 1, scale, sum)
+				}
+			}
+		}
+		value = scale * math.Sqrt(sum)
+	}
+	return value
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go
new file mode 100644
index 00000000000..33569832fd3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlantr.go
@@ -0,0 +1,252 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlantr computes the specified norm of an m×n trapezoidal matrix A. If
+// norm == lapack.MaxColumnSum work must have length at least n, otherwise work
+// is unused.
+func (impl Implementation) Dlantr(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, m, n int, a []float64, lda int, work []float64) float64 {
+	switch {
+	case norm != lapack.MaxRowSum && norm != lapack.MaxColumnSum && norm != lapack.Frobenius && norm != lapack.MaxAbs:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	minmn := min(m, n)
+	if minmn == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case norm == lapack.MaxColumnSum && len(work) < n:
+		panic(shortWork)
+	}
+
+	switch norm {
+	case lapack.MaxAbs:
+		if diag == blas.Unit {
+			value := 1.0
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					for j := i + 1; j < n; j++ {
+						tmp := math.Abs(a[i*lda+j])
+						if math.IsNaN(tmp) {
+							return tmp
+						}
+						if tmp > value {
+							value = tmp
+						}
+					}
+				}
+				return value
+			}
+			for i := 1; i < m; i++ {
+				for j := 0; j < min(i, n); j++ {
+					tmp := math.Abs(a[i*lda+j])
+					if math.IsNaN(tmp) {
+						return tmp
+					}
+					if tmp > value {
+						value = tmp
+					}
+				}
+			}
+			return value
+		}
+		var value float64
+		if uplo == blas.Upper {
+			for i := 0; i < m; i++ {
+				for j := i; j < n; j++ {
+					tmp := math.Abs(a[i*lda+j])
+					if math.IsNaN(tmp) {
+						return tmp
+					}
+					if tmp > value {
+						value = tmp
+					}
+				}
+			}
+			return value
+		}
+		for i := 0; i < m; i++ {
+			for j := 0; j <= min(i, n-1); j++ {
+				tmp := math.Abs(a[i*lda+j])
+				if math.IsNaN(tmp) {
+					return tmp
+				}
+				if tmp > value {
+					value = tmp
+				}
+			}
+		}
+		return value
+	case lapack.MaxColumnSum:
+		if diag == blas.Unit {
+			for i := 0; i < minmn; i++ {
+				work[i] = 1
+			}
+			for i := minmn; i < n; i++ {
+				work[i] = 0
+			}
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					for j := i + 1; j < n; j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			} else {
+				for i := 1; i < m; i++ {
+					for j := 0; j < min(i, n); j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				work[i] = 0
+			}
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					for j := i; j < n; j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					for j := 0; j <= min(i, n-1); j++ {
+						work[j] += math.Abs(a[i*lda+j])
+					}
+				}
+			}
+		}
+		var max float64
+		for _, v := range work[:n] {
+			if math.IsNaN(v) {
+				return math.NaN()
+			}
+			if v > max {
+				max = v
+			}
+		}
+		return max
+	case lapack.MaxRowSum:
+		var maxsum float64
+		if diag == blas.Unit {
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					var sum float64
+					if i < minmn {
+						sum = 1
+					}
+					for j := i + 1; j < n; j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return math.NaN()
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			} else {
+				for i := 0; i < m; i++ {
+					var sum float64
+					if i < minmn {
+						sum = 1
+					}
+					for j := 0; j < min(i, n); j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return math.NaN()
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			}
+		} else {
+			if uplo == blas.Upper {
+				for i := 0; i < m; i++ {
+					var sum float64
+					for j := i; j < n; j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return sum
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			} else {
+				for i := 0; i < m; i++ {
+					var sum float64
+					for j := 0; j <= min(i, n-1); j++ {
+						sum += math.Abs(a[i*lda+j])
+					}
+					if math.IsNaN(sum) {
+						return sum
+					}
+					if sum > maxsum {
+						maxsum = sum
+					}
+				}
+				return maxsum
+			}
+		}
+	default:
+		// lapack.Frobenius:
+		var scale, sum float64
+		if diag == blas.Unit {
+			scale = 1
+			sum = float64(min(m, n))
+			if uplo == blas.Upper {
+				for i := 0; i < min(m, n); i++ {
+					scale, sum = impl.Dlassq(n-i-1, a[i*lda+i+1:], 1, scale, sum)
+				}
+			} else {
+				for i := 1; i < m; i++ {
+					scale, sum = impl.Dlassq(min(i, n), a[i*lda:], 1, scale, sum)
+				}
+			}
+		} else {
+			scale = 0
+			sum = 1
+			if uplo == blas.Upper {
+				for i := 0; i < min(m, n); i++ {
+					scale, sum = impl.Dlassq(n-i, a[i*lda+i:], 1, scale, sum)
+				}
+			} else {
+				for i := 0; i < m; i++ {
+					scale, sum = impl.Dlassq(min(i+1, n), a[i*lda:], 1, scale, sum)
+				}
+			}
+		}
+		return scale * math.Sqrt(sum)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go
new file mode 100644
index 00000000000..360f71b1d39
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlanv2.go
@@ -0,0 +1,151 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlanv2 computes the Schur factorization of a real 2×2 matrix:
+//
+//	[ a b ] = [ cs -sn ] * [ aa bb ] * [ cs sn ]
+//	[ c d ]   [ sn  cs ]   [ cc dd ] * [-sn cs ]
+//
+// If cc is zero, aa and dd are real eigenvalues of the matrix. Otherwise it
+// holds that aa = dd and bb*cc < 0, and aa ± sqrt(bb*cc) are complex conjugate
+// eigenvalues. The real and imaginary parts of the eigenvalues are returned in
+// (rt1r,rt1i) and (rt2r,rt2i).
+func (impl Implementation) Dlanv2(a, b, c, d float64) (aa, bb, cc, dd float64, rt1r, rt1i, rt2r, rt2i float64, cs, sn float64) {
+	switch {
+	case c == 0: // Matrix is already upper triangular.
+		aa = a
+		bb = b
+		cc = 0
+		dd = d
+		cs = 1
+		sn = 0
+	case b == 0: // Matrix is lower triangular, swap rows and columns.
+		aa = d
+		bb = -c
+		cc = 0
+		dd = a
+		cs = 0
+		sn = 1
+	case a == d && math.Signbit(b) != math.Signbit(c): // Matrix is already in the standard Schur form.
+		aa = a
+		bb = b
+		cc = c
+		dd = d
+		cs = 1
+		sn = 0
+	default:
+		temp := a - d
+		p := temp / 2
+		bcmax := math.Max(math.Abs(b), math.Abs(c))
+		bcmis := math.Min(math.Abs(b), math.Abs(c))
+		if b*c < 0 {
+			bcmis *= -1
+		}
+		scale := math.Max(math.Abs(p), bcmax)
+		z := p/scale*p + bcmax/scale*bcmis
+		eps := dlamchP
+
+		if z >= 4*eps {
+			// Real eigenvalues. Compute aa and dd.
+			if p > 0 {
+				z = p + math.Sqrt(scale)*math.Sqrt(z)
+			} else {
+				z = p - math.Sqrt(scale)*math.Sqrt(z)
+			}
+			aa = d + z
+			dd = d - bcmax/z*bcmis
+			// Compute bb and the rotation matrix.
+			tau := impl.Dlapy2(c, z)
+			cs = z / tau
+			sn = c / tau
+			bb = b - c
+			cc = 0
+		} else {
+			// Complex eigenvalues, or real (almost) equal eigenvalues.
+			// Make diagonal elements equal.
+			safmn2 := math.Pow(dlamchB, math.Log(dlamchS/dlamchE)/math.Log(dlamchB)/2)
+			safmx2 := 1 / safmn2
+			sigma := b + c
+		loop:
+			for iter := 0; iter < 20; iter++ {
+				scale = math.Max(math.Abs(temp), math.Abs(sigma))
+				switch {
+				case scale >= safmx2:
+					sigma *= safmn2
+					temp *= safmn2
+				case scale <= safmn2:
+					sigma *= safmx2
+					temp *= safmx2
+				default:
+					break loop
+				}
+			}
+			p = temp / 2
+			tau := impl.Dlapy2(sigma, temp)
+			cs = math.Sqrt((1 + math.Abs(sigma)/tau) / 2)
+			sn = -p / (tau * cs)
+			if sigma < 0 {
+				sn *= -1
+			}
+			// Compute [ aa bb ] = [ a b ] [ cs -sn ]
+			//         [ cc dd ]   [ c d ] [ sn  cs ]
+			aa = a*cs + b*sn
+			bb = -a*sn + b*cs
+			cc = c*cs + d*sn
+			dd = -c*sn + d*cs
+			// Compute [ a b ] = [ cs sn ] [ aa bb ]
+			//         [ c d ]   [-sn cs ] [ cc dd ]
+			a = aa*cs + cc*sn
+			b = bb*cs + dd*sn
+			c = -aa*sn + cc*cs
+			d = -bb*sn + dd*cs
+
+			temp = (a + d) / 2
+			aa = temp
+			bb = b
+			cc = c
+			dd = temp
+
+			if cc != 0 {
+				if bb != 0 {
+					if math.Signbit(bb) == math.Signbit(cc) {
+						// Real eigenvalues, reduce to
+						// upper triangular form.
+						sab := math.Sqrt(math.Abs(bb))
+						sac := math.Sqrt(math.Abs(cc))
+						p = sab * sac
+						if cc < 0 {
+							p *= -1
+						}
+						tau = 1 / math.Sqrt(math.Abs(bb+cc))
+						aa = temp + p
+						bb = bb - cc
+						cc = 0
+						dd = temp - p
+						cs1 := sab * tau
+						sn1 := sac * tau
+						cs, sn = cs*cs1-sn*sn1, cs*sn1+sn*cs1
+					}
+				} else {
+					bb = -cc
+					cc = 0
+					cs, sn = -sn, cs
+				}
+			}
+		}
+	}
+
+	// Store eigenvalues in (rt1r,rt1i) and (rt2r,rt2i).
+	rt1r = aa
+	rt2r = dd
+	if cc != 0 {
+		rt1i = math.Sqrt(math.Abs(bb)) * math.Sqrt(math.Abs(cc))
+		rt2i = -rt1i
+	}
+	return
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go
new file mode 100644
index 00000000000..bf98c338ebf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapll.go
@@ -0,0 +1,55 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlapll returns the smallest singular value of the n×2 matrix A = [ x y ].
+// The function first computes the QR factorization of A = Q*R, and then computes
+// the SVD of the 2-by-2 upper triangular matrix r.
+//
+// The contents of x and y are overwritten during the call.
+//
+// Dlapll is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlapll(n int, x []float64, incX int, y []float64, incY int) float64 {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incX <= 0:
+		panic(badIncX)
+	case incY <= 0:
+		panic(badIncY)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(x) < 1+(n-1)*incX:
+		panic(shortX)
+	case len(y) < 1+(n-1)*incY:
+		panic(shortY)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return 0
+	}
+
+	// Compute the QR factorization of the N-by-2 matrix [ X Y ].
+	a00, tau := impl.Dlarfg(n, x[0], x[incX:], incX)
+	x[0] = 1
+
+	bi := blas64.Implementation()
+	c := -tau * bi.Ddot(n, x, incX, y, incY)
+	bi.Daxpy(n, c, x, incX, y, incY)
+	a11, _ := impl.Dlarfg(n-1, y[incY], y[2*incY:], incY)
+
+	// Compute the SVD of 2-by-2 upper triangular matrix.
+	ssmin, _ := impl.Dlas2(a00, y[0], a11)
+	return ssmin
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go
new file mode 100644
index 00000000000..73cd82db96a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmr.go
@@ -0,0 +1,88 @@
+// Copyright ©2022 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlapmr rearranges the rows of the m×n matrix X as specified by the permutation
+// k[0],k[1],...,k[m-1] of the integers 0,...,m-1.
+//
+// If forward is true, a forward permutation is applied:
+//
+//	X[k[i],0:n] is moved to X[i,0:n] for i=0,1,...,m-1.
+//
+// If forward is false, a backward permutation is applied:
+//
+//	X[i,0:n] is moved to X[k[i],0:n] for i=0,1,...,m-1.
+//
+// k must have length m, otherwise Dlapmr will panic.
+func (impl Implementation) Dlapmr(forward bool, m, n int, x []float64, ldx int, k []int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldx < max(1, n):
+		panic(badLdX)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(x) < (m-1)*ldx+n:
+		panic(shortX)
+	case len(k) != m:
+		panic(badLenK)
+	}
+
+	// Quick return if possible.
+	if m == 1 {
+		return
+	}
+
+	bi := blas64.Implementation()
+
+	for i, ki := range k {
+		k[i] = -(ki + 1)
+	}
+	if forward {
+		for i, ki := range k {
+			if ki >= 0 {
+				continue
+			}
+			j := i
+			k[j] = -k[j] - 1
+			in := k[j]
+			for {
+				if k[in] >= 0 {
+					break
+				}
+				bi.Dswap(n, x[j*ldx:], 1, x[in*ldx:], 1)
+				k[in] = -k[in] - 1
+				j = in
+				in = k[in]
+			}
+		}
+	} else {
+		for i, ki := range k {
+			if ki >= 0 {
+				continue
+			}
+			k[i] = -ki - 1
+			j := k[i]
+			for {
+				if j == i {
+					break
+				}
+				bi.Dswap(n, x[i*ldx:], 1, x[j*ldx:], 1)
+				k[j] = -k[j] - 1
+				j = k[j]
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go
new file mode 100644
index 00000000000..4a70e68f041
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapmt.go
@@ -0,0 +1,89 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlapmt rearranges the columns of the m×n matrix X as specified by the
+// permutation k_0, k_1, ..., k_n-1 of the integers 0, ..., n-1.
+//
+// If forward is true a forward permutation is performed:
+//
+//	X[0:m, k[j]] is moved to X[0:m, j] for j = 0, 1, ..., n-1.
+//
+// otherwise a backward permutation is performed:
+//
+//	X[0:m, j] is moved to X[0:m, k[j]] for j = 0, 1, ..., n-1.
+//
+// k must have length n, otherwise Dlapmt will panic. k is zero-indexed.
+func (impl Implementation) Dlapmt(forward bool, m, n int, x []float64, ldx int, k []int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldx < max(1, n):
+		panic(badLdX)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	switch {
+	case len(x) < (m-1)*ldx+n:
+		panic(shortX)
+	case len(k) != n:
+		panic(badLenK)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return
+	}
+
+	for i, v := range k {
+		v++
+		k[i] = -v
+	}
+
+	bi := blas64.Implementation()
+
+	if forward {
+		for j, v := range k {
+			if v >= 0 {
+				continue
+			}
+			k[j] = -v
+			i := -v - 1
+			for k[i] < 0 {
+				bi.Dswap(m, x[j:], ldx, x[i:], ldx)
+
+				k[i] = -k[i]
+				j = i
+				i = k[i] - 1
+			}
+		}
+	} else {
+		for i, v := range k {
+			if v >= 0 {
+				continue
+			}
+			k[i] = -v
+			j := -v - 1
+			for j != i {
+				bi.Dswap(m, x[j:], ldx, x[i:], ldx)
+
+				k[j] = -k[j]
+				j = k[j] - 1
+			}
+		}
+	}
+
+	for i := range k {
+		k[i]--
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go
new file mode 100644
index 00000000000..19f73ffabd9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlapy2.go
@@ -0,0 +1,14 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlapy2 is the LAPACK version of math.Hypot.
+//
+// Dlapy2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlapy2(x, y float64) float64 {
+	return math.Hypot(x, y)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go
new file mode 100644
index 00000000000..cc3bc06db66
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqp2.go
@@ -0,0 +1,127 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlaqp2 computes a QR factorization with column pivoting of the block A[offset:m, 0:n]
+// of the m×n matrix A. The block A[0:offset, 0:n] is accordingly pivoted, but not factorized.
+//
+// On exit, the upper triangle of block A[offset:m, 0:n] is the triangular factor obtained.
+// The elements in block A[offset:m, 0:n] below the diagonal, together with tau, represent
+// the orthogonal matrix Q as a product of elementary reflectors.
+//
+// offset is number of rows of the matrix A that must be pivoted but not factorized.
+// offset must not be negative otherwise Dlaqp2 will panic.
+//
+// On exit, jpvt holds the permutation that was applied; the jth column of A*P was the
+// jpvt[j] column of A. jpvt must have length n, otherwise Dlaqp2 will panic.
+//
+// On exit tau holds the scalar factors of the elementary reflectors. It must have length
+// at least min(m-offset, n) otherwise Dlaqp2 will panic.
+//
+// vn1 and vn2 hold the partial and complete column norms respectively. They must have length n,
+// otherwise Dlaqp2 will panic.
+//
+// work must have length n, otherwise Dlaqp2 will panic.
+//
+// Dlaqp2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqp2(m, n, offset int, a []float64, lda int, jpvt []int, tau, vn1, vn2, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case offset < 0:
+		panic(offsetLT0)
+	case offset > m:
+		panic(offsetGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	mn := min(m-offset, n)
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(jpvt) != n:
+		panic(badLenJpvt)
+	case len(tau) < mn:
+		panic(shortTau)
+	case len(vn1) < n:
+		panic(shortVn1)
+	case len(vn2) < n:
+		panic(shortVn2)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	tol3z := math.Sqrt(dlamchE)
+
+	bi := blas64.Implementation()
+
+	// Compute factorization.
+	for i := 0; i < mn; i++ {
+		offpi := offset + i
+
+		// Determine ith pivot column and swap if necessary.
+		p := i + bi.Idamax(n-i, vn1[i:], 1)
+		if p != i {
+			bi.Dswap(m, a[p:], lda, a[i:], lda)
+			jpvt[p], jpvt[i] = jpvt[i], jpvt[p]
+			vn1[p] = vn1[i]
+			vn2[p] = vn2[i]
+		}
+
+		// Generate elementary reflector H_i.
+		if offpi < m-1 {
+			a[offpi*lda+i], tau[i] = impl.Dlarfg(m-offpi, a[offpi*lda+i], a[(offpi+1)*lda+i:], lda)
+		} else {
+			tau[i] = 0
+		}
+
+		if i < n-1 {
+			// Apply H_iᵀ to A[offset+i:m, i:n] from the left.
+			aii := a[offpi*lda+i]
+			a[offpi*lda+i] = 1
+			impl.Dlarf(blas.Left, m-offpi, n-i-1, a[offpi*lda+i:], lda, tau[i], a[offpi*lda+i+1:], lda, work)
+			a[offpi*lda+i] = aii
+		}
+
+		// Update partial column norms.
+		for j := i + 1; j < n; j++ {
+			if vn1[j] == 0 {
+				continue
+			}
+
+			// The following marked lines follow from the
+			// analysis in Lapack Working Note 176.
+			r := math.Abs(a[offpi*lda+j]) / vn1[j] // *
+			temp := math.Max(0, 1-r*r)             // *
+			r = vn1[j] / vn2[j]                    // *
+			temp2 := temp * r * r                  // *
+			if temp2 < tol3z {
+				var v float64
+				if offpi < m-1 {
+					v = bi.Dnrm2(m-offpi-1, a[(offpi+1)*lda+j:], lda)
+				}
+				vn1[j] = v
+				vn2[j] = v
+			} else {
+				vn1[j] *= math.Sqrt(temp) // *
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go
new file mode 100644
index 00000000000..da1a60e5cf5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqps.go
@@ -0,0 +1,244 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlaqps computes a step of QR factorization with column pivoting
+// of an m×n matrix A by using Blas-3. It tries to factorize nb
+// columns from A starting from the row offset, and updates all
+// of the matrix with Dgemm.
+//
+// In some cases, due to catastrophic cancellations, it cannot
+// factorize nb columns. Hence, the actual number of factorized
+// columns is returned in kb.
+//
+// Dlaqps computes a QR factorization with column pivoting of the
+// block A[offset:m, 0:nb] of the m×n matrix A. The block
+// A[0:offset, 0:n] is accordingly pivoted, but not factorized.
+//
+// On exit, the upper triangle of block A[offset:m, 0:kb] is the
+// triangular factor obtained. The elements in block A[offset:m, 0:n]
+// below the diagonal, together with tau, represent the orthogonal
+// matrix Q as a product of elementary reflectors.
+//
+// offset is number of rows of the matrix A that must be pivoted but
+// not factorized. offset must not be negative otherwise Dlaqps will panic.
+//
+// On exit, jpvt holds the permutation that was applied; the jth column
+// of A*P was the jpvt[j] column of A. jpvt must have length n,
+// otherwise Dlapqs will panic.
+//
+// On exit tau holds the scalar factors of the elementary reflectors.
+// It must have length nb, otherwise Dlapqs will panic.
+//
+// vn1 and vn2 hold the partial and complete column norms respectively.
+// They must have length n, otherwise Dlapqs will panic.
+//
+// auxv must have length nb, otherwise Dlaqps will panic.
+//
+// f and ldf represent an n×nb matrix F that is overwritten during the
+// call.
+//
+// Dlaqps is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqps(m, n, offset, nb int, a []float64, lda int, jpvt []int, tau, vn1, vn2, auxv, f []float64, ldf int) (kb int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case offset < 0:
+		panic(offsetLT0)
+	case offset > m:
+		panic(offsetGTM)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldf < max(1, nb):
+		panic(badLdF)
+	}
+
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(jpvt) != n:
+		panic(badLenJpvt)
+	case len(vn1) < n:
+		panic(shortVn1)
+	case len(vn2) < n:
+		panic(shortVn2)
+	}
+
+	if nb == 0 {
+		return 0
+	}
+
+	switch {
+	case len(tau) < nb:
+		panic(shortTau)
+	case len(auxv) < nb:
+		panic(shortAuxv)
+	case len(f) < (n-1)*ldf+nb:
+		panic(shortF)
+	}
+
+	if offset == m {
+		return 0
+	}
+
+	lastrk := min(m, n+offset)
+	lsticc := -1
+	tol3z := math.Sqrt(dlamchE)
+
+	bi := blas64.Implementation()
+
+	var k, rk int
+	for ; k < nb && lsticc == -1; k++ {
+		rk = offset + k
+
+		// Determine kth pivot column and swap if necessary.
+		p := k + bi.Idamax(n-k, vn1[k:], 1)
+		if p != k {
+			bi.Dswap(m, a[p:], lda, a[k:], lda)
+			bi.Dswap(k, f[p*ldf:], 1, f[k*ldf:], 1)
+			jpvt[p], jpvt[k] = jpvt[k], jpvt[p]
+			vn1[p] = vn1[k]
+			vn2[p] = vn2[k]
+		}
+
+		// Apply previous Householder reflectors to column K:
+		//
+		// A[rk:m, k] = A[rk:m, k] - A[rk:m, 0:k-1]*F[k, 0:k-1]ᵀ.
+		if k > 0 {
+			bi.Dgemv(blas.NoTrans, m-rk, k, -1,
+				a[rk*lda:], lda,
+				f[k*ldf:], 1,
+				1,
+				a[rk*lda+k:], lda)
+		}
+
+		// Generate elementary reflector H_k.
+		if rk < m-1 {
+			a[rk*lda+k], tau[k] = impl.Dlarfg(m-rk, a[rk*lda+k], a[(rk+1)*lda+k:], lda)
+		} else {
+			tau[k] = 0
+		}
+
+		akk := a[rk*lda+k]
+		a[rk*lda+k] = 1
+
+		// Compute kth column of F:
+		//
+		// Compute F[k+1:n, k] = tau[k]*A[rk:m, k+1:n]ᵀ*A[rk:m, k].
+		if k < n-1 {
+			bi.Dgemv(blas.Trans, m-rk, n-k-1, tau[k],
+				a[rk*lda+k+1:], lda,
+				a[rk*lda+k:], lda,
+				0,
+				f[(k+1)*ldf+k:], ldf)
+		}
+
+		// Padding F[0:k, k] with zeros.
+		for j := 0; j < k; j++ {
+			f[j*ldf+k] = 0
+		}
+
+		// Incremental updating of F:
+		//
+		// F[0:n, k] := F[0:n, k] - tau[k]*F[0:n, 0:k-1]*A[rk:m, 0:k-1]ᵀ*A[rk:m,k].
+		if k > 0 {
+			bi.Dgemv(blas.Trans, m-rk, k, -tau[k],
+				a[rk*lda:], lda,
+				a[rk*lda+k:], lda,
+				0,
+				auxv, 1)
+			bi.Dgemv(blas.NoTrans, n, k, 1,
+				f, ldf,
+				auxv, 1,
+				1,
+				f[k:], ldf)
+		}
+
+		// Update the current row of A:
+		//
+		// A[rk, k+1:n] = A[rk, k+1:n] - A[rk, 0:k]*F[k+1:n, 0:k]ᵀ.
+		if k < n-1 {
+			bi.Dgemv(blas.NoTrans, n-k-1, k+1, -1,
+				f[(k+1)*ldf:], ldf,
+				a[rk*lda:], 1,
+				1,
+				a[rk*lda+k+1:], 1)
+		}
+
+		// Update partial column norms.
+		if rk < lastrk-1 {
+			for j := k + 1; j < n; j++ {
+				if vn1[j] == 0 {
+					continue
+				}
+
+				// The following marked lines follow from the
+				// analysis in Lapack Working Note 176.
+				r := math.Abs(a[rk*lda+j]) / vn1[j] // *
+				temp := math.Max(0, 1-r*r)          // *
+				r = vn1[j] / vn2[j]                 // *
+				temp2 := temp * r * r               // *
+				if temp2 < tol3z {
+					// vn2 is used here as a collection of
+					// indices into vn2 and also a collection
+					// of column norms.
+					vn2[j] = float64(lsticc)
+					lsticc = j
+				} else {
+					vn1[j] *= math.Sqrt(temp) // *
+				}
+			}
+		}
+
+		a[rk*lda+k] = akk
+	}
+	kb = k
+	rk = offset + kb
+
+	// Apply the block reflector to the rest of the matrix:
+	//
+	// A[offset+kb+1:m, kb+1:n] := A[offset+kb+1:m, kb+1:n] - A[offset+kb+1:m, 1:kb]*F[kb+1:n, 1:kb]ᵀ.
+	if kb < min(n, m-offset) {
+		bi.Dgemm(blas.NoTrans, blas.Trans,
+			m-rk, n-kb, kb, -1,
+			a[rk*lda:], lda,
+			f[kb*ldf:], ldf,
+			1,
+			a[rk*lda+kb:], lda)
+	}
+
+	// Recomputation of difficult columns.
+	for lsticc >= 0 {
+		itemp := int(vn2[lsticc])
+
+		// NOTE: The computation of vn1[lsticc] relies on the fact that
+		// Dnrm2 does not fail on vectors with norm below the value of
+		// sqrt(dlamchS)
+		v := bi.Dnrm2(m-rk, a[rk*lda+lsticc:], lda)
+		vn1[lsticc] = v
+		vn2[lsticc] = v
+
+		lsticc = itemp
+	}
+
+	return kb
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go
new file mode 100644
index 00000000000..8e4b266b857
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr04.go
@@ -0,0 +1,493 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+)
+
+// Dlaqr04 computes the eigenvalues of a block of an n×n upper Hessenberg matrix
+// H, and optionally the matrices T and Z from the Schur decomposition
+//
+//	H = Z T Zᵀ
+//
+// where T is an upper quasi-triangular matrix (the Schur form), and Z is the
+// orthogonal matrix of Schur vectors.
+//
+// wantt indicates whether the full Schur form T is required. If wantt is false,
+// then only enough of H will be updated to preserve the eigenvalues.
+//
+// wantz indicates whether the n×n matrix of Schur vectors Z is required. If it
+// is true, the orthogonal similarity transformation will be accumulated into
+// Z[iloz:ihiz+1,ilo:ihi+1], otherwise Z will not be referenced.
+//
+// ilo and ihi determine the block of H on which Dlaqr04 operates. It must hold that
+//
+//	0 <= ilo <= ihi < n     if n > 0,
+//	ilo == 0 and ihi == -1  if n == 0,
+//
+// and the block must be isolated, that is,
+//
+//	ilo == 0   or H[ilo,ilo-1] == 0,
+//	ihi == n-1 or H[ihi+1,ihi] == 0,
+//
+// otherwise Dlaqr04 will panic.
+//
+// wr and wi must have length ihi+1.
+//
+// iloz and ihiz specify the rows of Z to which transformations will be applied
+// if wantz is true. It must hold that
+//
+//	0 <= iloz <= ilo,  and  ihi <= ihiz < n,
+//
+// otherwise Dlaqr04 will panic.
+//
+// work must have length at least lwork and lwork must be
+//
+//	lwork >= 1  if n <= 11,
+//	lwork >= n  if n > 11,
+//
+// otherwise Dlaqr04 will panic. lwork as large as 6*n may be required for
+// optimal performance. On return, work[0] will contain the optimal value of
+// lwork.
+//
+// If lwork is -1, instead of performing Dlaqr04, the function only estimates the
+// optimal workspace size and stores it into work[0]. Neither h nor z are
+// accessed.
+//
+// recur is the non-negative recursion depth. For recur > 0, Dlaqr04 behaves
+// as DLAQR0, for recur == 0 it behaves as DLAQR4.
+//
+// unconverged indicates whether Dlaqr04 computed all the eigenvalues of H[ilo:ihi+1,ilo:ihi+1].
+//
+// If unconverged is zero and wantt is true, H will contain on return the upper
+// quasi-triangular matrix T from the Schur decomposition. 2×2 diagonal blocks
+// (corresponding to complex conjugate pairs of eigenvalues) will be returned in
+// standard form, with H[i,i] == H[i+1,i+1] and H[i+1,i]*H[i,i+1] < 0.
+//
+// If unconverged is zero and if wantt is false, the contents of h on return is
+// unspecified.
+//
+// If unconverged is zero, all the eigenvalues have been computed and their real
+// and imaginary parts will be stored on return in wr[ilo:ihi+1] and
+// wi[ilo:ihi+1], respectively. If two eigenvalues are computed as a complex
+// conjugate pair, they are stored in consecutive elements of wr and wi, say the
+// i-th and (i+1)th, with wi[i] > 0 and wi[i+1] < 0. If wantt is true, then the
+// eigenvalues are stored in the same order as on the diagonal of the Schur form
+// returned in H, with wr[i] = H[i,i] and, if H[i:i+2,i:i+2] is a 2×2 diagonal
+// block, wi[i] = sqrt(-H[i+1,i]*H[i,i+1]) and wi[i+1] = -wi[i].
+//
+// If unconverged is positive, some eigenvalues have not converged, and
+// wr[unconverged:ihi+1] and wi[unconverged:ihi+1] will contain those
+// eigenvalues which have been successfully computed. Failures are rare.
+//
+// If unconverged is positive and wantt is true, then on return
+//
+//	(initial H)*U = U*(final H),   (*)
+//
+// where U is an orthogonal matrix. The final H is upper Hessenberg and
+// H[unconverged:ihi+1,unconverged:ihi+1] is upper quasi-triangular.
+//
+// If unconverged is positive and wantt is false, on return the remaining
+// unconverged eigenvalues are the eigenvalues of the upper Hessenberg matrix
+// H[ilo:unconverged,ilo:unconverged].
+//
+// If unconverged is positive and wantz is true, then on return
+//
+//	(final Z) = (initial Z)*U,
+//
+// where U is the orthogonal matrix in (*) regardless of the value of wantt.
+//
+// References:
+//
+//	[1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part I:
+//	    Maintaining Well-Focused Shifts and Level 3 Performance. SIAM J. Matrix
+//	    Anal. Appl. 23(4) (2002), pp. 929—947
+//	    URL: http://dx.doi.org/10.1137/S0895479801384573
+//	[2] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
+//	    Aggressive Early Deflation. SIAM J. Matrix Anal. Appl. 23(4) (2002), pp. 948—973
+//	    URL: http://dx.doi.org/10.1137/S0895479801384585
+//
+// Dlaqr04 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqr04(wantt, wantz bool, n, ilo, ihi int, h []float64, ldh int, wr, wi []float64, iloz, ihiz int, z []float64, ldz int, work []float64, lwork int, recur int) (unconverged int) {
+	const (
+		// Matrices of order ntiny or smaller must be processed by
+		// Dlahqr because of insufficient subdiagonal scratch space.
+		// This is a hard limit.
+		ntiny = 15
+		// Exceptional deflation windows: try to cure rare slow
+		// convergence by varying the size of the deflation window after
+		// kexnw iterations.
+		kexnw = 5
+		// Exceptional shifts: try to cure rare slow convergence with
+		// ad-hoc exceptional shifts every kexsh iterations.
+		kexsh = 6
+
+		// See https://github.com/gonum/lapack/pull/151#discussion_r68162802
+		// and the surrounding discussion for an explanation where these
+		// constants come from.
+		// TODO(vladimir-ch): Similar constants for exceptional shifts
+		// are used also in dlahqr.go. The first constant is different
+		// there, it is equal to 3. Why? And does it matter?
+		wilk1 = 0.75
+		wilk2 = -0.4375
+	)
+
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(0, n-1) < ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case wantz && (iloz < 0 || ilo < iloz):
+		panic(badIloz)
+	case wantz && (ihiz < ihi || n <= ihiz):
+		panic(badIhiz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	// TODO(vladimir-ch): Enable if and when we figure out what the minimum
+	// necessary lwork value is. Dlaqr04 says that the minimum is n which
+	// clashes with Dlaqr23's opinion about optimal work when nw <= 2
+	// (independent of n).
+	// case lwork < n && n > ntiny && lwork != -1:
+	// 	panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	case recur < 0:
+		panic(recurLT0)
+	}
+
+	// Quick return.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	if lwork != -1 {
+		switch {
+		case len(h) < (n-1)*ldh+n:
+			panic(shortH)
+		case len(wr) != ihi+1:
+			panic(badLenWr)
+		case len(wi) != ihi+1:
+			panic(badLenWi)
+		case wantz && len(z) < (n-1)*ldz+n:
+			panic(shortZ)
+		case ilo > 0 && h[ilo*ldh+ilo-1] != 0:
+			panic(notIsolated)
+		case ihi+1 < n && h[(ihi+1)*ldh+ihi] != 0:
+			panic(notIsolated)
+		}
+	}
+
+	if n <= ntiny {
+		// Tiny matrices must use Dlahqr.
+		if lwork == -1 {
+			work[0] = 1
+			return 0
+		}
+		return impl.Dlahqr(wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, iloz, ihiz, z, ldz)
+	}
+
+	// Use small bulge multi-shift QR with aggressive early deflation on
+	// larger-than-tiny matrices.
+	var jbcmpz string
+	if wantt {
+		jbcmpz = "S"
+	} else {
+		jbcmpz = "E"
+	}
+	if wantz {
+		jbcmpz += "V"
+	} else {
+		jbcmpz += "N"
+	}
+
+	var fname string
+	if recur > 0 {
+		fname = "DLAQR0"
+	} else {
+		fname = "DLAQR4"
+	}
+	// nwr is the recommended deflation window size. n is greater than ntiny,
+	// so there is enough subdiagonal workspace for nwr >= 2 as required.
+	// (In fact, there is enough subdiagonal space for nwr >= 4.)
+	// TODO(vladimir-ch): If there is enough space for nwr >= 4, should we
+	// use it?
+	nwr := impl.Ilaenv(13, fname, jbcmpz, n, ilo, ihi, lwork)
+	nwr = max(2, nwr)
+	nwr = min(ihi-ilo+1, min((n-1)/3, nwr))
+
+	// nsr is the recommended number of simultaneous shifts. n is greater than
+	// ntiny, so there is enough subdiagonal workspace for nsr to be even and
+	// greater than or equal to two as required.
+	nsr := impl.Ilaenv(15, fname, jbcmpz, n, ilo, ihi, lwork)
+	nsr = min(nsr, min((n-3)/6, ihi-ilo))
+	nsr = max(2, nsr&^1)
+
+	// Workspace query call to Dlaqr23.
+	impl.Dlaqr23(wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz, ihiz, z, ldz,
+		wr, wi, h, ldh, n, h, ldh, n, h, ldh, work, -1, recur)
+	// Optimal workspace is max(Dlaqr5, Dlaqr23).
+	lwkopt := max(3*nsr/2, int(work[0]))
+	// Quick return in case of workspace query.
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return 0
+	}
+
+	// Dlahqr/Dlaqr04 crossover point.
+	nmin := impl.Ilaenv(12, fname, jbcmpz, n, ilo, ihi, lwork)
+	nmin = max(ntiny, nmin)
+
+	// Nibble determines when to skip a multi-shift QR sweep (Dlaqr5).
+	nibble := impl.Ilaenv(14, fname, jbcmpz, n, ilo, ihi, lwork)
+	nibble = max(0, nibble)
+
+	// Computation mode of far-from-diagonal orthogonal updates in Dlaqr5.
+	kacc22 := impl.Ilaenv(16, fname, jbcmpz, n, ilo, ihi, lwork)
+	kacc22 = max(0, min(kacc22, 2))
+
+	// nwmax is the largest possible deflation window for which there is
+	// sufficient workspace.
+	nwmax := min((n-1)/3, lwork/2)
+	nw := nwmax // Start with maximum deflation window size.
+
+	// nsmax is the largest number of simultaneous shifts for which there is
+	// sufficient workspace.
+	nsmax := min((n-3)/6, 2*lwork/3) &^ 1
+
+	ndfl := 1 // Number of iterations since last deflation.
+	ndec := 0 // Deflation window size decrement.
+
+	// Main loop.
+	var (
+		itmax = max(30, 2*kexsh) * max(10, (ihi-ilo+1))
+		it    = 0
+	)
+	for kbot := ihi; kbot >= ilo; {
+		if it == itmax {
+			unconverged = kbot + 1
+			break
+		}
+		it++
+
+		// Locate active block.
+		ktop := ilo
+		for k := kbot; k >= ilo+1; k-- {
+			if h[k*ldh+k-1] == 0 {
+				ktop = k
+				break
+			}
+		}
+
+		// Select deflation window size nw.
+		//
+		// Typical Case:
+		//  If possible and advisable, nibble the entire active block.
+		//  If not, use size min(nwr,nwmax) or min(nwr+1,nwmax)
+		//  depending upon which has the smaller corresponding
+		//  subdiagonal entry (a heuristic).
+		//
+		// Exceptional Case:
+		//  If there have been no deflations in kexnw or more
+		//  iterations, then vary the deflation window size. At first,
+		//  because larger windows are, in general, more powerful than
+		//  smaller ones, rapidly increase the window to the maximum
+		//  possible. Then, gradually reduce the window size.
+		nh := kbot - ktop + 1
+		nwupbd := min(nh, nwmax)
+		if ndfl < kexnw {
+			nw = min(nwupbd, nwr)
+		} else {
+			nw = min(nwupbd, 2*nw)
+		}
+		if nw < nwmax {
+			if nw >= nh-1 {
+				nw = nh
+			} else {
+				kwtop := kbot - nw + 1
+				if math.Abs(h[kwtop*ldh+kwtop-1]) > math.Abs(h[(kwtop-1)*ldh+kwtop-2]) {
+					nw++
+				}
+			}
+		}
+		if ndfl < kexnw {
+			ndec = -1
+		} else if ndec >= 0 || nw >= nwupbd {
+			ndec++
+			if nw-ndec < 2 {
+				ndec = 0
+			}
+			nw -= ndec
+		}
+
+		// Split workspace under the subdiagonal of H into:
+		//  - an nw×nw work array V in the lower left-hand corner,
+		//  - an nw×nhv horizontal work array along the bottom edge (nhv
+		//    must be at least nw but more is better),
+		//  - an nve×nw vertical work array along the left-hand-edge
+		//    (nhv can be any positive integer but more is better).
+		kv := n - nw
+		kt := nw
+		kwv := nw + 1
+		nhv := n - kwv - kt
+		// Aggressive early deflation.
+		ls, ld := impl.Dlaqr23(wantt, wantz, n, ktop, kbot, nw,
+			h, ldh, iloz, ihiz, z, ldz, wr[:kbot+1], wi[:kbot+1],
+			h[kv*ldh:], ldh, nhv, h[kv*ldh+kt:], ldh, nhv, h[kwv*ldh:], ldh, work, lwork, recur)
+
+		// Adjust kbot accounting for new deflations.
+		kbot -= ld
+		// ks points to the shifts.
+		ks := kbot - ls + 1
+
+		// Skip an expensive QR sweep if there is a (partly heuristic)
+		// reason to expect that many eigenvalues will deflate without
+		// it. Here, the QR sweep is skipped if many eigenvalues have
+		// just been deflated or if the remaining active block is small.
+		if ld > 0 && (100*ld > nw*nibble || kbot-ktop+1 <= min(nmin, nwmax)) {
+			// ld is positive, note progress.
+			ndfl = 1
+			continue
+		}
+
+		// ns is the nominal number of simultaneous shifts. This may be
+		// lowered (slightly) if Dlaqr23 did not provide that many
+		// shifts.
+		ns := min(min(nsmax, nsr), max(2, kbot-ktop)) &^ 1
+
+		// If there have been no deflations in a multiple of kexsh
+		// iterations, then try exceptional shifts. Otherwise use shifts
+		// provided by Dlaqr23 above or from the eigenvalues of a
+		// trailing principal submatrix.
+		if ndfl%kexsh == 0 {
+			ks = kbot - ns + 1
+			for i := kbot; i > max(ks, ktop+1); i -= 2 {
+				ss := math.Abs(h[i*ldh+i-1]) + math.Abs(h[(i-1)*ldh+i-2])
+				aa := wilk1*ss + h[i*ldh+i]
+				_, _, _, _, wr[i-1], wi[i-1], wr[i], wi[i], _, _ =
+					impl.Dlanv2(aa, ss, wilk2*ss, aa)
+			}
+			if ks == ktop {
+				wr[ks+1] = h[(ks+1)*ldh+ks+1]
+				wi[ks+1] = 0
+				wr[ks] = wr[ks+1]
+				wi[ks] = wi[ks+1]
+			}
+		} else {
+			// If we got ns/2 or fewer shifts, use Dlahqr or recur
+			// into Dlaqr04 on a trailing principal submatrix to get
+			// more. Since ns <= nsmax <=(n+6)/9, there is enough
+			// space below the subdiagonal to fit an ns×ns scratch
+			// array.
+			if kbot-ks+1 <= ns/2 {
+				ks = kbot - ns + 1
+				kt = n - ns
+				impl.Dlacpy(blas.All, ns, ns, h[ks*ldh+ks:], ldh, h[kt*ldh:], ldh)
+				if ns > nmin && recur > 0 {
+					ks += impl.Dlaqr04(false, false, ns, 1, ns-1, h[kt*ldh:], ldh,
+						wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 0, work, lwork, recur-1)
+				} else {
+					ks += impl.Dlahqr(false, false, ns, 0, ns-1, h[kt*ldh:], ldh,
+						wr[ks:ks+ns], wi[ks:ks+ns], 0, 0, nil, 1)
+				}
+				// In case of a rare QR failure use eigenvalues
+				// of the trailing 2×2 principal submatrix.
+				if ks >= kbot {
+					aa := h[(kbot-1)*ldh+kbot-1]
+					bb := h[(kbot-1)*ldh+kbot]
+					cc := h[kbot*ldh+kbot-1]
+					dd := h[kbot*ldh+kbot]
+					_, _, _, _, wr[kbot-1], wi[kbot-1], wr[kbot], wi[kbot], _, _ =
+						impl.Dlanv2(aa, bb, cc, dd)
+					ks = kbot - 1
+				}
+			}
+
+			if kbot-ks+1 > ns {
+				// Sorting the shifts helps a little. Bubble
+				// sort keeps complex conjugate pairs together.
+				sorted := false
+				for k := kbot; k > ks; k-- {
+					if sorted {
+						break
+					}
+					sorted = true
+					for i := ks; i < k; i++ {
+						if math.Abs(wr[i])+math.Abs(wi[i]) >= math.Abs(wr[i+1])+math.Abs(wi[i+1]) {
+							continue
+						}
+						sorted = false
+						wr[i], wr[i+1] = wr[i+1], wr[i]
+						wi[i], wi[i+1] = wi[i+1], wi[i]
+					}
+				}
+			}
+
+			// Shuffle shifts into pairs of real shifts and pairs of
+			// complex conjugate shifts using the fact that complex
+			// conjugate shifts are already adjacent to one another.
+			// TODO(vladimir-ch): The shuffling here could probably
+			// be removed but I'm not sure right now and it's safer
+			// to leave it.
+			for i := kbot; i > ks+1; i -= 2 {
+				if wi[i] == -wi[i-1] {
+					continue
+				}
+				wr[i], wr[i-1], wr[i-2] = wr[i-1], wr[i-2], wr[i]
+				wi[i], wi[i-1], wi[i-2] = wi[i-1], wi[i-2], wi[i]
+			}
+		}
+
+		// If there are only two shifts and both are real, then use only one.
+		if kbot-ks+1 == 2 && wi[kbot] == 0 {
+			if math.Abs(wr[kbot]-h[kbot*ldh+kbot]) < math.Abs(wr[kbot-1]-h[kbot*ldh+kbot]) {
+				wr[kbot-1] = wr[kbot]
+			} else {
+				wr[kbot] = wr[kbot-1]
+			}
+		}
+
+		// Use up to ns of the smallest magnitude shifts. If there
+		// aren't ns shifts available, then use them all, possibly
+		// dropping one to make the number of shifts even.
+		ns = min(ns, kbot-ks+1) &^ 1
+		ks = kbot - ns + 1
+
+		// Split workspace under the subdiagonal into:
+		// - a kdu×kdu work array U in the lower left-hand-corner,
+		// - a kdu×nhv horizontal work array WH along the bottom edge
+		//   (nhv must be at least kdu but more is better),
+		// - an nhv×kdu vertical work array WV along the left-hand-edge
+		//   (nhv must be at least kdu but more is better).
+		kdu := 2 * ns
+		ku := n - kdu
+		kwh := kdu
+		kwv = kdu + 3
+		nhv = n - kwv - kdu
+		// Small-bulge multi-shift QR sweep.
+		impl.Dlaqr5(wantt, wantz, kacc22, n, ktop, kbot, ns,
+			wr[ks:ks+ns], wi[ks:ks+ns], h, ldh, iloz, ihiz, z, ldz,
+			work, 3, h[ku*ldh:], ldh, nhv, h[kwv*ldh:], ldh, nhv, h[ku*ldh+kwh:], ldh)
+
+		// Note progress (or the lack of it).
+		if ld > 0 {
+			ndfl = 1
+		} else {
+			ndfl++
+		}
+	}
+
+	work[0] = float64(lwkopt)
+	return unconverged
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go
new file mode 100644
index 00000000000..c20c88fdb4f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr1.go
@@ -0,0 +1,61 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlaqr1 sets v to a scalar multiple of the first column of the product
+//
+//	(H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+//
+// where H is a 2×2 or 3×3 matrix, I is the identity matrix of the same size,
+// and i is the imaginary unit. Scaling is done to avoid overflows and most
+// underflows.
+//
+// n is the order of H and must be either 2 or 3. It must hold that either sr1 =
+// sr2 and si1 = -si2, or si1 = si2 = 0. The length of v must be equal to n. If
+// any of these conditions is not met, Dlaqr1 will panic.
+//
+// Dlaqr1 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqr1(n int, h []float64, ldh int, sr1, si1, sr2, si2 float64, v []float64) {
+	switch {
+	case n != 2 && n != 3:
+		panic("lapack: n must be 2 or 3")
+	case ldh < n:
+		panic(badLdH)
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case !((sr1 == sr2 && si1 == -si2) || (si1 == 0 && si2 == 0)):
+		panic(badShifts)
+	case len(v) != n:
+		panic(shortV)
+	}
+
+	if n == 2 {
+		s := math.Abs(h[0]-sr2) + math.Abs(si2) + math.Abs(h[ldh])
+		if s == 0 {
+			v[0] = 0
+			v[1] = 0
+		} else {
+			h21s := h[ldh] / s
+			v[0] = h21s*h[1] + (h[0]-sr1)*((h[0]-sr2)/s) - si1*(si2/s)
+			v[1] = h21s * (h[0] + h[ldh+1] - sr1 - sr2)
+		}
+		return
+	}
+
+	s := math.Abs(h[0]-sr2) + math.Abs(si2) + math.Abs(h[ldh]) + math.Abs(h[2*ldh])
+	if s == 0 {
+		v[0] = 0
+		v[1] = 0
+		v[2] = 0
+	} else {
+		h21s := h[ldh] / s
+		h31s := h[2*ldh] / s
+		v[0] = (h[0]-sr1)*((h[0]-sr2)/s) - si1*(si2/s) + h[1]*h21s + h[2]*h31s
+		v[1] = h21s*(h[0]+h[ldh+1]-sr1-sr2) + h[ldh+2]*h31s
+		v[2] = h31s*(h[0]+h[2*ldh+2]-sr1-sr2) + h21s*h[2*ldh+1]
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go
new file mode 100644
index 00000000000..a3fa6661c60
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr23.go
@@ -0,0 +1,423 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlaqr23 performs the orthogonal similarity transformation of an n×n upper
+// Hessenberg matrix to detect and deflate fully converged eigenvalues from a
+// trailing principal submatrix using aggressive early deflation [1].
+//
+// On return, H will be overwritten by a new Hessenberg matrix that is a
+// perturbation of an orthogonal similarity transformation of H. It is hoped
+// that on output H will have many zero subdiagonal entries.
+//
+// If wantt is true, the matrix H will be fully updated so that the
+// quasi-triangular Schur factor can be computed. If wantt is false, then only
+// enough of H will be updated to preserve the eigenvalues.
+//
+// If wantz is true, the orthogonal similarity transformation will be
+// accumulated into Z[iloz:ihiz+1,ktop:kbot+1], otherwise Z is not referenced.
+//
+// ktop and kbot determine a block [ktop:kbot+1,ktop:kbot+1] along the diagonal
+// of H. It must hold that
+//
+//	0 <= ilo <= ihi < n     if n > 0,
+//	ilo == 0 and ihi == -1  if n == 0,
+//
+// and the block must be isolated, that is, it must hold that
+//
+//	ktop == 0   or H[ktop,ktop-1] == 0,
+//	kbot == n-1 or H[kbot+1,kbot] == 0,
+//
+// otherwise Dlaqr23 will panic.
+//
+// nw is the deflation window size. It must hold that
+//
+//	0 <= nw <= kbot-ktop+1,
+//
+// otherwise Dlaqr23 will panic.
+//
+// iloz and ihiz specify the rows of the n×n matrix Z to which transformations
+// will be applied if wantz is true. It must hold that
+//
+//	0 <= iloz <= ktop,  and  kbot <= ihiz < n,
+//
+// otherwise Dlaqr23 will panic.
+//
+// sr and si must have length kbot+1, otherwise Dlaqr23 will panic.
+//
+// v and ldv represent an nw×nw work matrix.
+// t and ldt represent an nw×nh work matrix, and nh must be at least nw.
+// wv and ldwv represent an nv×nw work matrix.
+//
+// work must have length at least lwork and lwork must be at least max(1,2*nw),
+// otherwise Dlaqr23 will panic. Larger values of lwork may result in greater
+// efficiency. On return, work[0] will contain the optimal value of lwork.
+//
+// If lwork is -1, instead of performing Dlaqr23, the function only estimates the
+// optimal workspace size and stores it into work[0]. Neither h nor z are
+// accessed.
+//
+// recur is the non-negative recursion depth. For recur > 0, Dlaqr23 behaves
+// as DLAQR3, for recur == 0 it behaves as DLAQR2.
+//
+// On return, ns and nd will contain respectively the number of unconverged
+// (i.e., approximate) eigenvalues and converged eigenvalues that are stored in
+// sr and si.
+//
+// On return, the real and imaginary parts of approximate eigenvalues that may
+// be used for shifts will be stored respectively in sr[kbot-nd-ns+1:kbot-nd+1]
+// and si[kbot-nd-ns+1:kbot-nd+1].
+//
+// On return, the real and imaginary parts of converged eigenvalues will be
+// stored respectively in sr[kbot-nd+1:kbot+1] and si[kbot-nd+1:kbot+1].
+//
+// References:
+//
+//	[1] K. Braman, R. Byers, R. Mathias. The Multishift QR Algorithm. Part II:
+//	    Aggressive Early Deflation. SIAM J. Matrix Anal. Appl 23(4) (2002), pp. 948—973
+//	    URL: http://dx.doi.org/10.1137/S0895479801384585
+func (impl Implementation) Dlaqr23(wantt, wantz bool, n, ktop, kbot, nw int, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, sr, si []float64, v []float64, ldv int, nh int, t []float64, ldt int, nv int, wv []float64, ldwv int, work []float64, lwork int, recur int) (ns, nd int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case ktop < 0 || max(0, n-1) < ktop:
+		panic(badKtop)
+	case kbot < min(ktop, n-1) || n <= kbot:
+		panic(badKbot)
+	case nw < 0 || kbot-ktop+1+1 < nw:
+		panic(badNw)
+	case ldh < max(1, n):
+		panic(badLdH)
+	case wantz && (iloz < 0 || ktop < iloz):
+		panic(badIloz)
+	case wantz && (ihiz < kbot || n <= ihiz):
+		panic(badIhiz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case ldv < max(1, nw):
+		panic(badLdV)
+	case nh < nw:
+		panic(badNh)
+	case ldt < max(1, nh):
+		panic(badLdT)
+	case nv < 0:
+		panic(nvLT0)
+	case ldwv < max(1, nw):
+		panic(badLdWV)
+	case lwork < max(1, 2*nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	case recur < 0:
+		panic(recurLT0)
+	}
+
+	// Quick return for zero window size.
+	if nw == 0 {
+		work[0] = 1
+		return 0, 0
+	}
+
+	// LAPACK code does not enforce the documented behavior
+	//  nw <= kbot-ktop+1
+	// but we do (we panic above).
+	jw := nw
+	lwkopt := max(1, 2*nw)
+	if jw > 2 {
+		// Workspace query call to Dgehrd.
+		impl.Dgehrd(jw, 0, jw-2, t, ldt, work, work, -1)
+		lwk1 := int(work[0])
+		// Workspace query call to Dormhr.
+		impl.Dormhr(blas.Right, blas.NoTrans, jw, jw, 0, jw-2, t, ldt, work, v, ldv, work, -1)
+		lwk2 := int(work[0])
+		if recur > 0 {
+			// Workspace query call to Dlaqr04.
+			impl.Dlaqr04(true, true, jw, 0, jw-1, t, ldt, sr, si, 0, jw-1, v, ldv, work, -1, recur-1)
+			lwk3 := int(work[0])
+			// Optimal workspace.
+			lwkopt = max(jw+max(lwk1, lwk2), lwk3)
+		} else {
+			// Optimal workspace.
+			lwkopt = jw + max(lwk1, lwk2)
+		}
+	}
+	// Quick return in case of workspace query.
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return 0, 0
+	}
+
+	// Check input slices only if not doing workspace query.
+	switch {
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+	case len(v) < (nw-1)*ldv+nw:
+		panic(shortV)
+	case len(t) < (nw-1)*ldt+nh:
+		panic(shortT)
+	case len(wv) < (nv-1)*ldwv+nw:
+		panic(shortWV)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case len(sr) != kbot+1:
+		panic(badLenSr)
+	case len(si) != kbot+1:
+		panic(badLenSi)
+	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
+		panic(notIsolated)
+	case kbot+1 < n && h[(kbot+1)*ldh+kbot] != 0:
+		panic(notIsolated)
+	}
+
+	// Machine constants.
+	ulp := dlamchP
+	smlnum := float64(n) / ulp * dlamchS
+
+	// Setup deflation window.
+	var s float64
+	kwtop := kbot - jw + 1
+	if kwtop != ktop {
+		s = h[kwtop*ldh+kwtop-1]
+	}
+	if kwtop == kbot {
+		// 1×1 deflation window.
+		sr[kwtop] = h[kwtop*ldh+kwtop]
+		si[kwtop] = 0
+		ns = 1
+		nd = 0
+		if math.Abs(s) <= math.Max(smlnum, ulp*math.Abs(h[kwtop*ldh+kwtop])) {
+			ns = 0
+			nd = 1
+			if kwtop > ktop {
+				h[kwtop*ldh+kwtop-1] = 0
+			}
+		}
+		work[0] = 1
+		return ns, nd
+	}
+
+	// Convert to spike-triangular form. In case of a rare QR failure, this
+	// routine continues to do aggressive early deflation using that part of
+	// the deflation window that converged using infqr here and there to
+	// keep track.
+	impl.Dlacpy(blas.Upper, jw, jw, h[kwtop*ldh+kwtop:], ldh, t, ldt)
+	bi := blas64.Implementation()
+	bi.Dcopy(jw-1, h[(kwtop+1)*ldh+kwtop:], ldh+1, t[ldt:], ldt+1)
+	impl.Dlaset(blas.All, jw, jw, 0, 1, v, ldv)
+	nmin := impl.Ilaenv(12, "DLAQR3", "SV", jw, 0, jw-1, lwork)
+	var infqr int
+	if recur > 0 && jw > nmin {
+		infqr = impl.Dlaqr04(true, true, jw, 0, jw-1, t, ldt, sr[kwtop:], si[kwtop:], 0, jw-1, v, ldv, work, lwork, recur-1)
+	} else {
+		infqr = impl.Dlahqr(true, true, jw, 0, jw-1, t, ldt, sr[kwtop:], si[kwtop:], 0, jw-1, v, ldv)
+	}
+	// Note that ilo == 0 which conveniently coincides with the success
+	// value of infqr, that is, infqr as an index always points to the first
+	// converged eigenvalue.
+
+	// Dtrexc needs a clean margin near the diagonal.
+	for j := 0; j < jw-3; j++ {
+		t[(j+2)*ldt+j] = 0
+		t[(j+3)*ldt+j] = 0
+	}
+	if jw >= 3 {
+		t[(jw-1)*ldt+jw-3] = 0
+	}
+
+	ns = jw
+	ilst := infqr
+	// Deflation detection loop.
+	for ilst < ns {
+		bulge := false
+		if ns >= 2 {
+			bulge = t[(ns-1)*ldt+ns-2] != 0
+		}
+		if !bulge {
+			// Real eigenvalue.
+			abst := math.Abs(t[(ns-1)*ldt+ns-1])
+			if abst == 0 {
+				abst = math.Abs(s)
+			}
+			if math.Abs(s*v[ns-1]) <= math.Max(smlnum, ulp*abst) {
+				// Deflatable.
+				ns--
+			} else {
+				// Undeflatable, move it up out of the way.
+				// Dtrexc can not fail in this case.
+				_, ilst, _ = impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, ns-1, ilst, work)
+				ilst++
+			}
+			continue
+		}
+		// Complex conjugate pair.
+		abst := math.Abs(t[(ns-1)*ldt+ns-1]) + math.Sqrt(math.Abs(t[(ns-1)*ldt+ns-2]))*math.Sqrt(math.Abs(t[(ns-2)*ldt+ns-1]))
+		if abst == 0 {
+			abst = math.Abs(s)
+		}
+		if math.Max(math.Abs(s*v[ns-1]), math.Abs(s*v[ns-2])) <= math.Max(smlnum, ulp*abst) {
+			// Deflatable.
+			ns -= 2
+		} else {
+			// Undeflatable, move them up out of the way.
+			// Dtrexc does the right thing with ilst in case of a
+			// rare exchange failure.
+			_, ilst, _ = impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, ns-1, ilst, work)
+			ilst += 2
+		}
+	}
+
+	// Return to Hessenberg form.
+	if ns == 0 {
+		s = 0
+	}
+	if ns < jw {
+		// Sorting diagonal blocks of T improves accuracy for graded
+		// matrices. Bubble sort deals well with exchange failures.
+		sorted := false
+		i := ns
+		for !sorted {
+			sorted = true
+			kend := i - 1
+			i = infqr
+			var k int
+			if i == ns-1 || t[(i+1)*ldt+i] == 0 {
+				k = i + 1
+			} else {
+				k = i + 2
+			}
+			for k <= kend {
+				var evi float64
+				if k == i+1 {
+					evi = math.Abs(t[i*ldt+i])
+				} else {
+					evi = math.Abs(t[i*ldt+i]) + math.Sqrt(math.Abs(t[(i+1)*ldt+i]))*math.Sqrt(math.Abs(t[i*ldt+i+1]))
+				}
+
+				var evk float64
+				if k == kend || t[(k+1)*ldt+k] == 0 {
+					evk = math.Abs(t[k*ldt+k])
+				} else {
+					evk = math.Abs(t[k*ldt+k]) + math.Sqrt(math.Abs(t[(k+1)*ldt+k]))*math.Sqrt(math.Abs(t[k*ldt+k+1]))
+				}
+
+				if evi >= evk {
+					i = k
+				} else {
+					sorted = false
+					_, ilst, ok := impl.Dtrexc(lapack.UpdateSchur, jw, t, ldt, v, ldv, i, k, work)
+					if ok {
+						i = ilst
+					} else {
+						i = k
+					}
+				}
+				if i == kend || t[(i+1)*ldt+i] == 0 {
+					k = i + 1
+				} else {
+					k = i + 2
+				}
+			}
+		}
+	}
+
+	// Restore shift/eigenvalue array from T.
+	for i := jw - 1; i >= infqr; {
+		if i == infqr || t[i*ldt+i-1] == 0 {
+			sr[kwtop+i] = t[i*ldt+i]
+			si[kwtop+i] = 0
+			i--
+			continue
+		}
+		aa := t[(i-1)*ldt+i-1]
+		bb := t[(i-1)*ldt+i]
+		cc := t[i*ldt+i-1]
+		dd := t[i*ldt+i]
+		_, _, _, _, sr[kwtop+i-1], si[kwtop+i-1], sr[kwtop+i], si[kwtop+i], _, _ = impl.Dlanv2(aa, bb, cc, dd)
+		i -= 2
+	}
+
+	if ns < jw || s == 0 {
+		if ns > 1 && s != 0 {
+			// Reflect spike back into lower triangle.
+			bi.Dcopy(ns, v[:ns], 1, work[:ns], 1)
+			_, tau := impl.Dlarfg(ns, work[0], work[1:ns], 1)
+			work[0] = 1
+			impl.Dlaset(blas.Lower, jw-2, jw-2, 0, 0, t[2*ldt:], ldt)
+			impl.Dlarf(blas.Left, ns, jw, work[:ns], 1, tau, t, ldt, work[jw:])
+			impl.Dlarf(blas.Right, ns, ns, work[:ns], 1, tau, t, ldt, work[jw:])
+			impl.Dlarf(blas.Right, jw, ns, work[:ns], 1, tau, v, ldv, work[jw:])
+			impl.Dgehrd(jw, 0, ns-1, t, ldt, work[:jw-1], work[jw:], lwork-jw)
+		}
+
+		// Copy updated reduced window into place.
+		if kwtop > 0 {
+			h[kwtop*ldh+kwtop-1] = s * v[0]
+		}
+		impl.Dlacpy(blas.Upper, jw, jw, t, ldt, h[kwtop*ldh+kwtop:], ldh)
+		bi.Dcopy(jw-1, t[ldt:], ldt+1, h[(kwtop+1)*ldh+kwtop:], ldh+1)
+
+		// Accumulate orthogonal matrix in order to update H and Z, if
+		// requested.
+		if ns > 1 && s != 0 {
+			// work[:ns-1] contains the elementary reflectors stored
+			// by a call to Dgehrd above.
+			impl.Dormhr(blas.Right, blas.NoTrans, jw, ns, 0, ns-1,
+				t, ldt, work[:ns-1], v, ldv, work[jw:], lwork-jw)
+		}
+
+		// Update vertical slab in H.
+		var ltop int
+		if !wantt {
+			ltop = ktop
+		}
+		for krow := ltop; krow < kwtop; krow += nv {
+			kln := min(nv, kwtop-krow)
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, kln, jw, jw,
+				1, h[krow*ldh+kwtop:], ldh, v, ldv,
+				0, wv, ldwv)
+			impl.Dlacpy(blas.All, kln, jw, wv, ldwv, h[krow*ldh+kwtop:], ldh)
+		}
+
+		// Update horizontal slab in H.
+		if wantt {
+			for kcol := kbot + 1; kcol < n; kcol += nh {
+				kln := min(nh, n-kcol)
+				bi.Dgemm(blas.Trans, blas.NoTrans, jw, kln, jw,
+					1, v, ldv, h[kwtop*ldh+kcol:], ldh,
+					0, t, ldt)
+				impl.Dlacpy(blas.All, jw, kln, t, ldt, h[kwtop*ldh+kcol:], ldh)
+			}
+		}
+
+		// Update vertical slab in Z.
+		if wantz {
+			for krow := iloz; krow <= ihiz; krow += nv {
+				kln := min(nv, ihiz-krow+1)
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, kln, jw, jw,
+					1, z[krow*ldz+kwtop:], ldz, v, ldv,
+					0, wv, ldwv)
+				impl.Dlacpy(blas.All, kln, jw, wv, ldwv, z[krow*ldz+kwtop:], ldz)
+			}
+		}
+	}
+
+	// The number of deflations.
+	nd = jw - ns
+	// Shifts are converged eigenvalues that could not be deflated.
+	// Subtracting infqr from the spike length takes care of the case of a
+	// rare QR failure while calculating eigenvalues of the deflation
+	// window.
+	ns -= infqr
+	work[0] = float64(lwkopt)
+	return ns, nd
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go
new file mode 100644
index 00000000000..443a53d5c4b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaqr5.go
@@ -0,0 +1,560 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlaqr5 performs a single small-bulge multi-shift QR sweep on an isolated
+// block of a Hessenberg matrix.
+//
+// wantt and wantz determine whether the quasi-triangular Schur factor and the
+// orthogonal Schur factor, respectively, will be computed.
+//
+// kacc22 specifies the computation mode of far-from-diagonal orthogonal
+// updates. Permitted values are:
+//
+//	0: Dlaqr5 will not accumulate reflections and will not use matrix-matrix
+//	   multiply to update far-from-diagonal matrix entries.
+//	1: Dlaqr5 will accumulate reflections and use matrix-matrix multiply to
+//	   update far-from-diagonal matrix entries.
+//	2: Same as kacc22=1. This option used to enable exploiting the 2×2 structure
+//	   during matrix multiplications, but this is no longer supported.
+//
+// For other values of kacc2 Dlaqr5 will panic.
+//
+// n is the order of the Hessenberg matrix H.
+//
+// ktop and kbot are indices of the first and last row and column of an isolated
+// diagonal block upon which the QR sweep will be applied. It must hold that
+//
+//	ktop == 0,   or 0 < ktop <= n-1 and H[ktop, ktop-1] == 0, and
+//	kbot == n-1, or 0 <= kbot < n-1 and H[kbot+1, kbot] == 0,
+//
+// otherwise Dlaqr5 will panic.
+//
+// nshfts is the number of simultaneous shifts. It must be positive and even,
+// otherwise Dlaqr5 will panic.
+//
+// sr and si contain the real and imaginary parts, respectively, of the shifts
+// of origin that define the multi-shift QR sweep. On return both slices may be
+// reordered by Dlaqr5. Their length must be equal to nshfts, otherwise Dlaqr5
+// will panic.
+//
+// h and ldh represent the Hessenberg matrix H of size n×n. On return
+// multi-shift QR sweep with shifts sr+i*si has been applied to the isolated
+// diagonal block in rows and columns ktop through kbot, inclusive.
+//
+// iloz and ihiz specify the rows of Z to which transformations will be applied
+// if wantz is true. It must hold that 0 <= iloz <= ihiz < n, otherwise Dlaqr5
+// will panic.
+//
+// z and ldz represent the matrix Z of size n×n. If wantz is true, the QR sweep
+// orthogonal similarity transformation is accumulated into
+// z[iloz:ihiz,iloz:ihiz] from the right, otherwise z not referenced.
+//
+// v and ldv represent an auxiliary matrix V of size (nshfts/2)×3. Note that V
+// is transposed with respect to the reference netlib implementation.
+//
+// u and ldu represent an auxiliary matrix of size (2*nshfts)×(2*nshfts).
+//
+// wh and ldwh represent an auxiliary matrix of size (2*nshfts-1)×nh.
+//
+// wv and ldwv represent an auxiliary matrix of size nv×(2*nshfts-1).
+//
+// Dlaqr5 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaqr5(wantt, wantz bool, kacc22 int, n, ktop, kbot, nshfts int, sr, si []float64, h []float64, ldh int, iloz, ihiz int, z []float64, ldz int, v []float64, ldv int, u []float64, ldu int, nv int, wv []float64, ldwv int, nh int, wh []float64, ldwh int) {
+	switch {
+	case kacc22 != 0 && kacc22 != 1 && kacc22 != 2:
+		panic(badKacc22)
+	case n < 0:
+		panic(nLT0)
+	case ktop < 0 || n <= ktop:
+		panic(badKtop)
+	case kbot < 0 || n <= kbot:
+		panic(badKbot)
+
+	case nshfts < 0:
+		panic(nshftsLT0)
+	case nshfts&0x1 != 0:
+		panic(nshftsOdd)
+	case len(sr) != nshfts:
+		panic(badLenSr)
+	case len(si) != nshfts:
+		panic(badLenSi)
+
+	case ldh < max(1, n):
+		panic(badLdH)
+	case len(h) < (n-1)*ldh+n:
+		panic(shortH)
+
+	case wantz && ihiz >= n:
+		panic(badIhiz)
+	case wantz && iloz < 0 || ihiz < iloz:
+		panic(badIloz)
+	case ldz < 1, wantz && ldz < n:
+		panic(badLdZ)
+	case wantz && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+
+	case ldv < 3:
+		// V is transposed w.r.t. reference lapack.
+		panic(badLdV)
+	case len(v) < (nshfts/2-1)*ldv+3:
+		panic(shortV)
+
+	case ldu < max(1, 2*nshfts):
+		panic(badLdU)
+	case len(u) < (2*nshfts-1)*ldu+2*nshfts:
+		panic(shortU)
+
+	case nv < 0:
+		panic(nvLT0)
+	case ldwv < max(1, 2*nshfts):
+		panic(badLdWV)
+	case len(wv) < (nv-1)*ldwv+2*nshfts:
+		panic(shortWV)
+
+	case nh < 0:
+		panic(nhLT0)
+	case ldwh < max(1, nh):
+		panic(badLdWH)
+	case len(wh) < (2*nshfts-1)*ldwh+nh:
+		panic(shortWH)
+
+	case ktop > 0 && h[ktop*ldh+ktop-1] != 0:
+		panic(notIsolated)
+	case kbot < n-1 && h[(kbot+1)*ldh+kbot] != 0:
+		panic(notIsolated)
+	}
+
+	// If there are no shifts, then there is nothing to do.
+	if nshfts < 2 {
+		return
+	}
+	// If the active block is empty or 1×1, then there is nothing to do.
+	if ktop >= kbot {
+		return
+	}
+
+	// Shuffle shifts into pairs of real shifts and pairs of complex
+	// conjugate shifts assuming complex conjugate shifts are already
+	// adjacent to one another.
+	for i := 0; i < nshfts-2; i += 2 {
+		if si[i] == -si[i+1] {
+			continue
+		}
+		sr[i], sr[i+1], sr[i+2] = sr[i+1], sr[i+2], sr[i]
+		si[i], si[i+1], si[i+2] = si[i+1], si[i+2], si[i]
+	}
+
+	// Note: lapack says that nshfts must be even but allows it to be odd
+	// anyway. We panic above if nshfts is not even, so reducing it by one
+	// is unnecessary. The only caller Dlaqr04 uses only even nshfts.
+	//
+	// The original comment and code from lapack-3.6.0/SRC/dlaqr5.f:341:
+	// *     ==== NSHFTS is supposed to be even, but if it is odd,
+	// *     .    then simply reduce it by one.  The shuffle above
+	// *     .    ensures that the dropped shift is real and that
+	// *     .    the remaining shifts are paired. ====
+	// *
+	//      NS = NSHFTS - MOD( NSHFTS, 2 )
+	ns := nshfts
+
+	safmin := dlamchS
+	ulp := dlamchP
+	smlnum := safmin * float64(n) / ulp
+
+	// Use accumulated reflections to update far-from-diagonal entries?
+	accum := kacc22 == 1 || kacc22 == 2
+
+	// Clear trash.
+	if ktop+2 <= kbot {
+		h[(ktop+2)*ldh+ktop] = 0
+	}
+
+	// nbmps = number of 2-shift bulges in the chain.
+	nbmps := ns / 2
+
+	// kdu = width of slab.
+	kdu := 4 * nbmps
+
+	// Create and chase chains of nbmps bulges.
+	for incol := ktop - 2*nbmps + 1; incol <= kbot-2; incol += 2 * nbmps {
+		// jtop is an index from which updates from the right start.
+		var jtop int
+		switch {
+		case accum:
+			jtop = max(ktop, incol)
+		case wantt:
+		default:
+			jtop = ktop
+		}
+		ndcol := incol + kdu
+		if accum {
+			impl.Dlaset(blas.All, kdu, kdu, 0, 1, u, ldu)
+		}
+		// Near-the-diagonal bulge chase. The following loop performs
+		// the near-the-diagonal part of a small bulge multi-shift QR
+		// sweep. Each 4*nbmps column diagonal chunk extends from
+		// column incol to column ndcol (including both column incol and
+		// column ndcol). The following loop chases a 2*nbmps+1 column
+		// long chain of nbmps bulges 2*nbmps columns to the right.
+		// (incol may be less than ktop and ndcol may be greater than
+		// kbot indicating phantom columns from which to chase bulges
+		// before they are actually introduced or to which to chase
+		// bulges beyond column kbot.)
+		for krcol := incol; krcol <= min(incol+2*nbmps-1, kbot-2); krcol++ {
+			// Bulges number mtop to mbot are active double implicit
+			// shift bulges. There may or may not also be small 2×2
+			// bulge, if there is room. The inactive bulges (if any)
+			// must wait until the active bulges have moved down the
+			// diagonal to make room. The phantom matrix paradigm
+			// described above helps keep track.
+			mtop := max(0, (ktop-krcol)/2)
+			mbot := min(nbmps, (kbot-krcol-1)/2) - 1
+			m22 := mbot + 1
+			bmp22 := (mbot < nbmps-1) && (krcol+2*m22 == kbot-2)
+			// Generate reflections to chase the chain right one column.
+			// The minimum value of k is ktop-1.
+			if bmp22 {
+				// Special case: 2×2 reflection at bottom treated separately.
+				k := krcol + 2*m22
+				if k == ktop-1 {
+					impl.Dlaqr1(2, h[(k+1)*ldh+k+1:], ldh,
+						sr[2*m22], si[2*m22], sr[2*m22+1], si[2*m22+1],
+						v[m22*ldv:m22*ldv+2])
+					beta := v[m22*ldv]
+					_, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
+				} else {
+					beta := h[(k+1)*ldh+k]
+					v[m22*ldv+1] = h[(k+2)*ldh+k]
+					beta, v[m22*ldv] = impl.Dlarfg(2, beta, v[m22*ldv+1:m22*ldv+2], 1)
+					h[(k+1)*ldh+k] = beta
+					h[(k+2)*ldh+k] = 0
+				}
+				// Perform update from right within computational window.
+				t1 := v[m22*ldv]
+				t2 := t1 * v[m22*ldv+1]
+				for j := jtop; j <= min(kbot, k+3); j++ {
+					refsum := h[j*ldh+k+1] + v[m22*ldv+1]*h[j*ldh+k+2]
+					h[j*ldh+k+1] -= refsum * t1
+					h[j*ldh+k+2] -= refsum * t2
+				}
+				// Perform update from left within computational window.
+				var jbot int
+				switch {
+				case accum:
+					jbot = min(ndcol, kbot)
+				case wantt:
+					jbot = n - 1
+				default:
+					jbot = kbot
+				}
+				t1 = v[m22*ldv]
+				t2 = t1 * v[m22*ldv+1]
+				for j := k + 1; j <= jbot; j++ {
+					refsum := h[(k+1)*ldh+j] + v[m22*ldv+1]*h[(k+2)*ldh+j]
+					h[(k+1)*ldh+j] -= refsum * t1
+					h[(k+2)*ldh+j] -= refsum * t2
+				}
+				// The following convergence test requires that the traditional
+				// small-compared-to-nearby-diagonals criterion and the Ahues &
+				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The latter
+				// improves accuracy in some examples. Falling back on an alternate
+				// convergence criterion when tst1 or tst2 is zero (as done here) is
+				// traditional but probably unnecessary.
+				if k >= ktop && h[(k+1)*ldh+k] != 0 {
+					tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
+					if tst1 == 0 {
+						if k >= ktop+1 {
+							tst1 += math.Abs(h[k*ldh+k-1])
+						}
+						if k >= ktop+2 {
+							tst1 += math.Abs(h[k*ldh+k-2])
+						}
+						if k >= ktop+3 {
+							tst1 += math.Abs(h[k*ldh+k-3])
+						}
+						if k <= kbot-2 {
+							tst1 += math.Abs(h[(k+2)*ldh+k+1])
+						}
+						if k <= kbot-3 {
+							tst1 += math.Abs(h[(k+3)*ldh+k+1])
+						}
+						if k <= kbot-4 {
+							tst1 += math.Abs(h[(k+4)*ldh+k+1])
+						}
+					}
+					if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
+						h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						scl := h11 + h12
+						tst2 := h22 * (h11 / scl)
+						if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
+							h[(k+1)*ldh+k] = 0
+						}
+					}
+				}
+				// Accumulate orthogonal transformations.
+				if accum {
+					kms := k - incol - 1
+					t1 = v[m22*ldv]
+					t2 = t1 * v[m22*ldv+1]
+					for j := max(0, ktop-incol-1); j < kdu; j++ {
+						refsum := u[j*ldu+kms+1] + v[m22*ldv+1]*u[j*ldu+kms+2]
+						u[j*ldu+kms+1] -= refsum * t1
+						u[j*ldu+kms+2] -= refsum * t2
+					}
+				} else if wantz {
+					t1 = v[m22*ldv]
+					t2 = t1 * v[m22*ldv+1]
+					for j := iloz; j <= ihiz; j++ {
+						refsum := z[j*ldz+k+1] + v[m22*ldv+1]*z[j*ldz+k+2]
+						z[j*ldz+k+1] -= refsum * t1
+						z[j*ldz+k+2] -= refsum * t2
+					}
+				}
+			}
+			// Normal case: Chain of 3×3 reflections.
+			for m := mbot; m >= mtop; m-- {
+				k := krcol + 2*m
+				if k == ktop-1 {
+					impl.Dlaqr1(3, h[ktop*ldh+ktop:], ldh,
+						sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
+						v[m*ldv:m*ldv+3])
+					alpha := v[m*ldv]
+					_, v[m*ldv] = impl.Dlarfg(3, alpha, v[m*ldv+1:m*ldv+3], 1)
+				} else {
+					// Perform delayed transformation of row below m-th bulge.
+					// Exploit fact that first two elements of row are actually
+					// zero.
+					t1 := v[m*ldv]
+					t2 := t1 * v[m*ldv+1]
+					t3 := t1 * v[m*ldv+2]
+					refsum := v[m*ldv+2] * h[(k+3)*ldh+k+2]
+					h[(k+3)*ldh+k] = -refsum * t1
+					h[(k+3)*ldh+k+1] = -refsum * t2
+					h[(k+3)*ldh+k+2] -= refsum * t3
+					// Calculate reflection to move m-th bulge one step.
+					beta := h[(k+1)*ldh+k]
+					v[m*ldv+1] = h[(k+2)*ldh+k]
+					v[m*ldv+2] = h[(k+3)*ldh+k]
+					beta, v[m*ldv] = impl.Dlarfg(3, beta, v[m*ldv+1:m*ldv+3], 1)
+					// A bulge may collapse because of vigilant deflation or
+					// destructive underflow. In the underflow case, try the
+					// two-small-subdiagonals trick to try to reinflate the
+					// bulge.
+					if h[(k+3)*ldh+k] != 0 || h[(k+3)*ldh+k+1] != 0 || h[(k+3)*ldh+k+2] == 0 {
+						// Typical case: not collapsed (yet).
+						h[(k+1)*ldh+k] = beta
+						h[(k+2)*ldh+k] = 0
+						h[(k+3)*ldh+k] = 0
+					} else {
+						// Atypical case: collapsed. Attempt to reintroduce
+						// ignoring H[k+1,k] and H[k+2,k]. If the fill resulting
+						// from the new reflector is too large, then abandon it.
+						// Otherwise, use the new one.
+						var vt [3]float64
+						impl.Dlaqr1(3, h[(k+1)*ldh+k+1:], ldh,
+							sr[2*m], si[2*m], sr[2*m+1], si[2*m+1],
+							vt[:])
+						_, vt[0] = impl.Dlarfg(3, vt[0], vt[1:3], 1)
+						t1 = vt[0]
+						t2 = t1 * vt[1]
+						t3 = t1 * vt[2]
+						refsum = h[(k+1)*ldh+k] + vt[1]*h[(k+2)*ldh+k]
+						dsum := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1]) + math.Abs(h[(k+2)*ldh+k+2])
+						if math.Abs(h[(k+2)*ldh+k]-refsum*t2)+math.Abs(refsum*t3) > ulp*dsum {
+							// Starting a new bulge here would create
+							// non-negligible fill. Use the old one with
+							// trepidation.
+							h[(k+1)*ldh+k] = beta
+							h[(k+2)*ldh+k] = 0
+							h[(k+3)*ldh+k] = 0
+						} else {
+							// Starting a new bulge here would create only
+							// negligible fill. Replace the old reflector with
+							// the new one.
+							h[(k+1)*ldh+k] -= refsum * t1
+							h[(k+2)*ldh+k] = 0
+							h[(k+3)*ldh+k] = 0
+							v[m*ldv] = vt[0]
+							v[m*ldv+1] = vt[1]
+							v[m*ldv+2] = vt[2]
+						}
+					}
+				}
+				// Apply reflection from the right and the first column of
+				// update from the left. These updates are required for the
+				// vigilant deflation check. We still delay most of the updates
+				// from the left for efficiency.
+				t1 := v[m*ldv]
+				t2 := t1 * v[m*ldv+1]
+				t3 := t1 * v[m*ldv+2]
+				for j := jtop; j <= min(kbot, k+3); j++ {
+					refsum := h[j*ldh+k+1] + v[m*ldv+1]*h[j*ldh+k+2] + v[m*ldv+2]*h[j*ldh+k+3]
+					h[j*ldh+k+1] -= refsum * t1
+					h[j*ldh+k+2] -= refsum * t2
+					h[j*ldh+k+3] -= refsum * t3
+				}
+				// Perform update from left for subsequent column.
+				refsum := h[(k+1)*ldh+k+1] + v[m*ldv+1]*h[(k+2)*ldh+k+1] + v[m*ldv+2]*h[(k+3)*ldh+k+1]
+				h[(k+1)*ldh+k+1] -= refsum * t1
+				h[(k+2)*ldh+k+1] -= refsum * t2
+				h[(k+3)*ldh+k+1] -= refsum * t3
+				// The following convergence test requires that the tradition
+				// small-compared-to-nearby-diagonals criterion and the Ahues &
+				// Tisseur (LAWN 122, 1997) criteria both be satisfied. The
+				// latter improves accuracy in some examples. Falling back on an
+				// alternate convergence criterion when tst1 or tst2 is zero (as
+				// done here) is traditional but probably unnecessary.
+				if k < ktop {
+					continue
+				}
+				if h[(k+1)*ldh+k] != 0 {
+					tst1 := math.Abs(h[k*ldh+k]) + math.Abs(h[(k+1)*ldh+k+1])
+					if tst1 == 0 {
+						if k >= ktop+1 {
+							tst1 += math.Abs(h[k*ldh+k-1])
+						}
+						if k >= ktop+2 {
+							tst1 += math.Abs(h[k*ldh+k-2])
+						}
+						if k >= ktop+3 {
+							tst1 += math.Abs(h[k*ldh+k-3])
+						}
+						if k <= kbot-2 {
+							tst1 += math.Abs(h[(k+2)*ldh+k+1])
+						}
+						if k <= kbot-3 {
+							tst1 += math.Abs(h[(k+3)*ldh+k+1])
+						}
+						if k <= kbot-4 {
+							tst1 += math.Abs(h[(k+4)*ldh+k+1])
+						}
+					}
+					if math.Abs(h[(k+1)*ldh+k]) <= math.Max(smlnum, ulp*tst1) {
+						h12 := math.Max(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h21 := math.Min(math.Abs(h[(k+1)*ldh+k]), math.Abs(h[k*ldh+k+1]))
+						h11 := math.Max(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						h22 := math.Min(math.Abs(h[(k+1)*ldh+k+1]), math.Abs(h[k*ldh+k]-h[(k+1)*ldh+k+1]))
+						scl := h11 + h12
+						tst2 := h22 * (h11 / scl)
+						if tst2 == 0 || h21*(h12/scl) <= math.Max(smlnum, ulp*tst2) {
+							h[(k+1)*ldh+k] = 0
+						}
+					}
+				}
+			}
+			// Multiply H by reflections from the left.
+			var jbot int
+			switch {
+			case accum:
+				jbot = min(ndcol, kbot)
+			case wantt:
+				jbot = n - 1
+			default:
+				jbot = kbot
+			}
+			for m := mbot; m >= mtop; m-- {
+				k := krcol + 2*m
+				t1 := v[m*ldv]
+				t2 := t1 * v[m*ldv+1]
+				t3 := t1 * v[m*ldv+2]
+				for j := max(ktop, krcol+2*(m+1)); j <= jbot; j++ {
+					refsum := h[(k+1)*ldh+j] + v[m*ldv+1]*h[(k+2)*ldh+j] + v[m*ldv+2]*h[(k+3)*ldh+j]
+					h[(k+1)*ldh+j] -= refsum * t1
+					h[(k+2)*ldh+j] -= refsum * t2
+					h[(k+3)*ldh+j] -= refsum * t3
+				}
+			}
+			// Accumulate orthogonal transformations.
+			if accum {
+				// Accumulate U. If necessary, update Z later with an
+				// efficient matrix-matrix multiply.
+				for m := mbot; m >= mtop; m-- {
+					k := krcol + 2*m
+					kms := k - incol - 1
+					i2 := max(0, ktop-incol-1)
+					i2 = max(i2, kms-(krcol-incol))
+					i4 := min(kdu, krcol+2*mbot-incol+5)
+					t1 := v[m*ldv]
+					t2 := t1 * v[m*ldv+1]
+					t3 := t1 * v[m*ldv+2]
+					for j := i2; j < i4; j++ {
+						refsum := u[j*ldu+kms+1] + v[m*ldv+1]*u[j*ldu+kms+2] + v[m*ldv+2]*u[j*ldu+kms+3]
+						u[j*ldu+kms+1] -= refsum * t1
+						u[j*ldu+kms+2] -= refsum * t2
+						u[j*ldu+kms+3] -= refsum * t3
+					}
+				}
+			} else if wantz {
+				// U is not accumulated, so update Z now by multiplying by
+				// reflections from the right.
+				for m := mbot; m >= mtop; m-- {
+					k := krcol + 2*m
+					t1 := v[m*ldv]
+					t2 := t1 * v[m*ldv+1]
+					t3 := t1 * v[m*ldv+2]
+					for j := iloz; j <= ihiz; j++ {
+						refsum := z[j*ldz+k+1] + v[m*ldv+1]*z[j*ldz+k+2] + v[m*ldv+2]*z[j*ldz+k+3]
+						z[j*ldz+k+1] -= refsum * t1
+						z[j*ldz+k+2] -= refsum * t2
+						z[j*ldz+k+3] -= refsum * t3
+					}
+				}
+			}
+		}
+		// Use U (if accumulated) to update far-from-diagonal entries in H.
+		// If required, use U to update Z as well.
+		if !accum {
+			continue
+		}
+		jtop, jbot := ktop, kbot
+		if wantt {
+			jtop = 0
+			jbot = n - 1
+		}
+		bi := blas64.Implementation()
+		k1 := max(0, ktop-incol-1)
+		nu := kdu - max(0, ndcol-kbot) - k1
+		// Horizontal multiply.
+		for jcol := min(ndcol, kbot) + 1; jcol <= jbot; jcol += nh {
+			jlen := min(nh, jbot-jcol+1)
+			bi.Dgemm(blas.Trans, blas.NoTrans, nu, jlen, nu,
+				1, u[k1*ldu+k1:], ldu,
+				h[(incol+k1+1)*ldh+jcol:], ldh,
+				0, wh, ldwh)
+			impl.Dlacpy(blas.All, nu, jlen, wh, ldwh, h[(incol+k1+1)*ldh+jcol:], ldh)
+		}
+		// Vertical multiply.
+		for jrow := jtop; jrow < max(ktop, incol); jrow += nv {
+			jlen := min(nv, max(ktop, incol)-jrow)
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
+				1, h[jrow*ldh+incol+k1+1:], ldh,
+				u[k1*ldu+k1:], ldu,
+				0, wv, ldwv)
+			impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, h[jrow*ldh+incol+k1+1:], ldh)
+		}
+		// Z multiply (also vertical).
+		if wantz {
+			for jrow := iloz; jrow <= ihiz; jrow += nv {
+				jlen := min(nv, ihiz-jrow+1)
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, jlen, nu, nu,
+					1, z[jrow*ldz+incol+k1+1:], ldz,
+					u[k1*ldu+k1:], ldu,
+					0, wv, ldwv)
+				impl.Dlacpy(blas.All, jlen, nu, wv, ldwv, z[jrow*ldz+incol+k1+1:], ldz)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go
new file mode 100644
index 00000000000..16581a1b4e3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarf.go
@@ -0,0 +1,102 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlarf applies an elementary reflector H to an m×n matrix C:
+//
+//	C = H * C  if side == blas.Left
+//	C = C * H  if side == blas.Right
+//
+// H is represented in the form
+//
+//	H = I - tau * v * vᵀ
+//
+// where tau is a scalar and v is a vector.
+//
+// work must have length at least m if side == blas.Left and
+// at least n if side == blas.Right.
+//
+// Dlarf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlarf(side blas.Side, m, n int, v []float64, incv int, tau float64, c []float64, ldc int, work []float64) {
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case incv == 0:
+		panic(zeroIncV)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	applyleft := side == blas.Left
+	lenV := n
+	if applyleft {
+		lenV = m
+	}
+
+	switch {
+	case len(v) < 1+(lenV-1)*abs(incv):
+		panic(shortV)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case (applyleft && len(work) < n) || (!applyleft && len(work) < m):
+		panic(shortWork)
+	}
+
+	lastv := -1 // last non-zero element of v
+	lastc := -1 // last non-zero row/column of C
+	if tau != 0 {
+		if applyleft {
+			lastv = m - 1
+		} else {
+			lastv = n - 1
+		}
+		var i int
+		if incv > 0 {
+			i = lastv * incv
+		}
+		// Look for the last non-zero row in v.
+		for lastv >= 0 && v[i] == 0 {
+			lastv--
+			i -= incv
+		}
+		if applyleft {
+			// Scan for the last non-zero column in C[0:lastv, :]
+			lastc = impl.Iladlc(lastv+1, n, c, ldc)
+		} else {
+			// Scan for the last non-zero row in C[:, 0:lastv]
+			lastc = impl.Iladlr(m, lastv+1, c, ldc)
+		}
+	}
+	if lastv == -1 || lastc == -1 {
+		return
+	}
+	bi := blas64.Implementation()
+	if applyleft {
+		// Form H * C
+		// w[0:lastc+1] = c[1:lastv+1, 1:lastc+1]ᵀ * v[1:lastv+1,1]
+		bi.Dgemv(blas.Trans, lastv+1, lastc+1, 1, c, ldc, v, incv, 0, work, 1)
+		// c[0: lastv, 0: lastc] = c[...] - w[0:lastv, 1] * v[1:lastc, 1]ᵀ
+		bi.Dger(lastv+1, lastc+1, -tau, v, incv, work, 1, c, ldc)
+	} else {
+		// Form C * H
+		// w[0:lastc+1,1] := c[0:lastc+1,0:lastv+1] * v[0:lastv+1,1]
+		bi.Dgemv(blas.NoTrans, lastc+1, lastv+1, 1, c, ldc, v, incv, 0, work, 1)
+		// c[0:lastc+1,0:lastv+1] = c[...] - w[0:lastc+1,0] * v[0:lastv+1,0]ᵀ
+		bi.Dger(lastc+1, lastv+1, -tau, work, 1, v, incv, c, ldc)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go
new file mode 100644
index 00000000000..eb43ca74ceb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfb.go
@@ -0,0 +1,461 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlarfb applies a block reflector to a matrix.
+//
+// In the call to Dlarfb, the mxn c is multiplied by the implicitly defined matrix h as follows:
+//
+//	c = h * c   if side == Left and trans == NoTrans
+//	c = c * h   if side == Right and trans == NoTrans
+//	c = hᵀ * c  if side == Left and trans == Trans
+//	c = c * hᵀ  if side == Right and trans == Trans
+//
+// h is a product of elementary reflectors. direct sets the direction of multiplication
+//
+//	h = h_1 * h_2 * ... * h_k    if direct == Forward
+//	h = h_k * h_k-1 * ... * h_1  if direct == Backward
+//
+// The combination of direct and store defines the orientation of the elementary
+// reflectors. In all cases the ones on the diagonal are implicitly represented.
+//
+// If direct == lapack.Forward and store == lapack.ColumnWise
+//
+//	V = [ 1        ]
+//	    [v1   1    ]
+//	    [v1  v2   1]
+//	    [v1  v2  v3]
+//	    [v1  v2  v3]
+//
+// If direct == lapack.Forward and store == lapack.RowWise
+//
+//	V = [ 1  v1  v1  v1  v1]
+//	    [     1  v2  v2  v2]
+//	    [         1  v3  v3]
+//
+// If direct == lapack.Backward and store == lapack.ColumnWise
+//
+//	V = [v1  v2  v3]
+//	    [v1  v2  v3]
+//	    [ 1  v2  v3]
+//	    [     1  v3]
+//	    [         1]
+//
+// If direct == lapack.Backward and store == lapack.RowWise
+//
+//	V = [v1  v1   1        ]
+//	    [v2  v2  v2   1    ]
+//	    [v3  v3  v3  v3   1]
+//
+// An elementary reflector can be explicitly constructed by extracting the
+// corresponding elements of v, placing a 1 where the diagonal would be, and
+// placing zeros in the remaining elements.
+//
+// t is a k×k matrix containing the block reflector, and this function will panic
+// if t is not of sufficient size. See Dlarft for more information.
+//
+// work is a temporary storage matrix with stride ldwork.
+// work must be of size at least n×k side == Left and m×k if side == Right, and
+// this function will panic if this size is not met.
+//
+// Dlarfb is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlarfb(side blas.Side, trans blas.Transpose, direct lapack.Direct, store lapack.StoreV, m, n, k int, v []float64, ldv int, t []float64, ldt int, c []float64, ldc int, work []float64, ldwork int) {
+	nv := m
+	if side == blas.Right {
+		nv = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case direct != lapack.Forward && direct != lapack.Backward:
+		panic(badDirect)
+	case store != lapack.ColumnWise && store != lapack.RowWise:
+		panic(badStoreV)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case store == lapack.ColumnWise && ldv < max(1, k):
+		panic(badLdV)
+	case store == lapack.RowWise && ldv < max(1, nv):
+		panic(badLdV)
+	case ldt < max(1, k):
+		panic(badLdT)
+	case ldc < max(1, n):
+		panic(badLdC)
+	case ldwork < max(1, k):
+		panic(badLdWork)
+	}
+
+	if m == 0 || n == 0 {
+		return
+	}
+
+	nw := n
+	if side == blas.Right {
+		nw = m
+	}
+	switch {
+	case store == lapack.ColumnWise && len(v) < (nv-1)*ldv+k:
+		panic(shortV)
+	case store == lapack.RowWise && len(v) < (k-1)*ldv+nv:
+		panic(shortV)
+	case len(t) < (k-1)*ldt+k:
+		panic(shortT)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(work) < (nw-1)*ldwork+k:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+
+	transt := blas.Trans
+	if trans == blas.Trans {
+		transt = blas.NoTrans
+	}
+	// TODO(btracey): This follows the original Lapack code where the
+	// elements are copied into the columns of the working array. The
+	// loops should go in the other direction so the data is written
+	// into the rows of work so the copy is not strided. A bigger change
+	// would be to replace work with workᵀ, but benchmarks would be
+	// needed to see if the change is merited.
+	if store == lapack.ColumnWise {
+		if direct == lapack.Forward {
+			// V1 is the first k rows of C. V2 is the remaining rows.
+			if side == blas.Left {
+				// W = Cᵀ V = C1ᵀ V1 + C2ᵀ V2 (stored in work).
+
+				// W = C1.
+				for j := 0; j < k; j++ {
+					bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork)
+				}
+				// W = W * V1.
+				bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit,
+					n, k, 1,
+					v, ldv,
+					work, ldwork)
+				if m > k {
+					// W = W + C2ᵀ V2.
+					bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k,
+						1, c[k*ldc:], ldc, v[k*ldv:], ldv,
+						1, work, ldwork)
+				}
+				// W = W * Tᵀ or W * T.
+				bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k,
+					1, t, ldt,
+					work, ldwork)
+				// C -= V * Wᵀ.
+				if m > k {
+					// C2 -= V2 * Wᵀ.
+					bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k,
+						-1, v[k*ldv:], ldv, work, ldwork,
+						1, c[k*ldc:], ldc)
+				}
+				// W *= V1ᵀ.
+				bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k,
+					1, v, ldv,
+					work, ldwork)
+				// C1 -= Wᵀ.
+				// TODO(btracey): This should use blas.Axpy.
+				for i := 0; i < n; i++ {
+					for j := 0; j < k; j++ {
+						c[j*ldc+i] -= work[i*ldwork+j]
+					}
+				}
+				return
+			}
+			// Form C = C * H or C * Hᵀ, where C = (C1 C2).
+
+			// W = C1.
+			for i := 0; i < k; i++ {
+				bi.Dcopy(m, c[i:], ldc, work[i:], ldwork)
+			}
+			// W *= V1.
+			bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k,
+				1, v, ldv,
+				work, ldwork)
+			if n > k {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k,
+					1, c[k:], ldc, v[k*ldv:], ldv,
+					1, work, ldwork)
+			}
+			// W *= T or Tᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k,
+				1, t, ldt,
+				work, ldwork)
+			if n > k {
+				bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k,
+					-1, work, ldwork, v[k*ldv:], ldv,
+					1, c[k:], ldc)
+			}
+			// C -= W * Vᵀ.
+			bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k,
+				1, v, ldv,
+				work, ldwork)
+			// C -= W.
+			// TODO(btracey): This should use blas.Axpy.
+			for i := 0; i < m; i++ {
+				for j := 0; j < k; j++ {
+					c[i*ldc+j] -= work[i*ldwork+j]
+				}
+			}
+			return
+		}
+		// V = (V1)
+		//   = (V2) (last k rows)
+		// Where V2 is unit upper triangular.
+		if side == blas.Left {
+			// Form H * C or
+			// W = Cᵀ V.
+
+			// W = C2ᵀ.
+			for j := 0; j < k; j++ {
+				bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork)
+			}
+			// W *= V2.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k,
+				1, v[(m-k)*ldv:], ldv,
+				work, ldwork)
+			if m > k {
+				// W += C1ᵀ * V1.
+				bi.Dgemm(blas.Trans, blas.NoTrans, n, k, m-k,
+					1, c, ldc, v, ldv,
+					1, work, ldwork)
+			}
+			// W *= T or Tᵀ.
+			bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k,
+				1, t, ldt,
+				work, ldwork)
+			// C -= V * Wᵀ.
+			if m > k {
+				bi.Dgemm(blas.NoTrans, blas.Trans, m-k, n, k,
+					-1, v, ldv, work, ldwork,
+					1, c, ldc)
+			}
+			// W *= V2ᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k,
+				1, v[(m-k)*ldv:], ldv,
+				work, ldwork)
+			// C2 -= Wᵀ.
+			// TODO(btracey): This should use blas.Axpy.
+			for i := 0; i < n; i++ {
+				for j := 0; j < k; j++ {
+					c[(m-k+j)*ldc+i] -= work[i*ldwork+j]
+				}
+			}
+			return
+		}
+		// Form C * H or C * Hᵀ where C = (C1 C2).
+		// W = C * V.
+
+		// W = C2.
+		for j := 0; j < k; j++ {
+			bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork)
+		}
+
+		// W = W * V2.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k,
+			1, v[(n-k)*ldv:], ldv,
+			work, ldwork)
+		if n > k {
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, m, k, n-k,
+				1, c, ldc, v, ldv,
+				1, work, ldwork)
+		}
+		// W *= T or Tᵀ.
+		bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k,
+			1, t, ldt,
+			work, ldwork)
+		// C -= W * Vᵀ.
+		if n > k {
+			// C1 -= W * V1ᵀ.
+			bi.Dgemm(blas.NoTrans, blas.Trans, m, n-k, k,
+				-1, work, ldwork, v, ldv,
+				1, c, ldc)
+		}
+		// W *= V2ᵀ.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k,
+			1, v[(n-k)*ldv:], ldv,
+			work, ldwork)
+		// C2 -= W.
+		// TODO(btracey): This should use blas.Axpy.
+		for i := 0; i < m; i++ {
+			for j := 0; j < k; j++ {
+				c[i*ldc+n-k+j] -= work[i*ldwork+j]
+			}
+		}
+		return
+	}
+	// Store = Rowwise.
+	if direct == lapack.Forward {
+		// V = (V1 V2) where v1 is unit upper triangular.
+		if side == blas.Left {
+			// Form H * C or Hᵀ * C where C = (C1; C2).
+			// W = Cᵀ * Vᵀ.
+
+			// W = C1ᵀ.
+			for j := 0; j < k; j++ {
+				bi.Dcopy(n, c[j*ldc:], 1, work[j:], ldwork)
+			}
+			// W *= V1ᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, n, k,
+				1, v, ldv,
+				work, ldwork)
+			if m > k {
+				bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k,
+					1, c[k*ldc:], ldc, v[k:], ldv,
+					1, work, ldwork)
+			}
+			// W *= T or Tᵀ.
+			bi.Dtrmm(blas.Right, blas.Upper, transt, blas.NonUnit, n, k,
+				1, t, ldt,
+				work, ldwork)
+			// C -= Vᵀ * Wᵀ.
+			if m > k {
+				bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k,
+					-1, v[k:], ldv, work, ldwork,
+					1, c[k*ldc:], ldc)
+			}
+			// W *= V1.
+			bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, n, k,
+				1, v, ldv,
+				work, ldwork)
+			// C1 -= Wᵀ.
+			// TODO(btracey): This should use blas.Axpy.
+			for i := 0; i < n; i++ {
+				for j := 0; j < k; j++ {
+					c[j*ldc+i] -= work[i*ldwork+j]
+				}
+			}
+			return
+		}
+		// Form C * H or C * Hᵀ where C = (C1 C2).
+		// W = C * Vᵀ.
+
+		// W = C1.
+		for j := 0; j < k; j++ {
+			bi.Dcopy(m, c[j:], ldc, work[j:], ldwork)
+		}
+		// W *= V1ᵀ.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.Unit, m, k,
+			1, v, ldv,
+			work, ldwork)
+		if n > k {
+			bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k,
+				1, c[k:], ldc, v[k:], ldv,
+				1, work, ldwork)
+		}
+		// W *= T or Tᵀ.
+		bi.Dtrmm(blas.Right, blas.Upper, trans, blas.NonUnit, m, k,
+			1, t, ldt,
+			work, ldwork)
+		// C -= W * V.
+		if n > k {
+			bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k,
+				-1, work, ldwork, v[k:], ldv,
+				1, c[k:], ldc)
+		}
+		// W *= V1.
+		bi.Dtrmm(blas.Right, blas.Upper, blas.NoTrans, blas.Unit, m, k,
+			1, v, ldv,
+			work, ldwork)
+		// C1 -= W.
+		// TODO(btracey): This should use blas.Axpy.
+		for i := 0; i < m; i++ {
+			for j := 0; j < k; j++ {
+				c[i*ldc+j] -= work[i*ldwork+j]
+			}
+		}
+		return
+	}
+	// V = (V1 V2) where V2 is the last k columns and is lower unit triangular.
+	if side == blas.Left {
+		// Form H * C or Hᵀ C where C = (C1 ; C2).
+		// W = Cᵀ * Vᵀ.
+
+		// W = C2ᵀ.
+		for j := 0; j < k; j++ {
+			bi.Dcopy(n, c[(m-k+j)*ldc:], 1, work[j:], ldwork)
+		}
+		// W *= V2ᵀ.
+		bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, n, k,
+			1, v[m-k:], ldv,
+			work, ldwork)
+		if m > k {
+			bi.Dgemm(blas.Trans, blas.Trans, n, k, m-k,
+				1, c, ldc, v, ldv,
+				1, work, ldwork)
+		}
+		// W *= T or Tᵀ.
+		bi.Dtrmm(blas.Right, blas.Lower, transt, blas.NonUnit, n, k,
+			1, t, ldt,
+			work, ldwork)
+		// C -= Vᵀ * Wᵀ.
+		if m > k {
+			bi.Dgemm(blas.Trans, blas.Trans, m-k, n, k,
+				-1, v, ldv, work, ldwork,
+				1, c, ldc)
+		}
+		// W *= V2.
+		bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, n, k,
+			1, v[m-k:], ldv,
+			work, ldwork)
+		// C2 -= Wᵀ.
+		// TODO(btracey): This should use blas.Axpy.
+		for i := 0; i < n; i++ {
+			for j := 0; j < k; j++ {
+				c[(m-k+j)*ldc+i] -= work[i*ldwork+j]
+			}
+		}
+		return
+	}
+	// Form C * H or C * Hᵀ where C = (C1 C2).
+	// W = C * Vᵀ.
+	// W = C2.
+	for j := 0; j < k; j++ {
+		bi.Dcopy(m, c[n-k+j:], ldc, work[j:], ldwork)
+	}
+	// W *= V2ᵀ.
+	bi.Dtrmm(blas.Right, blas.Lower, blas.Trans, blas.Unit, m, k,
+		1, v[n-k:], ldv,
+		work, ldwork)
+	if n > k {
+		bi.Dgemm(blas.NoTrans, blas.Trans, m, k, n-k,
+			1, c, ldc, v, ldv,
+			1, work, ldwork)
+	}
+	// W *= T or Tᵀ.
+	bi.Dtrmm(blas.Right, blas.Lower, trans, blas.NonUnit, m, k,
+		1, t, ldt,
+		work, ldwork)
+	// C -= W * V.
+	if n > k {
+		bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n-k, k,
+			-1, work, ldwork, v, ldv,
+			1, c, ldc)
+	}
+	// W *= V2.
+	bi.Dtrmm(blas.Right, blas.Lower, blas.NoTrans, blas.Unit, m, k,
+		1, v[n-k:], ldv,
+		work, ldwork)
+	// C1 -= W.
+	// TODO(btracey): This should use blas.Axpy.
+	for i := 0; i < m; i++ {
+		for j := 0; j < k; j++ {
+			c[i*ldc+n-k+j] -= work[i*ldwork+j]
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go
new file mode 100644
index 00000000000..74ad111d41a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfg.go
@@ -0,0 +1,75 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlarfg generates an elementary reflector for a Householder matrix. It creates
+// a real elementary reflector of order n such that
+//
+//	H * (alpha) = (beta)
+//	    (    x)   (   0)
+//	Hᵀ * H = I
+//
+// H is represented in the form
+//
+//	H = 1 - tau * (1; v) * (1 vᵀ)
+//
+// where tau is a real scalar.
+//
+// On entry, x contains the vector x, on exit it contains v.
+//
+// Dlarfg is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlarfg(n int, alpha float64, x []float64, incX int) (beta, tau float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incX <= 0:
+		panic(badIncX)
+	}
+
+	if n <= 1 {
+		return alpha, 0
+	}
+
+	if len(x) < 1+(n-2)*abs(incX) {
+		panic(shortX)
+	}
+
+	bi := blas64.Implementation()
+
+	xnorm := bi.Dnrm2(n-1, x, incX)
+	if xnorm == 0 {
+		return alpha, 0
+	}
+	beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha)
+	safmin := dlamchS / dlamchE
+	knt := 0
+	if math.Abs(beta) < safmin {
+		// xnorm and beta may be inaccurate, scale x and recompute.
+		rsafmn := 1 / safmin
+		for {
+			knt++
+			bi.Dscal(n-1, rsafmn, x, incX)
+			beta *= rsafmn
+			alpha *= rsafmn
+			if math.Abs(beta) >= safmin {
+				break
+			}
+		}
+		xnorm = bi.Dnrm2(n-1, x, incX)
+		beta = -math.Copysign(impl.Dlapy2(alpha, xnorm), alpha)
+	}
+	tau = (beta - alpha) / beta
+	bi.Dscal(n-1, 1/(alpha-beta), x, incX)
+	for j := 0; j < knt; j++ {
+		beta *= safmin
+	}
+	return beta, tau
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go
new file mode 100644
index 00000000000..921a5a3d217
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarft.go
@@ -0,0 +1,169 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlarft forms the triangular factor T of a block reflector H, storing the answer
+// in t.
+//
+//	H = I - V * T * Vᵀ  if store == lapack.ColumnWise
+//	H = I - Vᵀ * T * V  if store == lapack.RowWise
+//
+// H is defined by a product of the elementary reflectors where
+//
+//	H = H_0 * H_1 * ... * H_{k-1}  if direct == lapack.Forward
+//	H = H_{k-1} * ... * H_1 * H_0  if direct == lapack.Backward
+//
+// t is a k×k triangular matrix. t is upper triangular if direct = lapack.Forward
+// and lower triangular otherwise. This function will panic if t is not of
+// sufficient size.
+//
+// store describes the storage of the elementary reflectors in v. See
+// Dlarfb for a description of layout.
+//
+// tau contains the scalar factors of the elementary reflectors H_i.
+//
+// Dlarft is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlarft(direct lapack.Direct, store lapack.StoreV, n, k int, v []float64, ldv int, tau []float64, t []float64, ldt int) {
+	mv, nv := n, k
+	if store == lapack.RowWise {
+		mv, nv = k, n
+	}
+	switch {
+	case direct != lapack.Forward && direct != lapack.Backward:
+		panic(badDirect)
+	case store != lapack.RowWise && store != lapack.ColumnWise:
+		panic(badStoreV)
+	case n < 0:
+		panic(nLT0)
+	case k < 1:
+		panic(kLT1)
+	case ldv < max(1, nv):
+		panic(badLdV)
+	case len(tau) < k:
+		panic(shortTau)
+	case ldt < max(1, k):
+		panic(shortT)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(v) < (mv-1)*ldv+nv:
+		panic(shortV)
+	case len(t) < (k-1)*ldt+k:
+		panic(shortT)
+	}
+
+	bi := blas64.Implementation()
+
+	// TODO(btracey): There are a number of minor obvious loop optimizations here.
+	// TODO(btracey): It may be possible to rearrange some of the code so that
+	// index of 1 is more common in the Dgemv.
+	if direct == lapack.Forward {
+		prevlastv := n - 1
+		for i := 0; i < k; i++ {
+			prevlastv = max(i, prevlastv)
+			if tau[i] == 0 {
+				for j := 0; j <= i; j++ {
+					t[j*ldt+i] = 0
+				}
+				continue
+			}
+			var lastv int
+			if store == lapack.ColumnWise {
+				// skip trailing zeros
+				for lastv = n - 1; lastv >= i+1; lastv-- {
+					if v[lastv*ldv+i] != 0 {
+						break
+					}
+				}
+				for j := 0; j < i; j++ {
+					t[j*ldt+i] = -tau[i] * v[i*ldv+j]
+				}
+				j := min(lastv, prevlastv)
+				bi.Dgemv(blas.Trans, j-i, i,
+					-tau[i], v[(i+1)*ldv:], ldv, v[(i+1)*ldv+i:], ldv,
+					1, t[i:], ldt)
+			} else {
+				for lastv = n - 1; lastv >= i+1; lastv-- {
+					if v[i*ldv+lastv] != 0 {
+						break
+					}
+				}
+				for j := 0; j < i; j++ {
+					t[j*ldt+i] = -tau[i] * v[j*ldv+i]
+				}
+				j := min(lastv, prevlastv)
+				bi.Dgemv(blas.NoTrans, i, j-i,
+					-tau[i], v[i+1:], ldv, v[i*ldv+i+1:], 1,
+					1, t[i:], ldt)
+			}
+			bi.Dtrmv(blas.Upper, blas.NoTrans, blas.NonUnit, i, t, ldt, t[i:], ldt)
+			t[i*ldt+i] = tau[i]
+			if i > 1 {
+				prevlastv = max(prevlastv, lastv)
+			} else {
+				prevlastv = lastv
+			}
+		}
+		return
+	}
+	prevlastv := 0
+	for i := k - 1; i >= 0; i-- {
+		if tau[i] == 0 {
+			for j := i; j < k; j++ {
+				t[j*ldt+i] = 0
+			}
+			continue
+		}
+		var lastv int
+		if i < k-1 {
+			if store == lapack.ColumnWise {
+				for lastv = 0; lastv < i; lastv++ {
+					if v[lastv*ldv+i] != 0 {
+						break
+					}
+				}
+				for j := i + 1; j < k; j++ {
+					t[j*ldt+i] = -tau[i] * v[(n-k+i)*ldv+j]
+				}
+				j := max(lastv, prevlastv)
+				bi.Dgemv(blas.Trans, n-k+i-j, k-i-1,
+					-tau[i], v[j*ldv+i+1:], ldv, v[j*ldv+i:], ldv,
+					1, t[(i+1)*ldt+i:], ldt)
+			} else {
+				for lastv = 0; lastv < i; lastv++ {
+					if v[i*ldv+lastv] != 0 {
+						break
+					}
+				}
+				for j := i + 1; j < k; j++ {
+					t[j*ldt+i] = -tau[i] * v[j*ldv+n-k+i]
+				}
+				j := max(lastv, prevlastv)
+				bi.Dgemv(blas.NoTrans, k-i-1, n-k+i-j,
+					-tau[i], v[(i+1)*ldv+j:], ldv, v[i*ldv+j:], 1,
+					1, t[(i+1)*ldt+i:], ldt)
+			}
+			bi.Dtrmv(blas.Lower, blas.NoTrans, blas.NonUnit, k-i-1,
+				t[(i+1)*ldt+i+1:], ldt,
+				t[(i+1)*ldt+i:], ldt)
+			if i > 0 {
+				prevlastv = min(prevlastv, lastv)
+			} else {
+				prevlastv = lastv
+			}
+		}
+		t[i*ldt+i] = tau[i]
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go
new file mode 100644
index 00000000000..4e40dad1888
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlarfx.go
@@ -0,0 +1,552 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlarfx applies an elementary reflector H to a real m×n matrix C, from either
+// the left or the right, with loop unrolling when the reflector has order less
+// than 11.
+//
+// H is represented in the form
+//
+//	H = I - tau * v * vᵀ,
+//
+// where tau is a real scalar and v is a real vector. If tau = 0, then H is
+// taken to be the identity matrix.
+//
+// v must have length equal to m if side == blas.Left, and equal to n if side ==
+// blas.Right, otherwise Dlarfx will panic.
+//
+// c and ldc represent the m×n matrix C. On return, C is overwritten by the
+// matrix H * C if side == blas.Left, or C * H if side == blas.Right.
+//
+// work must have length at least n if side == blas.Left, and at least m if side
+// == blas.Right, otherwise Dlarfx will panic. work is not referenced if H has
+// order < 11.
+//
+// Dlarfx is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlarfx(side blas.Side, m, n int, v []float64, tau float64, c []float64, ldc int, work []float64) {
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	nh := m
+	lwork := n
+	if side == blas.Right {
+		nh = n
+		lwork = m
+	}
+	switch {
+	case len(v) < nh:
+		panic(shortV)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case nh > 10 && len(work) < lwork:
+		panic(shortWork)
+	}
+
+	if tau == 0 {
+		return
+	}
+
+	if side == blas.Left {
+		// Form H * C, where H has order m.
+		switch m {
+		default: // Code for general m.
+			impl.Dlarf(side, m, n, v, 1, tau, c, ldc, work)
+			return
+
+		case 0: // No-op for zero size matrix.
+			return
+
+		case 1: // Special code for 1×1 Householder matrix.
+			t0 := 1 - tau*v[0]*v[0]
+			for j := 0; j < n; j++ {
+				c[j] *= t0
+			}
+			return
+
+		case 2: // Special code for 2×2 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+			}
+			return
+
+		case 3: // Special code for 3×3 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+			}
+			return
+
+		case 4: // Special code for 4×4 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+			}
+			return
+
+		case 5: // Special code for 5×5 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+			}
+			return
+
+		case 6: // Special code for 6×6 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+			}
+			return
+
+		case 7: // Special code for 7×7 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+			}
+			return
+
+		case 8: // Special code for 8×8 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			v7 := v[7]
+			t7 := tau * v7
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j] + v7*c[7*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+				c[7*ldc+j] -= sum * t7
+			}
+			return
+
+		case 9: // Special code for 9×9 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			v7 := v[7]
+			t7 := tau * v7
+			v8 := v[8]
+			t8 := tau * v8
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j] + v7*c[7*ldc+j] + v8*c[8*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+				c[7*ldc+j] -= sum * t7
+				c[8*ldc+j] -= sum * t8
+			}
+			return
+
+		case 10: // Special code for 10×10 Householder matrix.
+			v0 := v[0]
+			t0 := tau * v0
+			v1 := v[1]
+			t1 := tau * v1
+			v2 := v[2]
+			t2 := tau * v2
+			v3 := v[3]
+			t3 := tau * v3
+			v4 := v[4]
+			t4 := tau * v4
+			v5 := v[5]
+			t5 := tau * v5
+			v6 := v[6]
+			t6 := tau * v6
+			v7 := v[7]
+			t7 := tau * v7
+			v8 := v[8]
+			t8 := tau * v8
+			v9 := v[9]
+			t9 := tau * v9
+			for j := 0; j < n; j++ {
+				sum := v0*c[j] + v1*c[ldc+j] + v2*c[2*ldc+j] + v3*c[3*ldc+j] + v4*c[4*ldc+j] +
+					v5*c[5*ldc+j] + v6*c[6*ldc+j] + v7*c[7*ldc+j] + v8*c[8*ldc+j] + v9*c[9*ldc+j]
+				c[j] -= sum * t0
+				c[ldc+j] -= sum * t1
+				c[2*ldc+j] -= sum * t2
+				c[3*ldc+j] -= sum * t3
+				c[4*ldc+j] -= sum * t4
+				c[5*ldc+j] -= sum * t5
+				c[6*ldc+j] -= sum * t6
+				c[7*ldc+j] -= sum * t7
+				c[8*ldc+j] -= sum * t8
+				c[9*ldc+j] -= sum * t9
+			}
+			return
+		}
+	}
+
+	// Form C * H, where H has order n.
+	switch n {
+	default: // Code for general n.
+		impl.Dlarf(side, m, n, v, 1, tau, c, ldc, work)
+		return
+
+	case 0: // No-op for zero size matrix.
+		return
+
+	case 1: // Special code for 1×1 Householder matrix.
+		t0 := 1 - tau*v[0]*v[0]
+		for j := 0; j < m; j++ {
+			c[j*ldc] *= t0
+		}
+		return
+
+	case 2: // Special code for 2×2 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+		}
+		return
+
+	case 3: // Special code for 3×3 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+		}
+		return
+
+	case 4: // Special code for 4×4 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+		}
+		return
+
+	case 5: // Special code for 5×5 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+		}
+		return
+
+	case 6: // Special code for 6×6 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] + v5*cs[5]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+		}
+		return
+
+	case 7: // Special code for 7×7 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+		}
+		return
+
+	case 8: // Special code for 8×8 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		v7 := v[7]
+		t7 := tau * v7
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6] + v7*cs[7]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+			cs[7] -= sum * t7
+		}
+		return
+
+	case 9: // Special code for 9×9 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		v7 := v[7]
+		t7 := tau * v7
+		v8 := v[8]
+		t8 := tau * v8
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6] + v7*cs[7] + v8*cs[8]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+			cs[7] -= sum * t7
+			cs[8] -= sum * t8
+		}
+		return
+
+	case 10: // Special code for 10×10 Householder matrix.
+		v0 := v[0]
+		t0 := tau * v0
+		v1 := v[1]
+		t1 := tau * v1
+		v2 := v[2]
+		t2 := tau * v2
+		v3 := v[3]
+		t3 := tau * v3
+		v4 := v[4]
+		t4 := tau * v4
+		v5 := v[5]
+		t5 := tau * v5
+		v6 := v[6]
+		t6 := tau * v6
+		v7 := v[7]
+		t7 := tau * v7
+		v8 := v[8]
+		t8 := tau * v8
+		v9 := v[9]
+		t9 := tau * v9
+		for j := 0; j < m; j++ {
+			cs := c[j*ldc:]
+			sum := v0*cs[0] + v1*cs[1] + v2*cs[2] + v3*cs[3] + v4*cs[4] +
+				v5*cs[5] + v6*cs[6] + v7*cs[7] + v8*cs[8] + v9*cs[9]
+			cs[0] -= sum * t0
+			cs[1] -= sum * t1
+			cs[2] -= sum * t2
+			cs[3] -= sum * t3
+			cs[4] -= sum * t4
+			cs[5] -= sum * t5
+			cs[6] -= sum * t6
+			cs[7] -= sum * t7
+			cs[8] -= sum * t8
+			cs[9] -= sum * t9
+		}
+		return
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go
new file mode 100644
index 00000000000..93416c6f5f1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlartg.go
@@ -0,0 +1,73 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlartg generates a plane rotation so that
+//
+//	[ cs sn] * [f] = [r]
+//	[-sn cs]   [g] = [0]
+//
+// where cs*cs + sn*sn = 1.
+//
+// This is a more accurate version of BLAS Drotg that uses scaling to avoid
+// overflow or underflow, with the other differences that
+//   - cs >= 0
+//   - if g = 0, then cs = 1 and sn = 0
+//   - if f = 0 and g != 0, then cs = 0 and sn = sign(1,g)
+//
+// Dlartg is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlartg(f, g float64) (cs, sn, r float64) {
+	// Implementation based on Supplemental Material to:
+	//
+	// Edward Anderson
+	// Algorithm 978: Safe Scaling in the Level 1 BLAS
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (2017)
+	// DOI: https://doi.org/10.1145/3061665
+	//
+	// For further details see:
+	//
+	// W. Pereira, A. Lotfi, J. Langou
+	// Numerical analysis of Givens rotation
+	// DOI: https://doi.org/10.48550/arXiv.2211.04010
+
+	if g == 0 {
+		return 1, 0, f
+	}
+
+	g1 := math.Abs(g)
+
+	if f == 0 {
+		return 0, math.Copysign(1, g), g1
+	}
+
+	const safmin = dlamchS
+	const safmax = 1 / safmin
+	rtmin := math.Sqrt(safmin)
+	rtmax := math.Sqrt(safmax / 2)
+
+	f1 := math.Abs(f)
+
+	if rtmin < f1 && f1 < rtmax && rtmin < g1 && g1 < rtmax {
+		d := math.Sqrt(f*f + g*g)
+		cs = f1 / d
+		r = math.Copysign(d, f)
+		sn = g / r
+
+		return cs, sn, r
+	}
+
+	u := math.Min(math.Max(safmin, math.Max(f1, g1)), safmax)
+	fs := f / u
+	gs := g / u
+	d := math.Sqrt(fs*fs + gs*gs)
+	cs = math.Abs(fs) / d
+	r = math.Copysign(d, f)
+	sn = gs / r
+	r *= u
+
+	return cs, sn, r
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go
new file mode 100644
index 00000000000..a819fa3536c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlas2.go
@@ -0,0 +1,45 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlas2 computes the singular values of the 2×2 matrix defined by
+//
+//	[F G]
+//	[0 H]
+//
+// The smaller and larger singular values are returned in that order.
+//
+// Dlas2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlas2(f, g, h float64) (ssmin, ssmax float64) {
+	fa := math.Abs(f)
+	ga := math.Abs(g)
+	ha := math.Abs(h)
+	fhmin := math.Min(fa, ha)
+	fhmax := math.Max(fa, ha)
+	if fhmin == 0 {
+		if fhmax == 0 {
+			return 0, ga
+		}
+		v := math.Min(fhmax, ga) / math.Max(fhmax, ga)
+		return 0, math.Max(fhmax, ga) * math.Sqrt(1+v*v)
+	}
+	if ga < fhmax {
+		as := 1 + fhmin/fhmax
+		at := (fhmax - fhmin) / fhmax
+		au := (ga / fhmax) * (ga / fhmax)
+		c := 2 / (math.Sqrt(as*as+au) + math.Sqrt(at*at+au))
+		return fhmin * c, fhmax / c
+	}
+	au := fhmax / ga
+	if au == 0 {
+		return fhmin * fhmax / ga, ga
+	}
+	as := 1 + fhmin/fhmax
+	at := (fhmax - fhmin) / fhmax
+	c := 1 / (math.Sqrt(1+(as*au)*(as*au)) + math.Sqrt(1+(at*au)*(at*au)))
+	return 2 * (fhmin * c) * au, ga / (c + c)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go
new file mode 100644
index 00000000000..61c4eb79cb3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlascl.go
@@ -0,0 +1,111 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlascl multiplies an m×n matrix by the scalar cto/cfrom.
+//
+// cfrom must not be zero, and cto and cfrom must not be NaN, otherwise Dlascl
+// will panic.
+//
+// Dlascl is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlascl(kind lapack.MatrixType, kl, ku int, cfrom, cto float64, m, n int, a []float64, lda int) {
+	switch kind {
+	default:
+		panic(badMatrixType)
+	case 'H', 'B', 'Q', 'Z': // See dlascl.f.
+		panic("not implemented")
+	case lapack.General, lapack.UpperTri, lapack.LowerTri:
+		if lda < max(1, n) {
+			panic(badLdA)
+		}
+	}
+	switch {
+	case cfrom == 0:
+		panic(zeroCFrom)
+	case math.IsNaN(cfrom):
+		panic(nanCFrom)
+	case math.IsNaN(cto):
+		panic(nanCTo)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	}
+
+	if n == 0 || m == 0 {
+		return
+	}
+
+	switch kind {
+	case lapack.General, lapack.UpperTri, lapack.LowerTri:
+		if len(a) < (m-1)*lda+n {
+			panic(shortA)
+		}
+	}
+
+	smlnum := dlamchS
+	bignum := 1 / smlnum
+	cfromc := cfrom
+	ctoc := cto
+	cfrom1 := cfromc * smlnum
+	for {
+		var done bool
+		var mul, ctol float64
+		if cfrom1 == cfromc {
+			// cfromc is inf.
+			mul = ctoc / cfromc
+			done = true
+			ctol = ctoc
+		} else {
+			ctol = ctoc / bignum
+			if ctol == ctoc {
+				// ctoc is either 0 or inf.
+				mul = ctoc
+				done = true
+				cfromc = 1
+			} else if math.Abs(cfrom1) > math.Abs(ctoc) && ctoc != 0 {
+				mul = smlnum
+				done = false
+				cfromc = cfrom1
+			} else if math.Abs(ctol) > math.Abs(cfromc) {
+				mul = bignum
+				done = false
+				ctoc = ctol
+			} else {
+				mul = ctoc / cfromc
+				done = true
+			}
+		}
+		switch kind {
+		case lapack.General:
+			for i := 0; i < m; i++ {
+				for j := 0; j < n; j++ {
+					a[i*lda+j] = a[i*lda+j] * mul
+				}
+			}
+		case lapack.UpperTri:
+			for i := 0; i < m; i++ {
+				for j := i; j < n; j++ {
+					a[i*lda+j] = a[i*lda+j] * mul
+				}
+			}
+		case lapack.LowerTri:
+			for i := 0; i < m; i++ {
+				for j := 0; j <= min(i, n-1); j++ {
+					a[i*lda+j] = a[i*lda+j] * mul
+				}
+			}
+		}
+		if done {
+			break
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go
new file mode 100644
index 00000000000..b8b6b0f4dbb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaset.go
@@ -0,0 +1,58 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dlaset sets the off-diagonal elements of A to alpha, and the diagonal
+// elements to beta. If uplo == blas.Upper, only the elements in the upper
+// triangular part are set. If uplo == blas.Lower, only the elements in the
+// lower triangular part are set. If uplo is otherwise, all of the elements of A
+// are set.
+//
+// Dlaset is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaset(uplo blas.Uplo, m, n int, alpha, beta float64, a []float64, lda int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	minmn := min(m, n)
+	if minmn == 0 {
+		return
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	switch uplo {
+	case blas.Upper:
+		for i := 0; i < m; i++ {
+			for j := i + 1; j < n; j++ {
+				a[i*lda+j] = alpha
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < m; i++ {
+			for j := 0; j < min(i, n); j++ {
+				a[i*lda+j] = alpha
+			}
+		}
+	default:
+		for i := 0; i < m; i++ {
+			for j := 0; j < n; j++ {
+				a[i*lda+j] = alpha
+			}
+		}
+	}
+	for i := 0; i < minmn; i++ {
+		a[i*lda+i] = beta
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go
new file mode 100644
index 00000000000..1f1d1dc42e3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq1.go
@@ -0,0 +1,100 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasq1 computes the singular values of an n×n bidiagonal matrix with diagonal
+// d and off-diagonal e. On exit, d contains the singular values in decreasing
+// order, and e is overwritten. d must have length at least n, e must have
+// length at least n-1, and the input work must have length at least 4*n. Dlasq1
+// will panic if these conditions are not met.
+//
+// Dlasq1 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq1(n int, d, e, work []float64) (info int) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return info
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(work) < 4*n:
+		panic(shortWork)
+	}
+
+	if n == 1 {
+		d[0] = math.Abs(d[0])
+		return info
+	}
+
+	if n == 2 {
+		d[1], d[0] = impl.Dlas2(d[0], e[0], d[1])
+		return info
+	}
+
+	// Estimate the largest singular value.
+	var sigmx float64
+	for i := 0; i < n-1; i++ {
+		d[i] = math.Abs(d[i])
+		sigmx = math.Max(sigmx, math.Abs(e[i]))
+	}
+	d[n-1] = math.Abs(d[n-1])
+	// Early return if sigmx is zero (matrix is already diagonal).
+	if sigmx == 0 {
+		impl.Dlasrt(lapack.SortDecreasing, n, d)
+		return info
+	}
+
+	for i := 0; i < n; i++ {
+		sigmx = math.Max(sigmx, d[i])
+	}
+
+	// Copy D and E into WORK (in the Z format) and scale (squaring the
+	// input data makes scaling by a power of the radix pointless).
+
+	eps := dlamchP
+	safmin := dlamchS
+	scale := math.Sqrt(eps / safmin)
+	bi := blas64.Implementation()
+	bi.Dcopy(n, d, 1, work, 2)
+	bi.Dcopy(n-1, e, 1, work[1:], 2)
+	impl.Dlascl(lapack.General, 0, 0, sigmx, scale, 2*n-1, 1, work, 1)
+
+	// Compute the q's and e's.
+	for i := 0; i < 2*n-1; i++ {
+		work[i] *= work[i]
+	}
+	work[2*n-1] = 0
+
+	info = impl.Dlasq2(n, work)
+	if info == 0 {
+		for i := 0; i < n; i++ {
+			d[i] = math.Sqrt(work[i])
+		}
+		impl.Dlascl(lapack.General, 0, 0, scale, sigmx, n, 1, d, 1)
+	} else if info == 2 {
+		// Maximum number of iterations exceeded. Move data from work
+		// into D and E so the calling subroutine can try to finish.
+		for i := 0; i < n; i++ {
+			d[i] = math.Sqrt(work[2*i])
+			e[i] = math.Sqrt(work[2*i+1])
+		}
+		impl.Dlascl(lapack.General, 0, 0, scale, sigmx, n, 1, d, 1)
+		impl.Dlascl(lapack.General, 0, 0, scale, sigmx, n, 1, e, 1)
+	}
+	return info
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go
new file mode 100644
index 00000000000..e3870b1d962
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq2.go
@@ -0,0 +1,370 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasq2 computes all the eigenvalues of the symmetric positive
+// definite tridiagonal matrix associated with the qd array Z. Eigevalues
+// are computed to high relative accuracy avoiding denormalization, underflow
+// and overflow.
+//
+// To see the relation of Z to the tridiagonal matrix, let L be a
+// unit lower bidiagonal matrix with sub-diagonals Z(2,4,6,,..) and
+// let U be an upper bidiagonal matrix with 1's above and diagonal
+// Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+// symmetric tridiagonal to which it is similar.
+//
+// info returns a status error. The return codes mean as follows:
+//
+//	0: The algorithm completed successfully.
+//	1: A split was marked by a positive value in e.
+//	2: Current block of Z not diagonalized after 100*n iterations (in inner
+//	   while loop). On exit Z holds a qd array with the same eigenvalues as
+//	   the given Z.
+//	3: Termination criterion of outer while loop not met (program created more
+//	   than N unreduced blocks).
+//
+// z must have length at least 4*n, and must not contain any negative elements.
+// Dlasq2 will panic otherwise.
+//
+// Dlasq2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq2(n int, z []float64) (info int) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return info
+	}
+
+	if len(z) < 4*n {
+		panic(shortZ)
+	}
+
+	if n == 1 {
+		if z[0] < 0 {
+			panic(negZ)
+		}
+		return info
+	}
+
+	const cbias = 1.5
+
+	eps := dlamchP
+	safmin := dlamchS
+	tol := eps * 100
+	tol2 := tol * tol
+	if n == 2 {
+		if z[1] < 0 || z[2] < 0 {
+			panic(negZ)
+		} else if z[2] > z[0] {
+			z[0], z[2] = z[2], z[0]
+		}
+		z[4] = z[0] + z[1] + z[2]
+		if z[1] > z[2]*tol2 {
+			t := 0.5 * (z[0] - z[2] + z[1])
+			s := z[2] * (z[1] / t)
+			if s <= t {
+				s = z[2] * (z[1] / (t * (1 + math.Sqrt(1+s/t))))
+			} else {
+				s = z[2] * (z[1] / (t + math.Sqrt(t)*math.Sqrt(t+s)))
+			}
+			t = z[0] + s + z[1]
+			z[2] *= z[0] / t
+			z[0] = t
+		}
+		z[1] = z[2]
+		z[5] = z[1] + z[0]
+		return info
+	}
+	// Check for negative data and compute sums of q's and e's.
+	z[2*n-1] = 0
+	emin := z[1]
+	var d, e, qmax float64
+	var i1, n1 int
+	for k := 0; k < 2*(n-1); k += 2 {
+		if z[k] < 0 || z[k+1] < 0 {
+			panic(negZ)
+		}
+		d += z[k]
+		e += z[k+1]
+		qmax = math.Max(qmax, z[k])
+		emin = math.Min(emin, z[k+1])
+	}
+	if z[2*(n-1)] < 0 {
+		panic(negZ)
+	}
+	d += z[2*(n-1)]
+	// Check for diagonality.
+	if e == 0 {
+		for k := 1; k < n; k++ {
+			z[k] = z[2*k]
+		}
+		impl.Dlasrt(lapack.SortDecreasing, n, z)
+		z[2*(n-1)] = d
+		return info
+	}
+	trace := d + e
+	// Check for zero data.
+	if trace == 0 {
+		z[2*(n-1)] = 0
+		return info
+	}
+	// Rearrange data for locality: Z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...).
+	for k := 2 * n; k >= 2; k -= 2 {
+		z[2*k-1] = 0
+		z[2*k-2] = z[k-1]
+		z[2*k-3] = 0
+		z[2*k-4] = z[k-2]
+	}
+	i0 := 0
+	n0 := n - 1
+
+	// Reverse the qd-array, if warranted.
+	// z[4*i0-3] --> z[4*(i0+1)-3-1] --> z[4*i0]
+	if cbias*z[4*i0] < z[4*n0] {
+		ipn4Out := 4 * (i0 + n0 + 2)
+		for i4loop := 4 * (i0 + 1); i4loop <= 2*(i0+n0+1); i4loop += 4 {
+			i4 := i4loop - 1
+			ipn4 := ipn4Out - 1
+			z[i4-3], z[ipn4-i4-4] = z[ipn4-i4-4], z[i4-3]
+			z[i4-1], z[ipn4-i4-6] = z[ipn4-i4-6], z[i4-1]
+		}
+	}
+
+	// Initial split checking via dqd and Li's test.
+	pp := 0
+	for k := 0; k < 2; k++ {
+		d = z[4*n0+pp]
+		for i4loop := 4*n0 + pp; i4loop >= 4*(i0+1)+pp; i4loop -= 4 {
+			i4 := i4loop - 1
+			if z[i4-1] <= tol2*d {
+				z[i4-1] = math.Copysign(0, -1)
+				d = z[i4-3]
+			} else {
+				d = z[i4-3] * (d / (d + z[i4-1]))
+			}
+		}
+		// dqd maps Z to ZZ plus Li's test.
+		emin = z[4*(i0+1)+pp]
+		d = z[4*i0+pp]
+		for i4loop := 4*(i0+1) + pp; i4loop <= 4*n0+pp; i4loop += 4 {
+			i4 := i4loop - 1
+			z[i4-2*pp-2] = d + z[i4-1]
+			if z[i4-1] <= tol2*d {
+				z[i4-1] = math.Copysign(0, -1)
+				z[i4-2*pp-2] = d
+				z[i4-2*pp] = 0
+				d = z[i4+1]
+			} else if safmin*z[i4+1] < z[i4-2*pp-2] && safmin*z[i4-2*pp-2] < z[i4+1] {
+				tmp := z[i4+1] / z[i4-2*pp-2]
+				z[i4-2*pp] = z[i4-1] * tmp
+				d *= tmp
+			} else {
+				z[i4-2*pp] = z[i4+1] * (z[i4-1] / z[i4-2*pp-2])
+				d = z[i4+1] * (d / z[i4-2*pp-2])
+			}
+			emin = math.Min(emin, z[i4-2*pp])
+		}
+		z[4*(n0+1)-pp-3] = d
+
+		// Now find qmax.
+		qmax = z[4*(i0+1)-pp-3]
+		for i4loop := 4*(i0+1) - pp + 2; i4loop <= 4*(n0+1)+pp-2; i4loop += 4 {
+			i4 := i4loop - 1
+			qmax = math.Max(qmax, z[i4])
+		}
+		// Prepare for the next iteration on K.
+		pp = 1 - pp
+	}
+
+	// Initialise variables to pass to DLASQ3.
+	var ttype int
+	var dmin1, dmin2, dn, dn1, dn2, g, tau float64
+	var tempq float64
+	iter := 2
+	var nFail int
+	nDiv := 2 * (n0 - i0)
+	var i4 int
+outer:
+	for iwhila := 1; iwhila <= n+1; iwhila++ {
+		// Test for completion.
+		if n0 < 0 {
+			// Move q's to the front.
+			for k := 1; k < n; k++ {
+				z[k] = z[4*k]
+			}
+			// Sort and compute sum of eigenvalues.
+			impl.Dlasrt(lapack.SortDecreasing, n, z)
+			e = 0
+			for k := n - 1; k >= 0; k-- {
+				e += z[k]
+			}
+			// Store trace, sum(eigenvalues) and information on performance.
+			z[2*n] = trace
+			z[2*n+1] = e
+			z[2*n+2] = float64(iter)
+			z[2*n+3] = float64(nDiv) / float64(n*n)
+			z[2*n+4] = 100 * float64(nFail) / float64(iter)
+			return info
+		}
+
+		// While array unfinished do
+		// e[n0] holds the value of sigma when submatrix in i0:n0
+		// splits from the rest of the array, but is negated.
+		var desig float64
+		var sigma float64
+		if n0 != n-1 {
+			sigma = -z[4*(n0+1)-2]
+		}
+		if sigma < 0 {
+			info = 1
+			return info
+		}
+		// Find last unreduced submatrix's top index i0, find qmax and
+		// emin. Find Gershgorin-type bound if Q's much greater than E's.
+		var emax float64
+		if n0 > i0 {
+			emin = math.Abs(z[4*(n0+1)-6])
+		} else {
+			emin = 0
+		}
+		qmin := z[4*(n0+1)-4]
+		qmax = qmin
+		zSmall := false
+		for i4loop := 4 * (n0 + 1); i4loop >= 8; i4loop -= 4 {
+			i4 = i4loop - 1
+			if z[i4-5] <= 0 {
+				zSmall = true
+				break
+			}
+			if qmin >= 4*emax {
+				qmin = math.Min(qmin, z[i4-3])
+				emax = math.Max(emax, z[i4-5])
+			}
+			qmax = math.Max(qmax, z[i4-7]+z[i4-5])
+			emin = math.Min(emin, z[i4-5])
+		}
+		if !zSmall {
+			i4 = 3
+		}
+		i0 = (i4+1)/4 - 1
+		pp = 0
+		if n0-i0 > 1 {
+			dee := z[4*i0]
+			deemin := dee
+			kmin := i0
+			for i4loop := 4*(i0+1) + 1; i4loop <= 4*(n0+1)-3; i4loop += 4 {
+				i4 := i4loop - 1
+				dee = z[i4] * (dee / (dee + z[i4-2]))
+				if dee <= deemin {
+					deemin = dee
+					kmin = (i4+4)/4 - 1
+				}
+			}
+			if (kmin-i0)*2 < n0-kmin && deemin <= 0.5*z[4*n0] {
+				ipn4Out := 4 * (i0 + n0 + 2)
+				pp = 2
+				for i4loop := 4 * (i0 + 1); i4loop <= 2*(i0+n0+1); i4loop += 4 {
+					i4 := i4loop - 1
+					ipn4 := ipn4Out - 1
+					z[i4-3], z[ipn4-i4-4] = z[ipn4-i4-4], z[i4-3]
+					z[i4-2], z[ipn4-i4-3] = z[ipn4-i4-3], z[i4-2]
+					z[i4-1], z[ipn4-i4-6] = z[ipn4-i4-6], z[i4-1]
+					z[i4], z[ipn4-i4-5] = z[ipn4-i4-5], z[i4]
+				}
+			}
+		}
+		// Put -(initial shift) into DMIN.
+		dmin := -math.Max(0, qmin-2*math.Sqrt(qmin)*math.Sqrt(emax))
+
+		// Now i0:n0 is unreduced.
+		// PP = 0 for ping, PP = 1 for pong.
+		// PP = 2 indicates that flipping was applied to the Z array and
+		// 		that the tests for deflation upon entry in Dlasq3 should
+		// 		not be performed.
+		nbig := 100 * (n0 - i0 + 1)
+		for iwhilb := 0; iwhilb < nbig; iwhilb++ {
+			if i0 > n0 {
+				continue outer
+			}
+
+			// While submatrix unfinished take a good dqds step.
+			i0, n0, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau =
+				impl.Dlasq3(i0, n0, z, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau)
+
+			pp = 1 - pp
+			// When emin is very small check for splits.
+			if pp == 0 && n0-i0 >= 3 {
+				if z[4*(n0+1)-1] <= tol2*qmax || z[4*(n0+1)-2] <= tol2*sigma {
+					splt := i0 - 1
+					qmax = z[4*i0]
+					emin = z[4*(i0+1)-2]
+					oldemn := z[4*(i0+1)-1]
+					for i4loop := 4 * (i0 + 1); i4loop <= 4*(n0-2); i4loop += 4 {
+						i4 := i4loop - 1
+						if z[i4] <= tol2*z[i4-3] || z[i4-1] <= tol2*sigma {
+							z[i4-1] = -sigma
+							splt = i4 / 4
+							qmax = 0
+							emin = z[i4+3]
+							oldemn = z[i4+4]
+						} else {
+							qmax = math.Max(qmax, z[i4+1])
+							emin = math.Min(emin, z[i4-1])
+							oldemn = math.Min(oldemn, z[i4])
+						}
+					}
+					z[4*(n0+1)-2] = emin
+					z[4*(n0+1)-1] = oldemn
+					i0 = splt + 1
+				}
+			}
+		}
+		// Maximum number of iterations exceeded, restore the shift
+		// sigma and place the new d's and e's in a qd array.
+		// This might need to be done for several blocks.
+		info = 2
+		i1 = i0
+		for {
+			tempq = z[4*i0]
+			z[4*i0] += sigma
+			for k := i0 + 1; k <= n0; k++ {
+				tempe := z[4*(k+1)-6]
+				z[4*(k+1)-6] *= tempq / z[4*(k+1)-8]
+				tempq = z[4*k]
+				z[4*k] += sigma + tempe - z[4*(k+1)-6]
+			}
+			// Prepare to do this on the previous block if there is one.
+			if i1 <= 0 {
+				break
+			}
+			n1 = i1 - 1
+			for i1 >= 1 && z[4*(i1+1)-6] >= 0 {
+				i1 -= 1
+			}
+			sigma = -z[4*(n1+1)-2]
+		}
+		for k := 0; k < n; k++ {
+			z[2*k] = z[4*k]
+			// Only the block 1..N0 is unfinished.  The rest of the e's
+			// must be essentially zero, although sometimes other data
+			// has been stored in them.
+			if k < n0 {
+				z[2*(k+1)-1] = z[4*(k+1)-1]
+			} else {
+				z[2*(k+1)] = 0
+			}
+		}
+		return info
+	}
+	info = 3
+	return info
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go
new file mode 100644
index 00000000000..a05e94ef173
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq3.go
@@ -0,0 +1,172 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq3 checks for deflation, computes a shift (tau) and calls dqds.
+// In case of failure it changes shifts, and tries again until output
+// is positive.
+//
+// Dlasq3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq3(i0, n0 int, z []float64, pp int, dmin, sigma, desig, qmax float64, nFail, iter, nDiv int, ttype int, dmin1, dmin2, dn, dn1, dn2, g, tau float64) (
+	i0Out, n0Out, ppOut int, dminOut, sigmaOut, desigOut, qmaxOut float64, nFailOut, iterOut, nDivOut, ttypeOut int, dmin1Out, dmin2Out, dnOut, dn1Out, dn2Out, gOut, tauOut float64) {
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1 && pp != 2:
+		panic(badPp)
+	}
+
+	const cbias = 1.5
+
+	n0in := n0
+	eps := dlamchP
+	tol := eps * 100
+	tol2 := tol * tol
+	var nn int
+	var t float64
+	for {
+		if n0 < i0 {
+			return i0, n0, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau
+		}
+		if n0 == i0 {
+			z[4*(n0+1)-4] = z[4*(n0+1)+pp-4] + sigma
+			n0--
+			continue
+		}
+		nn = 4*(n0+1) + pp - 1
+		if n0 != i0+1 {
+			// Check whether e[n0-1] is negligible, 1 eigenvalue.
+			if z[nn-5] > tol2*(sigma+z[nn-3]) && z[nn-2*pp-4] > tol2*z[nn-7] {
+				// Check whether e[n0-2] is negligible, 2 eigenvalues.
+				if z[nn-9] > tol2*sigma && z[nn-2*pp-8] > tol2*z[nn-11] {
+					break
+				}
+			} else {
+				z[4*(n0+1)-4] = z[4*(n0+1)+pp-4] + sigma
+				n0--
+				continue
+			}
+		}
+		if z[nn-3] > z[nn-7] {
+			z[nn-3], z[nn-7] = z[nn-7], z[nn-3]
+		}
+		t = 0.5 * (z[nn-7] - z[nn-3] + z[nn-5])
+		if z[nn-5] > z[nn-3]*tol2 && t != 0 {
+			s := z[nn-3] * (z[nn-5] / t)
+			if s <= t {
+				s = z[nn-3] * (z[nn-5] / (t * (1 + math.Sqrt(1+s/t))))
+			} else {
+				s = z[nn-3] * (z[nn-5] / (t + math.Sqrt(t)*math.Sqrt(t+s)))
+			}
+			t = z[nn-7] + (s + z[nn-5])
+			z[nn-3] *= z[nn-7] / t
+			z[nn-7] = t
+		}
+		z[4*(n0+1)-8] = z[nn-7] + sigma
+		z[4*(n0+1)-4] = z[nn-3] + sigma
+		n0 -= 2
+	}
+	if pp == 2 {
+		pp = 0
+	}
+
+	// Reverse the qd-array, if warranted.
+	if dmin <= 0 || n0 < n0in {
+		if cbias*z[4*(i0+1)+pp-4] < z[4*(n0+1)+pp-4] {
+			ipn4Out := 4 * (i0 + n0 + 2)
+			for j4loop := 4 * (i0 + 1); j4loop <= 2*((i0+1)+(n0+1)-1); j4loop += 4 {
+				ipn4 := ipn4Out - 1
+				j4 := j4loop - 1
+
+				z[j4-3], z[ipn4-j4-4] = z[ipn4-j4-4], z[j4-3]
+				z[j4-2], z[ipn4-j4-3] = z[ipn4-j4-3], z[j4-2]
+				z[j4-1], z[ipn4-j4-6] = z[ipn4-j4-6], z[j4-1]
+				z[j4], z[ipn4-j4-5] = z[ipn4-j4-5], z[j4]
+			}
+			if n0-i0 <= 4 {
+				z[4*(n0+1)+pp-2] = z[4*(i0+1)+pp-2]
+				z[4*(n0+1)-pp-1] = z[4*(i0+1)-pp-1]
+			}
+			dmin2 = math.Min(dmin2, z[4*(i0+1)-pp-2])
+			z[4*(n0+1)+pp-2] = math.Min(math.Min(z[4*(n0+1)+pp-2], z[4*(i0+1)+pp-2]), z[4*(i0+1)+pp+2])
+			z[4*(n0+1)-pp-1] = math.Min(math.Min(z[4*(n0+1)-pp-1], z[4*(i0+1)-pp-1]), z[4*(i0+1)-pp+3])
+			qmax = math.Max(math.Max(qmax, z[4*(i0+1)+pp-4]), z[4*(i0+1)+pp])
+			dmin = math.Copysign(0, -1) // Fortran code has -zero, but -0 in go is 0
+		}
+	}
+
+	// Choose a shift.
+	tau, ttype, g = impl.Dlasq4(i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn, dn1, dn2, tau, ttype, g)
+
+	// Call dqds until dmin > 0.
+loop:
+	for {
+		i0, n0, pp, tau, sigma, dmin, dmin1, dmin2, dn, dn1, dn2 = impl.Dlasq5(i0, n0, z, pp, tau, sigma)
+
+		nDiv += n0 - i0 + 2
+		iter++
+		switch {
+		case dmin >= 0 && dmin1 >= 0:
+			// Success.
+			goto done
+
+		case dmin < 0 && dmin1 > 0 && z[4*n0-pp-1] < tol*(sigma+dn1) && math.Abs(dn) < tol*sigma:
+			// Convergence hidden by negative dn.
+			z[4*n0-pp+1] = 0
+			dmin = 0
+			goto done
+
+		case dmin < 0:
+			// Tau too big. Select new Tau and try again.
+			nFail++
+			if ttype < -22 {
+				// Failed twice. Play it safe.
+				tau = 0
+			} else if dmin1 > 0 {
+				// Late failure. Gives excellent shift.
+				tau = (tau + dmin) * (1 - 2*eps)
+				ttype -= 11
+			} else {
+				// Early failure. Divide by 4.
+				tau = tau / 4
+				ttype -= 12
+			}
+
+		case math.IsNaN(dmin):
+			if tau == 0 {
+				break loop
+			}
+			tau = 0
+
+		default:
+			// Possible underflow. Play it safe.
+			break loop
+		}
+	}
+
+	// Risk of underflow.
+	dmin, dmin1, dmin2, dn, dn1, dn2 = impl.Dlasq6(i0, n0, z, pp)
+	nDiv += n0 - i0 + 2
+	iter++
+	tau = 0
+
+done:
+	if tau < sigma {
+		desig += tau
+		t = sigma + desig
+		desig -= t - sigma
+	} else {
+		t = sigma + tau
+		desig += sigma - (t - tau)
+	}
+	sigma = t
+	return i0, n0, pp, dmin, sigma, desig, qmax, nFail, iter, nDiv, ttype, dmin1, dmin2, dn, dn1, dn2, g, tau
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go
new file mode 100644
index 00000000000..f6dbb31b98a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq4.go
@@ -0,0 +1,249 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq4 computes an approximation to the smallest eigenvalue using values of d
+// from the previous transform.
+// i0, n0, and n0in are zero-indexed.
+//
+// Dlasq4 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq4(i0, n0 int, z []float64, pp int, n0in int, dmin, dmin1, dmin2, dn, dn1, dn2, tau float64, ttype int, g float64) (tauOut float64, ttypeOut int, gOut float64) {
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1:
+		panic(badPp)
+	}
+
+	const (
+		cnst1 = 0.563
+		cnst2 = 1.01
+		cnst3 = 1.05
+
+		cnstthird = 0.333 // TODO(btracey): Fix?
+	)
+	// A negative dmin forces the shift to take that absolute value
+	// ttype records the type of shift.
+	if dmin <= 0 {
+		tau = -dmin
+		ttype = -1
+		return tau, ttype, g
+	}
+	nn := 4*(n0+1) + pp - 1 // -1 for zero indexing
+	s := math.NaN()         // Poison s so that failure to take a path below is obvious
+	if n0in == n0 {
+		// No eigenvalues deflated.
+		if dmin == dn || dmin == dn1 {
+			b1 := math.Sqrt(z[nn-3]) * math.Sqrt(z[nn-5])
+			b2 := math.Sqrt(z[nn-7]) * math.Sqrt(z[nn-9])
+			a2 := z[nn-7] + z[nn-5]
+			if dmin == dn && dmin1 == dn1 {
+				gap2 := dmin2 - a2 - dmin2/4
+				var gap1 float64
+				if gap2 > 0 && gap2 > b2 {
+					gap1 = a2 - dn - (b2/gap2)*b2
+				} else {
+					gap1 = a2 - dn - (b1 + b2)
+				}
+				if gap1 > 0 && gap1 > b1 {
+					s = math.Max(dn-(b1/gap1)*b1, 0.5*dmin)
+					ttype = -2
+				} else {
+					s = 0
+					if dn > b1 {
+						s = dn - b1
+					}
+					if a2 > b1+b2 {
+						s = math.Min(s, a2-(b1+b2))
+					}
+					s = math.Max(s, cnstthird*dmin)
+					ttype = -3
+				}
+			} else {
+				ttype = -4
+				s = dmin / 4
+				var gam float64
+				var np int
+				if dmin == dn {
+					gam = dn
+					a2 = 0
+					if z[nn-5] > z[nn-7] {
+						return tau, ttype, g
+					}
+					b2 = z[nn-5] / z[nn-7]
+					np = nn - 9
+				} else {
+					np = nn - 2*pp
+					gam = dn1
+					if z[np-4] > z[np-2] {
+						return tau, ttype, g
+					}
+					a2 = z[np-4] / z[np-2]
+					if z[nn-9] > z[nn-11] {
+						return tau, ttype, g
+					}
+					b2 = z[nn-9] / z[nn-11]
+					np = nn - 13
+				}
+				// Approximate contribution to norm squared from i < nn-1.
+				a2 += b2
+				for i4loop := np + 1; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					if b2 == 0 {
+						break
+					}
+					b1 = b2
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b2 *= z[i4] / z[i4-2]
+					a2 += b2
+					if 100*math.Max(b2, b1) < a2 || cnst1 < a2 {
+						break
+					}
+				}
+				a2 *= cnst3
+				// Rayleigh quotient residual bound.
+				if a2 < cnst1 {
+					s = gam * (1 - math.Sqrt(a2)) / (1 + a2)
+				}
+			}
+		} else if dmin == dn2 {
+			ttype = -5
+			s = dmin / 4
+			// Compute contribution to norm squared from i > nn-2.
+			np := nn - 2*pp
+			b1 := z[np-2]
+			b2 := z[np-6]
+			gam := dn2
+			if z[np-8] > b2 || z[np-4] > b1 {
+				return tau, ttype, g
+			}
+			a2 := (z[np-8] / b2) * (1 + z[np-4]/b1)
+			// Approximate contribution to norm squared from i < nn-2.
+			if n0-i0 > 2 {
+				b2 = z[nn-13] / z[nn-15]
+				a2 += b2
+				for i4loop := (nn + 1) - 17; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					if b2 == 0 {
+						break
+					}
+					b1 = b2
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b2 *= z[i4] / z[i4-2]
+					a2 += b2
+					if 100*math.Max(b2, b1) < a2 || cnst1 < a2 {
+						break
+					}
+				}
+				a2 *= cnst3
+			}
+			if a2 < cnst1 {
+				s = gam * (1 - math.Sqrt(a2)) / (1 + a2)
+			}
+		} else {
+			// Case 6, no information to guide us.
+			if ttype == -6 {
+				g += cnstthird * (1 - g)
+			} else if ttype == -18 {
+				g = cnstthird / 4
+			} else {
+				g = 1.0 / 4
+			}
+			s = g * dmin
+			ttype = -6
+		}
+	} else if n0in == (n0 + 1) {
+		// One eigenvalue just deflated. Use DMIN1, DN1 for DMIN and DN.
+		if dmin1 == dn1 && dmin2 == dn2 {
+			ttype = -7
+			s = cnstthird * dmin1
+			if z[nn-5] > z[nn-7] {
+				return tau, ttype, g
+			}
+			b1 := z[nn-5] / z[nn-7]
+			b2 := b1
+			if b2 != 0 {
+				for i4loop := 4*(n0+1) - 9 + pp; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					a2 := b1
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b1 *= z[i4] / z[i4-2]
+					b2 += b1
+					if 100*math.Max(b1, a2) < b2 {
+						break
+					}
+				}
+			}
+			b2 = math.Sqrt(cnst3 * b2)
+			a2 := dmin1 / (1 + b2*b2)
+			gap2 := 0.5*dmin2 - a2
+			if gap2 > 0 && gap2 > b2*a2 {
+				s = math.Max(s, a2*(1-cnst2*a2*(b2/gap2)*b2))
+			} else {
+				s = math.Max(s, a2*(1-cnst2*b2))
+				ttype = -8
+			}
+		} else {
+			s = dmin1 / 4
+			if dmin1 == dn1 {
+				s = 0.5 * dmin1
+			}
+			ttype = -9
+		}
+	} else if n0in == (n0 + 2) {
+		// Two eigenvalues deflated. Use DMIN2, DN2 for DMIN and DN.
+		if dmin2 == dn2 && 2*z[nn-5] < z[nn-7] {
+			ttype = -10
+			s = cnstthird * dmin2
+			if z[nn-5] > z[nn-7] {
+				return tau, ttype, g
+			}
+			b1 := z[nn-5] / z[nn-7]
+			b2 := b1
+			if b2 != 0 {
+				for i4loop := 4*(n0+1) - 9 + pp; i4loop >= 4*(i0+1)-1+pp; i4loop -= 4 {
+					i4 := i4loop - 1
+					if z[i4] > z[i4-2] {
+						return tau, ttype, g
+					}
+					b1 *= z[i4] / z[i4-2]
+					b2 += b1
+					if 100*b1 < b2 {
+						break
+					}
+				}
+			}
+			b2 = math.Sqrt(cnst3 * b2)
+			a2 := dmin2 / (1 + b2*b2)
+			gap2 := z[nn-7] + z[nn-9] - math.Sqrt(z[nn-11])*math.Sqrt(z[nn-9]) - a2
+			if gap2 > 0 && gap2 > b2*a2 {
+				s = math.Max(s, a2*(1-cnst2*a2*(b2/gap2)*b2))
+			} else {
+				s = math.Max(s, a2*(1-cnst2*b2))
+			}
+		} else {
+			s = dmin2 / 4
+			ttype = -11
+		}
+	} else if n0in > n0+2 {
+		// Case 12, more than two eigenvalues deflated. No information.
+		s = 0
+		ttype = -12
+	}
+	tau = s
+	return tau, ttype, g
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go
new file mode 100644
index 00000000000..d3826d9186e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq5.go
@@ -0,0 +1,140 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq5 computes one dqds transform in ping-pong form.
+// i0 and n0 are zero-indexed.
+//
+// Dlasq5 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq5(i0, n0 int, z []float64, pp int, tau, sigma float64) (i0Out, n0Out, ppOut int, tauOut, sigmaOut, dmin, dmin1, dmin2, dn, dnm1, dnm2 float64) {
+	// The lapack function has inputs for ieee and eps, but Go requires ieee so
+	// these are unnecessary.
+
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1:
+		panic(badPp)
+	}
+
+	if n0-i0-1 <= 0 {
+		return i0, n0, pp, tau, sigma, dmin, dmin1, dmin2, dn, dnm1, dnm2
+	}
+
+	eps := dlamchP
+	dthresh := eps * (sigma + tau)
+	if tau < dthresh*0.5 {
+		tau = 0
+	}
+	var j4 int
+	var emin float64
+	if tau != 0 {
+		j4 = 4*i0 + pp
+		emin = z[j4+4]
+		d := z[j4] - tau
+		dmin = d
+		// In the reference there are code paths that actually return this value.
+		// dmin1 = -z[j4]
+		if pp == 0 {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-2] = d + z[j4-1]
+				tmp := z[j4+1] / z[j4-2]
+				d = d*tmp - tau
+				dmin = math.Min(dmin, d)
+				z[j4] = z[j4-1] * tmp
+				emin = math.Min(z[j4], emin)
+			}
+		} else {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-3] = d + z[j4]
+				tmp := z[j4+2] / z[j4-3]
+				d = d*tmp - tau
+				dmin = math.Min(dmin, d)
+				z[j4-1] = z[j4] * tmp
+				emin = math.Min(z[j4-1], emin)
+			}
+		}
+		// Unroll the last two steps.
+		dnm2 = d
+		dmin2 = dmin
+		j4 = 4*((n0+1)-2) - pp - 1
+		j4p2 := j4 + 2*pp - 1
+		z[j4-2] = dnm2 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dnm1 = z[j4p2+2]*(dnm2/z[j4-2]) - tau
+		dmin = math.Min(dmin, dnm1)
+
+		dmin1 = dmin
+		j4 += 4
+		j4p2 = j4 + 2*pp - 1
+		z[j4-2] = dnm1 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dn = z[j4p2+2]*(dnm1/z[j4-2]) - tau
+		dmin = math.Min(dmin, dn)
+	} else {
+		// This is the version that sets d's to zero if they are small enough.
+		j4 = 4*(i0+1) + pp - 4
+		emin = z[j4+4]
+		d := z[j4] - tau
+		dmin = d
+		// In the reference there are code paths that actually return this value.
+		// dmin1 = -z[j4]
+		if pp == 0 {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-2] = d + z[j4-1]
+				tmp := z[j4+1] / z[j4-2]
+				d = d*tmp - tau
+				if d < dthresh {
+					d = 0
+				}
+				dmin = math.Min(dmin, d)
+				z[j4] = z[j4-1] * tmp
+				emin = math.Min(z[j4], emin)
+			}
+		} else {
+			for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+				j4 := j4loop - 1
+				z[j4-3] = d + z[j4]
+				tmp := z[j4+2] / z[j4-3]
+				d = d*tmp - tau
+				if d < dthresh {
+					d = 0
+				}
+				dmin = math.Min(dmin, d)
+				z[j4-1] = z[j4] * tmp
+				emin = math.Min(z[j4-1], emin)
+			}
+		}
+		// Unroll the last two steps.
+		dnm2 = d
+		dmin2 = dmin
+		j4 = 4*((n0+1)-2) - pp - 1
+		j4p2 := j4 + 2*pp - 1
+		z[j4-2] = dnm2 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dnm1 = z[j4p2+2]*(dnm2/z[j4-2]) - tau
+		dmin = math.Min(dmin, dnm1)
+
+		dmin1 = dmin
+		j4 += 4
+		j4p2 = j4 + 2*pp - 1
+		z[j4-2] = dnm1 + z[j4p2]
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dn = z[j4p2+2]*(dnm1/z[j4-2]) - tau
+		dmin = math.Min(dmin, dn)
+	}
+	z[j4+2] = dn
+	z[4*(n0+1)-pp-1] = emin
+	return i0, n0, pp, tau, sigma, dmin, dmin1, dmin2, dn, dnm1, dnm2
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go
new file mode 100644
index 00000000000..54bf5875629
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasq6.go
@@ -0,0 +1,118 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasq6 computes one dqd transform in ping-pong form with protection against
+// overflow and underflow. z has length at least 4*(n0+1) and holds the qd array.
+// i0 is the zero-based first index.
+// n0 is the zero-based last index.
+//
+// Dlasq6 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasq6(i0, n0 int, z []float64, pp int) (dmin, dmin1, dmin2, dn, dnm1, dnm2 float64) {
+	switch {
+	case i0 < 0:
+		panic(i0LT0)
+	case n0 < 0:
+		panic(n0LT0)
+	case len(z) < 4*n0:
+		panic(shortZ)
+	case pp != 0 && pp != 1:
+		panic(badPp)
+	}
+
+	if n0-i0-1 <= 0 {
+		return dmin, dmin1, dmin2, dn, dnm1, dnm2
+	}
+
+	safmin := dlamchS
+	j4 := 4*(i0+1) + pp - 4 // -4 rather than -3 for zero indexing
+	emin := z[j4+4]
+	d := z[j4]
+	dmin = d
+	if pp == 0 {
+		for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+			j4 := j4loop - 1 // Translate back to zero-indexed.
+			z[j4-2] = d + z[j4-1]
+			if z[j4-2] == 0 {
+				z[j4] = 0
+				d = z[j4+1]
+				dmin = d
+				emin = 0
+			} else if safmin*z[j4+1] < z[j4-2] && safmin*z[j4-2] < z[j4+1] {
+				tmp := z[j4+1] / z[j4-2]
+				z[j4] = z[j4-1] * tmp
+				d *= tmp
+			} else {
+				z[j4] = z[j4+1] * (z[j4-1] / z[j4-2])
+				d = z[j4+1] * (d / z[j4-2])
+			}
+			dmin = math.Min(dmin, d)
+			emin = math.Min(emin, z[j4])
+		}
+	} else {
+		for j4loop := 4 * (i0 + 1); j4loop <= 4*((n0+1)-3); j4loop += 4 {
+			j4 := j4loop - 1
+			z[j4-3] = d + z[j4]
+			if z[j4-3] == 0 {
+				z[j4-1] = 0
+				d = z[j4+2]
+				dmin = d
+				emin = 0
+			} else if safmin*z[j4+2] < z[j4-3] && safmin*z[j4-3] < z[j4+2] {
+				tmp := z[j4+2] / z[j4-3]
+				z[j4-1] = z[j4] * tmp
+				d *= tmp
+			} else {
+				z[j4-1] = z[j4+2] * (z[j4] / z[j4-3])
+				d = z[j4+2] * (d / z[j4-3])
+			}
+			dmin = math.Min(dmin, d)
+			emin = math.Min(emin, z[j4-1])
+		}
+	}
+	// Unroll last two steps.
+	dnm2 = d
+	dmin2 = dmin
+	j4 = 4*(n0-1) - pp - 1
+	j4p2 := j4 + 2*pp - 1
+	z[j4-2] = dnm2 + z[j4p2]
+	if z[j4-2] == 0 {
+		z[j4] = 0
+		dnm1 = z[j4p2+2]
+		dmin = dnm1
+		emin = 0
+	} else if safmin*z[j4p2+2] < z[j4-2] && safmin*z[j4-2] < z[j4p2+2] {
+		tmp := z[j4p2+2] / z[j4-2]
+		z[j4] = z[j4p2] * tmp
+		dnm1 = dnm2 * tmp
+	} else {
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dnm1 = z[j4p2+2] * (dnm2 / z[j4-2])
+	}
+	dmin = math.Min(dmin, dnm1)
+	dmin1 = dmin
+	j4 += 4
+	j4p2 = j4 + 2*pp - 1
+	z[j4-2] = dnm1 + z[j4p2]
+	if z[j4-2] == 0 {
+		z[j4] = 0
+		dn = z[j4p2+2]
+		dmin = dn
+		emin = 0
+	} else if safmin*z[j4p2+2] < z[j4-2] && safmin*z[j4-2] < z[j4p2+2] {
+		tmp := z[j4p2+2] / z[j4-2]
+		z[j4] = z[j4p2] * tmp
+		dn = dnm1 * tmp
+	} else {
+		z[j4] = z[j4p2+2] * (z[j4p2] / z[j4-2])
+		dn = z[j4p2+2] * (dnm1 / z[j4-2])
+	}
+	dmin = math.Min(dmin, dn)
+	z[j4+2] = dn
+	z[4*(n0+1)-pp-1] = emin
+	return dmin, dmin1, dmin2, dn, dnm1, dnm2
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go
new file mode 100644
index 00000000000..3aab41f8e3e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasr.go
@@ -0,0 +1,287 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasr applies a sequence of plane rotations to the m×n matrix A. This series
+// of plane rotations is implicitly represented by a matrix P. P is multiplied
+// by a depending on the value of side -- A = P * A if side == lapack.Left,
+// A = A * Pᵀ if side == lapack.Right.
+//
+// The exact value of P depends on the value of pivot, but in all cases P is
+// implicitly represented by a series of 2×2 rotation matrices. The entries of
+// rotation matrix k are defined by s[k] and c[k]
+//
+//	R(k) = [ c[k] s[k]]
+//	       [-s[k] s[k]]
+//
+// If direct == lapack.Forward, the rotation matrices are applied as
+// P = P(z-1) * ... * P(2) * P(1), while if direct == lapack.Backward they are
+// applied as P = P(1) * P(2) * ... * P(n).
+//
+// pivot defines the mapping of the elements in R(k) to P(k).
+// If pivot == lapack.Variable, the rotation is performed for the (k, k+1) plane.
+//
+//	P(k) = [1                    ]
+//	       [    ...              ]
+//	       [     1               ]
+//	       [       c[k] s[k]     ]
+//	       [      -s[k] c[k]     ]
+//	       [                 1   ]
+//	       [                ...  ]
+//	       [                    1]
+//
+// if pivot == lapack.Top, the rotation is performed for the (1, k+1) plane,
+//
+//	P(k) = [c[k]        s[k]     ]
+//	       [    1                ]
+//	       [     ...             ]
+//	       [         1           ]
+//	       [-s[k]       c[k]     ]
+//	       [                 1   ]
+//	       [                ...  ]
+//	       [                    1]
+//
+// and if pivot == lapack.Bottom, the rotation is performed for the (k, z) plane.
+//
+//	P(k) = [1                    ]
+//	       [  ...                ]
+//	       [      1              ]
+//	       [        c[k]     s[k]]
+//	       [           1         ]
+//	       [            ...      ]
+//	       [              1      ]
+//	       [       -s[k]     c[k]]
+//
+// s and c have length m - 1 if side == blas.Left, and n - 1 if side == blas.Right.
+//
+// Dlasr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasr(side blas.Side, pivot lapack.Pivot, direct lapack.Direct, m, n int, c, s, a []float64, lda int) {
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case pivot != lapack.Variable && pivot != lapack.Top && pivot != lapack.Bottom:
+		panic(badPivot)
+	case direct != lapack.Forward && direct != lapack.Backward:
+		panic(badDirect)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if side == blas.Left {
+		if len(c) < m-1 {
+			panic(shortC)
+		}
+		if len(s) < m-1 {
+			panic(shortS)
+		}
+	} else {
+		if len(c) < n-1 {
+			panic(shortC)
+		}
+		if len(s) < n-1 {
+			panic(shortS)
+		}
+	}
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	if side == blas.Left {
+		if pivot == lapack.Variable {
+			if direct == lapack.Forward {
+				for j := 0; j < m-1; j++ {
+					ctmp := c[j]
+					stmp := s[j]
+					if ctmp != 1 || stmp != 0 {
+						for i := 0; i < n; i++ {
+							tmp2 := a[j*lda+i]
+							tmp := a[(j+1)*lda+i]
+							a[(j+1)*lda+i] = ctmp*tmp - stmp*tmp2
+							a[j*lda+i] = stmp*tmp + ctmp*tmp2
+						}
+					}
+				}
+				return
+			}
+			for j := m - 2; j >= 0; j-- {
+				ctmp := c[j]
+				stmp := s[j]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < n; i++ {
+						tmp2 := a[j*lda+i]
+						tmp := a[(j+1)*lda+i]
+						a[(j+1)*lda+i] = ctmp*tmp - stmp*tmp2
+						a[j*lda+i] = stmp*tmp + ctmp*tmp2
+					}
+				}
+			}
+			return
+		} else if pivot == lapack.Top {
+			if direct == lapack.Forward {
+				for j := 1; j < m; j++ {
+					ctmp := c[j-1]
+					stmp := s[j-1]
+					if ctmp != 1 || stmp != 0 {
+						for i := 0; i < n; i++ {
+							tmp := a[j*lda+i]
+							tmp2 := a[i]
+							a[j*lda+i] = ctmp*tmp - stmp*tmp2
+							a[i] = stmp*tmp + ctmp*tmp2
+						}
+					}
+				}
+				return
+			}
+			for j := m - 1; j >= 1; j-- {
+				ctmp := c[j-1]
+				stmp := s[j-1]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < n; i++ {
+						ctmp := c[j-1]
+						stmp := s[j-1]
+						if ctmp != 1 || stmp != 0 {
+							for i := 0; i < n; i++ {
+								tmp := a[j*lda+i]
+								tmp2 := a[i]
+								a[j*lda+i] = ctmp*tmp - stmp*tmp2
+								a[i] = stmp*tmp + ctmp*tmp2
+							}
+						}
+					}
+				}
+			}
+			return
+		}
+		if direct == lapack.Forward {
+			for j := 0; j < m-1; j++ {
+				ctmp := c[j]
+				stmp := s[j]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < n; i++ {
+						tmp := a[j*lda+i]
+						tmp2 := a[(m-1)*lda+i]
+						a[j*lda+i] = stmp*tmp2 + ctmp*tmp
+						a[(m-1)*lda+i] = ctmp*tmp2 - stmp*tmp
+					}
+				}
+			}
+			return
+		}
+		for j := m - 2; j >= 0; j-- {
+			ctmp := c[j]
+			stmp := s[j]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < n; i++ {
+					tmp := a[j*lda+i]
+					tmp2 := a[(m-1)*lda+i]
+					a[j*lda+i] = stmp*tmp2 + ctmp*tmp
+					a[(m-1)*lda+i] = ctmp*tmp2 - stmp*tmp
+				}
+			}
+		}
+		return
+	}
+	if pivot == lapack.Variable {
+		if direct == lapack.Forward {
+			for j := 0; j < n-1; j++ {
+				ctmp := c[j]
+				stmp := s[j]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < m; i++ {
+						tmp := a[i*lda+j+1]
+						tmp2 := a[i*lda+j]
+						a[i*lda+j+1] = ctmp*tmp - stmp*tmp2
+						a[i*lda+j] = stmp*tmp + ctmp*tmp2
+					}
+				}
+			}
+			return
+		}
+		for j := n - 2; j >= 0; j-- {
+			ctmp := c[j]
+			stmp := s[j]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < m; i++ {
+					tmp := a[i*lda+j+1]
+					tmp2 := a[i*lda+j]
+					a[i*lda+j+1] = ctmp*tmp - stmp*tmp2
+					a[i*lda+j] = stmp*tmp + ctmp*tmp2
+				}
+			}
+		}
+		return
+	} else if pivot == lapack.Top {
+		if direct == lapack.Forward {
+			for j := 1; j < n; j++ {
+				ctmp := c[j-1]
+				stmp := s[j-1]
+				if ctmp != 1 || stmp != 0 {
+					for i := 0; i < m; i++ {
+						tmp := a[i*lda+j]
+						tmp2 := a[i*lda]
+						a[i*lda+j] = ctmp*tmp - stmp*tmp2
+						a[i*lda] = stmp*tmp + ctmp*tmp2
+					}
+				}
+			}
+			return
+		}
+		for j := n - 1; j >= 1; j-- {
+			ctmp := c[j-1]
+			stmp := s[j-1]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < m; i++ {
+					tmp := a[i*lda+j]
+					tmp2 := a[i*lda]
+					a[i*lda+j] = ctmp*tmp - stmp*tmp2
+					a[i*lda] = stmp*tmp + ctmp*tmp2
+				}
+			}
+		}
+		return
+	}
+	if direct == lapack.Forward {
+		for j := 0; j < n-1; j++ {
+			ctmp := c[j]
+			stmp := s[j]
+			if ctmp != 1 || stmp != 0 {
+				for i := 0; i < m; i++ {
+					tmp := a[i*lda+j]
+					tmp2 := a[i*lda+n-1]
+					a[i*lda+j] = stmp*tmp2 + ctmp*tmp
+					a[i*lda+n-1] = ctmp*tmp2 - stmp*tmp
+				}
+
+			}
+		}
+		return
+	}
+	for j := n - 2; j >= 0; j-- {
+		ctmp := c[j]
+		stmp := s[j]
+		if ctmp != 1 || stmp != 0 {
+			for i := 0; i < m; i++ {
+				tmp := a[i*lda+j]
+				tmp2 := a[i*lda+n-1]
+				a[i*lda+j] = stmp*tmp2 + ctmp*tmp
+				a[i*lda+n-1] = ctmp*tmp2 - stmp*tmp
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go
new file mode 100644
index 00000000000..be472805bf6
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasrt.go
@@ -0,0 +1,36 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"sort"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlasrt sorts the numbers in the input slice d. If s == lapack.SortIncreasing,
+// the elements are sorted in increasing order. If s == lapack.SortDecreasing,
+// the elements are sorted in decreasing order. For other values of s Dlasrt
+// will panic.
+//
+// Dlasrt is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasrt(s lapack.Sort, n int, d []float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case len(d) < n:
+		panic(shortD)
+	}
+
+	d = d[:n]
+	switch s {
+	default:
+		panic(badSort)
+	case lapack.SortIncreasing:
+		sort.Float64s(d)
+	case lapack.SortDecreasing:
+		sort.Sort(sort.Reverse(sort.Float64Slice(d)))
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go
new file mode 100644
index 00000000000..8f8cda8789f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlassq.go
@@ -0,0 +1,131 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlassq updates a sum of squares represented in scaled form. Dlassq returns
+// the values scl and smsq such that
+//
+//	scl^2*smsq = X[0]^2 + ... + X[n-1]^2 + scale^2*sumsq
+//
+// The value of sumsq is assumed to be non-negative.
+//
+// Dlassq is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlassq(n int, x []float64, incx int, scale float64, sumsq float64) (scl, smsq float64) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incx <= 0:
+		panic(badIncX)
+	case len(x) < 1+(n-1)*incx:
+		panic(shortX)
+	}
+
+	if math.IsNaN(scale) || math.IsNaN(sumsq) {
+		return scale, sumsq
+	}
+
+	if sumsq == 0 {
+		scale = 1
+	}
+	if scale == 0 {
+		scale = 1
+		sumsq = 0
+	}
+
+	if n == 0 {
+		return scale, sumsq
+	}
+
+	// Compute the sum of squares in 3 accumulators:
+	//  - abig: sum of squares scaled down to avoid overflow
+	//  - asml: sum of squares scaled up to avoid underflow
+	//  - amed: sum of squares that do not require scaling
+	// The thresholds and multipliers are:
+	//  - values bigger than dtbig are scaled down by dsbig
+	//  - values smaller than dtsml are scaled up by dssml
+	var (
+		isBig            bool
+		asml, amed, abig float64
+	)
+	for i, ix := 0, 0; i < n; i++ {
+		ax := math.Abs(x[ix])
+		switch {
+		case ax > dtbig:
+			ax *= dsbig
+			abig += ax * ax
+			isBig = true
+		case ax < dtsml:
+			if !isBig {
+				ax *= dssml
+				asml += ax * ax
+			}
+		default:
+			amed += ax * ax
+		}
+		ix += incx
+	}
+	// Put the existing sum of squares into one of the accumulators.
+	if sumsq > 0 {
+		ax := scale * math.Sqrt(sumsq)
+		switch {
+		case ax > dtbig:
+			if scale > 1 {
+				scale *= dsbig
+				abig += scale * (scale * sumsq)
+			} else {
+				// sumsq > dtbig^2 => (dsbig * (dsbig * sumsq)) is representable.
+				abig += scale * (scale * (dsbig * (dsbig * sumsq)))
+			}
+		case ax < dtsml:
+			if !isBig {
+				if scale < 1 {
+					scale *= dssml
+					asml += scale * (scale * sumsq)
+				} else {
+					// sumsq < dtsml^2 => (dssml * (dssml * sumsq)) is representable.
+					asml += scale * (scale * (dssml * (dssml * sumsq)))
+				}
+			}
+		default:
+			amed += scale * (scale * sumsq)
+		}
+	}
+	// Combine abig and amed or amed and asml if more than one accumulator was
+	// used.
+	switch {
+	case abig > 0:
+		// Combine abig and amed:
+		if amed > 0 || math.IsNaN(amed) {
+			abig += (amed * dsbig) * dsbig
+		}
+		scale = 1 / dsbig
+		sumsq = abig
+	case asml > 0:
+		// Combine amed and asml:
+		if amed > 0 || math.IsNaN(amed) {
+			amed = math.Sqrt(amed)
+			asml = math.Sqrt(asml) / dssml
+			ymin, ymax := asml, amed
+			if asml > amed {
+				ymin, ymax = amed, asml
+			}
+			scale = 1
+			sumsq = ymax * ymax * (1 + (ymin/ymax)*(ymin/ymax))
+		} else {
+			scale = 1 / dssml
+			sumsq = asml
+		}
+	default:
+		scale = 1
+		sumsq = amed
+	}
+	return scale, sumsq
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go
new file mode 100644
index 00000000000..cc7ceea0b85
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasv2.go
@@ -0,0 +1,117 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Dlasv2 computes the singular value decomposition of a 2×2 matrix.
+//
+//	[ csl snl] [f g] [csr -snr] = [ssmax     0]
+//	[-snl csl] [0 h] [snr  csr] = [    0 ssmin]
+//
+// ssmax is the larger absolute singular value, and ssmin is the smaller absolute
+// singular value. [cls, snl] and [csr, snr] are the left and right singular vectors.
+//
+// Dlasv2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasv2(f, g, h float64) (ssmin, ssmax, snr, csr, snl, csl float64) {
+	ft := f
+	fa := math.Abs(ft)
+	ht := h
+	ha := math.Abs(h)
+	// pmax points to the largest element of the matrix in terms of absolute value.
+	// 1 if F, 2 if G, 3 if H.
+	pmax := 1
+	swap := ha > fa
+	if swap {
+		pmax = 3
+		ft, ht = ht, ft
+		fa, ha = ha, fa
+	}
+	gt := g
+	ga := math.Abs(gt)
+	var clt, crt, slt, srt float64
+	if ga == 0 {
+		ssmin = ha
+		ssmax = fa
+		clt = 1
+		crt = 1
+		slt = 0
+		srt = 0
+	} else {
+		gasmall := true
+		if ga > fa {
+			pmax = 2
+			if (fa / ga) < dlamchE {
+				gasmall = false
+				ssmax = ga
+				if ha > 1 {
+					ssmin = fa / (ga / ha)
+				} else {
+					ssmin = (fa / ga) * ha
+				}
+				clt = 1
+				slt = ht / gt
+				srt = 1
+				crt = ft / gt
+			}
+		}
+		if gasmall {
+			d := fa - ha
+			l := d / fa
+			if d == fa { // deal with inf
+				l = 1
+			}
+			m := gt / ft
+			t := 2 - l
+			s := math.Hypot(t, m)
+			var r float64
+			if l == 0 {
+				r = math.Abs(m)
+			} else {
+				r = math.Hypot(l, m)
+			}
+			a := 0.5 * (s + r)
+			ssmin = ha / a
+			ssmax = fa * a
+			if m == 0 {
+				if l == 0 {
+					t = math.Copysign(2, ft) * math.Copysign(1, gt)
+				} else {
+					t = gt/math.Copysign(d, ft) + m/t
+				}
+			} else {
+				t = (m/(s+t) + m/(r+l)) * (1 + a)
+			}
+			l = math.Hypot(t, 2)
+			crt = 2 / l
+			srt = t / l
+			clt = (crt + srt*m) / a
+			slt = (ht / ft) * srt / a
+		}
+	}
+	if swap {
+		csl = srt
+		snl = crt
+		csr = slt
+		snr = clt
+	} else {
+		csl = clt
+		snl = slt
+		csr = crt
+		snr = srt
+	}
+	var tsign float64
+	switch pmax {
+	case 1:
+		tsign = math.Copysign(1, csr) * math.Copysign(1, csl) * math.Copysign(1, f)
+	case 2:
+		tsign = math.Copysign(1, snr) * math.Copysign(1, csl) * math.Copysign(1, g)
+	case 3:
+		tsign = math.Copysign(1, snr) * math.Copysign(1, snl) * math.Copysign(1, h)
+	}
+	ssmax = math.Copysign(ssmax, tsign)
+	ssmin = math.Copysign(ssmin, tsign*math.Copysign(1, f)*math.Copysign(1, h))
+	return ssmin, ssmax, snr, csr, snl, csl
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go
new file mode 100644
index 00000000000..88600ac17bc
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlaswp.go
@@ -0,0 +1,58 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// Dlaswp swaps the rows k1 to k2 of a rectangular matrix A according to the
+// indices in ipiv so that row k is swapped with ipiv[k].
+//
+// n is the number of columns of A and incX is the increment for ipiv. If incX
+// is 1, the swaps are applied from k1 to k2. If incX is -1, the swaps are
+// applied in reverse order from k2 to k1. For other values of incX Dlaswp will
+// panic. ipiv must have length k2+1, otherwise Dlaswp will panic.
+//
+// The indices k1, k2, and the elements of ipiv are zero-based.
+//
+// Dlaswp is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlaswp(n int, a []float64, lda int, k1, k2 int, ipiv []int, incX int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case k1 < 0:
+		panic(badK1)
+	case k2 < k1:
+		panic(badK2)
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(a) < k2*lda+n: // A must have at least k2+1 rows.
+		panic(shortA)
+	case len(ipiv) != k2+1:
+		panic(badLenIpiv)
+	case incX != 1 && incX != -1:
+		panic(absIncNotOne)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	bi := blas64.Implementation()
+	if incX == 1 {
+		for k := k1; k <= k2; k++ {
+			if k == ipiv[k] {
+				continue
+			}
+			bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1)
+		}
+		return
+	}
+	for k := k2; k >= k1; k-- {
+		if k == ipiv[k] {
+			continue
+		}
+		bi.Dswap(n, a[k*lda:], 1, a[ipiv[k]*lda:], 1)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go
new file mode 100644
index 00000000000..160b68b84ab
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlasy2.go
@@ -0,0 +1,292 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlasy2 solves the Sylvester matrix equation where the matrices are of order 1
+// or 2. It computes the unknown n1×n2 matrix X so that
+//
+//	TL*X   + sgn*X*TR  = scale*B  if tranl == false and tranr == false,
+//	TLᵀ*X + sgn*X*TR   = scale*B  if tranl == true  and tranr == false,
+//	TL*X   + sgn*X*TRᵀ = scale*B  if tranl == false and tranr == true,
+//	TLᵀ*X + sgn*X*TRᵀ  = scale*B  if tranl == true  and tranr == true,
+//
+// where TL is n1×n1, TR is n2×n2, B is n1×n2, and 1 <= n1,n2 <= 2.
+//
+// isgn must be 1 or -1, and n1 and n2 must be 0, 1, or 2, but these conditions
+// are not checked.
+//
+// Dlasy2 returns three values, a scale factor that is chosen less than or equal
+// to 1 to prevent the solution overflowing, the infinity norm of the solution,
+// and an indicator of success. If ok is false, TL and TR have eigenvalues that
+// are too close, so TL or TR is perturbed to get a non-singular equation.
+//
+// Dlasy2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlasy2(tranl, tranr bool, isgn, n1, n2 int, tl []float64, ldtl int, tr []float64, ldtr int, b []float64, ldb int, x []float64, ldx int) (scale, xnorm float64, ok bool) {
+	// TODO(vladimir-ch): Add input validation checks conditionally skipped
+	// using the build tag mechanism.
+
+	ok = true
+	// Quick return if possible.
+	if n1 == 0 || n2 == 0 {
+		return scale, xnorm, ok
+	}
+
+	// Set constants to control overflow.
+	eps := dlamchP
+	smlnum := dlamchS / eps
+	sgn := float64(isgn)
+
+	if n1 == 1 && n2 == 1 {
+		// 1×1 case: TL11*X + sgn*X*TR11 = B11.
+		tau1 := tl[0] + sgn*tr[0]
+		bet := math.Abs(tau1)
+		if bet <= smlnum {
+			tau1 = smlnum
+			bet = smlnum
+			ok = false
+		}
+		scale = 1
+		gam := math.Abs(b[0])
+		if smlnum*gam > bet {
+			scale = 1 / gam
+		}
+		x[0] = b[0] * scale / tau1
+		xnorm = math.Abs(x[0])
+		return scale, xnorm, ok
+	}
+
+	if n1+n2 == 3 {
+		// 1×2 or 2×1 case.
+		var (
+			smin float64
+			tmp  [4]float64 // tmp is used as a 2×2 row-major matrix.
+			btmp [2]float64
+		)
+		if n1 == 1 && n2 == 2 {
+			// 1×2 case: TL11*[X11 X12] + sgn*[X11 X12]*op[TR11 TR12] = [B11 B12].
+			//                                            [TR21 TR22]
+			smin = math.Abs(tl[0])
+			smin = math.Max(smin, math.Max(math.Abs(tr[0]), math.Abs(tr[1])))
+			smin = math.Max(smin, math.Max(math.Abs(tr[ldtr]), math.Abs(tr[ldtr+1])))
+			smin = math.Max(eps*smin, smlnum)
+			tmp[0] = tl[0] + sgn*tr[0]
+			tmp[3] = tl[0] + sgn*tr[ldtr+1]
+			if tranr {
+				tmp[1] = sgn * tr[1]
+				tmp[2] = sgn * tr[ldtr]
+			} else {
+				tmp[1] = sgn * tr[ldtr]
+				tmp[2] = sgn * tr[1]
+			}
+			btmp[0] = b[0]
+			btmp[1] = b[1]
+		} else {
+			// 2×1 case: op[TL11 TL12]*[X11] + sgn*[X11]*TR11 = [B11].
+			//             [TL21 TL22]*[X21]       [X21]        [B21]
+			smin = math.Abs(tr[0])
+			smin = math.Max(smin, math.Max(math.Abs(tl[0]), math.Abs(tl[1])))
+			smin = math.Max(smin, math.Max(math.Abs(tl[ldtl]), math.Abs(tl[ldtl+1])))
+			smin = math.Max(eps*smin, smlnum)
+			tmp[0] = tl[0] + sgn*tr[0]
+			tmp[3] = tl[ldtl+1] + sgn*tr[0]
+			if tranl {
+				tmp[1] = tl[ldtl]
+				tmp[2] = tl[1]
+			} else {
+				tmp[1] = tl[1]
+				tmp[2] = tl[ldtl]
+			}
+			btmp[0] = b[0]
+			btmp[1] = b[ldb]
+		}
+
+		// Solve 2×2 system using complete pivoting.
+		// Set pivots less than smin to smin.
+
+		bi := blas64.Implementation()
+		ipiv := bi.Idamax(len(tmp), tmp[:], 1)
+		// Compute the upper triangular matrix [u11 u12].
+		//                                     [  0 u22]
+		u11 := tmp[ipiv]
+		if math.Abs(u11) <= smin {
+			ok = false
+			u11 = smin
+		}
+		locu12 := [4]int{1, 0, 3, 2} // Index in tmp of the element on the same row as the pivot.
+		u12 := tmp[locu12[ipiv]]
+		locl21 := [4]int{2, 3, 0, 1} // Index in tmp of the element on the same column as the pivot.
+		l21 := tmp[locl21[ipiv]] / u11
+		locu22 := [4]int{3, 2, 1, 0} // Index in tmp of the remaining element.
+		u22 := tmp[locu22[ipiv]] - l21*u12
+		if math.Abs(u22) <= smin {
+			ok = false
+			u22 = smin
+		}
+		if ipiv&0x2 != 0 { // true for ipiv equal to 2 and 3.
+			// The pivot was in the second row, swap the elements of
+			// the right-hand side.
+			btmp[0], btmp[1] = btmp[1], btmp[0]-l21*btmp[1]
+		} else {
+			btmp[1] -= l21 * btmp[0]
+		}
+		scale = 1
+		if 2*smlnum*math.Abs(btmp[1]) > math.Abs(u22) || 2*smlnum*math.Abs(btmp[0]) > math.Abs(u11) {
+			scale = 0.5 / math.Max(math.Abs(btmp[0]), math.Abs(btmp[1]))
+			btmp[0] *= scale
+			btmp[1] *= scale
+		}
+		// Solve the system [u11 u12] [x21] = [ btmp[0] ].
+		//                  [  0 u22] [x22]   [ btmp[1] ]
+		x22 := btmp[1] / u22
+		x21 := btmp[0]/u11 - (u12/u11)*x22
+		if ipiv&0x1 != 0 { // true for ipiv equal to 1 and 3.
+			// The pivot was in the second column, swap the elements
+			// of the solution.
+			x21, x22 = x22, x21
+		}
+		x[0] = x21
+		if n1 == 1 {
+			x[1] = x22
+			xnorm = math.Abs(x[0]) + math.Abs(x[1])
+		} else {
+			x[ldx] = x22
+			xnorm = math.Max(math.Abs(x[0]), math.Abs(x[ldx]))
+		}
+		return scale, xnorm, ok
+	}
+
+	// 2×2 case: op[TL11 TL12]*[X11 X12] + SGN*[X11 X12]*op[TR11 TR12] = [B11 B12].
+	//             [TL21 TL22] [X21 X22]       [X21 X22]   [TR21 TR22]   [B21 B22]
+	//
+	// Solve equivalent 4×4 system using complete pivoting.
+	// Set pivots less than smin to smin.
+
+	smin := math.Max(math.Abs(tr[0]), math.Abs(tr[1]))
+	smin = math.Max(smin, math.Max(math.Abs(tr[ldtr]), math.Abs(tr[ldtr+1])))
+	smin = math.Max(smin, math.Max(math.Abs(tl[0]), math.Abs(tl[1])))
+	smin = math.Max(smin, math.Max(math.Abs(tl[ldtl]), math.Abs(tl[ldtl+1])))
+	smin = math.Max(eps*smin, smlnum)
+
+	var t [4][4]float64
+	t[0][0] = tl[0] + sgn*tr[0]
+	t[1][1] = tl[0] + sgn*tr[ldtr+1]
+	t[2][2] = tl[ldtl+1] + sgn*tr[0]
+	t[3][3] = tl[ldtl+1] + sgn*tr[ldtr+1]
+	if tranl {
+		t[0][2] = tl[ldtl]
+		t[1][3] = tl[ldtl]
+		t[2][0] = tl[1]
+		t[3][1] = tl[1]
+	} else {
+		t[0][2] = tl[1]
+		t[1][3] = tl[1]
+		t[2][0] = tl[ldtl]
+		t[3][1] = tl[ldtl]
+	}
+	if tranr {
+		t[0][1] = sgn * tr[1]
+		t[1][0] = sgn * tr[ldtr]
+		t[2][3] = sgn * tr[1]
+		t[3][2] = sgn * tr[ldtr]
+	} else {
+		t[0][1] = sgn * tr[ldtr]
+		t[1][0] = sgn * tr[1]
+		t[2][3] = sgn * tr[ldtr]
+		t[3][2] = sgn * tr[1]
+	}
+
+	var btmp [4]float64
+	btmp[0] = b[0]
+	btmp[1] = b[1]
+	btmp[2] = b[ldb]
+	btmp[3] = b[ldb+1]
+
+	// Perform elimination.
+	var jpiv [4]int // jpiv records any column swaps for pivoting.
+	for i := 0; i < 3; i++ {
+		var (
+			xmax       float64
+			ipsv, jpsv int
+		)
+		for ip := i; ip < 4; ip++ {
+			for jp := i; jp < 4; jp++ {
+				if math.Abs(t[ip][jp]) >= xmax {
+					xmax = math.Abs(t[ip][jp])
+					ipsv = ip
+					jpsv = jp
+				}
+			}
+		}
+		if ipsv != i {
+			// The pivot is not in the top row of the unprocessed
+			// block, swap rows ipsv and i of t and btmp.
+			t[ipsv], t[i] = t[i], t[ipsv]
+			btmp[ipsv], btmp[i] = btmp[i], btmp[ipsv]
+		}
+		if jpsv != i {
+			// The pivot is not in the left column of the
+			// unprocessed block, swap columns jpsv and i of t.
+			for k := 0; k < 4; k++ {
+				t[k][jpsv], t[k][i] = t[k][i], t[k][jpsv]
+			}
+		}
+		jpiv[i] = jpsv
+		if math.Abs(t[i][i]) < smin {
+			ok = false
+			t[i][i] = smin
+		}
+		for k := i + 1; k < 4; k++ {
+			t[k][i] /= t[i][i]
+			btmp[k] -= t[k][i] * btmp[i]
+			for j := i + 1; j < 4; j++ {
+				t[k][j] -= t[k][i] * t[i][j]
+			}
+		}
+	}
+	if math.Abs(t[3][3]) < smin {
+		ok = false
+		t[3][3] = smin
+	}
+	scale = 1
+	if 8*smlnum*math.Abs(btmp[0]) > math.Abs(t[0][0]) ||
+		8*smlnum*math.Abs(btmp[1]) > math.Abs(t[1][1]) ||
+		8*smlnum*math.Abs(btmp[2]) > math.Abs(t[2][2]) ||
+		8*smlnum*math.Abs(btmp[3]) > math.Abs(t[3][3]) {
+
+		maxbtmp := math.Max(math.Abs(btmp[0]), math.Abs(btmp[1]))
+		maxbtmp = math.Max(maxbtmp, math.Max(math.Abs(btmp[2]), math.Abs(btmp[3])))
+		scale = (1.0 / 8.0) / maxbtmp
+		btmp[0] *= scale
+		btmp[1] *= scale
+		btmp[2] *= scale
+		btmp[3] *= scale
+	}
+	// Compute the solution of the upper triangular system t * tmp = btmp.
+	var tmp [4]float64
+	for i := 3; i >= 0; i-- {
+		temp := 1 / t[i][i]
+		tmp[i] = btmp[i] * temp
+		for j := i + 1; j < 4; j++ {
+			tmp[i] -= temp * t[i][j] * tmp[j]
+		}
+	}
+	for i := 2; i >= 0; i-- {
+		if jpiv[i] != i {
+			tmp[i], tmp[jpiv[i]] = tmp[jpiv[i]], tmp[i]
+		}
+	}
+	x[0] = tmp[0]
+	x[1] = tmp[1]
+	x[ldx] = tmp[2]
+	x[ldx+1] = tmp[3]
+	xnorm = math.Max(math.Abs(tmp[0])+math.Abs(tmp[1]), math.Abs(tmp[2])+math.Abs(tmp[3]))
+	return scale, xnorm, ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go
new file mode 100644
index 00000000000..e0e809cf90d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatbs.go
@@ -0,0 +1,454 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlatbs solves a triangular banded system of equations
+//
+//	A * x = s*b    if trans == blas.NoTrans
+//	Aᵀ * x = s*b  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an upper or lower triangular band matrix, x and b are n-element
+// vectors, and s is a scaling factor chosen so that the components of x will be
+// less than the overflow threshold.
+//
+// On entry, x contains the right-hand side b of the triangular system.
+// On return, x is overwritten by the solution vector x.
+//
+// normin specifies whether the cnorm parameter contains the column norms of A on
+// entry. If it is true, cnorm[j] contains the norm of the off-diagonal part of
+// the j-th column of A. If it is false, the norms will be computed and stored
+// in cnorm.
+//
+// Dlatbs returns the scaling factor s for the triangular system. If the matrix
+// A is singular (A[j,j]==0 for some j), then scale is set to 0 and a
+// non-trivial solution to A*x = 0 is returned.
+//
+// Dlatbs is an internal routine. It is exported for testing purposes.
+func (Implementation) Dlatbs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, normin bool, n, kd int, ab []float64, ldab int, x, cnorm []float64) (scale float64) {
+	noTran := trans == blas.NoTrans
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case !noTran && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(ab) < (n-1)*ldab+kd+1:
+		panic(shortAB)
+	case len(x) < n:
+		panic(shortX)
+	case len(cnorm) < n:
+		panic(shortCNorm)
+	}
+
+	// Parameters to control overflow.
+	smlnum := dlamchS / dlamchP
+	bignum := 1 / smlnum
+
+	bi := blas64.Implementation()
+	kld := max(1, ldab-1)
+	if !normin {
+		// Compute the 1-norm of each column, not including the diagonal.
+		if uplo == blas.Upper {
+			for j := 0; j < n; j++ {
+				jlen := min(j, kd)
+				if jlen > 0 {
+					cnorm[j] = bi.Dasum(jlen, ab[(j-jlen)*ldab+jlen:], kld)
+				} else {
+					cnorm[j] = 0
+				}
+			}
+		} else {
+			for j := 0; j < n; j++ {
+				jlen := min(n-j-1, kd)
+				if jlen > 0 {
+					cnorm[j] = bi.Dasum(jlen, ab[(j+1)*ldab+kd-1:], kld)
+				} else {
+					cnorm[j] = 0
+				}
+			}
+		}
+	}
+
+	// Set up indices and increments for loops below.
+	var (
+		jFirst, jLast, jInc int
+		maind               int
+	)
+	if noTran {
+		if uplo == blas.Upper {
+			jFirst = n - 1
+			jLast = -1
+			jInc = -1
+			maind = 0
+		} else {
+			jFirst = 0
+			jLast = n
+			jInc = 1
+			maind = kd
+		}
+	} else {
+		if uplo == blas.Upper {
+			jFirst = 0
+			jLast = n
+			jInc = 1
+			maind = 0
+		} else {
+			jFirst = n - 1
+			jLast = -1
+			jInc = -1
+			maind = kd
+		}
+	}
+
+	// Scale the column norms by tscal if the maximum element in cnorm is
+	// greater than bignum.
+	tmax := cnorm[bi.Idamax(n, cnorm, 1)]
+	tscal := 1.0
+	if tmax > bignum {
+		tscal = 1 / (smlnum * tmax)
+		bi.Dscal(n, tscal, cnorm, 1)
+	}
+
+	// Compute a bound on the computed solution vector to see if the Level 2
+	// BLAS routine Dtbsv can be used.
+
+	xMax := math.Abs(x[bi.Idamax(n, x, 1)])
+	xBnd := xMax
+	grow := 0.0
+	// Compute the growth only if the maximum element in cnorm is NOT greater
+	// than bignum.
+	if tscal != 1 {
+		goto skipComputeGrow
+	}
+	if noTran {
+		// Compute the growth in A * x = b.
+		if diag == blas.NonUnit {
+			// A is non-unit triangular.
+			//
+			// Compute grow = 1/G_j and xBnd = 1/M_j.
+			// Initially, G_0 = max{x(i), i=1,...,n}.
+			grow = 1 / math.Max(xBnd, smlnum)
+			xBnd = grow
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// M_j = G_{j-1} / abs(A[j,j])
+				tjj := math.Abs(ab[j*ldab+maind])
+				xBnd = math.Min(xBnd, math.Min(1, tjj)*grow)
+				if tjj+cnorm[j] >= smlnum {
+					// G_j = G_{j-1}*( 1 + cnorm[j] / abs(A[j,j]) )
+					grow *= tjj / (tjj + cnorm[j])
+				} else {
+					// G_j could overflow, set grow to 0.
+					grow = 0
+				}
+			}
+			grow = xBnd
+		} else {
+			// A is unit triangular.
+			//
+			// Compute grow = 1/G_j, where G_0 = max{x(i), i=1,...,n}.
+			grow = math.Min(1, 1/math.Max(xBnd, smlnum))
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// G_j = G_{j-1}*( 1 + cnorm[j] )
+				grow /= 1 + cnorm[j]
+			}
+		}
+	} else {
+		// Compute the growth in Aᵀ * x = b.
+		if diag == blas.NonUnit {
+			// A is non-unit triangular.
+			//
+			// Compute grow = 1/G_j and xBnd = 1/M_j.
+			// Initially, G_0 = max{x(i), i=1,...,n}.
+			grow = 1 / math.Max(xBnd, smlnum)
+			xBnd = grow
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// G_j = max( G_{j-1}, M_{j-1}*( 1 + cnorm[j] ) )
+				xj := 1 + cnorm[j]
+				grow = math.Min(grow, xBnd/xj)
+				// M_j = M_{j-1}*( 1 + cnorm[j] ) / abs(A[j,j])
+				tjj := math.Abs(ab[j*ldab+maind])
+				if xj > tjj {
+					xBnd *= tjj / xj
+				}
+			}
+			grow = math.Min(grow, xBnd)
+		} else {
+			// A is unit triangular.
+			//
+			// Compute grow = 1/G_j, where G_0 = max{x(i), i=1,...,n}.
+			grow = math.Min(1, 1/math.Max(xBnd, smlnum))
+			for j := jFirst; j != jLast; j += jInc {
+				if grow <= smlnum {
+					// Exit the loop because the growth factor is too small.
+					goto skipComputeGrow
+				}
+				// G_j = G_{j-1}*( 1 + cnorm[j] )
+				grow /= 1 + cnorm[j]
+			}
+		}
+	}
+skipComputeGrow:
+
+	if grow*tscal > smlnum {
+		// The reciprocal of the bound on elements of X is not too small, use
+		// the Level 2 BLAS solve.
+		bi.Dtbsv(uplo, trans, diag, n, kd, ab, ldab, x, 1)
+		// Scale the column norms by 1/tscal for return.
+		if tscal != 1 {
+			bi.Dscal(n, 1/tscal, cnorm, 1)
+		}
+		return 1
+	}
+
+	// Use a Level 1 BLAS solve, scaling intermediate results.
+
+	scale = 1
+	if xMax > bignum {
+		// Scale x so that its components are less than or equal to bignum in
+		// absolute value.
+		scale = bignum / xMax
+		bi.Dscal(n, scale, x, 1)
+		xMax = bignum
+	}
+
+	if noTran {
+		// Solve A * x = b.
+		for j := jFirst; j != jLast; j += jInc {
+			// Compute x[j] = b[j] / A[j,j], scaling x if necessary.
+			xj := math.Abs(x[j])
+			tjjs := tscal
+			if diag == blas.NonUnit {
+				tjjs *= ab[j*ldab+maind]
+			}
+			tjj := math.Abs(tjjs)
+			switch {
+			case tjj > smlnum:
+				// smlnum < abs(A[j,j])
+				if tjj < 1 && xj > tjj*bignum {
+					// Scale x by 1/b[j].
+					rec := 1 / xj
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xMax *= rec
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			case tjj > 0:
+				// 0 < abs(A[j,j]) <= smlnum
+				if xj > tjj*bignum {
+					// Scale x by (1/abs(x[j]))*abs(A[j,j])*bignum to avoid
+					// overflow when dividing by A[j,j].
+					rec := tjj * bignum / xj
+					if cnorm[j] > 1 {
+						// Scale by 1/cnorm[j] to avoid overflow when
+						// multiplying x[j] times column j.
+						rec /= cnorm[j]
+					}
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xMax *= rec
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			default:
+				// A[j,j] == 0: Set x[0:n] = 0, x[j] = 1, and scale = 0, and
+				// compute a solution to A*x = 0.
+				for i := range x[:n] {
+					x[i] = 0
+				}
+				x[j] = 1
+				xj = 1
+				scale = 0
+				xMax = 0
+			}
+
+			// Scale x if necessary to avoid overflow when adding a multiple of
+			// column j of A.
+			switch {
+			case xj > 1:
+				rec := 1 / xj
+				if cnorm[j] > (bignum-xMax)*rec {
+					// Scale x by 1/(2*abs(x[j])).
+					rec *= 0.5
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+				}
+			case xj*cnorm[j] > bignum-xMax:
+				// Scale x by 1/2.
+				bi.Dscal(n, 0.5, x, 1)
+				scale *= 0.5
+			}
+
+			if uplo == blas.Upper {
+				if j > 0 {
+					// Compute the update
+					//  x[max(0,j-kd):j] := x[max(0,j-kd):j] - x[j] * A[max(0,j-kd):j,j]
+					jlen := min(j, kd)
+					if jlen > 0 {
+						bi.Daxpy(jlen, -x[j]*tscal, ab[(j-jlen)*ldab+jlen:], kld, x[j-jlen:], 1)
+					}
+					i := bi.Idamax(j, x, 1)
+					xMax = math.Abs(x[i])
+				}
+			} else if j < n-1 {
+				// Compute the update
+				//  x[j+1:min(j+kd,n)] := x[j+1:min(j+kd,n)] - x[j] * A[j+1:min(j+kd,n),j]
+				jlen := min(kd, n-j-1)
+				if jlen > 0 {
+					bi.Daxpy(jlen, -x[j]*tscal, ab[(j+1)*ldab+kd-1:], kld, x[j+1:], 1)
+				}
+				i := j + 1 + bi.Idamax(n-j-1, x[j+1:], 1)
+				xMax = math.Abs(x[i])
+			}
+		}
+	} else {
+		// Solve Aᵀ * x = b.
+		for j := jFirst; j != jLast; j += jInc {
+			// Compute x[j] = b[j] - sum A[k,j]*x[k].
+			//                       k!=j
+			xj := math.Abs(x[j])
+			tjjs := tscal
+			if diag == blas.NonUnit {
+				tjjs *= ab[j*ldab+maind]
+			}
+			tjj := math.Abs(tjjs)
+			rec := 1 / math.Max(1, xMax)
+			uscal := tscal
+			if cnorm[j] > (bignum-xj)*rec {
+				// If x[j] could overflow, scale x by 1/(2*xMax).
+				rec *= 0.5
+				if tjj > 1 {
+					// Divide by A[j,j] when scaling x if A[j,j] > 1.
+					rec = math.Min(1, rec*tjj)
+					uscal /= tjjs
+				}
+				if rec < 1 {
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xMax *= rec
+				}
+			}
+
+			var sumj float64
+			if uscal == 1 {
+				// If the scaling needed for A in the dot product is 1, call
+				// Ddot to perform the dot product...
+				if uplo == blas.Upper {
+					jlen := min(j, kd)
+					if jlen > 0 {
+						sumj = bi.Ddot(jlen, ab[(j-jlen)*ldab+jlen:], kld, x[j-jlen:], 1)
+					}
+				} else {
+					jlen := min(n-j-1, kd)
+					if jlen > 0 {
+						sumj = bi.Ddot(jlen, ab[(j+1)*ldab+kd-1:], kld, x[j+1:], 1)
+					}
+				}
+			} else {
+				// ...otherwise, use in-line code for the dot product.
+				if uplo == blas.Upper {
+					jlen := min(j, kd)
+					for i := 0; i < jlen; i++ {
+						sumj += (ab[(j-jlen+i)*ldab+jlen-i] * uscal) * x[j-jlen+i]
+					}
+				} else {
+					jlen := min(n-j-1, kd)
+					for i := 0; i < jlen; i++ {
+						sumj += (ab[(j+1+i)*ldab+kd-1-i] * uscal) * x[j+i+1]
+					}
+				}
+			}
+
+			if uscal == tscal {
+				// Compute x[j] := ( x[j] - sumj ) / A[j,j]
+				// if 1/A[j,j] was not used to scale the dot product.
+				x[j] -= sumj
+				xj = math.Abs(x[j])
+				// Compute x[j] = x[j] / A[j,j], scaling if necessary.
+				// Note: the reference implementation skips this step for blas.Unit matrices
+				// when tscal is equal to 1 but it complicates the logic and only saves
+				// the comparison and division in the first switch-case. Not skipping it
+				// is also consistent with the NoTrans case above.
+				switch {
+				case tjj > smlnum:
+					// smlnum < abs(A[j,j]):
+					if tjj < 1 && xj > tjj*bignum {
+						// Scale x by 1/abs(x[j]).
+						rec := 1 / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xMax *= rec
+					}
+					x[j] /= tjjs
+				case tjj > 0:
+					// 0 < abs(A[j,j]) <= smlnum:
+					if xj > tjj*bignum {
+						// Scale x by (1/abs(x[j]))*abs(A[j,j])*bignum.
+						rec := (tjj * bignum) / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xMax *= rec
+					}
+					x[j] /= tjjs
+				default:
+					// A[j,j] == 0: Set x[0:n] = 0, x[j] = 1, and scale = 0, and
+					// compute a solution Aᵀ * x = 0.
+					for i := range x[:n] {
+						x[i] = 0
+					}
+					x[j] = 1
+					scale = 0
+					xMax = 0
+				}
+			} else {
+				// Compute x[j] := x[j] / A[j,j] - sumj
+				// if the dot product has already been divided by 1/A[j,j].
+				x[j] = x[j]/tjjs - sumj
+			}
+			xMax = math.Max(xMax, math.Abs(x[j]))
+		}
+		scale /= tscal
+	}
+
+	// Scale the column norms by 1/tscal for return.
+	if tscal != 1 {
+		bi.Dscal(n, 1/tscal, cnorm, 1)
+	}
+	return scale
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go
new file mode 100644
index 00000000000..83422912b9b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatdf.go
@@ -0,0 +1,175 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlatdf computes a contribution to the reciprocal Dif-estimate by solving
+//
+//	Z * x = h - f
+//
+// and choosing the vector h such that the norm of x is as large as possible.
+//
+// The n×n matrix Z is represented by its LU factorization as computed by Dgetc2
+// and has the form
+//
+//	Z = P * L * U * Q
+//
+// where P and Q are permutation matrices, L is lower triangular with unit
+// diagonal elements and U is upper triangular.
+//
+// job specifies the heuristic method for computing the contribution.
+//
+// If job is lapack.LocalLookAhead, all entries of h are chosen as either +1 or
+// -1.
+//
+// If job is lapack.NormalizedNullVector, an approximate null-vector e of Z is
+// computed using Dgecon and normalized. h is chosen as ±e with the sign giving
+// the greater value of 2-norm(x). This strategy is about 5 times as expensive
+// as LocalLookAhead.
+//
+// On entry, rhs holds the contribution f from earlier solved sub-systems. On
+// return, rhs holds the solution x.
+//
+// ipiv and jpiv contain the pivot indices as returned by Dgetc2: row i of the
+// matrix has been interchanged with row ipiv[i] and column j of the matrix has
+// been interchanged with column jpiv[j].
+//
+// n must be at most 8, ipiv and jpiv must have length n, and rhs must have
+// length at least n, otherwise Dlatdf will panic.
+//
+// rdsum and rdscal represent the sum of squares of computed contributions to
+// the Dif-estimate from earlier solved sub-systems. rdscal is the scaling
+// factor used to prevent overflow in rdsum. Dlatdf returns this sum of squares
+// updated with the contributions from the current sub-system.
+//
+// Dlatdf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlatdf(job lapack.MaximizeNormXJob, n int, z []float64, ldz int, rhs []float64, rdsum, rdscal float64, ipiv, jpiv []int) (scale, sum float64) {
+	switch {
+	case job != lapack.LocalLookAhead && job != lapack.NormalizedNullVector:
+		panic(badMaximizeNormXJob)
+	case n < 0:
+		panic(nLT0)
+	case n > 8:
+		panic("lapack: n > 8")
+	case ldz < max(1, n):
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case len(rhs) < n:
+		panic(shortRHS)
+	case len(ipiv) != n:
+		panic(badLenIpiv)
+	case len(jpiv) != n:
+		panic(badLenJpiv)
+	}
+
+	const maxdim = 8
+	var (
+		xps   [maxdim]float64
+		xms   [maxdim]float64
+		work  [4 * maxdim]float64
+		iwork [maxdim]int
+	)
+	bi := blas64.Implementation()
+	xp := xps[:n]
+	xm := xms[:n]
+	if job == lapack.NormalizedNullVector {
+		// Compute approximate nullvector xm of Z.
+		_ = impl.Dgecon(lapack.MaxRowSum, n, z, ldz, 1, work[:], iwork[:])
+		// This relies on undocumented content in work[n:2*n] stored by Dgecon.
+		bi.Dcopy(n, work[n:], 1, xm, 1)
+
+		// Compute rhs.
+		impl.Dlaswp(1, xm, 1, 0, n-2, ipiv[:n-1], -1)
+		tmp := 1 / bi.Dnrm2(n, xm, 1)
+		bi.Dscal(n, tmp, xm, 1)
+		bi.Dcopy(n, xm, 1, xp, 1)
+		bi.Daxpy(n, 1, rhs, 1, xp, 1)
+		bi.Daxpy(n, -1.0, xm, 1, rhs, 1)
+		_ = impl.Dgesc2(n, z, ldz, rhs, ipiv, jpiv)
+		_ = impl.Dgesc2(n, z, ldz, xp, ipiv, jpiv)
+		if bi.Dasum(n, xp, 1) > bi.Dasum(n, rhs, 1) {
+			bi.Dcopy(n, xp, 1, rhs, 1)
+		}
+
+		// Compute and return the updated sum of squares.
+		return impl.Dlassq(n, rhs, 1, rdscal, rdsum)
+	}
+
+	// Apply permutations ipiv to rhs
+	impl.Dlaswp(1, rhs, 1, 0, n-2, ipiv[:n-1], 1)
+
+	// Solve for L-part choosing rhs either to +1 or -1.
+	pmone := -1.0
+	for j := 0; j < n-2; j++ {
+		bp := rhs[j] + 1
+		bm := rhs[j] - 1
+
+		// Look-ahead for L-part rhs[0:n-2] = +1 or -1, splus and sminu computed
+		// more efficiently than in https://doi.org/10.1109/9.29404.
+		splus := 1 + bi.Ddot(n-j-1, z[(j+1)*ldz+j:], ldz, z[(j+1)*ldz+j:], ldz)
+		sminu := bi.Ddot(n-j-1, z[(j+1)*ldz+j:], ldz, rhs[j+1:], 1)
+		splus *= rhs[j]
+		switch {
+		case splus > sminu:
+			rhs[j] = bp
+		case sminu > splus:
+			rhs[j] = bm
+		default:
+			// In this case the updating sums are equal and we can choose rsh[j]
+			// +1 or -1. The first time this happens we choose -1, thereafter
+			// +1. This is a simple way to get good estimates of matrices like
+			// Byers well-known example (see https://doi.org/10.1109/9.29404).
+			rhs[j] += pmone
+			pmone = 1
+		}
+
+		// Compute remaining rhs.
+		bi.Daxpy(n-j-1, -rhs[j], z[(j+1)*ldz+j:], ldz, rhs[j+1:], 1)
+	}
+
+	// Solve for U-part, look-ahead for rhs[n-1] = ±1. This is not done in
+	// Bsolve and will hopefully give us a better estimate because any
+	// ill-conditioning of the original matrix is transferred to U and not to L.
+	// U[n-1,n-1] is an approximation to sigma_min(LU).
+	bi.Dcopy(n-1, rhs, 1, xp, 1)
+	xp[n-1] = rhs[n-1] + 1
+	rhs[n-1] -= 1
+	var splus, sminu float64
+	for i := n - 1; i >= 0; i-- {
+		tmp := 1 / z[i*ldz+i]
+		xp[i] *= tmp
+		rhs[i] *= tmp
+		for k := i + 1; k < n; k++ {
+			xp[i] -= xp[k] * (z[i*ldz+k] * tmp)
+			rhs[i] -= rhs[k] * (z[i*ldz+k] * tmp)
+		}
+		splus += math.Abs(xp[i])
+		sminu += math.Abs(rhs[i])
+	}
+	if splus > sminu {
+		bi.Dcopy(n, xp, 1, rhs, 1)
+	}
+
+	// Apply the permutations jpiv to the computed solution (rhs).
+	impl.Dlaswp(1, rhs, 1, 0, n-2, jpiv[:n-1], -1)
+
+	// Compute and return the updated sum of squares.
+	return impl.Dlassq(n, rhs, 1, rdscal, rdsum)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go
new file mode 100644
index 00000000000..195be09c9b7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrd.go
@@ -0,0 +1,176 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlatrd reduces nb rows and columns of a real n×n symmetric matrix A to symmetric
+// tridiagonal form. It computes the orthonormal similarity transformation
+//
+//	Qᵀ * A * Q
+//
+// and returns the matrices V and W to apply to the unreduced part of A. If
+// uplo == blas.Upper, the upper triangle is supplied and the last nb rows are
+// reduced. If uplo == blas.Lower, the lower triangle is supplied and the first
+// nb rows are reduced.
+//
+// a contains the symmetric matrix on entry with active triangular half specified
+// by uplo. On exit, the nb columns have been reduced to tridiagonal form. The
+// diagonal contains the diagonal of the reduced matrix, the off-diagonal is
+// set to 1, and the remaining elements contain the data to construct Q.
+//
+// If uplo == blas.Upper, with n = 5 and nb = 2 on exit a is
+//
+//	[ a   a   a  v4  v5]
+//	[     a   a  v4  v5]
+//	[         a   1  v5]
+//	[             d   1]
+//	[                 d]
+//
+// If uplo == blas.Lower, with n = 5 and nb = 2, on exit a is
+//
+//	[ d                ]
+//	[ 1   d            ]
+//	[v1   1   a        ]
+//	[v1  v2   a   a    ]
+//	[v1  v2   a   a   a]
+//
+// e contains the superdiagonal elements of the reduced matrix. If uplo == blas.Upper,
+// e[n-nb:n-1] contains the last nb columns of the reduced matrix, while if
+// uplo == blas.Lower, e[:nb] contains the first nb columns of the reduced matrix.
+// e must have length at least n-1, and Dlatrd will panic otherwise.
+//
+// tau contains the scalar factors of the elementary reflectors needed to construct Q.
+// The reflectors are stored in tau[n-nb:n-1] if uplo == blas.Upper, and in
+// tau[:nb] if uplo == blas.Lower. tau must have length n-1, and Dlatrd will panic
+// otherwise.
+//
+// w is an n×nb matrix. On exit it contains the data to update the unreduced part
+// of A.
+//
+// The matrix Q is represented as a product of elementary reflectors. Each reflector
+// H has the form
+//
+//	I - tau * v * vᵀ
+//
+// If uplo == blas.Upper,
+//
+//	Q = H_{n-1} * H_{n-2} * ... * H_{n-nb}
+//
+// where v[:i-1] is stored in A[:i-1,i], v[i-1] = 1, and v[i:n] = 0.
+//
+// If uplo == blas.Lower,
+//
+//	Q = H_0 * H_1 * ... * H_{nb-1}
+//
+// where v[:i+1] = 0, v[i+1] = 1, and v[i+2:n] is stored in A[i+2:n,i].
+//
+// The vectors v form the n×nb matrix V which is used with W to apply a
+// symmetric rank-2 update to the unreduced part of A
+//
+//	A = A - V * Wᵀ - W * Vᵀ
+//
+// Dlatrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlatrd(uplo blas.Uplo, n, nb int, a []float64, lda int, e, tau, w []float64, ldw int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case nb < 0:
+		panic(nbLT0)
+	case nb > n:
+		panic(nbGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldw < max(1, nb):
+		panic(badLdW)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(w) < (n-1)*ldw+nb:
+		panic(shortW)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		for i := n - 1; i >= n-nb; i-- {
+			iw := i - n + nb
+			if i < n-1 {
+				// Update A(0:i, i).
+				bi.Dgemv(blas.NoTrans, i+1, n-i-1, -1, a[i+1:], lda,
+					w[i*ldw+iw+1:], 1, 1, a[i:], lda)
+				bi.Dgemv(blas.NoTrans, i+1, n-i-1, -1, w[iw+1:], ldw,
+					a[i*lda+i+1:], 1, 1, a[i:], lda)
+			}
+			if i > 0 {
+				// Generate elementary reflector H_i to annihilate A(0:i-2,i).
+				e[i-1], tau[i-1] = impl.Dlarfg(i, a[(i-1)*lda+i], a[i:], lda)
+				a[(i-1)*lda+i] = 1
+
+				// Compute W(0:i-1, i).
+				bi.Dsymv(blas.Upper, i, 1, a, lda, a[i:], lda, 0, w[iw:], ldw)
+				if i < n-1 {
+					bi.Dgemv(blas.Trans, i, n-i-1, 1, w[iw+1:], ldw,
+						a[i:], lda, 0, w[(i+1)*ldw+iw:], ldw)
+					bi.Dgemv(blas.NoTrans, i, n-i-1, -1, a[i+1:], lda,
+						w[(i+1)*ldw+iw:], ldw, 1, w[iw:], ldw)
+					bi.Dgemv(blas.Trans, i, n-i-1, 1, a[i+1:], lda,
+						a[i:], lda, 0, w[(i+1)*ldw+iw:], ldw)
+					bi.Dgemv(blas.NoTrans, i, n-i-1, -1, w[iw+1:], ldw,
+						w[(i+1)*ldw+iw:], ldw, 1, w[iw:], ldw)
+				}
+				bi.Dscal(i, tau[i-1], w[iw:], ldw)
+				alpha := -0.5 * tau[i-1] * bi.Ddot(i, w[iw:], ldw, a[i:], lda)
+				bi.Daxpy(i, alpha, a[i:], lda, w[iw:], ldw)
+			}
+		}
+	} else {
+		// Reduce first nb columns of lower triangle.
+		for i := 0; i < nb; i++ {
+			// Update A(i:n, i)
+			bi.Dgemv(blas.NoTrans, n-i, i, -1, a[i*lda:], lda,
+				w[i*ldw:], 1, 1, a[i*lda+i:], lda)
+			bi.Dgemv(blas.NoTrans, n-i, i, -1, w[i*ldw:], ldw,
+				a[i*lda:], 1, 1, a[i*lda+i:], lda)
+			if i < n-1 {
+				// Generate elementary reflector H_i to annihilate A(i+2:n,i).
+				e[i], tau[i] = impl.Dlarfg(n-i-1, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
+				a[(i+1)*lda+i] = 1
+
+				// Compute W(i+1:n,i).
+				bi.Dsymv(blas.Lower, n-i-1, 1, a[(i+1)*lda+i+1:], lda,
+					a[(i+1)*lda+i:], lda, 0, w[(i+1)*ldw+i:], ldw)
+				bi.Dgemv(blas.Trans, n-i-1, i, 1, w[(i+1)*ldw:], ldw,
+					a[(i+1)*lda+i:], lda, 0, w[i:], ldw)
+				bi.Dgemv(blas.NoTrans, n-i-1, i, -1, a[(i+1)*lda:], lda,
+					w[i:], ldw, 1, w[(i+1)*ldw+i:], ldw)
+				bi.Dgemv(blas.Trans, n-i-1, i, 1, a[(i+1)*lda:], lda,
+					a[(i+1)*lda+i:], lda, 0, w[i:], ldw)
+				bi.Dgemv(blas.NoTrans, n-i-1, i, -1, w[(i+1)*ldw:], ldw,
+					w[i:], ldw, 1, w[(i+1)*ldw+i:], ldw)
+				bi.Dscal(n-i-1, tau[i], w[(i+1)*ldw+i:], ldw)
+				alpha := -0.5 * tau[i] * bi.Ddot(n-i-1, w[(i+1)*ldw+i:], ldw,
+					a[(i+1)*lda+i:], lda)
+				bi.Daxpy(n-i-1, alpha, a[(i+1)*lda+i:], lda,
+					w[(i+1)*ldw+i:], ldw)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go
new file mode 100644
index 00000000000..f13b7d57c09
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlatrs.go
@@ -0,0 +1,410 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dlatrs solves a triangular system of equations scaled to prevent overflow. It
+// solves
+//
+//	A * x = scale * b if trans == blas.NoTrans
+//	Aᵀ * x = scale * b if trans == blas.Trans
+//
+// where the scale s is set for numeric stability.
+//
+// A is an n×n triangular matrix. On entry, the slice x contains the values of
+// b, and on exit it contains the solution vector x.
+//
+// If normin == true, cnorm is an input and cnorm[j] contains the norm of the off-diagonal
+// part of the j^th column of A. If trans == blas.NoTrans, cnorm[j] must be greater
+// than or equal to the infinity norm, and greater than or equal to the one-norm
+// otherwise. If normin == false, then cnorm is treated as an output, and is set
+// to contain the 1-norm of the off-diagonal part of the j^th column of A.
+//
+// Dlatrs is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dlatrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, normin bool, n int, a []float64, lda int, x []float64, cnorm []float64) (scale float64) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.Unit && diag != blas.NonUnit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(x) < n:
+		panic(shortX)
+	case len(cnorm) < n:
+		panic(shortCNorm)
+	}
+
+	upper := uplo == blas.Upper
+	nonUnit := diag == blas.NonUnit
+
+	smlnum := dlamchS / dlamchP
+	bignum := 1 / smlnum
+	scale = 1
+
+	bi := blas64.Implementation()
+
+	if !normin {
+		if upper {
+			cnorm[0] = 0
+			for j := 1; j < n; j++ {
+				cnorm[j] = bi.Dasum(j, a[j:], lda)
+			}
+		} else {
+			for j := 0; j < n-1; j++ {
+				cnorm[j] = bi.Dasum(n-j-1, a[(j+1)*lda+j:], lda)
+			}
+			cnorm[n-1] = 0
+		}
+	}
+	// Scale the column norms by tscal if the maximum element in cnorm is greater than bignum.
+	imax := bi.Idamax(n, cnorm, 1)
+	var tscal float64
+	if cnorm[imax] <= bignum {
+		tscal = 1
+	} else {
+		tmax := cnorm[imax]
+		// Avoid NaN generation if entries in cnorm exceed the overflow
+		// threshold.
+		if tmax <= math.MaxFloat64 {
+			// Case 1: All entries in cnorm are valid floating-point numbers.
+			tscal = 1 / (smlnum * tmax)
+			bi.Dscal(n, tscal, cnorm, 1)
+		} else {
+			// Case 2: At least one column norm of A cannot be represented as
+			// floating-point number. Find the offdiagonal entry A[i,j] with the
+			// largest absolute value. If this entry is not +/- Infinity, use
+			// this value as tscal.
+			tmax = 0
+			if upper {
+				// A is upper triangular.
+				for j := 1; j < n; j++ {
+					tmax = math.Max(impl.Dlange(lapack.MaxAbs, j, 1, a[j:], lda, nil), tmax)
+				}
+			} else {
+				// A is lower triangular.
+				for j := 0; j < n-1; j++ {
+					tmax = math.Max(impl.Dlange(lapack.MaxAbs, n-j-1, 1, a[(j+1)*lda+j:], lda, nil), tmax)
+				}
+			}
+			if tmax <= math.MaxFloat64 {
+				tscal = 1 / (smlnum * tmax)
+				for j := 0; j < n; j++ {
+					if cnorm[j] <= math.MaxFloat64 {
+						cnorm[j] *= tscal
+					} else {
+						// Recompute the 1-norm without introducing Infinity in
+						// the summation.
+						cnorm[j] = 0
+						if upper {
+							for i := 0; i < j; i++ {
+								cnorm[j] += tscal * math.Abs(a[i*lda+j])
+							}
+						} else {
+							for i := j + 1; i < n; i++ {
+								cnorm[j] += tscal * math.Abs(a[i*lda+j])
+							}
+						}
+					}
+				}
+			} else {
+				// At least one entry of A is not a valid floating-point entry.
+				// Rely on Dtrsv to propagate Inf and NaN.
+				bi.Dtrsv(uplo, trans, diag, n, a, lda, x, 1)
+				return
+			}
+		}
+	}
+
+	// Compute a bound on the computed solution vector to see if bi.Dtrsv can be used.
+	j := bi.Idamax(n, x, 1)
+	xmax := math.Abs(x[j])
+	xbnd := xmax
+	var grow float64
+	var jfirst, jlast, jinc int
+	if trans == blas.NoTrans {
+		if upper {
+			jfirst = n - 1
+			jlast = -1
+			jinc = -1
+		} else {
+			jfirst = 0
+			jlast = n
+			jinc = 1
+		}
+		// Compute the growth in A * x = b.
+		if tscal != 1 {
+			grow = 0
+			goto Solve
+		}
+		if nonUnit {
+			grow = 1 / math.Max(xbnd, smlnum)
+			xbnd = grow
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				tjj := math.Abs(a[j*lda+j])
+				xbnd = math.Min(xbnd, math.Min(1, tjj)*grow)
+				if tjj+cnorm[j] >= smlnum {
+					grow *= tjj / (tjj + cnorm[j])
+				} else {
+					grow = 0
+				}
+			}
+			grow = xbnd
+		} else {
+			grow = math.Min(1, 1/math.Max(xbnd, smlnum))
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				grow *= 1 / (1 + cnorm[j])
+			}
+		}
+	} else {
+		if upper {
+			jfirst = 0
+			jlast = n
+			jinc = 1
+		} else {
+			jfirst = n - 1
+			jlast = -1
+			jinc = -1
+		}
+		if tscal != 1 {
+			grow = 0
+			goto Solve
+		}
+		if nonUnit {
+			grow = 1 / (math.Max(xbnd, smlnum))
+			xbnd = grow
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				xj := 1 + cnorm[j]
+				grow = math.Min(grow, xbnd/xj)
+				tjj := math.Abs(a[j*lda+j])
+				if xj > tjj {
+					xbnd *= tjj / xj
+				}
+			}
+			grow = math.Min(grow, xbnd)
+		} else {
+			grow = math.Min(1, 1/math.Max(xbnd, smlnum))
+			for j := jfirst; j != jlast; j += jinc {
+				if grow <= smlnum {
+					goto Solve
+				}
+				xj := 1 + cnorm[j]
+				grow /= xj
+			}
+		}
+	}
+
+Solve:
+	if grow*tscal > smlnum {
+		// Use the Level 2 BLAS solve if the reciprocal of the bound on
+		// elements of X is not too small.
+		bi.Dtrsv(uplo, trans, diag, n, a, lda, x, 1)
+		if tscal != 1 {
+			bi.Dscal(n, 1/tscal, cnorm, 1)
+		}
+		return scale
+	}
+
+	// Use a Level 1 BLAS solve, scaling intermediate results.
+	if xmax > bignum {
+		scale = bignum / xmax
+		bi.Dscal(n, scale, x, 1)
+		xmax = bignum
+	}
+	if trans == blas.NoTrans {
+		for j := jfirst; j != jlast; j += jinc {
+			xj := math.Abs(x[j])
+			var tjj, tjjs float64
+			if nonUnit {
+				tjjs = a[j*lda+j] * tscal
+			} else {
+				tjjs = tscal
+				if tscal == 1 {
+					goto Skip1
+				}
+			}
+			tjj = math.Abs(tjjs)
+			if tjj > smlnum {
+				if tjj < 1 {
+					if xj > tjj*bignum {
+						rec := 1 / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xmax *= rec
+					}
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			} else if tjj > 0 {
+				if xj > tjj*bignum {
+					rec := (tjj * bignum) / xj
+					if cnorm[j] > 1 {
+						rec /= cnorm[j]
+					}
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xmax *= rec
+				}
+				x[j] /= tjjs
+				xj = math.Abs(x[j])
+			} else {
+				for i := 0; i < n; i++ {
+					x[i] = 0
+				}
+				x[j] = 1
+				xj = 1
+				scale = 0
+				xmax = 0
+			}
+		Skip1:
+			if xj > 1 {
+				rec := 1 / xj
+				if cnorm[j] > (bignum-xmax)*rec {
+					rec *= 0.5
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+				}
+			} else if xj*cnorm[j] > bignum-xmax {
+				bi.Dscal(n, 0.5, x, 1)
+				scale *= 0.5
+			}
+			if upper {
+				if j > 0 {
+					bi.Daxpy(j, -x[j]*tscal, a[j:], lda, x, 1)
+					i := bi.Idamax(j, x, 1)
+					xmax = math.Abs(x[i])
+				}
+			} else {
+				if j < n-1 {
+					bi.Daxpy(n-j-1, -x[j]*tscal, a[(j+1)*lda+j:], lda, x[j+1:], 1)
+					i := j + bi.Idamax(n-j-1, x[j+1:], 1)
+					xmax = math.Abs(x[i])
+				}
+			}
+		}
+	} else {
+		for j := jfirst; j != jlast; j += jinc {
+			xj := math.Abs(x[j])
+			uscal := tscal
+			rec := 1 / math.Max(xmax, 1)
+			var tjjs float64
+			if cnorm[j] > (bignum-xj)*rec {
+				rec *= 0.5
+				if nonUnit {
+					tjjs = a[j*lda+j] * tscal
+				} else {
+					tjjs = tscal
+				}
+				tjj := math.Abs(tjjs)
+				if tjj > 1 {
+					rec = math.Min(1, rec*tjj)
+					uscal /= tjjs
+				}
+				if rec < 1 {
+					bi.Dscal(n, rec, x, 1)
+					scale *= rec
+					xmax *= rec
+				}
+			}
+			var sumj float64
+			if uscal == 1 {
+				if upper {
+					sumj = bi.Ddot(j, a[j:], lda, x, 1)
+				} else if j < n-1 {
+					sumj = bi.Ddot(n-j-1, a[(j+1)*lda+j:], lda, x[j+1:], 1)
+				}
+			} else {
+				if upper {
+					for i := 0; i < j; i++ {
+						sumj += (a[i*lda+j] * uscal) * x[i]
+					}
+				} else if j < n {
+					for i := j + 1; i < n; i++ {
+						sumj += (a[i*lda+j] * uscal) * x[i]
+					}
+				}
+			}
+			if uscal == tscal {
+				x[j] -= sumj
+				xj := math.Abs(x[j])
+				var tjjs float64
+				if nonUnit {
+					tjjs = a[j*lda+j] * tscal
+				} else {
+					tjjs = tscal
+					if tscal == 1 {
+						goto Skip2
+					}
+				}
+				tjj := math.Abs(tjjs)
+				if tjj > smlnum {
+					if tjj < 1 {
+						if xj > tjj*bignum {
+							rec = 1 / xj
+							bi.Dscal(n, rec, x, 1)
+							scale *= rec
+							xmax *= rec
+						}
+					}
+					x[j] /= tjjs
+				} else if tjj > 0 {
+					if xj > tjj*bignum {
+						rec = (tjj * bignum) / xj
+						bi.Dscal(n, rec, x, 1)
+						scale *= rec
+						xmax *= rec
+					}
+					x[j] /= tjjs
+				} else {
+					for i := 0; i < n; i++ {
+						x[i] = 0
+					}
+					x[j] = 1
+					scale = 0
+					xmax = 0
+				}
+			} else {
+				x[j] = x[j]/tjjs - sumj
+			}
+		Skip2:
+			xmax = math.Max(xmax, math.Abs(x[j]))
+		}
+	}
+	scale /= tscal
+	if tscal != 1 {
+		bi.Dscal(n, 1/tscal, cnorm, 1)
+	}
+	return scale
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go
new file mode 100644
index 00000000000..b70a8420d58
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauu2.go
@@ -0,0 +1,66 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlauu2 computes the product
+//
+//	U * Uᵀ  if uplo is blas.Upper
+//	Lᵀ * L  if uplo is blas.Lower
+//
+// where U or L is stored in the upper or lower triangular part of A.
+// Only the upper or lower triangle of the result is stored, overwriting
+// the corresponding factor in A.
+func (impl Implementation) Dlauu2(uplo blas.Uplo, n int, a []float64, lda int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		// Compute the product U*Uᵀ.
+		for i := 0; i < n; i++ {
+			aii := a[i*lda+i]
+			if i < n-1 {
+				a[i*lda+i] = bi.Ddot(n-i, a[i*lda+i:], 1, a[i*lda+i:], 1)
+				bi.Dgemv(blas.NoTrans, i, n-i-1, 1, a[i+1:], lda, a[i*lda+i+1:], 1,
+					aii, a[i:], lda)
+			} else {
+				bi.Dscal(i+1, aii, a[i:], lda)
+			}
+		}
+	} else {
+		// Compute the product Lᵀ*L.
+		for i := 0; i < n; i++ {
+			aii := a[i*lda+i]
+			if i < n-1 {
+				a[i*lda+i] = bi.Ddot(n-i, a[i*lda+i:], lda, a[i*lda+i:], lda)
+				bi.Dgemv(blas.Trans, n-i-1, i, 1, a[(i+1)*lda:], lda, a[(i+1)*lda+i:], lda,
+					aii, a[i*lda:], 1)
+			} else {
+				bi.Dscal(i+1, aii, a[i*lda:], 1)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go
new file mode 100644
index 00000000000..575ed7c88fe
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dlauum.go
@@ -0,0 +1,83 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dlauum computes the product
+//
+//	U * Uᵀ  if uplo is blas.Upper
+//	Lᵀ * L  if uplo is blas.Lower
+//
+// where U or L is stored in the upper or lower triangular part of A.
+// Only the upper or lower triangle of the result is stored, overwriting
+// the corresponding factor in A.
+func (impl Implementation) Dlauum(uplo blas.Uplo, n int, a []float64, lda int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Determine the block size.
+	opts := "U"
+	if uplo == blas.Lower {
+		opts = "L"
+	}
+	nb := impl.Ilaenv(1, "DLAUUM", opts, n, -1, -1, -1)
+
+	if nb <= 1 || n <= nb {
+		// Use unblocked code.
+		impl.Dlauu2(uplo, n, a, lda)
+		return
+	}
+
+	// Use blocked code.
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Compute the product U*Uᵀ.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			bi.Dtrmm(blas.Right, blas.Upper, blas.Trans, blas.NonUnit,
+				i, ib, 1, a[i*lda+i:], lda, a[i:], lda)
+			impl.Dlauu2(blas.Upper, ib, a[i*lda+i:], lda)
+			if n-i-ib > 0 {
+				bi.Dgemm(blas.NoTrans, blas.Trans, i, ib, n-i-ib,
+					1, a[i+ib:], lda, a[i*lda+i+ib:], lda, 1, a[i:], lda)
+				bi.Dsyrk(blas.Upper, blas.NoTrans, ib, n-i-ib,
+					1, a[i*lda+i+ib:], lda, 1, a[i*lda+i:], lda)
+			}
+		}
+	} else {
+		// Compute the product Lᵀ*L.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			bi.Dtrmm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit,
+				ib, i, 1, a[i*lda+i:], lda, a[i*lda:], lda)
+			impl.Dlauu2(blas.Lower, ib, a[i*lda+i:], lda)
+			if n-i-ib > 0 {
+				bi.Dgemm(blas.Trans, blas.NoTrans, ib, i, n-i-ib,
+					1, a[(i+ib)*lda+i:], lda, a[(i+ib)*lda:], lda, 1, a[i*lda:], lda)
+				bi.Dsyrk(blas.Lower, blas.Trans, ib, n-i-ib,
+					1, a[(i+ib)*lda+i:], lda, 1, a[i*lda+i:], lda)
+			}
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/doc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/doc.go
new file mode 100644
index 00000000000..087f63cc6ee
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/doc.go
@@ -0,0 +1,28 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gonum is a pure-go implementation of the LAPACK API. The LAPACK API defines
+// a set of algorithms for advanced matrix operations.
+//
+// The function definitions and implementations follow that of the netlib reference
+// implementation. See http://www.netlib.org/lapack/explore-html/ for more
+// information, and http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
+// for more license information.
+//
+// Slice function arguments frequently represent vectors and matrices. The data
+// layout is identical to that found in https://pkg.go.dev/gonum.org/v1/gonum/blas/gonum.
+//
+// Most LAPACK functions are built on top the routines defined in the BLAS API,
+// and as such the computation time for many LAPACK functions is
+// dominated by BLAS calls. Here, BLAS is accessed through the
+// blas64 package (https://pkg.go.dev/gonum.org/v1/gonum/blas/blas64). In particular,
+// this implies that an external BLAS library will be used if it is
+// registered in blas64.
+//
+// The full LAPACK capability has not been implemented at present. The full
+// API is very large, containing approximately 200 functions for double precision
+// alone. Future additions will be focused on supporting the Gonum matrix
+// package (https://pkg.go.dev/gonum.org/v1/gonum/mat), though pull requests
+// with implementations and tests for LAPACK function are encouraged.
+package gonum // import "gonum.org/v1/gonum/lapack/gonum"
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go
new file mode 100644
index 00000000000..fdb37af2a70
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2l.go
@@ -0,0 +1,78 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorg2l generates an m×n matrix Q with orthonormal columns which is defined
+// as the last n columns of a product of k elementary reflectors of order m.
+//
+//	Q = H_{k-1} * ... * H_1 * H_0
+//
+// See Dgelqf for more information. It must be that m >= n >= k.
+//
+// tau contains the scalar reflectors computed by Dgeqlf. tau must have length
+// at least k, and Dorg2l will panic otherwise.
+//
+// work contains temporary memory, and must have length at least n. Dorg2l will
+// panic otherwise.
+//
+// Dorg2l is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorg2l(m, n, k int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Initialize columns 0:n-k to columns of the unit matrix.
+	for j := 0; j < n-k; j++ {
+		for l := 0; l < m; l++ {
+			a[l*lda+j] = 0
+		}
+		a[(m-n+j)*lda+j] = 1
+	}
+
+	bi := blas64.Implementation()
+	for i := 0; i < k; i++ {
+		ii := n - k + i
+
+		// Apply H_i to A[0:m-k+i, 0:n-k+i] from the left.
+		a[(m-n+ii)*lda+ii] = 1
+		impl.Dlarf(blas.Left, m-n+ii+1, ii, a[ii:], lda, tau[i], a, lda, work)
+		bi.Dscal(m-n+ii, -tau[i], a[ii:], lda)
+		a[(m-n+ii)*lda+ii] = 1 - tau[i]
+
+		// Set A[m-k+i:m, n-k+i+1] to zero.
+		for l := m - n + ii + 1; l < m; l++ {
+			a[l*lda+ii] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go
new file mode 100644
index 00000000000..c56f24cbd96
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorg2r.go
@@ -0,0 +1,77 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorg2r generates an m×n matrix Q with orthonormal columns defined by the
+// product of elementary reflectors as computed by Dgeqrf.
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// len(tau) = k, 0 <= k <= n, 0 <= n <= m, len(work) >= n.
+// Dorg2r will panic if these conditions are not met.
+//
+// Dorg2r is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorg2r(m, n, k int, a []float64, lda int, tau []float64, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+
+	// Initialize columns k+1:n to columns of the unit matrix.
+	for l := 0; l < m; l++ {
+		for j := k; j < n; j++ {
+			a[l*lda+j] = 0
+		}
+	}
+	for j := k; j < n; j++ {
+		a[j*lda+j] = 1
+	}
+	for i := k - 1; i >= 0; i-- {
+		for i := range work {
+			work[i] = 0
+		}
+		if i < n-1 {
+			a[i*lda+i] = 1
+			impl.Dlarf(blas.Left, m-i, n-i-1, a[i*lda+i:], lda, tau[i], a[i*lda+i+1:], lda, work)
+		}
+		if i < m-1 {
+			bi.Dscal(m-i-1, -tau[i], a[(i+1)*lda+i:], lda)
+		}
+		a[i*lda+i] = 1 - tau[i]
+		for l := 0; l < i; l++ {
+			a[l*lda+i] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go
new file mode 100644
index 00000000000..35535100b61
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgbr.go
@@ -0,0 +1,138 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/lapack"
+
+// Dorgbr generates one of the matrices Q or Pᵀ computed by Dgebrd
+// computed from the decomposition Dgebrd. See Dgebd2 for the description of
+// Q and Pᵀ.
+//
+// If vect == lapack.GenerateQ, then a is assumed to have been an m×k matrix and
+// Q is of order m. If m >= k, then Dorgbr returns the first n columns of Q
+// where m >= n >= k. If m < k, then Dorgbr returns Q as an m×m matrix.
+//
+// If vect == lapack.GeneratePT, then A is assumed to have been a k×n matrix, and
+// Pᵀ is of order n. If k < n, then Dorgbr returns the first m rows of Pᵀ,
+// where n >= m >= k. If k >= n, then Dorgbr returns Pᵀ as an n×n matrix.
+//
+// Dorgbr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgbr(vect lapack.GenOrtho, m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	wantq := vect == lapack.GenerateQ
+	mn := min(m, n)
+	switch {
+	case vect != lapack.GenerateQ && vect != lapack.GeneratePT:
+		panic(badGenOrtho)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case wantq && n > m:
+		panic(nGTM)
+	case wantq && n < min(m, k):
+		panic("lapack: n < min(m,k)")
+	case !wantq && m > n:
+		panic(mGTN)
+	case !wantq && m < min(n, k):
+		panic("lapack: m < min(n,k)")
+	case lda < max(1, n) && lwork != -1:
+		// Normally, we follow the reference and require the leading
+		// dimension to be always valid, even in case of workspace
+		// queries. However, if a caller provided a placeholder value
+		// for lda (and a) when doing a workspace query that didn't
+		// fulfill the condition here, it would cause a panic. This is
+		// exactly what Dgesvd does.
+		panic(badLdA)
+	case lwork < max(1, mn) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	work[0] = 1
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if wantq {
+		if m >= k {
+			impl.Dorgqr(m, n, k, a, lda, tau, work, -1)
+		} else if m > 1 {
+			impl.Dorgqr(m-1, m-1, m-1, a[lda+1:], lda, tau, work, -1)
+		}
+	} else {
+		if k < n {
+			impl.Dorglq(m, n, k, a, lda, tau, work, -1)
+		} else if n > 1 {
+			impl.Dorglq(n-1, n-1, n-1, a[lda+1:], lda, tau, work, -1)
+		}
+	}
+	lworkopt := int(work[0])
+	lworkopt = max(lworkopt, mn)
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case wantq && len(tau) < min(m, k):
+		panic(shortTau)
+	case !wantq && len(tau) < min(n, k):
+		panic(shortTau)
+	}
+
+	if wantq {
+		// Form Q, determined by a call to Dgebrd to reduce an m×k matrix.
+		if m >= k {
+			impl.Dorgqr(m, n, k, a, lda, tau[:k], work, lwork)
+		} else {
+			// Shift the vectors which define the elementary reflectors one
+			// column to the right, and set the first row and column of Q to
+			// those of the unit matrix.
+			for j := m - 1; j >= 1; j-- {
+				a[j] = 0
+				for i := j + 1; i < m; i++ {
+					a[i*lda+j] = a[i*lda+j-1]
+				}
+			}
+			a[0] = 1
+			for i := 1; i < m; i++ {
+				a[i*lda] = 0
+			}
+			if m > 1 {
+				// Form Q[1:m-1, 1:m-1]
+				impl.Dorgqr(m-1, m-1, m-1, a[lda+1:], lda, tau[:m-1], work, lwork)
+			}
+		}
+	} else {
+		// Form Pᵀ, determined by a call to Dgebrd to reduce a k×n matrix.
+		if k < n {
+			impl.Dorglq(m, n, k, a, lda, tau, work, lwork)
+		} else {
+			// Shift the vectors which define the elementary reflectors one
+			// row downward, and set the first row and column of Pᵀ to
+			// those of the unit matrix.
+			a[0] = 1
+			for i := 1; i < n; i++ {
+				a[i*lda] = 0
+			}
+			for j := 1; j < n; j++ {
+				for i := j - 1; i >= 1; i-- {
+					a[i*lda+j] = a[(i-1)*lda+j]
+				}
+				a[j] = 0
+			}
+			if n > 1 {
+				impl.Dorglq(n-1, n-1, n-1, a[lda+1:], lda, tau, work, lwork)
+			}
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go
new file mode 100644
index 00000000000..8f0dd452ec3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorghr.go
@@ -0,0 +1,103 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dorghr generates an n×n orthogonal matrix Q which is defined as the product
+// of ihi-ilo elementary reflectors:
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// a and lda represent an n×n matrix that contains the elementary reflectors, as
+// returned by Dgehrd. On return, a is overwritten by the n×n orthogonal matrix
+// Q. Q will be equal to the identity matrix except in the submatrix
+// Q[ilo+1:ihi+1,ilo+1:ihi+1].
+//
+// ilo and ihi must have the same values as in the previous call of Dgehrd. It
+// must hold that
+//
+//	0 <= ilo <= ihi < n  if n > 0,
+//	ilo = 0, ihi = -1    if n == 0.
+//
+// tau contains the scalar factors of the elementary reflectors, as returned by
+// Dgehrd. tau must have length n-1.
+//
+// work must have length at least max(1,lwork) and lwork must be at least
+// ihi-ilo. For optimum performance lwork must be at least (ihi-ilo)*nb where nb
+// is the optimal blocksize. On return, work[0] will contain the optimal value
+// of lwork.
+//
+// If lwork == -1, instead of performing Dorghr, only the optimal value of lwork
+// will be stored into work[0].
+//
+// If any requirement on input sizes is not met, Dorghr will panic.
+//
+// Dorghr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorghr(n, ilo, ihi int, a []float64, lda int, tau, work []float64, lwork int) {
+	nh := ihi - ilo
+	switch {
+	case ilo < 0 || max(1, n) <= ilo:
+		panic(badIlo)
+	case ihi < min(ilo, n-1) || n <= ihi:
+		panic(badIhi)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, nh) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	lwkopt := max(1, nh) * impl.Ilaenv(1, "DORGQR", " ", nh, nh, nh, -1)
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	// Shift the vectors which define the elementary reflectors one column
+	// to the right.
+	for i := ilo + 2; i < ihi+1; i++ {
+		copy(a[i*lda+ilo+1:i*lda+i], a[i*lda+ilo:i*lda+i-1])
+	}
+	// Set the first ilo+1 and the last n-ihi-1 rows and columns to those of
+	// the identity matrix.
+	for i := 0; i < ilo+1; i++ {
+		for j := 0; j < n; j++ {
+			a[i*lda+j] = 0
+		}
+		a[i*lda+i] = 1
+	}
+	for i := ilo + 1; i < ihi+1; i++ {
+		for j := 0; j <= ilo; j++ {
+			a[i*lda+j] = 0
+		}
+		for j := i; j < n; j++ {
+			a[i*lda+j] = 0
+		}
+	}
+	for i := ihi + 1; i < n; i++ {
+		for j := 0; j < n; j++ {
+			a[i*lda+j] = 0
+		}
+		a[i*lda+i] = 1
+	}
+	if nh > 0 {
+		// Generate Q[ilo+1:ihi+1,ilo+1:ihi+1].
+		impl.Dorgqr(nh, nh, nh, a[(ilo+1)*lda+ilo+1:], lda, tau[ilo:ihi], work, lwork)
+	}
+	work[0] = float64(lwkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go
new file mode 100644
index 00000000000..6dd9a888630
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgl2.go
@@ -0,0 +1,79 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorgl2 generates an m×n matrix Q with orthonormal rows defined as the first m
+// rows of a product of k elementary reflectors of order n
+//
+//	Q = H_{k-1} * ... * H_0
+//
+// as returned by Dgelqf.
+//
+// On entry, tau and the first k rows of A must contain the scalar factors and
+// the vectors, respectively, which define the elementary reflectors H_i,
+// i=0,...,k-1, as returned by Dgelqf. On return, A contains the matrix Q.
+//
+// tau must have length at least k, work must have length at least m, and it
+// must hold that 0 <= k <= m <= n, otherwise Dorgl2 will panic.
+//
+// Dorgl2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgl2(m, n, k int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < m:
+		panic(nLTM)
+	case k < 0:
+		panic(kLT0)
+	case k > m:
+		panic(kGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if m == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	bi := blas64.Implementation()
+
+	if k < m {
+		for i := k; i < m; i++ {
+			for j := 0; j < n; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+		for j := k; j < m; j++ {
+			a[j*lda+j] = 1
+		}
+	}
+	for i := k - 1; i >= 0; i-- {
+		if i < n-1 {
+			if i < m-1 {
+				a[i*lda+i] = 1
+				impl.Dlarf(blas.Right, m-i-1, n-i, a[i*lda+i:], 1, tau[i], a[(i+1)*lda+i:], lda, work)
+			}
+			bi.Dscal(n-i-1, -tau[i], a[i*lda+i+1:], 1)
+		}
+		a[i*lda+i] = 1 - tau[i]
+		for l := 0; l < i; l++ {
+			a[i*lda+l] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go
new file mode 100644
index 00000000000..d6b3aadfca0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorglq.go
@@ -0,0 +1,125 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dorglq generates an m×n matrix Q with orthonormal rows defined as the first m
+// rows of a product of k elementary reflectors of order n
+//
+//	Q = H_{k-1} * ... * H_0
+//
+// as returned by Dgelqf.
+//
+// On entry, tau and the first k rows of A must contain the scalar factors and
+// the vectors, respectively, which define the elementary reflectors H_i,
+// i=0,...,k-1, as returned by Dgelqf. On return, A contains the matrix Q.
+//
+// tau must have length at least k, work must have length at least lwork and
+// lwork must be at least max(1,m). On return, optimal value of lwork will be
+// stored in work[0]. It must also hold that 0 <= k <= m <= n, otherwise Dorglq
+// will panic.
+//
+// If lwork == -1, instead of performing Dorglq, the function only calculates
+// the optimal value of lwork and stores it into work[0].
+func (impl Implementation) Dorglq(m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < m:
+		panic(nLTM)
+	case k < 0:
+		panic(kLT0)
+	case k > m:
+		panic(kGTM)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, m) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if m == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DORGLQ", " ", m, n, k, -1)
+	if lwork == -1 {
+		work[0] = float64(m * nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	}
+
+	nbmin := 2 // Minimum block size
+	var nx int // Crossover size from blocked to unblocked code
+	iws := m   // Length of work needed
+	var ldwork int
+	if 1 < nb && nb < k {
+		nx = max(0, impl.Ilaenv(3, "DORGLQ", " ", m, n, k, -1))
+		if nx < k {
+			ldwork = nb
+			iws = m * ldwork
+			if lwork < iws {
+				nb = lwork / m
+				ldwork = nb
+				nbmin = max(2, impl.Ilaenv(2, "DORGLQ", " ", m, n, k, -1))
+			}
+		}
+	}
+
+	var ki, kk int
+	if nbmin <= nb && nb < k && nx < k {
+		// The first kk rows are handled by the blocked method.
+		ki = ((k - nx - 1) / nb) * nb
+		kk = min(k, ki+nb)
+		for i := kk; i < m; i++ {
+			for j := 0; j < kk; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+	}
+	if kk < m {
+		// Perform the operation on columns kk to the end.
+		impl.Dorgl2(m-kk, n-kk, k-kk, a[kk*lda+kk:], lda, tau[kk:], work)
+	}
+	if kk > 0 {
+		// Perform the operation on column-blocks
+		for i := ki; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			if i+ib < m {
+				impl.Dlarft(lapack.Forward, lapack.RowWise,
+					n-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+
+				impl.Dlarfb(blas.Right, blas.Trans, lapack.Forward, lapack.RowWise,
+					m-i-ib, n-i, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[(i+ib)*lda+i:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+			impl.Dorgl2(ib, n-i, ib, a[i*lda+i:], lda, tau[i:], work)
+			for l := i; l < i+ib; l++ {
+				for j := 0; j < i; j++ {
+					a[l*lda+j] = 0
+				}
+			}
+		}
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go
new file mode 100644
index 00000000000..d5ef17f3b6c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgql.go
@@ -0,0 +1,139 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dorgql generates the m×n matrix Q with orthonormal columns defined as the
+// last n columns of a product of k elementary reflectors of order m
+//
+//	Q = H_{k-1} * ... * H_1 * H_0.
+//
+// It must hold that
+//
+//	0 <= k <= n <= m,
+//
+// and Dorgql will panic otherwise.
+//
+// On entry, the (n-k+i)-th column of A must contain the vector which defines
+// the elementary reflector H_i, for i=0,...,k-1, and tau[i] must contain its
+// scalar factor. On return, a contains the m×n matrix Q.
+//
+// tau must have length at least k, and Dorgql will panic otherwise.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// max(1,n), otherwise Dorgql will panic. For optimum performance lwork must
+// be a sufficiently large multiple of n.
+//
+// If lwork == -1, instead of computing Dorgql the optimal work length is stored
+// into work[0].
+//
+// Dorgql is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgql(m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DORGQL", " ", m, n, k, -1)
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	}
+
+	nbmin := 2
+	var nx, ldwork int
+	iws := n
+	if 1 < nb && nb < k {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(3, "DORGQL", " ", m, n, k, -1))
+		if nx < k {
+			// Determine if workspace is large enough for blocked code.
+			iws = n * nb
+			if lwork < iws {
+				// Not enough workspace to use optimal nb: reduce nb and determine
+				// the minimum value of nb.
+				nb = lwork / n
+				nbmin = max(2, impl.Ilaenv(2, "DORGQL", " ", m, n, k, -1))
+			}
+			ldwork = nb
+		}
+	}
+
+	var kk int
+	if nbmin <= nb && nb < k && nx < k {
+		// Use blocked code after the first block. The last kk columns are handled
+		// by the block method.
+		kk = min(k, ((k-nx+nb-1)/nb)*nb)
+
+		// Set A(m-kk:m, 0:n-kk) to zero.
+		for i := m - kk; i < m; i++ {
+			for j := 0; j < n-kk; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+	}
+
+	// Use unblocked code for the first or only block.
+	impl.Dorg2l(m-kk, n-kk, k-kk, a, lda, tau, work)
+	if kk > 0 {
+		// Use blocked code.
+		for i := k - kk; i < k; i += nb {
+			ib := min(nb, k-i)
+			if n-k+i > 0 {
+				// Form the triangular factor of the block reflector
+				// H = H_{i+ib-1} * ... * H_{i+1} * H_i.
+				impl.Dlarft(lapack.Backward, lapack.ColumnWise, m-k+i+ib, ib,
+					a[n-k+i:], lda, tau[i:], work, ldwork)
+
+				// Apply H to A[0:m-k+i+ib, 0:n-k+i] from the left.
+				impl.Dlarfb(blas.Left, blas.NoTrans, lapack.Backward, lapack.ColumnWise,
+					m-k+i+ib, n-k+i, ib, a[n-k+i:], lda, work, ldwork,
+					a, lda, work[ib*ldwork:], ldwork)
+			}
+
+			// Apply H to rows 0:m-k+i+ib of current block.
+			impl.Dorg2l(m-k+i+ib, ib, ib, a[n-k+i:], lda, tau[i:], work)
+
+			// Set rows m-k+i+ib:m of current block to zero.
+			for j := n - k + i; j < n-k+i+ib; j++ {
+				for l := m - k + i + ib; l < m; l++ {
+					a[l*lda+j] = 0
+				}
+			}
+		}
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go
new file mode 100644
index 00000000000..a1e0fa87164
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgqr.go
@@ -0,0 +1,136 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dorgqr generates an m×n matrix Q with orthonormal columns defined by the
+// product of elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// as computed by Dgeqrf.
+// Dorgqr is the blocked version of Dorg2r that makes greater use of level-3 BLAS
+// routines.
+//
+// The length of tau must be k, and the length of work must be at least n.
+// It also must be that 0 <= k <= n and 0 <= n <= m.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= n, and the amount of blocking is limited by the usable
+// length. If lwork == -1, instead of computing Dorgqr the optimal work length
+// is stored into work[0].
+//
+// Dorgqr will panic if the conditions on input values are not met.
+//
+// Dorgqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgqr(m, n, k int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case n > m:
+		panic(nGTM)
+	case k < 0:
+		panic(kLT0)
+	case k > n:
+		panic(kGTN)
+	case lda < max(1, n) && lwork != -1:
+		// Normally, we follow the reference and require the leading
+		// dimension to be always valid, even in case of workspace
+		// queries. However, if a caller provided a placeholder value
+		// for lda (and a) when doing a workspace query that didn't
+		// fulfill the condition here, it would cause a panic. This is
+		// exactly what Dgesvd does.
+		panic(badLdA)
+	case lwork < max(1, n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DORGQR", " ", m, n, k, -1)
+	// work is treated as an n×nb matrix
+	if lwork == -1 {
+		work[0] = float64(n * nb)
+		return
+	}
+
+	switch {
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	}
+
+	nbmin := 2 // Minimum block size
+	var nx int // Crossover size from blocked to unblocked code
+	iws := n   // Length of work needed
+	var ldwork int
+	if 1 < nb && nb < k {
+		nx = max(0, impl.Ilaenv(3, "DORGQR", " ", m, n, k, -1))
+		if nx < k {
+			ldwork = nb
+			iws = n * ldwork
+			if lwork < iws {
+				nb = lwork / n
+				ldwork = nb
+				nbmin = max(2, impl.Ilaenv(2, "DORGQR", " ", m, n, k, -1))
+			}
+		}
+	}
+	var ki, kk int
+	if nbmin <= nb && nb < k && nx < k {
+		// The first kk columns are handled by the blocked method.
+		ki = ((k - nx - 1) / nb) * nb
+		kk = min(k, ki+nb)
+		for i := 0; i < kk; i++ {
+			for j := kk; j < n; j++ {
+				a[i*lda+j] = 0
+			}
+		}
+	}
+	if kk < n {
+		// Perform the operation on columns kk to the end.
+		impl.Dorg2r(m-kk, n-kk, k-kk, a[kk*lda+kk:], lda, tau[kk:], work)
+	}
+	if kk > 0 {
+		// Perform the operation on column-blocks.
+		for i := ki; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			if i+ib < n {
+				impl.Dlarft(lapack.Forward, lapack.ColumnWise,
+					m-i, ib,
+					a[i*lda+i:], lda,
+					tau[i:],
+					work, ldwork)
+
+				impl.Dlarfb(blas.Left, blas.NoTrans, lapack.Forward, lapack.ColumnWise,
+					m-i, n-i-ib, ib,
+					a[i*lda+i:], lda,
+					work, ldwork,
+					a[i*lda+i+ib:], lda,
+					work[ib*ldwork:], ldwork)
+			}
+			impl.Dorg2r(m-i, ib, ib, a[i*lda+i:], lda, tau[i:i+ib], work)
+			// Set rows 0:i-1 of current block to zero.
+			for j := i; j < i+ib; j++ {
+				for l := 0; l < i; l++ {
+					a[l*lda+j] = 0
+				}
+			}
+		}
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go
new file mode 100644
index 00000000000..6f2790cb8ff
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgr2.go
@@ -0,0 +1,83 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dorgr2 generates an m×n real matrix Q with orthonormal rows, which is defined
+// as the last m rows of a product of k elementary reflectors of order n
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// as returned by Dgerqf.
+//
+// On entry, the (m-k+i)-th row of A must contain the vector which defines the
+// elementary reflector H_i, for i = 0,1,...,k, as returned by Dgerqf. On
+// return, A will contain the m×n matrix Q.
+//
+// The i-th element of tau must contain the scalar factor of the elementary
+// reflector H_i, as returned by Dgerqf.
+//
+// It must hold that
+//
+//	n >= m >= k >= 0,
+//
+// the length of tau must be k and the length of work must be m, otherwise
+// Dorgr2 will panic.
+//
+// Dorgr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgr2(m, n, k int, a []float64, lda int, tau, work []float64) {
+	switch {
+	case k < 0:
+		panic(kLT0)
+	case m < k:
+		panic(kGTM)
+	case n < m:
+		panic(mGTN)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 {
+		return
+	}
+
+	switch {
+	case len(tau) != k:
+		panic(badLenTau)
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+	case len(work) < m:
+		panic(shortWork)
+	}
+
+	// Initialise rows 0:m-k to rows of the unit matrix.
+	for l := 0; l < m-k; l++ {
+		row := a[l*lda : l*lda+n]
+		for j := range row {
+			row[j] = 0
+		}
+		a[l*lda+n-m+l] = 1
+	}
+	bi := blas64.Implementation()
+	for i := 0; i < k; i++ {
+		ii := m - k + i
+
+		// Apply H_i to A[0:m-k+i+1, 0:n-k+i+1] from the right.
+		a[ii*lda+n-m+ii] = 1
+		impl.Dlarf(blas.Right, ii, n-m+ii+1, a[ii*lda:], 1, tau[i], a, lda, work)
+		bi.Dscal(n-m+ii, -tau[i], a[ii*lda:], 1)
+		a[ii*lda+n-m+ii] = 1 - tau[i]
+
+		// Set A[m-k+i, n-k+i:n] to zero.
+		for l := n - m + ii + 1; l < n; l++ {
+			a[ii*lda+l] = 0
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go
new file mode 100644
index 00000000000..7021ae53d30
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorgtr.go
@@ -0,0 +1,106 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dorgtr generates a real orthogonal matrix Q which is defined as the product
+// of n-1 elementary reflectors of order n as returned by Dsytrd.
+//
+// The construction of Q depends on the value of uplo:
+//
+//	Q = H_{n-1} * ... * H_1 * H_0  if uplo == blas.Upper
+//	Q = H_0 * H_1 * ... * H_{n-1}  if uplo == blas.Lower
+//
+// where H_i is constructed from the elementary reflectors as computed by Dsytrd.
+// See the documentation for Dsytrd for more information.
+//
+// tau must have length at least n-1, and Dorgtr will panic otherwise.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= max(1,n-1), and Dorgtr will panic otherwise. The amount of blocking
+// is limited by the usable length.
+// If lwork == -1, instead of computing Dorgtr the optimal work length is stored
+// into work[0].
+//
+// Dorgtr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorgtr(uplo blas.Uplo, n int, a []float64, lda int, tau, work []float64, lwork int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, n-1) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	var nb int
+	if uplo == blas.Upper {
+		nb = impl.Ilaenv(1, "DORGQL", " ", n-1, n-1, n-1, -1)
+	} else {
+		nb = impl.Ilaenv(1, "DORGQR", " ", n-1, n-1, n-1, -1)
+	}
+	lworkopt := max(1, n-1) * nb
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	if uplo == blas.Upper {
+		// Q was determined by a call to Dsytrd with uplo == blas.Upper.
+		// Shift the vectors which define the elementary reflectors one column
+		// to the left, and set the last row and column of Q to those of the unit
+		// matrix.
+		for j := 0; j < n-1; j++ {
+			for i := 0; i < j; i++ {
+				a[i*lda+j] = a[i*lda+j+1]
+			}
+			a[(n-1)*lda+j] = 0
+		}
+		for i := 0; i < n-1; i++ {
+			a[i*lda+n-1] = 0
+		}
+		a[(n-1)*lda+n-1] = 1
+
+		// Generate Q[0:n-1, 0:n-1].
+		impl.Dorgql(n-1, n-1, n-1, a, lda, tau, work, lwork)
+	} else {
+		// Q was determined by a call to Dsytrd with uplo == blas.Upper.
+		// Shift the vectors which define the elementary reflectors one column
+		// to the right, and set the first row and column of Q to those of the unit
+		// matrix.
+		for j := n - 1; j > 0; j-- {
+			a[j] = 0
+			for i := j + 1; i < n; i++ {
+				a[i*lda+j] = a[i*lda+j-1]
+			}
+		}
+		a[0] = 1
+		for i := 1; i < n; i++ {
+			a[i*lda] = 0
+		}
+		if n > 1 {
+			// Generate Q[1:n, 1:n].
+			impl.Dorgqr(n-1, n-1, n-1, a[lda+1:], lda, tau[:n-1], work, lwork)
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go
new file mode 100644
index 00000000000..aea77a70d24
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorm2r.go
@@ -0,0 +1,103 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dorm2r multiplies a general matrix C by an orthogonal matrix from a QR factorization
+// determined by Dgeqrf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, a is a matrix of size m×k, and if side == blas.Right
+// a is of size n×k.
+//
+// tau contains the Householder factors and must have length k and this function
+// will panic otherwise.
+//
+// work is temporary storage of length at least n if side == blas.Left
+// and at least m if side == blas.Right and this function will panic otherwise.
+//
+// Dorm2r is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorm2r(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64) {
+	left := side == blas.Left
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case lda < max(1, k):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		return
+	}
+
+	switch {
+	case left && len(a) < (m-1)*lda+k:
+		panic(shortA)
+	case !left && len(a) < (n-1)*lda+k:
+		panic(shortA)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(tau) != k:
+		panic(badLenTau)
+	case left && len(work) < n:
+		panic(shortWork)
+	case !left && len(work) < m:
+		panic(shortWork)
+	}
+
+	if left {
+		if trans == blas.NoTrans {
+			for i := k - 1; i >= 0; i-- {
+				aii := a[i*lda+i]
+				a[i*lda+i] = 1
+				impl.Dlarf(side, m-i, n, a[i*lda+i:], lda, tau[i], c[i*ldc:], ldc, work)
+				a[i*lda+i] = aii
+			}
+			return
+		}
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m-i, n, a[i*lda+i:], lda, tau[i], c[i*ldc:], ldc, work)
+			a[i*lda+i] = aii
+		}
+		return
+	}
+	if trans == blas.NoTrans {
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m, n-i, a[i*lda+i:], lda, tau[i], c[i:], ldc, work)
+			a[i*lda+i] = aii
+		}
+		return
+	}
+	for i := k - 1; i >= 0; i-- {
+		aii := a[i*lda+i]
+		a[i*lda+i] = 1
+		impl.Dlarf(side, m, n-i, a[i*lda+i:], lda, tau[i], c[i:], ldc, work)
+		a[i*lda+i] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go
new file mode 100644
index 00000000000..8be7040c92d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormbr.go
@@ -0,0 +1,180 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dormbr applies a multiplicative update to the matrix C based on a
+// decomposition computed by Dgebrd.
+//
+// Dormbr overwrites the m×n matrix C with
+//
+//	Q * C   if vect == lapack.ApplyQ, side == blas.Left, and trans == blas.NoTrans
+//	C * Q   if vect == lapack.ApplyQ, side == blas.Right, and trans == blas.NoTrans
+//	Qᵀ * C  if vect == lapack.ApplyQ, side == blas.Left, and trans == blas.Trans
+//	C * Qᵀ  if vect == lapack.ApplyQ, side == blas.Right, and trans == blas.Trans
+//
+//	P * C   if vect == lapack.ApplyP, side == blas.Left, and trans == blas.NoTrans
+//	C * P   if vect == lapack.ApplyP, side == blas.Right, and trans == blas.NoTrans
+//	Pᵀ * C  if vect == lapack.ApplyP, side == blas.Left, and trans == blas.Trans
+//	C * Pᵀ  if vect == lapack.ApplyP, side == blas.Right, and trans == blas.Trans
+//
+// where P and Q are the orthogonal matrices determined by Dgebrd when reducing
+// a matrix A to bidiagonal form: A = Q * B * Pᵀ. See Dgebrd for the
+// definitions of Q and P.
+//
+// If vect == lapack.ApplyQ, A is assumed to have been an nq×k matrix, while if
+// vect == lapack.ApplyP, A is assumed to have been a k×nq matrix. nq = m if
+// side == blas.Left, while nq = n if side == blas.Right.
+//
+// tau must have length min(nq,k), and Dormbr will panic otherwise. tau contains
+// the elementary reflectors to construct Q or P depending on the value of
+// vect.
+//
+// work must have length at least max(1,lwork), and lwork must be either -1 or
+// at least max(1,n) if side == blas.Left, and at least max(1,m) if side ==
+// blas.Right. For optimum performance lwork should be at least n*nb if side ==
+// blas.Left, and at least m*nb if side == blas.Right, where nb is the optimal
+// block size. On return, work[0] will contain the optimal value of lwork.
+//
+// If lwork == -1, the function only calculates the optimal value of lwork and
+// returns it in work[0].
+//
+// Dormbr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dormbr(vect lapack.ApplyOrtho, side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	nq := n
+	nw := m
+	if side == blas.Left {
+		nq = m
+		nw = n
+	}
+	applyQ := vect == lapack.ApplyQ
+	switch {
+	case !applyQ && vect != lapack.ApplyP:
+		panic(badApplyOrtho)
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case applyQ && lda < max(1, min(nq, k)):
+		panic(badLdA)
+	case !applyQ && lda < max(1, nq):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		work[0] = 1
+		return
+	}
+
+	// The current implementation does not use opts, but a future change may
+	// use these options so construct them.
+	var opts string
+	if side == blas.Left {
+		opts = "L"
+	} else {
+		opts = "R"
+	}
+	if trans == blas.Trans {
+		opts += "T"
+	} else {
+		opts += "N"
+	}
+	var nb int
+	if applyQ {
+		if side == blas.Left {
+			nb = impl.Ilaenv(1, "DORMQR", opts, m-1, n, m-1, -1)
+		} else {
+			nb = impl.Ilaenv(1, "DORMQR", opts, m, n-1, n-1, -1)
+		}
+	} else {
+		if side == blas.Left {
+			nb = impl.Ilaenv(1, "DORMLQ", opts, m-1, n, m-1, -1)
+		} else {
+			nb = impl.Ilaenv(1, "DORMLQ", opts, m, n-1, n-1, -1)
+		}
+	}
+	lworkopt := max(1, nw) * nb
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	minnqk := min(nq, k)
+	switch {
+	case applyQ && len(a) < (nq-1)*lda+minnqk:
+		panic(shortA)
+	case !applyQ && len(a) < (minnqk-1)*lda+nq:
+		panic(shortA)
+	case len(tau) < minnqk:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	if applyQ {
+		// Change the operation to get Q depending on the size of the initial
+		// matrix to Dgebrd. The size matters due to the storage location of
+		// the off-diagonal elements.
+		if nq >= k {
+			impl.Dormqr(side, trans, m, n, k, a, lda, tau[:k], c, ldc, work, lwork)
+		} else if nq > 1 {
+			mi := m
+			ni := n - 1
+			i1 := 0
+			i2 := 1
+			if side == blas.Left {
+				mi = m - 1
+				ni = n
+				i1 = 1
+				i2 = 0
+			}
+			impl.Dormqr(side, trans, mi, ni, nq-1, a[lda:], lda, tau[:nq-1], c[i1*ldc+i2:], ldc, work, lwork)
+		}
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	transt := blas.Trans
+	if trans == blas.Trans {
+		transt = blas.NoTrans
+	}
+
+	// Change the operation to get P depending on the size of the initial
+	// matrix to Dgebrd. The size matters due to the storage location of
+	// the off-diagonal elements.
+	if nq > k {
+		impl.Dormlq(side, transt, m, n, k, a, lda, tau, c, ldc, work, lwork)
+	} else if nq > 1 {
+		mi := m
+		ni := n - 1
+		i1 := 0
+		i2 := 1
+		if side == blas.Left {
+			mi = m - 1
+			ni = n
+			i1 = 1
+			i2 = 0
+		}
+		impl.Dormlq(side, transt, mi, ni, nq-1, a[1:], lda, tau, c[i1*ldc+i2:], ldc, work, lwork)
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go
new file mode 100644
index 00000000000..318a57adcac
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormhr.go
@@ -0,0 +1,134 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dormhr multiplies an m×n general matrix C with an nq×nq orthogonal matrix Q
+//
+//	Q * C   if side == blas.Left  and trans == blas.NoTrans,
+//	Qᵀ * C  if side == blas.Left  and trans == blas.Trans,
+//	C * Q   if side == blas.Right and trans == blas.NoTrans,
+//	C * Qᵀ  if side == blas.Right and trans == blas.Trans,
+//
+// where nq == m if side == blas.Left and nq == n if side == blas.Right.
+//
+// Q is defined implicitly as the product of ihi-ilo elementary reflectors, as
+// returned by Dgehrd:
+//
+//	Q = H_{ilo} H_{ilo+1} ... H_{ihi-1}.
+//
+// Q is equal to the identity matrix except in the submatrix
+// Q[ilo+1:ihi+1,ilo+1:ihi+1].
+//
+// ilo and ihi must have the same values as in the previous call of Dgehrd. It
+// must hold that
+//
+//	0 <= ilo <= ihi < m   if m > 0 and side == blas.Left,
+//	ilo = 0 and ihi = -1  if m = 0 and side == blas.Left,
+//	0 <= ilo <= ihi < n   if n > 0 and side == blas.Right,
+//	ilo = 0 and ihi = -1  if n = 0 and side == blas.Right.
+//
+// a and lda represent an m×m matrix if side == blas.Left and an n×n matrix if
+// side == blas.Right. The matrix contains vectors which define the elementary
+// reflectors, as returned by Dgehrd.
+//
+// tau contains the scalar factors of the elementary reflectors, as returned by
+// Dgehrd. tau must have length m-1 if side == blas.Left and n-1 if side ==
+// blas.Right.
+//
+// c and ldc represent the m×n matrix C. On return, c is overwritten by the
+// product with Q.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// max(1,n), if side == blas.Left, and max(1,m), if side == blas.Right. For
+// optimum performance lwork should be at least n*nb if side == blas.Left and
+// m*nb if side == blas.Right, where nb is the optimal block size. On return,
+// work[0] will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Dormhr, only the optimal value of lwork
+// will be stored in work[0].
+//
+// If any requirement on input sizes is not met, Dormhr will panic.
+//
+// Dormhr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dormhr(side blas.Side, trans blas.Transpose, m, n, ilo, ihi int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	nq := n // The order of Q.
+	nw := m // The minimum length of work.
+	if side == blas.Left {
+		nq = m
+		nw = n
+	}
+	switch {
+	case side != blas.Left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case ilo < 0 || max(1, nq) <= ilo:
+		panic(badIlo)
+	case ihi < min(ilo, nq-1) || nq <= ihi:
+		panic(badIhi)
+	case lda < max(1, nq):
+		panic(badLdA)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nh := ihi - ilo
+	var nb int
+	if side == blas.Left {
+		opts := "LN"
+		if trans == blas.Trans {
+			opts = "LT"
+		}
+		nb = impl.Ilaenv(1, "DORMQR", opts, nh, n, nh, -1)
+	} else {
+		opts := "RN"
+		if trans == blas.Trans {
+			opts = "RT"
+		}
+		nb = impl.Ilaenv(1, "DORMQR", opts, m, nh, nh, -1)
+	}
+	lwkopt := max(1, nw) * nb
+	if lwork == -1 {
+		work[0] = float64(lwkopt)
+		return
+	}
+
+	if nh == 0 {
+		work[0] = 1
+		return
+	}
+
+	switch {
+	case len(a) < (nq-1)*lda+nq:
+		panic(shortA)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(tau) != nq-1:
+		panic(badLenTau)
+	}
+
+	if side == blas.Left {
+		impl.Dormqr(side, trans, nh, n, nh, a[(ilo+1)*lda+ilo:], lda,
+			tau[ilo:ihi], c[(ilo+1)*ldc:], ldc, work, lwork)
+	} else {
+		impl.Dormqr(side, trans, m, nh, nh, a[(ilo+1)*lda+ilo:], lda,
+			tau[ilo:ihi], c[ilo+1:], ldc, work, lwork)
+	}
+	work[0] = float64(lwkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go
new file mode 100644
index 00000000000..665e2102c8c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dorml2.go
@@ -0,0 +1,104 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dorml2 multiplies a general matrix C by an orthogonal matrix from an LQ factorization
+// determined by Dgelqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, a is a matrix of side k×m, and if side == blas.Right
+// a is of size k×n.
+//
+// tau contains the Householder factors and is of length at least k and this function will
+// panic otherwise.
+//
+// work is temporary storage of length at least n if side == blas.Left
+// and at least m if side == blas.Right and this function will panic otherwise.
+//
+// Dorml2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dorml2(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64) {
+	left := side == blas.Left
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case left && lda < max(1, m):
+		panic(badLdA)
+	case !left && lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		return
+	}
+
+	switch {
+	case left && len(a) < (k-1)*lda+m:
+		panic(shortA)
+	case !left && len(a) < (k-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case left && len(work) < n:
+		panic(shortWork)
+	case !left && len(work) < m:
+		panic(shortWork)
+	}
+
+	notrans := trans == blas.NoTrans
+	switch {
+	case left && notrans:
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m-i, n, a[i*lda+i:], 1, tau[i], c[i*ldc:], ldc, work)
+			a[i*lda+i] = aii
+		}
+
+	case left && !notrans:
+		for i := k - 1; i >= 0; i-- {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m-i, n, a[i*lda+i:], 1, tau[i], c[i*ldc:], ldc, work)
+			a[i*lda+i] = aii
+		}
+
+	case !left && notrans:
+		for i := k - 1; i >= 0; i-- {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m, n-i, a[i*lda+i:], 1, tau[i], c[i:], ldc, work)
+			a[i*lda+i] = aii
+		}
+
+	case !left && !notrans:
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+i]
+			a[i*lda+i] = 1
+			impl.Dlarf(side, m, n-i, a[i*lda+i:], 1, tau[i], c[i:], ldc, work)
+			a[i*lda+i] = aii
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go
new file mode 100644
index 00000000000..37b499739a1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormlq.go
@@ -0,0 +1,176 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dormlq multiplies the matrix C by the orthogonal matrix Q defined by the
+// slices a and tau. A and tau are as returned from Dgelqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, A is a matrix of side k×m, and if side == blas.Right
+// A is of size k×n. This uses a blocked algorithm.
+//
+// work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m if side == blas.Left and lwork >= n if side == blas.Right,
+// and this function will panic otherwise.
+// Dormlq uses a block algorithm, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Dormlq,
+// the optimal work length will be stored into work[0].
+//
+// tau contains the Householder scales and must have length at least k, and
+// this function will panic otherwise.
+func (impl Implementation) Dormlq(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	left := side == blas.Left
+	nw := m
+	if left {
+		nw = n
+	}
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.Trans && trans != blas.NoTrans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case left && lda < max(1, m):
+		panic(badLdA)
+	case !left && lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		work[0] = 1
+		return
+	}
+
+	const (
+		nbmax = 64
+		ldt   = nbmax
+		tsize = nbmax * ldt
+	)
+	opts := string(side) + string(trans)
+	nb := min(nbmax, impl.Ilaenv(1, "DORMLQ", opts, m, n, k, -1))
+	lworkopt := max(1, nw)*nb + tsize
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case left && len(a) < (k-1)*lda+m:
+		panic(shortA)
+	case !left && len(a) < (k-1)*lda+n:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	nbmin := 2
+	if 1 < nb && nb < k {
+		iws := nw*nb + tsize
+		if lwork < iws {
+			nb = (lwork - tsize) / nw
+			nbmin = max(2, impl.Ilaenv(2, "DORMLQ", opts, m, n, k, -1))
+		}
+	}
+	if nb < nbmin || k <= nb {
+		// Call unblocked code.
+		impl.Dorml2(side, trans, m, n, k, a, lda, tau, c, ldc, work)
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	t := work[:tsize]
+	wrk := work[tsize:]
+	ldwrk := nb
+
+	notrans := trans == blas.NoTrans
+	transt := blas.NoTrans
+	if notrans {
+		transt = blas.Trans
+	}
+
+	switch {
+	case left && notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i*ldc:], ldc,
+				wrk, ldwrk)
+		}
+
+	case left && !notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i*ldc:], ldc,
+				wrk, ldwrk)
+		}
+
+	case !left && notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i:], ldc,
+				wrk, ldwrk)
+		}
+
+	case !left && !notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.RowWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				t, ldt)
+			impl.Dlarfb(side, transt, lapack.Forward, lapack.RowWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				t, ldt,
+				c[i:], ldc,
+				wrk, ldwrk)
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go
new file mode 100644
index 00000000000..c1e5668be5c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormqr.go
@@ -0,0 +1,180 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dormqr multiplies an m×n matrix C by an orthogonal matrix Q as
+//
+//	C = Q * C   if side == blas.Left  and trans == blas.NoTrans,
+//	C = Qᵀ * C  if side == blas.Left  and trans == blas.Trans,
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans,
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans,
+//
+// where Q is defined as the product of k elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}.
+//
+// If side == blas.Left, A is an m×k matrix and 0 <= k <= m.
+// If side == blas.Right, A is an n×k matrix and 0 <= k <= n.
+// The ith column of A contains the vector which defines the elementary
+// reflector H_i and tau[i] contains its scalar factor. tau must have length k
+// and Dormqr will panic otherwise. Dgeqrf returns A and tau in the required
+// form.
+//
+// work must have length at least max(1,lwork), and lwork must be at least n if
+// side == blas.Left and at least m if side == blas.Right, otherwise Dormqr will
+// panic.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= m if side == blas.Left and lwork >= n if side ==
+// blas.Right, and this function will panic otherwise. Larger values of lwork
+// will generally give better performance. On return, work[0] will contain the
+// optimal value of lwork.
+//
+// If lwork is -1, instead of performing Dormqr, the optimal workspace size will
+// be stored into work[0].
+func (impl Implementation) Dormqr(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int) {
+	left := side == blas.Left
+	nq := n
+	nw := m
+	if left {
+		nq = m
+		nw = n
+	}
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case lda < max(1, k):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	case lwork < max(1, nw) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		work[0] = 1
+		return
+	}
+
+	const (
+		nbmax = 64
+		ldt   = nbmax
+		tsize = nbmax * ldt
+	)
+	opts := string(side) + string(trans)
+	nb := min(nbmax, impl.Ilaenv(1, "DORMQR", opts, m, n, k, -1))
+	lworkopt := max(1, nw)*nb + tsize
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (nq-1)*lda+k:
+		panic(shortA)
+	case len(tau) != k:
+		panic(badLenTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	}
+
+	nbmin := 2
+	if 1 < nb && nb < k {
+		if lwork < nw*nb+tsize {
+			nb = (lwork - tsize) / nw
+			nbmin = max(2, impl.Ilaenv(2, "DORMQR", opts, m, n, k, -1))
+		}
+	}
+
+	if nb < nbmin || k <= nb {
+		// Call unblocked code.
+		impl.Dorm2r(side, trans, m, n, k, a, lda, tau, c, ldc, work)
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	var (
+		ldwork  = nb
+		notrans = trans == blas.NoTrans
+	)
+	switch {
+	case left && notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i*ldc:], ldc,
+				work[tsize:], ldwork)
+		}
+
+	case left && !notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, m-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m-i, n, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i*ldc:], ldc,
+				work[tsize:], ldwork)
+		}
+
+	case !left && notrans:
+		for i := 0; i < k; i += nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i:], ldc,
+				work[tsize:], ldwork)
+		}
+
+	case !left && !notrans:
+		for i := ((k - 1) / nb) * nb; i >= 0; i -= nb {
+			ib := min(nb, k-i)
+			impl.Dlarft(lapack.Forward, lapack.ColumnWise, n-i, ib,
+				a[i*lda+i:], lda,
+				tau[i:],
+				work[:tsize], ldt)
+			impl.Dlarfb(side, trans, lapack.Forward, lapack.ColumnWise, m, n-i, ib,
+				a[i*lda+i:], lda,
+				work[:tsize], ldt,
+				c[i:], ldc,
+				work[tsize:], ldwork)
+		}
+	}
+	work[0] = float64(lworkopt)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go
new file mode 100644
index 00000000000..59d4d4f17e7
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dormr2.go
@@ -0,0 +1,105 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dormr2 multiplies a general matrix C by an orthogonal matrix from a RQ factorization
+// determined by Dgerqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, a is a matrix of size k×m, and if side == blas.Right
+// a is of size k×n.
+//
+// tau contains the Householder factors and is of length at least k and this function
+// will panic otherwise.
+//
+// work is temporary storage of length at least n if side == blas.Left
+// and at least m if side == blas.Right and this function will panic otherwise.
+//
+// Dormr2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dormr2(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64) {
+	left := side == blas.Left
+	nq := n
+	nw := m
+	if left {
+		nq = m
+		nw = n
+	}
+	switch {
+	case !left && side != blas.Right:
+		panic(badSide)
+	case trans != blas.NoTrans && trans != blas.Trans:
+		panic(badTrans)
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case k < 0:
+		panic(kLT0)
+	case left && k > m:
+		panic(kGTM)
+	case !left && k > n:
+		panic(kGTN)
+	case lda < max(1, nq):
+		panic(badLdA)
+	case ldc < max(1, n):
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 || k == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (k-1)*lda+nq:
+		panic(shortA)
+	case len(tau) < k:
+		panic(shortTau)
+	case len(c) < (m-1)*ldc+n:
+		panic(shortC)
+	case len(work) < nw:
+		panic(shortWork)
+	}
+
+	if left {
+		if trans == blas.NoTrans {
+			for i := k - 1; i >= 0; i-- {
+				aii := a[i*lda+(m-k+i)]
+				a[i*lda+(m-k+i)] = 1
+				impl.Dlarf(side, m-k+i+1, n, a[i*lda:], 1, tau[i], c, ldc, work)
+				a[i*lda+(m-k+i)] = aii
+			}
+			return
+		}
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+(m-k+i)]
+			a[i*lda+(m-k+i)] = 1
+			impl.Dlarf(side, m-k+i+1, n, a[i*lda:], 1, tau[i], c, ldc, work)
+			a[i*lda+(m-k+i)] = aii
+		}
+		return
+	}
+	if trans == blas.NoTrans {
+		for i := 0; i < k; i++ {
+			aii := a[i*lda+(n-k+i)]
+			a[i*lda+(n-k+i)] = 1
+			impl.Dlarf(side, m, n-k+i+1, a[i*lda:], 1, tau[i], c, ldc, work)
+			a[i*lda+(n-k+i)] = aii
+		}
+		return
+	}
+	for i := k - 1; i >= 0; i-- {
+		aii := a[i*lda+(n-k+i)]
+		a[i*lda+(n-k+i)] = 1
+		impl.Dlarf(side, m, n-k+i+1, a[i*lda:], 1, tau[i], c, ldc, work)
+		a[i*lda+(n-k+i)] = aii
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go
new file mode 100644
index 00000000000..0ed63e62dd1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbcon.go
@@ -0,0 +1,111 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbcon returns an estimate of the reciprocal of the condition number (in the
+// 1-norm) of an n×n symmetric positive definite band matrix using the Cholesky
+// factorization
+//
+//	A = Uᵀ*U  if uplo == blas.Upper
+//	A = L*Lᵀ  if uplo == blas.Lower
+//
+// computed by Dpbtrf. The estimate is obtained for norm(inv(A)), and the
+// reciprocal of the condition number is computed as
+//
+//	rcond = 1 / (anorm * norm(inv(A))).
+//
+// The length of work must be at least 3*n and the length of iwork must be at
+// least n.
+func (impl Implementation) Dpbcon(uplo blas.Uplo, n, kd int, ab []float64, ldab int, anorm float64, work []float64, iwork []int) (rcond float64) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	case anorm < 0:
+		panic(badNorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(ab) < (n-1)*ldab+kd+1:
+		panic(shortAB)
+	case len(work) < 3*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	// Quick return if possible.
+	if anorm == 0 {
+		return 0
+	}
+
+	const smlnum = dlamchS
+
+	var (
+		ainvnm float64
+		kase   int
+		isave  [3]int
+		normin bool
+
+		// Denote work slices.
+		x     = work[:n]
+		v     = work[n : 2*n]
+		cnorm = work[2*n : 3*n]
+	)
+	// Estimate the 1-norm of the inverse.
+	bi := blas64.Implementation()
+	for {
+		ainvnm, kase = impl.Dlacn2(n, v, x, iwork, ainvnm, kase, &isave)
+		if kase == 0 {
+			break
+		}
+		var op1, op2 blas.Transpose
+		if uplo == blas.Upper {
+			// Multiply x by inv(Uᵀ),
+			op1 = blas.Trans
+			// then by inv(Uᵀ).
+			op2 = blas.NoTrans
+		} else {
+			// Multiply x by inv(L),
+			op1 = blas.NoTrans
+			// then by inv(Lᵀ).
+			op2 = blas.Trans
+		}
+		scaleL := impl.Dlatbs(uplo, op1, blas.NonUnit, normin, n, kd, ab, ldab, x, cnorm)
+		normin = true
+		scaleU := impl.Dlatbs(uplo, op2, blas.NonUnit, normin, n, kd, ab, ldab, x, cnorm)
+		// Multiply x by 1/scale if doing so will not cause overflow.
+		scale := scaleL * scaleU
+		if scale != 1 {
+			ix := bi.Idamax(n, x, 1)
+			if scale < math.Abs(x[ix])*smlnum || scale == 0 {
+				return 0
+			}
+			impl.Drscl(n, scale, x, 1)
+		}
+	}
+	if ainvnm == 0 {
+		return 0
+	}
+	// Return the estimate of the reciprocal condition number.
+	return (1 / ainvnm) / anorm
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go
new file mode 100644
index 00000000000..8150e568025
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtf2.go
@@ -0,0 +1,114 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbtf2 computes the Cholesky factorization of a symmetric positive banded
+// matrix ab. The matrix ab is n×n with kd diagonal bands. The Cholesky
+// factorization computed is
+//
+//	A = Uᵀ * U  if ul == blas.Upper
+//	A = L * Lᵀ  if ul == blas.Lower
+//
+// ul also specifies the storage of ab. If ul == blas.Upper, then
+// ab is stored as an upper-triangular banded matrix with kd super-diagonals,
+// and if ul == blas.Lower, ab is stored as a lower-triangular banded matrix
+// with kd sub-diagonals. On exit, the banded matrix U or L is stored in-place
+// into ab depending on the value of ul. Dpbtf2 returns whether the factorization
+// was successfully completed.
+//
+// The band storage scheme is illustrated below when n = 6, and kd = 2.
+// The resulting Cholesky decomposition is stored in the same elements as the
+// input band matrix (a11 becomes u11 or l11, etc.).
+//
+//	ul = blas.Upper
+//	a11 a12 a13
+//	a22 a23 a24
+//	a33 a34 a35
+//	a44 a45 a46
+//	a55 a56  *
+//	a66  *   *
+//
+//	ul = blas.Lower
+//	 *   *  a11
+//	 *  a21 a22
+//	a31 a32 a33
+//	a42 a43 a44
+//	a53 a54 a55
+//	a64 a65 a66
+//
+// Dpbtf2 is the unblocked version of the algorithm, see Dpbtrf for the blocked
+// version.
+//
+// Dpbtf2 is an internal routine, exported for testing purposes.
+func (Implementation) Dpbtf2(uplo blas.Uplo, n, kd int, ab []float64, ldab int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(ab) < (n-1)*ldab+kd+1 {
+		panic(shortAB)
+	}
+
+	bi := blas64.Implementation()
+
+	kld := max(1, ldab-1)
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization A = Uᵀ * U.
+		for j := 0; j < n; j++ {
+			// Compute U(j,j) and test for non-positive-definiteness.
+			ajj := ab[j*ldab]
+			if ajj <= 0 {
+				return false
+			}
+			ajj = math.Sqrt(ajj)
+			ab[j*ldab] = ajj
+			// Compute elements j+1:j+kn of row j and update the trailing submatrix
+			// within the band.
+			kn := min(kd, n-j-1)
+			if kn > 0 {
+				bi.Dscal(kn, 1/ajj, ab[j*ldab+1:], 1)
+				bi.Dsyr(blas.Upper, kn, -1, ab[j*ldab+1:], 1, ab[(j+1)*ldab:], kld)
+			}
+		}
+		return true
+	}
+	// Compute the Cholesky factorization A = L * Lᵀ.
+	for j := 0; j < n; j++ {
+		// Compute L(j,j) and test for non-positive-definiteness.
+		ajj := ab[j*ldab+kd]
+		if ajj <= 0 {
+			return false
+		}
+		ajj = math.Sqrt(ajj)
+		ab[j*ldab+kd] = ajj
+		// Compute elements j+1:j+kn of column j and update the trailing submatrix
+		// within the band.
+		kn := min(kd, n-j-1)
+		if kn > 0 {
+			bi.Dscal(kn, 1/ajj, ab[(j+1)*ldab+kd-1:], kld)
+			bi.Dsyr(blas.Lower, kn, -1, ab[(j+1)*ldab+kd-1:], kld, ab[(j+1)*ldab+kd:], kld)
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go
new file mode 100644
index 00000000000..12cdfc0fabf
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrf.go
@@ -0,0 +1,216 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbtrf computes the Cholesky factorization of an n×n symmetric positive
+// definite band matrix
+//
+//	A = Uᵀ * U  if uplo == blas.Upper
+//	A = L * Lᵀ  if uplo == blas.Lower
+//
+// where U is an upper triangular band matrix and L is lower triangular. kd is
+// the number of super- or sub-diagonals of A.
+//
+// The band storage scheme is illustrated below when n = 6 and kd = 2. Elements
+// marked * are not used by the function.
+//
+//	uplo == blas.Upper
+//	On entry:         On return:
+//	 a00  a01  a02     u00  u01  u02
+//	 a11  a12  a13     u11  u12  u13
+//	 a22  a23  a24     u22  u23  u24
+//	 a33  a34  a35     u33  u34  u35
+//	 a44  a45   *      u44  u45   *
+//	 a55   *    *      u55   *    *
+//
+//	uplo == blas.Lower
+//	On entry:         On return:
+//	  *    *   a00       *    *   l00
+//	  *   a10  a11       *   l10  l11
+//	 a20  a21  a22      l20  l21  l22
+//	 a31  a32  a33      l31  l32  l33
+//	 a42  a43  a44      l42  l43  l44
+//	 a53  a54  a55      l53  l54  l55
+func (impl Implementation) Dpbtrf(uplo blas.Uplo, n, kd int, ab []float64, ldab int) (ok bool) {
+	const nbmax = 32
+
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(ab) < (n-1)*ldab+kd+1 {
+		panic(shortAB)
+	}
+
+	opts := string(blas.Upper)
+	if uplo == blas.Lower {
+		opts = string(blas.Lower)
+	}
+	nb := impl.Ilaenv(1, "DPBTRF", opts, n, kd, -1, -1)
+	// The block size must not exceed the semi-bandwidth kd, and must not
+	// exceed the limit set by the size of the local array work.
+	nb = min(nb, nbmax)
+
+	if nb <= 1 || kd < nb {
+		// Use unblocked code.
+		return impl.Dpbtf2(uplo, n, kd, ab, ldab)
+	}
+
+	// Use blocked code.
+	ldwork := nb
+	work := make([]float64, nb*ldwork)
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization of a symmetric band
+		// matrix, given the upper triangle of the matrix in band
+		// storage.
+
+		// Process the band matrix one diagonal block at a time.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			// Factorize the diagonal block.
+			ok := impl.Dpotf2(uplo, ib, ab[i*ldab:], ldab-1)
+			if !ok {
+				return false
+			}
+			if i+ib >= n {
+				continue
+			}
+			// Update the relevant part of the trailing submatrix.
+			// If A11 denotes the diagonal block which has just been
+			// factorized, then we need to update the remaining
+			// blocks in the diagram:
+			//
+			//  A11   A12   A13
+			//        A22   A23
+			//              A33
+			//
+			// The numbers of rows and columns in the partitioning
+			// are ib, i2, i3 respectively. The blocks A12, A22 and
+			// A23 are empty if ib = kd. The upper triangle of A13
+			// lies outside the band.
+			i2 := min(kd-ib, n-i-ib)
+			if i2 > 0 {
+				// Update A12.
+				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, ib, i2,
+					1, ab[i*ldab:], ldab-1, ab[i*ldab+ib:], ldab-1)
+				// Update A22.
+				bi.Dsyrk(blas.Upper, blas.Trans, i2, ib,
+					-1, ab[i*ldab+ib:], ldab-1, 1, ab[(i+ib)*ldab:], ldab-1)
+			}
+			i3 := min(ib, n-i-kd)
+			if i3 > 0 {
+				// Copy the lower triangle of A13 into the work array.
+				for ii := 0; ii < ib; ii++ {
+					for jj := 0; jj <= min(ii, i3-1); jj++ {
+						work[ii*ldwork+jj] = ab[(i+ii)*ldab+kd-ii+jj]
+					}
+				}
+				// Update A13 (in the work array).
+				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, ib, i3,
+					1, ab[i*ldab:], ldab-1, work, ldwork)
+				// Update A23.
+				if i2 > 0 {
+					bi.Dgemm(blas.Trans, blas.NoTrans, i2, i3, ib,
+						-1, ab[i*ldab+ib:], ldab-1, work, ldwork,
+						1, ab[(i+ib)*ldab+kd-ib:], ldab-1)
+				}
+				// Update A33.
+				bi.Dsyrk(blas.Upper, blas.Trans, i3, ib,
+					-1, work, ldwork, 1, ab[(i+kd)*ldab:], ldab-1)
+				// Copy the lower triangle of A13 back into place.
+				for ii := 0; ii < ib; ii++ {
+					for jj := 0; jj <= min(ii, i3-1); jj++ {
+						ab[(i+ii)*ldab+kd-ii+jj] = work[ii*ldwork+jj]
+					}
+				}
+			}
+		}
+	} else {
+		// Compute the Cholesky factorization of a symmetric band
+		// matrix, given the lower triangle of the matrix in band
+		// storage.
+
+		// Process the band matrix one diagonal block at a time.
+		for i := 0; i < n; i += nb {
+			ib := min(nb, n-i)
+			// Factorize the diagonal block.
+			ok := impl.Dpotf2(uplo, ib, ab[i*ldab+kd:], ldab-1)
+			if !ok {
+				return false
+			}
+			if i+ib >= n {
+				continue
+			}
+			// Update the relevant part of the trailing submatrix.
+			// If A11 denotes the diagonal block which has just been
+			// factorized, then we need to update the remaining
+			// blocks in the diagram:
+			//
+			//  A11
+			//  A21   A22
+			//  A31   A32   A33
+			//
+			// The numbers of rows and columns in the partitioning
+			// are ib, i2, i3 respectively. The blocks A21, A22 and
+			// A32 are empty if ib = kd. The lowr triangle of A31
+			// lies outside the band.
+			i2 := min(kd-ib, n-i-ib)
+			if i2 > 0 {
+				// Update A21.
+				bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, i2, ib,
+					1, ab[i*ldab+kd:], ldab-1, ab[(i+ib)*ldab+kd-ib:], ldab-1)
+				// Update A22.
+				bi.Dsyrk(blas.Lower, blas.NoTrans, i2, ib,
+					-1, ab[(i+ib)*ldab+kd-ib:], ldab-1, 1, ab[(i+ib)*ldab+kd:], ldab-1)
+			}
+			i3 := min(ib, n-i-kd)
+			if i3 > 0 {
+				// Copy the upper triangle of A31 into the work array.
+				for ii := 0; ii < i3; ii++ {
+					for jj := ii; jj < ib; jj++ {
+						work[ii*ldwork+jj] = ab[(ii+i+kd)*ldab+jj-ii]
+					}
+				}
+				// Update A31 (in the work array).
+				bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, i3, ib,
+					1, ab[i*ldab+kd:], ldab-1, work, ldwork)
+				// Update A32.
+				if i2 > 0 {
+					bi.Dgemm(blas.NoTrans, blas.Trans, i3, i2, ib,
+						-1, work, ldwork, ab[(i+ib)*ldab+kd-ib:], ldab-1,
+						1, ab[(i+kd)*ldab+ib:], ldab-1)
+				}
+				// Update A33.
+				bi.Dsyrk(blas.Lower, blas.NoTrans, i3, ib,
+					-1, work, ldwork, 1, ab[(i+kd)*ldab+kd:], ldab-1)
+				// Copy the upper triangle of A31 back into place.
+				for ii := 0; ii < i3; ii++ {
+					for jj := ii; jj < ib; jj++ {
+						ab[(ii+i+kd)*ldab+jj-ii] = work[ii*ldwork+jj]
+					}
+				}
+			}
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go
new file mode 100644
index 00000000000..97c9ada00bb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpbtrs.go
@@ -0,0 +1,69 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpbtrs solves a system of linear equations A*X = B with an n×n symmetric
+// positive definite band matrix A using the Cholesky factorization
+//
+//	A = Uᵀ * U  if uplo == blas.Upper
+//	A = L * Lᵀ  if uplo == blas.Lower
+//
+// computed by Dpbtrf. kd is the number of super- or sub-diagonals of A. See the
+// documentation for Dpbtrf for a description of the band storage format of A.
+//
+// On entry, b contains the n×nrhs right hand side matrix B. On return, it is
+// overwritten with the solution matrix X.
+func (Implementation) Dpbtrs(uplo blas.Uplo, n, kd, nrhs int, ab []float64, ldab int, b []float64, ldb int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldab < kd+1:
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	if len(ab) < (n-1)*ldab+kd+1 {
+		panic(shortAB)
+	}
+	if len(b) < (n-1)*ldb+nrhs {
+		panic(shortB)
+	}
+
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Solve A*X = B where A = Uᵀ*U.
+		for j := 0; j < nrhs; j++ {
+			// Solve Uᵀ*Y = B, overwriting B with Y.
+			bi.Dtbsv(blas.Upper, blas.Trans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+			// Solve U*X = Y, overwriting Y with X.
+			bi.Dtbsv(blas.Upper, blas.NoTrans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+		}
+	} else {
+		// Solve A*X = B where A = L*Lᵀ.
+		for j := 0; j < nrhs; j++ {
+			// Solve L*Y = B, overwriting B with Y.
+			bi.Dtbsv(blas.Lower, blas.NoTrans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+			// Solve Lᵀ*X = Y, overwriting Y with X.
+			bi.Dtbsv(blas.Lower, blas.Trans, blas.NonUnit, n, kd, ab, ldab, b[j:], ldb)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go
new file mode 100644
index 00000000000..7af4c18728e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpocon.go
@@ -0,0 +1,90 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpocon estimates the reciprocal of the condition number of a positive-definite
+// matrix A given the Cholesky decomposition of A. The condition number computed
+// is based on the 1-norm and the ∞-norm.
+//
+// anorm is the 1-norm and the ∞-norm of the original matrix A.
+//
+// work is a temporary data slice of length at least 3*n and Dpocon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Dpocon will panic otherwise.
+func (impl Implementation) Dpocon(uplo blas.Uplo, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64 {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case anorm < 0:
+		panic(negANorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(work) < 3*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	if anorm == 0 {
+		return 0
+	}
+
+	bi := blas64.Implementation()
+
+	var (
+		smlnum = dlamchS
+		rcond  float64
+		sl, su float64
+		normin bool
+		ainvnm float64
+		kase   int
+		isave  [3]int
+	)
+	for {
+		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, &isave)
+		if kase == 0 {
+			if ainvnm != 0 {
+				rcond = (1 / ainvnm) / anorm
+			}
+			return rcond
+		}
+		if uplo == blas.Upper {
+			sl = impl.Dlatrs(blas.Upper, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+			normin = true
+			su = impl.Dlatrs(blas.Upper, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+		} else {
+			sl = impl.Dlatrs(blas.Lower, blas.NoTrans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+			normin = true
+			su = impl.Dlatrs(blas.Lower, blas.Trans, blas.NonUnit, normin, n, a, lda, work, work[2*n:])
+		}
+		scale := sl * su
+		if scale != 1 {
+			ix := bi.Idamax(n, work, 1)
+			if scale == 0 || scale < math.Abs(work[ix])*smlnum {
+				return rcond
+			}
+			impl.Drscl(n, scale, work, 1)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go
new file mode 100644
index 00000000000..83411f1cf18
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotf2.go
@@ -0,0 +1,82 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpotf2 computes the Cholesky decomposition of the symmetric positive definite
+// matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix,
+// and a = Uᵀ U is stored in place into a. If ul == blas.Lower, then a = L Lᵀ
+// is computed and stored in-place into a. If a is not positive definite, false
+// is returned. This is the unblocked version of the algorithm.
+//
+// Dpotf2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dpotf2(ul blas.Uplo, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case ul != blas.Upper && ul != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+
+	if ul == blas.Upper {
+		for j := 0; j < n; j++ {
+			ajj := a[j*lda+j]
+			if j != 0 {
+				ajj -= bi.Ddot(j, a[j:], lda, a[j:], lda)
+			}
+			if ajj <= 0 || math.IsNaN(ajj) {
+				a[j*lda+j] = ajj
+				return false
+			}
+			ajj = math.Sqrt(ajj)
+			a[j*lda+j] = ajj
+			if j < n-1 {
+				bi.Dgemv(blas.Trans, j, n-j-1,
+					-1, a[j+1:], lda, a[j:], lda,
+					1, a[j*lda+j+1:], 1)
+				bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
+			}
+		}
+		return true
+	}
+	for j := 0; j < n; j++ {
+		ajj := a[j*lda+j]
+		if j != 0 {
+			ajj -= bi.Ddot(j, a[j*lda:], 1, a[j*lda:], 1)
+		}
+		if ajj <= 0 || math.IsNaN(ajj) {
+			a[j*lda+j] = ajj
+			return false
+		}
+		ajj = math.Sqrt(ajj)
+		a[j*lda+j] = ajj
+		if j < n-1 {
+			bi.Dgemv(blas.NoTrans, n-j-1, j,
+				-1, a[(j+1)*lda:], lda, a[j*lda:], 1,
+				1, a[(j+1)*lda+j:], lda)
+			bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go
new file mode 100644
index 00000000000..7c81680166b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrf.go
@@ -0,0 +1,81 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpotrf computes the Cholesky decomposition of the symmetric positive definite
+// matrix a. If ul == blas.Upper, then a is stored as an upper-triangular matrix,
+// and a = Uᵀ U is stored in place into a. If ul == blas.Lower, then a = L Lᵀ
+// is computed and stored in-place into a. If a is not positive definite, false
+// is returned. This is the blocked version of the algorithm.
+func (impl Implementation) Dpotrf(ul blas.Uplo, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case ul != blas.Upper && ul != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	nb := impl.Ilaenv(1, "DPOTRF", string(ul), n, -1, -1, -1)
+	if nb <= 1 || n <= nb {
+		return impl.Dpotf2(ul, n, a, lda)
+	}
+	bi := blas64.Implementation()
+	if ul == blas.Upper {
+		for j := 0; j < n; j += nb {
+			jb := min(nb, n-j)
+			bi.Dsyrk(blas.Upper, blas.Trans, jb, j,
+				-1, a[j:], lda,
+				1, a[j*lda+j:], lda)
+			ok = impl.Dpotf2(blas.Upper, jb, a[j*lda+j:], lda)
+			if !ok {
+				return ok
+			}
+			if j+jb < n {
+				bi.Dgemm(blas.Trans, blas.NoTrans, jb, n-j-jb, j,
+					-1, a[j:], lda, a[j+jb:], lda,
+					1, a[j*lda+j+jb:], lda)
+				bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, jb, n-j-jb,
+					1, a[j*lda+j:], lda,
+					a[j*lda+j+jb:], lda)
+			}
+		}
+		return true
+	}
+	for j := 0; j < n; j += nb {
+		jb := min(nb, n-j)
+		bi.Dsyrk(blas.Lower, blas.NoTrans, jb, j,
+			-1, a[j*lda:], lda,
+			1, a[j*lda+j:], lda)
+		ok := impl.Dpotf2(blas.Lower, jb, a[j*lda+j:], lda)
+		if !ok {
+			return ok
+		}
+		if j+jb < n {
+			bi.Dgemm(blas.NoTrans, blas.Trans, n-j-jb, jb, j,
+				-1, a[(j+jb)*lda:], lda, a[j*lda:], lda,
+				1, a[(j+jb)*lda+j:], lda)
+			bi.Dtrsm(blas.Right, blas.Lower, blas.Trans, blas.NonUnit, n-j-jb, jb,
+				1, a[j*lda+j:], lda,
+				a[(j+jb)*lda+j:], lda)
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go
new file mode 100644
index 00000000000..6fa981c1309
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotri.go
@@ -0,0 +1,44 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas"
+
+// Dpotri computes the inverse of a real symmetric positive definite matrix A
+// using its Cholesky factorization.
+//
+// On entry, a contains the triangular factor U or L from the Cholesky
+// factorization A = Uᵀ*U or A = L*Lᵀ, as computed by Dpotrf.
+// On return, a contains the upper or lower triangle of the (symmetric)
+// inverse of A, overwriting the input factor U or L.
+func (impl Implementation) Dpotri(uplo blas.Uplo, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Invert the triangular Cholesky factor U or L.
+	ok = impl.Dtrtri(uplo, blas.NonUnit, n, a, lda)
+	if !ok {
+		return false
+	}
+
+	// Form inv(U)*inv(U)ᵀ or inv(L)ᵀ*inv(L).
+	impl.Dlauum(uplo, n, a, lda)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go
new file mode 100644
index 00000000000..77d070001a9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpotrs.go
@@ -0,0 +1,64 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpotrs solves a system of n linear equations A*X = B where A is an n×n
+// symmetric positive definite matrix and B is an n×nrhs matrix. The matrix A is
+// represented by its Cholesky factorization
+//
+//	A = Uᵀ*U  if uplo == blas.Upper
+//	A = L*Lᵀ  if uplo == blas.Lower
+//
+// as computed by Dpotrf. On entry, B contains the right-hand side matrix B, on
+// return it contains the solution matrix X.
+func (Implementation) Dpotrs(uplo blas.Uplo, n, nrhs int, a []float64, lda int, b []float64, ldb int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		// Solve Uᵀ * U * X = B where U is stored in the upper triangle of A.
+
+		// Solve Uᵀ * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Upper, blas.Trans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+		// Solve U * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Upper, blas.NoTrans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+	} else {
+		// Solve L * Lᵀ * X = B where L is stored in the lower triangle of A.
+
+		// Solve L * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Lower, blas.NoTrans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+		// Solve Lᵀ * X = B, overwriting B with X.
+		bi.Dtrsm(blas.Left, blas.Lower, blas.Trans, blas.NonUnit, n, nrhs, 1, a, lda, b, ldb)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go
new file mode 100644
index 00000000000..79b607ddc90
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstf2.go
@@ -0,0 +1,202 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpstf2 computes the Cholesky factorization with complete pivoting of an n×n
+// symmetric positive semidefinite matrix A.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U ,  if uplo = blas.Upper,
+//	Pᵀ * A * P = L  * Lᵀ,  if uplo = blas.Lower,
+//
+// where U is an upper triangular matrix, L is lower triangular, and P is a
+// permutation matrix.
+//
+// tol is a user-defined tolerance. The algorithm terminates if the pivot is
+// less than or equal to tol. If tol is negative, then n*eps*max(A[k,k]) will be
+// used instead.
+//
+// On return, A contains the factor U or L from the Cholesky factorization and
+// piv contains P stored such that P[piv[k],k] = 1.
+//
+// Dpstf2 returns the computed rank of A and whether the factorization can be
+// used to solve a system. Dpstf2 does not attempt to check that A is positive
+// semi-definite, so if ok is false, the matrix A is either rank deficient or is
+// not positive semidefinite.
+//
+// The length of piv must be n and the length of work must be at least 2*n,
+// otherwise Dpstf2 will panic.
+//
+// Dpstf2 is an internal routine. It is exported for testing purposes.
+func (Implementation) Dpstf2(uplo blas.Uplo, n int, a []float64, lda int, piv []int, tol float64, work []float64) (rank int, ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0, true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(piv) != n:
+		panic(badLenPiv)
+	case len(work) < 2*n:
+		panic(shortWork)
+	}
+
+	// Initialize piv.
+	for i := range piv[:n] {
+		piv[i] = i
+	}
+
+	// Compute the first pivot.
+	pvt := 0
+	ajj := a[0]
+	for i := 1; i < n; i++ {
+		aii := a[i*lda+i]
+		if aii > ajj {
+			pvt = i
+			ajj = aii
+		}
+	}
+	if ajj <= 0 || math.IsNaN(ajj) {
+		return 0, false
+	}
+
+	// Compute stopping value if not supplied.
+	dstop := tol
+	if dstop < 0 {
+		dstop = float64(n) * dlamchE * ajj
+	}
+
+	// Set first half of work to zero, holds dot products.
+	dots := work[:n]
+	for i := range dots {
+		dots[i] = 0
+	}
+	work2 := work[n : 2*n]
+
+	bi := blas64.Implementation()
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization  Pᵀ * A * P = Uᵀ * U.
+		for j := 0; j < n; j++ {
+			// Update dot products and compute possible pivots which are stored
+			// in the second half of work.
+			for i := j; i < n; i++ {
+				if j > 0 {
+					tmp := a[(j-1)*lda+i]
+					dots[i] += tmp * tmp
+				}
+				work2[i] = a[i*lda+i] - dots[i]
+			}
+			if j > 0 {
+				// Find the pivot.
+				pvt = j
+				ajj = work2[pvt]
+				for k := j + 1; k < n; k++ {
+					wk := work2[k]
+					if wk > ajj {
+						pvt = k
+						ajj = wk
+					}
+				}
+				// Test for exit.
+				if ajj <= dstop || math.IsNaN(ajj) {
+					a[j*lda+j] = ajj
+					return j, false
+				}
+			}
+			if j != pvt {
+				// Swap pivot rows and columns.
+				a[pvt*lda+pvt] = a[j*lda+j]
+				bi.Dswap(j, a[j:], lda, a[pvt:], lda)
+				if pvt < n-1 {
+					bi.Dswap(n-pvt-1, a[j*lda+(pvt+1):], 1, a[pvt*lda+(pvt+1):], 1)
+				}
+				bi.Dswap(pvt-j-1, a[j*lda+(j+1):], 1, a[(j+1)*lda+pvt:], lda)
+				// Swap dot products and piv.
+				dots[j], dots[pvt] = dots[pvt], dots[j]
+				piv[j], piv[pvt] = piv[pvt], piv[j]
+			}
+			ajj = math.Sqrt(ajj)
+			a[j*lda+j] = ajj
+			// Compute elements j+1:n of row j.
+			if j < n-1 {
+				bi.Dgemv(blas.Trans, j, n-j-1,
+					-1, a[j+1:], lda, a[j:], lda,
+					1, a[j*lda+j+1:], 1)
+				bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
+			}
+		}
+	} else {
+		// Compute the Cholesky factorization  Pᵀ * A * P = L * Lᵀ.
+		for j := 0; j < n; j++ {
+			// Update dot products and compute possible pivots which are stored
+			// in the second half of work.
+			for i := j; i < n; i++ {
+				if j > 0 {
+					tmp := a[i*lda+(j-1)]
+					dots[i] += tmp * tmp
+				}
+				work2[i] = a[i*lda+i] - dots[i]
+			}
+			if j > 0 {
+				// Find the pivot.
+				pvt = j
+				ajj = work2[pvt]
+				for k := j + 1; k < n; k++ {
+					wk := work2[k]
+					if wk > ajj {
+						pvt = k
+						ajj = wk
+					}
+				}
+				// Test for exit.
+				if ajj <= dstop || math.IsNaN(ajj) {
+					a[j*lda+j] = ajj
+					return j, false
+				}
+			}
+			if j != pvt {
+				// Swap pivot rows and columns.
+				a[pvt*lda+pvt] = a[j*lda+j]
+				bi.Dswap(j, a[j*lda:], 1, a[pvt*lda:], 1)
+				if pvt < n-1 {
+					bi.Dswap(n-pvt-1, a[(pvt+1)*lda+j:], lda, a[(pvt+1)*lda+pvt:], lda)
+				}
+				bi.Dswap(pvt-j-1, a[(j+1)*lda+j:], lda, a[pvt*lda+(j+1):], 1)
+				// Swap dot products and piv.
+				dots[j], dots[pvt] = dots[pvt], dots[j]
+				piv[j], piv[pvt] = piv[pvt], piv[j]
+			}
+			ajj = math.Sqrt(ajj)
+			a[j*lda+j] = ajj
+			// Compute elements j+1:n of column j.
+			if j < n-1 {
+				bi.Dgemv(blas.NoTrans, n-j-1, j,
+					-1, a[(j+1)*lda:], lda, a[j*lda:], 1,
+					1, a[(j+1)*lda+j:], lda)
+				bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
+			}
+		}
+	}
+	return n, true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go
new file mode 100644
index 00000000000..46a2fd4b77e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpstrf.go
@@ -0,0 +1,233 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dpstrf computes the Cholesky factorization with complete pivoting of an n×n
+// symmetric positive semidefinite matrix A.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U ,  if uplo = blas.Upper,
+//	Pᵀ * A * P = L  * Lᵀ,  if uplo = blas.Lower,
+//
+// where U is an upper triangular matrix, L is lower triangular, and P is a
+// permutation matrix.
+//
+// tol is a user-defined tolerance. The algorithm terminates if the pivot is
+// less than or equal to tol. If tol is negative, then n*eps*max(A[k,k]) will be
+// used instead.
+//
+// On return, A contains the factor U or L from the Cholesky factorization and
+// piv contains P stored such that P[piv[k],k] = 1.
+//
+// Dpstrf returns the computed rank of A and whether the factorization can be
+// used to solve a system. Dpstrf does not attempt to check that A is positive
+// semi-definite, so if ok is false, the matrix A is either rank deficient or is
+// not positive semidefinite.
+//
+// The length of piv must be n and the length of work must be at least 2*n,
+// otherwise Dpstrf will panic.
+//
+// Dpstrf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dpstrf(uplo blas.Uplo, n int, a []float64, lda int, piv []int, tol float64, work []float64) (rank int, ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 0, true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(piv) != n:
+		panic(badLenPiv)
+	case len(work) < 2*n:
+		panic(shortWork)
+	}
+
+	// Get block size.
+	nb := impl.Ilaenv(1, "DPOTRF", string(uplo), n, -1, -1, -1)
+	if nb <= 1 || n <= nb {
+		// Use unblocked code.
+		return impl.Dpstf2(uplo, n, a, lda, piv, tol, work)
+	}
+
+	// Initialize piv.
+	for i := range piv[:n] {
+		piv[i] = i
+	}
+
+	// Compute the first pivot.
+	pvt := 0
+	ajj := a[0]
+	for i := 1; i < n; i++ {
+		aii := a[i*lda+i]
+		if aii > ajj {
+			pvt = i
+			ajj = aii
+		}
+	}
+	if ajj <= 0 || math.IsNaN(ajj) {
+		return 0, false
+	}
+
+	// Compute stopping value if not supplied.
+	dstop := tol
+	if dstop < 0 {
+		dstop = float64(n) * dlamchE * ajj
+	}
+
+	bi := blas64.Implementation()
+	// Split work in half, the first half holds dot products.
+	dots := work[:n]
+	work2 := work[n : 2*n]
+	if uplo == blas.Upper {
+		// Compute the Cholesky factorization  Pᵀ * A * P = Uᵀ * U.
+		for k := 0; k < n; k += nb {
+			// Account for last block not being nb wide.
+			jb := min(nb, n-k)
+			// Set relevant part of dot products to zero.
+			for i := k; i < n; i++ {
+				dots[i] = 0
+			}
+			for j := k; j < k+jb; j++ {
+				// Update dot products and compute possible pivots which are stored
+				// in the second half of work.
+				for i := j; i < n; i++ {
+					if j > k {
+						tmp := a[(j-1)*lda+i]
+						dots[i] += tmp * tmp
+					}
+					work2[i] = a[i*lda+i] - dots[i]
+				}
+				if j > 0 {
+					// Find the pivot.
+					pvt = j
+					ajj = work2[pvt]
+					for l := j + 1; l < n; l++ {
+						wl := work2[l]
+						if wl > ajj {
+							pvt = l
+							ajj = wl
+						}
+					}
+					// Test for exit.
+					if ajj <= dstop || math.IsNaN(ajj) {
+						a[j*lda+j] = ajj
+						return j, false
+					}
+				}
+				if j != pvt {
+					// Swap pivot rows and columns.
+					a[pvt*lda+pvt] = a[j*lda+j]
+					bi.Dswap(j, a[j:], lda, a[pvt:], lda)
+					if pvt < n-1 {
+						bi.Dswap(n-pvt-1, a[j*lda+(pvt+1):], 1, a[pvt*lda+(pvt+1):], 1)
+					}
+					bi.Dswap(pvt-j-1, a[j*lda+(j+1):], 1, a[(j+1)*lda+pvt:], lda)
+					// Swap dot products and piv.
+					dots[j], dots[pvt] = dots[pvt], dots[j]
+					piv[j], piv[pvt] = piv[pvt], piv[j]
+				}
+				ajj = math.Sqrt(ajj)
+				a[j*lda+j] = ajj
+				// Compute elements j+1:n of row j.
+				if j < n-1 {
+					bi.Dgemv(blas.Trans, j-k, n-j-1,
+						-1, a[k*lda+j+1:], lda, a[k*lda+j:], lda,
+						1, a[j*lda+j+1:], 1)
+					bi.Dscal(n-j-1, 1/ajj, a[j*lda+j+1:], 1)
+				}
+			}
+			// Update trailing matrix.
+			if k+jb < n {
+				j := k + jb
+				bi.Dsyrk(blas.Upper, blas.Trans, n-j, jb,
+					-1, a[k*lda+j:], lda, 1, a[j*lda+j:], lda)
+			}
+		}
+	} else {
+		// Compute the Cholesky factorization  Pᵀ * A * P = L * Lᵀ.
+		for k := 0; k < n; k += nb {
+			// Account for last block not being nb wide.
+			jb := min(nb, n-k)
+			// Set relevant part of dot products to zero.
+			for i := k; i < n; i++ {
+				dots[i] = 0
+			}
+			for j := k; j < k+jb; j++ {
+				// Update dot products and compute possible pivots which are stored
+				// in the second half of work.
+				for i := j; i < n; i++ {
+					if j > k {
+						tmp := a[i*lda+(j-1)]
+						dots[i] += tmp * tmp
+					}
+					work2[i] = a[i*lda+i] - dots[i]
+				}
+				if j > 0 {
+					// Find the pivot.
+					pvt = j
+					ajj = work2[pvt]
+					for l := j + 1; l < n; l++ {
+						wl := work2[l]
+						if wl > ajj {
+							pvt = l
+							ajj = wl
+						}
+					}
+					// Test for exit.
+					if ajj <= dstop || math.IsNaN(ajj) {
+						a[j*lda+j] = ajj
+						return j, false
+					}
+				}
+				if j != pvt {
+					// Swap pivot rows and columns.
+					a[pvt*lda+pvt] = a[j*lda+j]
+					bi.Dswap(j, a[j*lda:], 1, a[pvt*lda:], 1)
+					if pvt < n-1 {
+						bi.Dswap(n-pvt-1, a[(pvt+1)*lda+j:], lda, a[(pvt+1)*lda+pvt:], lda)
+					}
+					bi.Dswap(pvt-j-1, a[(j+1)*lda+j:], lda, a[pvt*lda+(j+1):], 1)
+					// Swap dot products and piv.
+					dots[j], dots[pvt] = dots[pvt], dots[j]
+					piv[j], piv[pvt] = piv[pvt], piv[j]
+				}
+				ajj = math.Sqrt(ajj)
+				a[j*lda+j] = ajj
+				// Compute elements j+1:n of column j.
+				if j < n-1 {
+					bi.Dgemv(blas.NoTrans, n-j-1, j-k,
+						-1, a[(j+1)*lda+k:], lda, a[j*lda+k:], 1,
+						1, a[(j+1)*lda+j:], lda)
+					bi.Dscal(n-j-1, 1/ajj, a[(j+1)*lda+j:], lda)
+				}
+			}
+			// Update trailing matrix.
+			if k+jb < n {
+				j := k + jb
+				bi.Dsyrk(blas.Lower, blas.NoTrans, n-j, jb,
+					-1, a[j*lda+k:], lda, 1, a[j*lda+j:], lda)
+			}
+		}
+	}
+	return n, true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go
new file mode 100644
index 00000000000..cd41e3175a3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dptcon.go
@@ -0,0 +1,99 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dptcon computes and returns the reciprocal of the condition number (in the
+// 1-norm) of a symmetric positive definite tridiagonal matrix A using the
+// factorization A = L*D*Lᵀ or A = Uᵀ*D*U computed by Dpttrf.
+//
+// The reciprocal of the condition number is computed as
+//
+//	rcond = 1 / (anorm * ‖A⁻¹‖)
+//
+// and ‖A⁻¹‖ is computed by a direct method.
+//
+// d and e contain, respectively, the n diagonal elements of the diagonal matrix
+// D and the (n-1) off-diagonal elements of the unit bidiagonal factor U or L
+// from the factorization of A, as computed by Dpttrf.
+//
+// anorm is the 1-norm of the original matrix A.
+//
+// work must have length n, otherwise Dptcon will panic.
+func (impl Implementation) Dptcon(n int, d, e []float64, anorm float64, work []float64) (rcond float64) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case anorm < 0:
+		panic(badNorm)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	switch {
+	case anorm == 0:
+		return 0
+	case math.IsNaN(anorm):
+		// Propagate NaN.
+		return anorm
+	case math.IsInf(anorm, 1):
+		return 0
+	}
+
+	// Check that d[0:n] is positive.
+	for _, di := range d[:n] {
+		if di <= 0 {
+			return 0
+		}
+	}
+
+	// Solve M(A) * x = e, where M(A) = (m[i,j]) is given by
+	//
+	// 	m[i,j] =  abs(A[i,j]), i == j,
+	// 	m[i,j] = -abs(A[i,j]), i != j,
+	//
+	// and e = [1,1,...,1]ᵀ. Note M(A) = M(L)*D*M(L)ᵀ.
+
+	// Solve M(L) * b = e.
+	work[0] = 1
+	for i := 1; i < n; i++ {
+		work[i] = 1 + work[i-1]*math.Abs(e[i-1])
+	}
+
+	// Solve D * M(L)ᵀ * x = b.
+	work[n-1] /= d[n-1]
+	for i := n - 2; i >= 0; i-- {
+		work[i] = work[i]/d[i] + work[i+1]*math.Abs(e[i])
+	}
+
+	// Compute ainvnm = max(x[i]), 0<=i<n.
+	bi := blas64.Implementation()
+	ix := bi.Idamax(n, work, 1)
+	ainvnm := math.Abs(work[ix])
+	if ainvnm == 0 {
+		return 0
+	}
+
+	// Compute the reciprocal condition number.
+	return 1 / ainvnm / anorm
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go
new file mode 100644
index 00000000000..37851b63d54
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dptsv.go
@@ -0,0 +1,49 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dptsv computes the solution to system of linear equations
+//
+//	A * X = B
+//
+// where A is an n×n symmetric positive definite tridiagonal matrix, and X and B
+// are n×nrhs matrices. A is factored as A = L*D*Lᵀ, and the factored form of A
+// is then used to solve the system of equations.
+//
+// On entry, d contains the n diagonal elements of A and e contains the (n-1)
+// subdiagonal elements of A. On return, d contains the n diagonal elements of
+// the diagonal matrix D from the factorization A = L*D*Lᵀ and e contains the
+// (n-1) subdiagonal elements of the unit bidiagonal factor L.
+//
+// Dptsv returns whether the solution X has been successfully computed.
+func (impl Implementation) Dptsv(n, nrhs int, d, e []float64, b []float64, ldb int) (ok bool) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	if n == 0 || nrhs == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	ok = impl.Dpttrf(n, d, e)
+	if ok {
+		impl.Dpttrs(n, nrhs, d, e, b, ldb)
+	}
+	return ok
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go
new file mode 100644
index 00000000000..8ff2c2e4baa
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrf.go
@@ -0,0 +1,80 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dpttrf computes the L*D*Lᵀ factorization of an n×n symmetric positive
+// definite tridiagonal matrix A and returns whether the factorization was
+// successful.
+//
+// On entry, d and e contain the n diagonal and (n-1) subdiagonal elements,
+// respectively, of A.
+//
+// On return, d contains the n diagonal elements of the diagonal matrix D and e
+// contains the (n-1) subdiagonal elements of the unit bidiagonal matrix L.
+func (impl Implementation) Dpttrf(n int, d, e []float64) (ok bool) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	}
+
+	// Compute the L*D*Lᵀ (or Uᵀ*D*U) factorization of A.
+	i4 := (n - 1) % 4
+	for i := 0; i < i4; i++ {
+		if d[i] <= 0 {
+			return false
+		}
+		ei := e[i]
+		e[i] /= d[i]
+		d[i+1] -= e[i] * ei
+	}
+	for i := i4; i < n-4; i += 4 {
+		// Drop out of the loop if d[i] <= 0: the matrix is not positive
+		// definite.
+		if d[i] <= 0 {
+			return false
+		}
+
+		// Solve for e[i] and d[i+1].
+		ei := e[i]
+		e[i] /= d[i]
+		d[i+1] -= e[i] * ei
+		if d[i+1] <= 0 {
+			return false
+		}
+
+		// Solve for e[i+1] and d[i+2].
+		ei = e[i+1]
+		e[i+1] /= d[i+1]
+		d[i+2] -= e[i+1] * ei
+		if d[i+2] <= 0 {
+			return false
+		}
+
+		// Solve for e[i+2] and d[i+3].
+		ei = e[i+2]
+		e[i+2] /= d[i+2]
+		d[i+3] -= e[i+2] * ei
+		if d[i+3] <= 0 {
+			return false
+		}
+
+		// Solve for e[i+3] and d[i+4].
+		ei = e[i+3]
+		e[i+3] /= d[i+3]
+		d[i+4] -= e[i+3] * ei
+	}
+	// Check d[n-1] for positive definiteness.
+	return d[n-1] > 0
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go
new file mode 100644
index 00000000000..7bdee6f937d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dpttrs.go
@@ -0,0 +1,51 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Dpttrs solves a tridiagonal system of the form
+//
+//	A * X = B
+//
+// using the L*D*Lᵀ factorization of A computed by Dpttrf. D is a diagonal
+// matrix specified in d, L is a unit bidiagonal matrix whose subdiagonal is
+// specified in e, and X and B are n×nrhs matrices.
+func (impl Implementation) Dpttrs(n, nrhs int, d, e []float64, b []float64, ldb int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 || nrhs == 0 {
+		return
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	nb := 1
+	if nrhs > 1 {
+		nb = max(1, impl.Ilaenv(1, "DPTTRS", " ", n, nrhs, -1, -1))
+	}
+
+	if nb >= nrhs {
+		impl.dptts2(n, nrhs, d, e, b, ldb)
+	} else {
+		for j := 0; j < nrhs; j += nb {
+			jb := min(nrhs-j, nb)
+			impl.dptts2(n, jb, d, e, b[j:], ldb)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go
new file mode 100644
index 00000000000..ff1df168f23
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dptts2.go
@@ -0,0 +1,39 @@
+// Copyright ©2023 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// dptts2 solves a tridiagonal system of the form
+//
+//	A * X = B
+//
+// using the L*D*Lᵀ factorization of A computed by Dpttrf. D is a diagonal
+// matrix specified in d, L is a unit bidiagonal matrix whose subdiagonal is
+// specified in e, and X and B are n×nrhs matrices.
+func (impl Implementation) dptts2(n, nrhs int, d, e []float64, b []float64, ldb int) {
+	// Quick return if possible.
+	if n <= 1 {
+		if n == 1 {
+			bi := blas64.Implementation()
+			bi.Dscal(nrhs, 1/d[0], b, 1)
+		}
+		return
+	}
+
+	// Solve A * X = B using the factorization A = L*D*Lᵀ, overwriting each
+	// right hand side vector with its solution.
+	for j := 0; j < nrhs; j++ {
+		// Solve L * x = b.
+		for i := 1; i < n; i++ {
+			b[i*ldb+j] -= b[(i-1)*ldb+j] * e[i-1]
+		}
+		// Solve D * Lᵀ * x = b.
+		b[(n-1)*ldb+j] /= d[n-1]
+		for i := n - 2; i >= 0; i-- {
+			b[i*ldb+j] = b[i*ldb+j]/d[i] - b[(i+1)*ldb+j]*e[i]
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go b/vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go
new file mode 100644
index 00000000000..b2772dbc224
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/drscl.go
@@ -0,0 +1,63 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Drscl multiplies the vector x by 1/a being careful to avoid overflow or
+// underflow where possible.
+//
+// Drscl is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Drscl(n int, a float64, x []float64, incX int) {
+	switch {
+	case n < 0:
+		panic(nLT0)
+	case incX <= 0:
+		panic(badIncX)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	if len(x) < 1+(n-1)*incX {
+		panic(shortX)
+	}
+
+	bi := blas64.Implementation()
+
+	cden := a
+	cnum := 1.0
+	smlnum := dlamchS
+	bignum := 1 / smlnum
+	for {
+		cden1 := cden * smlnum
+		cnum1 := cnum / bignum
+		var mul float64
+		var done bool
+		switch {
+		case cnum != 0 && math.Abs(cden1) > math.Abs(cnum):
+			mul = smlnum
+			done = false
+			cden = cden1
+		case math.Abs(cnum1) > math.Abs(cden):
+			mul = bignum
+			done = false
+			cnum = cnum1
+		default:
+			mul = cnum / cden
+			done = true
+		}
+		bi.Dscal(n, mul, x, incX)
+		if done {
+			break
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go
new file mode 100644
index 00000000000..d6c7861ab5b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsteqr.go
@@ -0,0 +1,376 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dsteqr computes the eigenvalues and optionally the eigenvectors of a symmetric
+// tridiagonal matrix using the implicit QL or QR method. The eigenvectors of a
+// full or band symmetric matrix can also be found if Dsytrd, Dsptrd, or Dsbtrd
+// have been used to reduce this matrix to tridiagonal form.
+//
+// d, on entry, contains the diagonal elements of the tridiagonal matrix. On exit,
+// d contains the eigenvalues in ascending order. d must have length n and
+// Dsteqr will panic otherwise.
+//
+// e, on entry, contains the off-diagonal elements of the tridiagonal matrix on
+// entry, and is overwritten during the call to Dsteqr. e must have length n-1 and
+// Dsteqr will panic otherwise.
+//
+// z, on entry, contains the n×n orthogonal matrix used in the reduction to
+// tridiagonal form if compz == lapack.EVOrig. On exit, if
+// compz == lapack.EVOrig, z contains the orthonormal eigenvectors of the
+// original symmetric matrix, and if compz == lapack.EVTridiag, z contains the
+// orthonormal eigenvectors of the symmetric tridiagonal matrix. z is not used
+// if compz == lapack.EVCompNone.
+//
+// work must have length at least max(1, 2*n-2) if the eigenvectors are computed,
+// and Dsteqr will panic otherwise.
+//
+// Dsteqr is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsteqr(compz lapack.EVComp, n int, d, e, z []float64, ldz int, work []float64) (ok bool) {
+	switch {
+	case compz != lapack.EVCompNone && compz != lapack.EVTridiag && compz != lapack.EVOrig:
+		panic(badEVComp)
+	case n < 0:
+		panic(nLT0)
+	case ldz < 1, compz != lapack.EVCompNone && ldz < n:
+		panic(badLdZ)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case compz != lapack.EVCompNone && len(z) < (n-1)*ldz+n:
+		panic(shortZ)
+	case compz != lapack.EVCompNone && len(work) < max(1, 2*n-2):
+		panic(shortWork)
+	}
+
+	var icompz int
+	if compz == lapack.EVOrig {
+		icompz = 1
+	} else if compz == lapack.EVTridiag {
+		icompz = 2
+	}
+
+	if n == 1 {
+		if icompz == 2 {
+			z[0] = 1
+		}
+		return true
+	}
+
+	bi := blas64.Implementation()
+
+	eps := dlamchE
+	eps2 := eps * eps
+	safmin := dlamchS
+	safmax := 1 / safmin
+	ssfmax := math.Sqrt(safmax) / 3
+	ssfmin := math.Sqrt(safmin) / eps2
+
+	// Compute the eigenvalues and eigenvectors of the tridiagonal matrix.
+	if icompz == 2 {
+		impl.Dlaset(blas.All, n, n, 0, 1, z, ldz)
+	}
+	const maxit = 30
+	nmaxit := n * maxit
+
+	jtot := 0
+
+	// Determine where the matrix splits and choose QL or QR iteration for each
+	// block, according to whether top or bottom diagonal element is smaller.
+	l1 := 0
+	nm1 := n - 1
+
+	type scaletype int
+	const (
+		down scaletype = iota + 1
+		up
+	)
+	var iscale scaletype
+
+	for {
+		if l1 > n-1 {
+			// Order eigenvalues and eigenvectors.
+			if icompz == 0 {
+				impl.Dlasrt(lapack.SortIncreasing, n, d)
+			} else {
+				// TODO(btracey): Consider replacing this sort with a call to sort.Sort.
+				for ii := 1; ii < n; ii++ {
+					i := ii - 1
+					k := i
+					p := d[i]
+					for j := ii; j < n; j++ {
+						if d[j] < p {
+							k = j
+							p = d[j]
+						}
+					}
+					if k != i {
+						d[k] = d[i]
+						d[i] = p
+						bi.Dswap(n, z[i:], ldz, z[k:], ldz)
+					}
+				}
+			}
+			return true
+		}
+		if l1 > 0 {
+			e[l1-1] = 0
+		}
+		var m int
+		if l1 <= nm1 {
+			for m = l1; m < nm1; m++ {
+				test := math.Abs(e[m])
+				if test == 0 {
+					break
+				}
+				if test <= (math.Sqrt(math.Abs(d[m]))*math.Sqrt(math.Abs(d[m+1])))*eps {
+					e[m] = 0
+					break
+				}
+			}
+		}
+		l := l1
+		lsv := l
+		lend := m
+		lendsv := lend
+		l1 = m + 1
+		if lend == l {
+			continue
+		}
+
+		// Scale submatrix in rows and columns L to Lend
+		anorm := impl.Dlanst(lapack.MaxAbs, lend-l+1, d[l:], e[l:])
+		switch {
+		case anorm == 0:
+			continue
+		case anorm > ssfmax:
+			iscale = down
+			// Pretend that d and e are matrices with 1 column.
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l+1, 1, d[l:], 1)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l, 1, e[l:], 1)
+		case anorm < ssfmin:
+			iscale = up
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l+1, 1, d[l:], 1)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l, 1, e[l:], 1)
+		}
+
+		// Choose between QL and QR.
+		if math.Abs(d[lend]) < math.Abs(d[l]) {
+			lend = lsv
+			l = lendsv
+		}
+		if lend > l {
+			// QL Iteration. Look for small subdiagonal element.
+			for {
+				if l != lend {
+					for m = l; m < lend; m++ {
+						v := math.Abs(e[m])
+						if v*v <= (eps2*math.Abs(d[m]))*math.Abs(d[m+1])+safmin {
+							break
+						}
+					}
+				} else {
+					m = lend
+				}
+				if m < lend {
+					e[m] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found.
+					l++
+					if l > lend {
+						break
+					}
+					continue
+				}
+
+				// If remaining matrix is 2×2, use Dlae2 to compute its eigensystem.
+				if m == l+1 {
+					if icompz > 0 {
+						d[l], d[l+1], work[l], work[n-1+l] = impl.Dlaev2(d[l], e[l], d[l+1])
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward,
+							n, 2, work[l:], work[n-1+l:], z[l:], ldz)
+					} else {
+						d[l], d[l+1] = impl.Dlae2(d[l], e[l], d[l+1])
+					}
+					e[l] = 0
+					l += 2
+					if l > lend {
+						break
+					}
+					continue
+				}
+
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift
+				g := (d[l+1] - p) / (2 * e[l])
+				r := impl.Dlapy2(g, 1)
+				g = d[m] - p + e[l]/(g+math.Copysign(r, g))
+				s := 1.0
+				c := 1.0
+				p = 0.0
+
+				// Inner loop
+				for i := m - 1; i >= l; i-- {
+					f := s * e[i]
+					b := c * e[i]
+					c, s, r = impl.Dlartg(g, f)
+					if i != m-1 {
+						e[i+1] = r
+					}
+					g = d[i+1] - p
+					r = (d[i]-g)*s + 2*c*b
+					p = s * r
+					d[i+1] = g + p
+					g = c*r - b
+
+					// If eigenvectors are desired, then save rotations.
+					if icompz > 0 {
+						work[i] = c
+						work[n-1+i] = -s
+					}
+				}
+				// If eigenvectors are desired, then apply saved rotations.
+				if icompz > 0 {
+					mm := m - l + 1
+					impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward,
+						n, mm, work[l:], work[n-1+l:], z[l:], ldz)
+				}
+				d[l] -= p
+				e[l] = g
+			}
+		} else {
+			// QR Iteration.
+			// Look for small superdiagonal element.
+			for {
+				if l != lend {
+					for m = l; m > lend; m-- {
+						v := math.Abs(e[m-1])
+						if v*v <= (eps2*math.Abs(d[m])*math.Abs(d[m-1]) + safmin) {
+							break
+						}
+					}
+				} else {
+					m = lend
+				}
+				if m > lend {
+					e[m-1] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found
+					l--
+					if l < lend {
+						break
+					}
+					continue
+				}
+
+				// If remaining matrix is 2×2, use Dlae2 to compute its eigenvalues.
+				if m == l-1 {
+					if icompz > 0 {
+						d[l-1], d[l], work[m], work[n-1+m] = impl.Dlaev2(d[l-1], e[l-1], d[l])
+						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward,
+							n, 2, work[m:], work[n-1+m:], z[l-1:], ldz)
+					} else {
+						d[l-1], d[l] = impl.Dlae2(d[l-1], e[l-1], d[l])
+					}
+					e[l-1] = 0
+					l -= 2
+					if l < lend {
+						break
+					}
+					continue
+				}
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift.
+				g := (d[l-1] - p) / (2 * e[l-1])
+				r := impl.Dlapy2(g, 1)
+				g = d[m] - p + (e[l-1])/(g+math.Copysign(r, g))
+				s := 1.0
+				c := 1.0
+				p = 0.0
+
+				// Inner loop.
+				for i := m; i < l; i++ {
+					f := s * e[i]
+					b := c * e[i]
+					c, s, r = impl.Dlartg(g, f)
+					if i != m {
+						e[i-1] = r
+					}
+					g = d[i] - p
+					r = (d[i+1]-g)*s + 2*c*b
+					p = s * r
+					d[i] = g + p
+					g = c*r - b
+
+					// If eigenvectors are desired, then save rotations.
+					if icompz > 0 {
+						work[i] = c
+						work[n-1+i] = s
+					}
+				}
+
+				// If eigenvectors are desired, then apply saved rotations.
+				if icompz > 0 {
+					mm := l - m + 1
+					impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward,
+						n, mm, work[m:], work[n-1+m:], z[m:], ldz)
+				}
+				d[l] -= p
+				e[l-1] = g
+			}
+		}
+
+		// Undo scaling if necessary.
+		switch iscale {
+		case down:
+			// Pretend that d and e are matrices with 1 column.
+			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv+1, 1, d[lsv:], 1)
+			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv, 1, e[lsv:], 1)
+		case up:
+			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv+1, 1, d[lsv:], 1)
+			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv, 1, e[lsv:], 1)
+		}
+
+		// Check for no convergence to an eigenvalue after a total of n*maxit iterations.
+		if jtot >= nmaxit {
+			break
+		}
+	}
+	for i := 0; i < n-1; i++ {
+		if e[i] != 0 {
+			return false
+		}
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go
new file mode 100644
index 00000000000..dc1e178dfa1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsterf.go
@@ -0,0 +1,285 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dsterf computes all eigenvalues of a symmetric tridiagonal matrix using the
+// Pal-Walker-Kahan variant of the QL or QR algorithm.
+//
+// d contains the diagonal elements of the tridiagonal matrix on entry, and
+// contains the eigenvalues in ascending order on exit. d must have length at
+// least n, or Dsterf will panic.
+//
+// e contains the off-diagonal elements of the tridiagonal matrix on entry, and is
+// overwritten during the call to Dsterf. e must have length of at least n-1 or
+// Dsterf will panic.
+//
+// Dsterf is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsterf(n int, d, e []float64) (ok bool) {
+	if n < 0 {
+		panic(nLT0)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	}
+
+	if n == 1 {
+		return true
+	}
+
+	const (
+		none = 0 // The values are not scaled.
+		down = 1 // The values are scaled below ssfmax threshold.
+		up   = 2 // The values are scaled below ssfmin threshold.
+	)
+
+	// Determine the unit roundoff for this environment.
+	eps := dlamchE
+	eps2 := eps * eps
+	safmin := dlamchS
+	safmax := 1 / safmin
+	ssfmax := math.Sqrt(safmax) / 3
+	ssfmin := math.Sqrt(safmin) / eps2
+
+	// Compute the eigenvalues of the tridiagonal matrix.
+	maxit := 30
+	nmaxit := n * maxit
+	jtot := 0
+
+	l1 := 0
+
+	for {
+		if l1 > n-1 {
+			impl.Dlasrt(lapack.SortIncreasing, n, d)
+			return true
+		}
+		if l1 > 0 {
+			e[l1-1] = 0
+		}
+		var m int
+		for m = l1; m < n-1; m++ {
+			if math.Abs(e[m]) <= math.Sqrt(math.Abs(d[m]))*math.Sqrt(math.Abs(d[m+1]))*eps {
+				e[m] = 0
+				break
+			}
+		}
+
+		l := l1
+		lsv := l
+		lend := m
+		lendsv := lend
+		l1 = m + 1
+		if lend == 0 {
+			continue
+		}
+
+		// Scale submatrix in rows and columns l to lend.
+		anorm := impl.Dlanst(lapack.MaxAbs, lend-l+1, d[l:], e[l:])
+		iscale := none
+		if anorm == 0 {
+			continue
+		}
+		if anorm > ssfmax {
+			iscale = down
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l+1, 1, d[l:], n)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmax, lend-l, 1, e[l:], n)
+		} else if anorm < ssfmin {
+			iscale = up
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l+1, 1, d[l:], n)
+			impl.Dlascl(lapack.General, 0, 0, anorm, ssfmin, lend-l, 1, e[l:], n)
+		}
+
+		el := e[l:lend]
+		for i, v := range el {
+			el[i] *= v
+		}
+
+		// Choose between QL and QR iteration.
+		if math.Abs(d[lend]) < math.Abs(d[l]) {
+			lend = lsv
+			l = lendsv
+		}
+		if lend >= l {
+			// QL Iteration.
+			// Look for small sub-diagonal element.
+			for {
+				if l != lend {
+					for m = l; m < lend; m++ {
+						if math.Abs(e[m]) <= eps2*(math.Abs(d[m]*d[m+1])) {
+							break
+						}
+					}
+				} else {
+					m = lend
+				}
+				if m < lend {
+					e[m] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found.
+					l++
+					if l > lend {
+						break
+					}
+					continue
+				}
+				// If remaining matrix is 2 by 2, use Dlae2 to compute its eigenvalues.
+				if m == l+1 {
+					d[l], d[l+1] = impl.Dlae2(d[l], math.Sqrt(e[l]), d[l+1])
+					e[l] = 0
+					l += 2
+					if l > lend {
+						break
+					}
+					continue
+				}
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift.
+				rte := math.Sqrt(e[l])
+				sigma := (d[l+1] - p) / (2 * rte)
+				r := impl.Dlapy2(sigma, 1)
+				sigma = p - (rte / (sigma + math.Copysign(r, sigma)))
+
+				c := 1.0
+				s := 0.0
+				gamma := d[m] - sigma
+				p = gamma * gamma
+
+				// Inner loop.
+				for i := m - 1; i >= l; i-- {
+					bb := e[i]
+					r := p + bb
+					if i != m-1 {
+						e[i+1] = s * r
+					}
+					oldc := c
+					c = p / r
+					s = bb / r
+					oldgam := gamma
+					alpha := d[i]
+					gamma = c*(alpha-sigma) - s*oldgam
+					d[i+1] = oldgam + (alpha - gamma)
+					if c != 0 {
+						p = (gamma * gamma) / c
+					} else {
+						p = oldc * bb
+					}
+				}
+				e[l] = s * p
+				d[l] = sigma + gamma
+			}
+		} else {
+			for {
+				// QR Iteration.
+				// Look for small super-diagonal element.
+				for m = l; m > lend; m-- {
+					if math.Abs(e[m-1]) <= eps2*math.Abs(d[m]*d[m-1]) {
+						break
+					}
+				}
+				if m > lend {
+					e[m-1] = 0
+				}
+				p := d[l]
+				if m == l {
+					// Eigenvalue found.
+					l--
+					if l < lend {
+						break
+					}
+					continue
+				}
+
+				// If remaining matrix is 2 by 2, use Dlae2 to compute its eigenvalues.
+				if m == l-1 {
+					d[l], d[l-1] = impl.Dlae2(d[l], math.Sqrt(e[l-1]), d[l-1])
+					e[l-1] = 0
+					l -= 2
+					if l < lend {
+						break
+					}
+					continue
+				}
+				if jtot == nmaxit {
+					break
+				}
+				jtot++
+
+				// Form shift.
+				rte := math.Sqrt(e[l-1])
+				sigma := (d[l-1] - p) / (2 * rte)
+				r := impl.Dlapy2(sigma, 1)
+				sigma = p - (rte / (sigma + math.Copysign(r, sigma)))
+
+				c := 1.0
+				s := 0.0
+				gamma := d[m] - sigma
+				p = gamma * gamma
+
+				// Inner loop.
+				for i := m; i < l; i++ {
+					bb := e[i]
+					r := p + bb
+					if i != m {
+						e[i-1] = s * r
+					}
+					oldc := c
+					c = p / r
+					s = bb / r
+					oldgam := gamma
+					alpha := d[i+1]
+					gamma = c*(alpha-sigma) - s*oldgam
+					d[i] = oldgam + alpha - gamma
+					if c != 0 {
+						p = (gamma * gamma) / c
+					} else {
+						p = oldc * bb
+					}
+				}
+				e[l-1] = s * p
+				d[l] = sigma + gamma
+			}
+		}
+
+		// Undo scaling if necessary
+		switch iscale {
+		case down:
+			impl.Dlascl(lapack.General, 0, 0, ssfmax, anorm, lendsv-lsv+1, 1, d[lsv:], n)
+		case up:
+			impl.Dlascl(lapack.General, 0, 0, ssfmin, anorm, lendsv-lsv+1, 1, d[lsv:], n)
+		}
+
+		// Check for no convergence to an eigenvalue after a total of n*maxit iterations.
+		if jtot >= nmaxit {
+			break
+		}
+	}
+	for _, v := range e[:n-1] {
+		if v != 0 {
+			return false
+		}
+	}
+	impl.Dlasrt(lapack.SortIncreasing, n, d)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go
new file mode 100644
index 00000000000..5f57f3a5c97
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsyev.go
@@ -0,0 +1,130 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dsyev computes all eigenvalues and, optionally, the eigenvectors of a real
+// symmetric matrix A.
+//
+// w contains the eigenvalues in ascending order upon return. w must have length
+// at least n, and Dsyev will panic otherwise.
+//
+// On entry, a contains the elements of the symmetric matrix A in the triangular
+// portion specified by uplo. If jobz == lapack.EVCompute, a contains the
+// orthonormal eigenvectors of A on exit, otherwise jobz must be lapack.EVNone
+// and on exit the specified triangular region is overwritten.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At minimum,
+// lwork >= 3*n-1, and Dsyev will panic otherwise. The amount of blocking is
+// limited by the usable length. If lwork == -1, instead of computing Dsyev the
+// optimal work length is stored into work[0].
+func (impl Implementation) Dsyev(jobz lapack.EVJob, uplo blas.Uplo, n int, a []float64, lda int, w, work []float64, lwork int) (ok bool) {
+	switch {
+	case jobz != lapack.EVNone && jobz != lapack.EVCompute:
+		panic(badEVJob)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < max(1, 3*n-1) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	var opts string
+	if uplo == blas.Upper {
+		opts = "U"
+	} else {
+		opts = "L"
+	}
+	nb := impl.Ilaenv(1, "DSYTRD", opts, n, -1, -1, -1)
+	lworkopt := max(1, (nb+2)*n)
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(w) < n:
+		panic(shortW)
+	}
+
+	if n == 1 {
+		w[0] = a[0]
+		work[0] = 2
+		if jobz == lapack.EVCompute {
+			a[0] = 1
+		}
+		return true
+	}
+
+	safmin := dlamchS
+	eps := dlamchP
+	smlnum := safmin / eps
+	bignum := 1 / smlnum
+	rmin := math.Sqrt(smlnum)
+	rmax := math.Sqrt(bignum)
+
+	// Scale matrix to allowable range, if necessary.
+	anrm := impl.Dlansy(lapack.MaxAbs, uplo, n, a, lda, work)
+	scaled := false
+	var sigma float64
+	if anrm > 0 && anrm < rmin {
+		scaled = true
+		sigma = rmin / anrm
+	} else if anrm > rmax {
+		scaled = true
+		sigma = rmax / anrm
+	}
+	if scaled {
+		kind := lapack.LowerTri
+		if uplo == blas.Upper {
+			kind = lapack.UpperTri
+		}
+		impl.Dlascl(kind, 0, 0, 1, sigma, n, n, a, lda)
+	}
+	var inde int
+	indtau := inde + n
+	indwork := indtau + n
+	llwork := lwork - indwork
+	impl.Dsytrd(uplo, n, a, lda, w, work[inde:], work[indtau:], work[indwork:], llwork)
+
+	// For eigenvalues only, call Dsterf. For eigenvectors, first call Dorgtr
+	// to generate the orthogonal matrix, then call Dsteqr.
+	if jobz == lapack.EVNone {
+		ok = impl.Dsterf(n, w, work[inde:])
+	} else {
+		impl.Dorgtr(uplo, n, a, lda, work[indtau:], work[indwork:], llwork)
+		ok = impl.Dsteqr(lapack.EVComp(jobz), n, w, work[inde:], a, lda, work[indtau:])
+	}
+	if !ok {
+		return false
+	}
+
+	// If the matrix was scaled, then rescale eigenvalues appropriately.
+	if scaled {
+		bi := blas64.Implementation()
+		bi.Dscal(n, 1/sigma, w, 1)
+	}
+	work[0] = float64(lworkopt)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go
new file mode 100644
index 00000000000..03e7cc07b07
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytd2.go
@@ -0,0 +1,147 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dsytd2 reduces a symmetric n×n matrix A to symmetric tridiagonal form T by
+// an orthogonal similarity transformation
+//
+//	Qᵀ * A * Q = T
+//
+// On entry, the matrix is contained in the specified triangle of a. On exit,
+// if uplo == blas.Upper, the diagonal and first super-diagonal of a are
+// overwritten with the elements of T. The elements above the first super-diagonal
+// are overwritten with the elementary reflectors that are used with
+// the elements written to tau in order to construct Q. If uplo == blas.Lower,
+// the elements are written in the lower triangular region.
+//
+// d must have length at least n. e and tau must have length at least n-1. Dsytd2
+// will panic if these sizes are not met.
+//
+// Q is represented as a product of elementary reflectors.
+// If uplo == blas.Upper
+//
+//	Q = H_{n-2} * ... * H_1 * H_0
+//
+// and if uplo == blas.Lower
+//
+//	Q = H_0 * H_1 * ... * H_{n-2}
+//
+// where
+//
+//	H_i = I - tau * v * vᵀ
+//
+// where tau is stored in tau[i], and v is stored in a.
+//
+// If uplo == blas.Upper, v[0:i-1] is stored in A[0:i-1,i+1], v[i] = 1, and
+// v[i+1:] = 0. The elements of a are
+//
+//	[ d   e  v2  v3  v4]
+//	[     d   e  v3  v4]
+//	[         d   e  v4]
+//	[             d   e]
+//	[                 d]
+//
+// If uplo == blas.Lower, v[0:i+1] = 0, v[i+1] = 1, and v[i+2:] is stored in
+// A[i+2:n,i].
+// The elements of a are
+//
+//	[ d                ]
+//	[ e   d            ]
+//	[v1   e   d        ]
+//	[v1  v2   e   d    ]
+//	[v1  v2  v3   e   d]
+//
+// Dsytd2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsytd2(uplo blas.Uplo, n int, a []float64, lda int, d, e, tau []float64) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	bi := blas64.Implementation()
+
+	if uplo == blas.Upper {
+		// Reduce the upper triangle of A.
+		for i := n - 2; i >= 0; i-- {
+			// Generate elementary reflector H_i = I - tau * v * vᵀ to
+			// annihilate A[i:i-1, i+1].
+			var taui float64
+			a[i*lda+i+1], taui = impl.Dlarfg(i+1, a[i*lda+i+1], a[i+1:], lda)
+			e[i] = a[i*lda+i+1]
+			if taui != 0 {
+				// Apply H_i from both sides to A[0:i,0:i].
+				a[i*lda+i+1] = 1
+
+				// Compute x := tau * A * v storing x in tau[0:i].
+				bi.Dsymv(uplo, i+1, taui, a, lda, a[i+1:], lda, 0, tau, 1)
+
+				// Compute w := x - 1/2 * tau * (xᵀ * v) * v.
+				alpha := -0.5 * taui * bi.Ddot(i+1, tau, 1, a[i+1:], lda)
+				bi.Daxpy(i+1, alpha, a[i+1:], lda, tau, 1)
+
+				// Apply the transformation as a rank-2 update
+				// A = A - v * wᵀ - w * vᵀ.
+				bi.Dsyr2(uplo, i+1, -1, a[i+1:], lda, tau, 1, a, lda)
+				a[i*lda+i+1] = e[i]
+			}
+			d[i+1] = a[(i+1)*lda+i+1]
+			tau[i] = taui
+		}
+		d[0] = a[0]
+		return
+	}
+	// Reduce the lower triangle of A.
+	for i := 0; i < n-1; i++ {
+		// Generate elementary reflector H_i = I - tau * v * vᵀ to
+		// annihilate A[i+2:n, i].
+		var taui float64
+		a[(i+1)*lda+i], taui = impl.Dlarfg(n-i-1, a[(i+1)*lda+i], a[min(i+2, n-1)*lda+i:], lda)
+		e[i] = a[(i+1)*lda+i]
+		if taui != 0 {
+			// Apply H_i from both sides to A[i+1:n, i+1:n].
+			a[(i+1)*lda+i] = 1
+
+			// Compute x := tau * A * v, storing y in tau[i:n-1].
+			bi.Dsymv(uplo, n-i-1, taui, a[(i+1)*lda+i+1:], lda, a[(i+1)*lda+i:], lda, 0, tau[i:], 1)
+
+			// Compute w := x - 1/2 * tau * (xᵀ * v) * v.
+			alpha := -0.5 * taui * bi.Ddot(n-i-1, tau[i:], 1, a[(i+1)*lda+i:], lda)
+			bi.Daxpy(n-i-1, alpha, a[(i+1)*lda+i:], lda, tau[i:], 1)
+
+			// Apply the transformation as a rank-2 update
+			// A = A - v * wᵀ - w * vᵀ.
+			bi.Dsyr2(uplo, n-i-1, -1, a[(i+1)*lda+i:], lda, tau[i:], 1, a[(i+1)*lda+i+1:], lda)
+			a[(i+1)*lda+i] = e[i]
+		}
+		d[i] = a[i*lda+i]
+		tau[i] = taui
+	}
+	d[n-1] = a[(n-1)*lda+n-1]
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go
new file mode 100644
index 00000000000..74d2287ed2b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dsytrd.go
@@ -0,0 +1,184 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dsytrd reduces a symmetric n×n matrix A to symmetric tridiagonal form by an
+// orthogonal similarity transformation
+//
+//	Qᵀ * A * Q = T
+//
+// where Q is an orthonormal matrix and T is symmetric and tridiagonal.
+//
+// On entry, a contains the elements of the input matrix in the triangle specified
+// by uplo. On exit, the diagonal and sub/super-diagonal are overwritten by the
+// corresponding elements of the tridiagonal matrix T. The remaining elements in
+// the triangle, along with the array tau, contain the data to construct Q as
+// the product of elementary reflectors.
+//
+// If uplo == blas.Upper, Q is constructed with
+//
+//	Q = H_{n-2} * ... * H_1 * H_0
+//
+// where
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// v is constructed as v[i+1:n] = 0, v[i] = 1, v[0:i-1] is stored in A[0:i-1, i+1].
+// The elements of A are
+//
+//	[ d   e  v1  v2  v3]
+//	[     d   e  v2  v3]
+//	[         d   e  v3]
+//	[             d   e]
+//	[                 e]
+//
+// If uplo == blas.Lower, Q is constructed with
+//
+//	Q = H_0 * H_1 * ... * H_{n-2}
+//
+// where
+//
+//	H_i = I - tau_i * v * vᵀ
+//
+// v is constructed as v[0:i+1] = 0, v[i+1] = 1, v[i+2:n] is stored in A[i+2:n, i].
+// The elements of A are
+//
+//	[ d                ]
+//	[ e   d            ]
+//	[v0   e   d        ]
+//	[v0  v1   e   d    ]
+//	[v0  v1  v2   e   d]
+//
+// d must have length n, and e and tau must have length n-1. Dsytrd will panic if
+// these conditions are not met.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At minimum,
+// lwork >= 1, and Dsytrd will panic otherwise. The amount of blocking is
+// limited by the usable length.
+// If lwork == -1, instead of computing Dsytrd the optimal work length is stored
+// into work[0].
+//
+// Dsytrd is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dsytrd(uplo blas.Uplo, n int, a []float64, lda int, d, e, tau, work []float64, lwork int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case lwork < 1 && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return
+	}
+
+	nb := impl.Ilaenv(1, "DSYTRD", string(uplo), n, -1, -1, -1)
+	lworkopt := n * nb
+	if lwork == -1 {
+		work[0] = float64(lworkopt)
+		return
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(d) < n:
+		panic(shortD)
+	case len(e) < n-1:
+		panic(shortE)
+	case len(tau) < n-1:
+		panic(shortTau)
+	}
+
+	bi := blas64.Implementation()
+
+	nx := n
+	iws := 1
+	var ldwork int
+	if 1 < nb && nb < n {
+		// Determine when to cross over from blocked to unblocked code. The last
+		// block is always handled by unblocked code.
+		nx = max(nb, impl.Ilaenv(3, "DSYTRD", string(uplo), n, -1, -1, -1))
+		if nx < n {
+			// Determine if workspace is large enough for blocked code.
+			ldwork = nb
+			iws = n * ldwork
+			if lwork < iws {
+				// Not enough workspace to use optimal nb: determine the minimum
+				// value of nb and reduce nb or force use of unblocked code by
+				// setting nx = n.
+				nb = max(lwork/n, 1)
+				nbmin := impl.Ilaenv(2, "DSYTRD", string(uplo), n, -1, -1, -1)
+				if nb < nbmin {
+					nx = n
+				}
+			}
+		} else {
+			nx = n
+		}
+	} else {
+		nb = 1
+	}
+	ldwork = nb
+
+	if uplo == blas.Upper {
+		// Reduce the upper triangle of A. Columns 0:kk are handled by the
+		// unblocked method.
+		var i int
+		kk := n - ((n-nx+nb-1)/nb)*nb
+		for i = n - nb; i >= kk; i -= nb {
+			// Reduce columns i:i+nb to tridiagonal form and form the matrix W
+			// which is needed to update the unreduced part of the matrix.
+			impl.Dlatrd(uplo, i+nb, nb, a, lda, e, tau, work, ldwork)
+
+			// Update the unreduced submatrix A[0:i-1,0:i-1], using an update
+			// of the form A = A - V*Wᵀ - W*Vᵀ.
+			bi.Dsyr2k(uplo, blas.NoTrans, i, nb, -1, a[i:], lda, work, ldwork, 1, a, lda)
+
+			// Copy superdiagonal elements back into A, and diagonal elements into D.
+			for j := i; j < i+nb; j++ {
+				a[(j-1)*lda+j] = e[j-1]
+				d[j] = a[j*lda+j]
+			}
+		}
+		// Use unblocked code to reduce the last or only block
+		// check that i == kk.
+		impl.Dsytd2(uplo, kk, a, lda, d, e, tau)
+	} else {
+		var i int
+		// Reduce the lower triangle of A.
+		for i = 0; i < n-nx; i += nb {
+			// Reduce columns 0:i+nb to tridiagonal form and form the matrix W
+			// which is needed to update the unreduced part of the matrix.
+			impl.Dlatrd(uplo, n-i, nb, a[i*lda+i:], lda, e[i:], tau[i:], work, ldwork)
+
+			// Update the unreduced submatrix A[i+ib:n, i+ib:n], using an update
+			// of the form A = A + V*Wᵀ - W*Vᵀ.
+			bi.Dsyr2k(uplo, blas.NoTrans, n-i-nb, nb, -1, a[(i+nb)*lda+i:], lda,
+				work[nb*ldwork:], ldwork, 1, a[(i+nb)*lda+i+nb:], lda)
+
+			// Copy subdiagonal elements back into A, and diagonal elements into D.
+			for j := i; j < i+nb; j++ {
+				a[(j+1)*lda+j] = e[j]
+				d[j] = a[j*lda+j]
+			}
+		}
+		// Use unblocked code to reduce the last or only block.
+		impl.Dsytd2(uplo, n-i, a[i*lda+i:], lda, d[i:], e[i:], tau[i:])
+	}
+	work[0] = float64(iws)
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go
new file mode 100644
index 00000000000..6b56d9e0f01
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtbtrs.go
@@ -0,0 +1,77 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtbtrs solves a triangular system of the form
+//
+//	A * X = B   if trans == blas.NoTrans
+//	Aᵀ * X = B  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix with kd super- or subdiagonals, and
+// B is an n×nrhs matrix.
+//
+// Dtbtrs returns whether A is non-singular. If A is singular, no solution X is
+// computed.
+func (impl Implementation) Dtbtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, kd, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case kd < 0:
+		panic(kdLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < kd+1:
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+kd+1:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	// Check for singularity.
+	if diag == blas.NonUnit {
+		if uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				if a[i*lda] == 0 {
+					return false
+				}
+			}
+		} else {
+			for i := 0; i < n; i++ {
+				if a[i*lda+kd] == 0 {
+					return false
+				}
+			}
+		}
+	}
+
+	// Solve A * X = B  or Aᵀ * X = B.
+	bi := blas64.Implementation()
+	for j := 0; j < nrhs; j++ {
+		bi.Dtbsv(uplo, trans, diag, n, kd, a, lda, b[j:], ldb)
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go
new file mode 100644
index 00000000000..b3f0208a355
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtgsja.go
@@ -0,0 +1,389 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dtgsja computes the generalized singular value decomposition (GSVD)
+// of two real upper triangular or trapezoidal matrices A and B.
+//
+// A and B have the following forms, which may be obtained by the
+// preprocessing subroutine Dggsvp from a general m×n matrix A and p×n
+// matrix B:
+//
+//	          n-k-l  k    l
+//	A =    k [  0   A12  A13 ] if m-k-l >= 0;
+//	       l [  0    0   A23 ]
+//	   m-k-l [  0    0    0  ]
+//
+//	          n-k-l  k    l
+//	A =    k [  0   A12  A13 ] if m-k-l < 0;
+//	     m-k [  0    0   A23 ]
+//
+//	          n-k-l  k    l
+//	B =    l [  0    0   B13 ]
+//	     p-l [  0    0    0  ]
+//
+// where the k×k matrix A12 and l×l matrix B13 are non-singular
+// upper triangular. A23 is l×l upper triangular if m-k-l >= 0,
+// otherwise A23 is (m-k)×l upper trapezoidal.
+//
+// On exit,
+//
+//	Uᵀ*A*Q = D1*[ 0 R ], Vᵀ*B*Q = D2*[ 0 R ],
+//
+// where U, V and Q are orthogonal matrices.
+// R is a non-singular upper triangular matrix, and D1 and D2 are
+// diagonal matrices, which are of the following structures:
+//
+// If m-k-l >= 0,
+//
+//	                  k  l
+//	     D1 =     k [ I  0 ]
+//	              l [ 0  C ]
+//	          m-k-l [ 0  0 ]
+//
+//	                k  l
+//	     D2 = l   [ 0  S ]
+//	          p-l [ 0  0 ]
+//
+//	             n-k-l  k    l
+//	[ 0 R ] = k [  0   R11  R12 ] k
+//	          l [  0    0   R22 ] l
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_{k+l} ),
+//	S = diag( beta_k,  ... , beta_{k+l} ),
+//	C^2 + S^2 = I.
+//
+// R is stored in
+//
+//	A[0:k+l, n-k-l:n]
+//
+// on exit.
+//
+// If m-k-l < 0,
+//
+//	               k m-k k+l-m
+//	    D1 =   k [ I  0    0  ]
+//	         m-k [ 0  C    0  ]
+//
+//	                 k m-k k+l-m
+//	    D2 =   m-k [ 0  S    0  ]
+//	         k+l-m [ 0  0    I  ]
+//	           p-l [ 0  0    0  ]
+//
+//	               n-k-l  k   m-k  k+l-m
+//	[ 0 R ] =    k [ 0    R11  R12  R13 ]
+//	           m-k [ 0     0   R22  R23 ]
+//	         k+l-m [ 0     0    0   R33 ]
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_m ),
+//	S = diag( beta_k,  ... , beta_m ),
+//	C^2 + S^2 = I.
+//
+//	R = [ R11 R12 R13 ] is stored in A[0:m, n-k-l:n]
+//	    [  0  R22 R23 ]
+//
+// and R33 is stored in
+//
+//	B[m-k:l, n+m-k-l:n] on exit.
+//
+// The computation of the orthogonal transformation matrices U, V or Q
+// is optional. These matrices may either be formed explicitly, or they
+// may be post-multiplied into input matrices U1, V1, or Q1.
+//
+// Dtgsja essentially uses a variant of Kogbetliantz algorithm to reduce
+// min(l,m-k)×l triangular or trapezoidal matrix A23 and l×l
+// matrix B13 to the form:
+//
+//	U1ᵀ*A13*Q1 = C1*R1; V1ᵀ*B13*Q1 = S1*R1,
+//
+// where U1, V1 and Q1 are orthogonal matrices. C1 and S1 are diagonal
+// matrices satisfying
+//
+//	C1^2 + S1^2 = I,
+//
+// and R1 is an l×l non-singular upper triangular matrix.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDUnit     Use unit-initialized matrix
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// k and l specify the sub-blocks in the input matrices A and B:
+//
+//	A23 = A[k:min(k+l,m), n-l:n) and B13 = B[0:l, n-l:n]
+//
+// of A and B, whose GSVD is going to be computed by Dtgsja.
+//
+// tola and tolb are the convergence criteria for the Jacobi-Kogbetliantz
+// iteration procedure. Generally, they are the same as used in the preprocessing
+// step, for example,
+//
+//	tola = max(m, n)*norm(A)*eps,
+//	tolb = max(p, n)*norm(B)*eps,
+//
+// where eps is the machine epsilon.
+//
+// work must have length at least 2*n, otherwise Dtgsja will panic.
+//
+// alpha and beta must have length n or Dtgsja will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//
+//	alpha[0:k] = 1,
+//	beta[0:k]  = 0,
+//
+// if m-k-l >= 0,
+//
+//	alpha[k:k+l] = diag(C),
+//	beta[k:k+l]  = diag(S),
+//
+// if m-k-l < 0,
+//
+//	alpha[k:m]= C, alpha[m:k+l]= 0
+//	beta[k:m] = S, beta[m:k+l] = 1.
+//
+// if k+l < n,
+//
+//	alpha[k+l:n] = 0 and
+//	beta[k+l:n]  = 0.
+//
+// On exit, A[n-k:n, 0:min(k+l,m)] contains the triangular matrix R or part of R
+// and if necessary, B[m-k:l, n+m-k-l:n] contains a part of R.
+//
+// Dtgsja returns whether the routine converged and the number of iteration cycles
+// that were run.
+//
+// Dtgsja is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtgsja(jobU, jobV, jobQ lapack.GSVDJob, m, p, n, k, l int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64) (cycles int, ok bool) {
+	const maxit = 40
+
+	initu := jobU == lapack.GSVDUnit
+	wantu := initu || jobU == lapack.GSVDU
+
+	initv := jobV == lapack.GSVDUnit
+	wantv := initv || jobV == lapack.GSVDV
+
+	initq := jobQ == lapack.GSVDUnit
+	wantq := initq || jobQ == lapack.GSVDQ
+
+	switch {
+	case !initu && !wantu && jobU != lapack.GSVDNone:
+		panic(badGSVDJob + "U")
+	case !initv && !wantv && jobV != lapack.GSVDNone:
+		panic(badGSVDJob + "V")
+	case !initq && !wantq && jobQ != lapack.GSVDNone:
+		panic(badGSVDJob + "Q")
+	case m < 0:
+		panic(mLT0)
+	case p < 0:
+		panic(pLT0)
+	case n < 0:
+		panic(nLT0)
+
+	case lda < max(1, n):
+		panic(badLdA)
+	case len(a) < (m-1)*lda+n:
+		panic(shortA)
+
+	case ldb < max(1, n):
+		panic(badLdB)
+	case len(b) < (p-1)*ldb+n:
+		panic(shortB)
+
+	case len(alpha) != n:
+		panic(badLenAlpha)
+	case len(beta) != n:
+		panic(badLenBeta)
+
+	case ldu < 1, wantu && ldu < m:
+		panic(badLdU)
+	case wantu && len(u) < (m-1)*ldu+m:
+		panic(shortU)
+
+	case ldv < 1, wantv && ldv < p:
+		panic(badLdV)
+	case wantv && len(v) < (p-1)*ldv+p:
+		panic(shortV)
+
+	case ldq < 1, wantq && ldq < n:
+		panic(badLdQ)
+	case wantq && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+
+	case len(work) < 2*n:
+		panic(shortWork)
+	}
+
+	// Initialize U, V and Q, if necessary
+	if initu {
+		impl.Dlaset(blas.All, m, m, 0, 1, u, ldu)
+	}
+	if initv {
+		impl.Dlaset(blas.All, p, p, 0, 1, v, ldv)
+	}
+	if initq {
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+	}
+
+	bi := blas64.Implementation()
+	minTol := math.Min(tola, tolb)
+
+	// Loop until convergence.
+	upper := false
+	for cycles = 1; cycles <= maxit; cycles++ {
+		upper = !upper
+
+		for i := 0; i < l-1; i++ {
+			for j := i + 1; j < l; j++ {
+				var a1, a2, a3 float64
+				if k+i < m {
+					a1 = a[(k+i)*lda+n-l+i]
+				}
+				if k+j < m {
+					a3 = a[(k+j)*lda+n-l+j]
+				}
+
+				b1 := b[i*ldb+n-l+i]
+				b3 := b[j*ldb+n-l+j]
+
+				var b2 float64
+				if upper {
+					if k+i < m {
+						a2 = a[(k+i)*lda+n-l+j]
+					}
+					b2 = b[i*ldb+n-l+j]
+				} else {
+					if k+j < m {
+						a2 = a[(k+j)*lda+n-l+i]
+					}
+					b2 = b[j*ldb+n-l+i]
+				}
+
+				csu, snu, csv, snv, csq, snq := impl.Dlags2(upper, a1, a2, a3, b1, b2, b3)
+
+				// Update (k+i)-th and (k+j)-th rows of matrix A: Uᵀ*A.
+				if k+j < m {
+					bi.Drot(l, a[(k+j)*lda+n-l:], 1, a[(k+i)*lda+n-l:], 1, csu, snu)
+				}
+
+				// Update i-th and j-th rows of matrix B: Vᵀ*B.
+				bi.Drot(l, b[j*ldb+n-l:], 1, b[i*ldb+n-l:], 1, csv, snv)
+
+				// Update (n-l+i)-th and (n-l+j)-th columns of matrices
+				// A and B: A*Q and B*Q.
+				bi.Drot(min(k+l, m), a[n-l+j:], lda, a[n-l+i:], lda, csq, snq)
+				bi.Drot(l, b[n-l+j:], ldb, b[n-l+i:], ldb, csq, snq)
+
+				if upper {
+					if k+i < m {
+						a[(k+i)*lda+n-l+j] = 0
+					}
+					b[i*ldb+n-l+j] = 0
+				} else {
+					if k+j < m {
+						a[(k+j)*lda+n-l+i] = 0
+					}
+					b[j*ldb+n-l+i] = 0
+				}
+
+				// Update orthogonal matrices U, V, Q, if desired.
+				if wantu && k+j < m {
+					bi.Drot(m, u[k+j:], ldu, u[k+i:], ldu, csu, snu)
+				}
+				if wantv {
+					bi.Drot(p, v[j:], ldv, v[i:], ldv, csv, snv)
+				}
+				if wantq {
+					bi.Drot(n, q[n-l+j:], ldq, q[n-l+i:], ldq, csq, snq)
+				}
+			}
+		}
+
+		if !upper {
+			// The matrices A13 and B13 were lower triangular at the start
+			// of the cycle, and are now upper triangular.
+			//
+			// Convergence test: test the parallelism of the corresponding
+			// rows of A and B.
+			var error float64
+			for i := 0; i < min(l, m-k); i++ {
+				bi.Dcopy(l-i, a[(k+i)*lda+n-l+i:], 1, work, 1)
+				bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, work[l:], 1)
+				ssmin := impl.Dlapll(l-i, work, 1, work[l:], 1)
+				error = math.Max(error, ssmin)
+			}
+			if math.Abs(error) <= minTol {
+				// The algorithm has converged.
+				// Compute the generalized singular value pairs (alpha, beta)
+				// and set the triangular matrix R to array A.
+				for i := 0; i < k; i++ {
+					alpha[i] = 1
+					beta[i] = 0
+				}
+
+				for i := 0; i < min(l, m-k); i++ {
+					a1 := a[(k+i)*lda+n-l+i]
+					b1 := b[i*ldb+n-l+i]
+					gamma := b1 / a1
+					if !math.IsInf(gamma, 0) {
+						// Change sign if necessary.
+						if gamma < 0 {
+							bi.Dscal(l-i, -1, b[i*ldb+n-l+i:], 1)
+							if wantv {
+								bi.Dscal(p, -1, v[i:], ldv)
+							}
+						}
+						beta[k+i], alpha[k+i], _ = impl.Dlartg(math.Abs(gamma), 1)
+
+						if alpha[k+i] >= beta[k+i] {
+							bi.Dscal(l-i, 1/alpha[k+i], a[(k+i)*lda+n-l+i:], 1)
+						} else {
+							bi.Dscal(l-i, 1/beta[k+i], b[i*ldb+n-l+i:], 1)
+							bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, a[(k+i)*lda+n-l+i:], 1)
+						}
+					} else {
+						alpha[k+i] = 0
+						beta[k+i] = 1
+						bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, a[(k+i)*lda+n-l+i:], 1)
+					}
+				}
+
+				for i := m; i < k+l; i++ {
+					alpha[i] = 0
+					beta[i] = 1
+				}
+				if k+l < n {
+					for i := k + l; i < n; i++ {
+						alpha[i] = 0
+						beta[i] = 0
+					}
+				}
+
+				return cycles, true
+			}
+		}
+	}
+
+	// The algorithm has not converged after maxit cycles.
+	return cycles, false
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go
new file mode 100644
index 00000000000..899c95dd58c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrcon.go
@@ -0,0 +1,90 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dtrcon estimates the reciprocal of the condition number of a triangular matrix A.
+// The condition number computed may be based on the 1-norm or the ∞-norm.
+//
+// work is a temporary data slice of length at least 3*n and Dtrcon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Dtrcon will panic otherwise.
+func (impl Implementation) Dtrcon(norm lapack.MatrixNorm, uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int, work []float64, iwork []int) float64 {
+	switch {
+	case norm != lapack.MaxColumnSum && norm != lapack.MaxRowSum:
+		panic(badNorm)
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return 1
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(work) < 3*n:
+		panic(shortWork)
+	case len(iwork) < n:
+		panic(shortIWork)
+	}
+
+	bi := blas64.Implementation()
+
+	var rcond float64
+	smlnum := dlamchS * float64(n)
+
+	anorm := impl.Dlantr(norm, uplo, diag, n, n, a, lda, work)
+
+	if anorm <= 0 {
+		return rcond
+	}
+	var ainvnm float64
+	var normin bool
+	kase1 := 2
+	if norm == lapack.MaxColumnSum {
+		kase1 = 1
+	}
+	var kase int
+	isave := new([3]int)
+	var scale float64
+	for {
+		ainvnm, kase = impl.Dlacn2(n, work[n:], work, iwork, ainvnm, kase, isave)
+		if kase == 0 {
+			if ainvnm != 0 {
+				rcond = (1 / anorm) / ainvnm
+			}
+			return rcond
+		}
+		if kase == kase1 {
+			scale = impl.Dlatrs(uplo, blas.NoTrans, diag, normin, n, a, lda, work, work[2*n:])
+		} else {
+			scale = impl.Dlatrs(uplo, blas.Trans, diag, normin, n, a, lda, work, work[2*n:])
+		}
+		normin = true
+		if scale != 1 {
+			ix := bi.Idamax(n, work, 1)
+			xnorm := math.Abs(work[ix])
+			if scale == 0 || scale < xnorm*smlnum {
+				return rcond
+			}
+			impl.Drscl(n, scale, work, 1)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go
new file mode 100644
index 00000000000..86197d3af55
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrevc3.go
@@ -0,0 +1,894 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Dtrevc3 computes some or all of the right and/or left eigenvectors of an n×n
+// upper quasi-triangular matrix T in Schur canonical form. Matrices of this
+// type are produced by the Schur factorization of a real general matrix A
+//
+//	A = Q T Qᵀ,
+//
+// as computed by Dhseqr.
+//
+// The right eigenvector x of T corresponding to an
+// eigenvalue λ is defined by
+//
+//	T x = λ x,
+//
+// and the left eigenvector y is defined by
+//
+//	yᵀ T = λ yᵀ.
+//
+// The eigenvalues are read directly from the diagonal blocks of T.
+//
+// This routine returns the matrices X and/or Y of right and left eigenvectors
+// of T, or the products Q*X and/or Q*Y, where Q is an input matrix. If Q is the
+// orthogonal factor that reduces a matrix A to Schur form T, then Q*X and Q*Y
+// are the matrices of right and left eigenvectors of A.
+//
+// If side == lapack.EVRight, only right eigenvectors will be computed.
+// If side == lapack.EVLeft, only left eigenvectors will be computed.
+// If side == lapack.EVBoth, both right and left eigenvectors will be computed.
+// For other values of side, Dtrevc3 will panic.
+//
+// If howmny == lapack.EVAll, all right and/or left eigenvectors will be
+// computed.
+// If howmny == lapack.EVAllMulQ, all right and/or left eigenvectors will be
+// computed and multiplied from left by the matrices in VR and/or VL.
+// If howmny == lapack.EVSelected, right and/or left eigenvectors will be
+// computed as indicated by selected.
+// For other values of howmny, Dtrevc3 will panic.
+//
+// selected specifies which eigenvectors will be computed. It must have length n
+// if howmny == lapack.EVSelected, and it is not referenced otherwise.
+// If w_j is a real eigenvalue, the corresponding real eigenvector will be
+// computed if selected[j] is true.
+// If w_j and w_{j+1} are the real and imaginary parts of a complex eigenvalue,
+// the corresponding complex eigenvector is computed if either selected[j] or
+// selected[j+1] is true, and on return selected[j] will be set to true and
+// selected[j+1] will be set to false.
+//
+// VL and VR are n×mm matrices. If howmny is lapack.EVAll or
+// lapack.AllEVMulQ, mm must be at least n. If howmny is
+// lapack.EVSelected, mm must be large enough to store the selected
+// eigenvectors. Each selected real eigenvector occupies one column and each
+// selected complex eigenvector occupies two columns. If mm is not sufficiently
+// large, Dtrevc3 will panic.
+//
+// On entry, if howmny is lapack.EVAllMulQ, it is assumed that VL (if side
+// is lapack.EVLeft or lapack.EVBoth) contains an n×n matrix QL,
+// and that VR (if side is lapack.EVRight or lapack.EVBoth) contains
+// an n×n matrix QR. QL and QR are typically the orthogonal matrix Q of Schur
+// vectors returned by Dhseqr.
+//
+// On return, if side is lapack.EVLeft or lapack.EVBoth,
+// VL will contain:
+//
+//	if howmny == lapack.EVAll,      the matrix Y of left eigenvectors of T,
+//	if howmny == lapack.EVAllMulQ,  the matrix Q*Y,
+//	if howmny == lapack.EVSelected, the left eigenvectors of T specified by
+//	                                selected, stored consecutively in the
+//	                                columns of VL, in the same order as their
+//	                                eigenvalues.
+//
+// VL is not referenced if side == lapack.EVRight.
+//
+// On return, if side is lapack.EVRight or lapack.EVBoth,
+// VR will contain:
+//
+//	if howmny == lapack.EVAll,      the matrix X of right eigenvectors of T,
+//	if howmny == lapack.EVAllMulQ,  the matrix Q*X,
+//	if howmny == lapack.EVSelected, the left eigenvectors of T specified by
+//	                                selected, stored consecutively in the
+//	                                columns of VR, in the same order as their
+//	                                eigenvalues.
+//
+// VR is not referenced if side == lapack.EVLeft.
+//
+// Complex eigenvectors corresponding to a complex eigenvalue are stored in VL
+// and VR in two consecutive columns, the first holding the real part, and the
+// second the imaginary part.
+//
+// Each eigenvector will be normalized so that the element of largest magnitude
+// has magnitude 1. Here the magnitude of a complex number (x,y) is taken to be
+// |x| + |y|.
+//
+// work must have length at least lwork and lwork must be at least max(1,3*n),
+// otherwise Dtrevc3 will panic. For optimum performance, lwork should be at
+// least n+2*n*nb, where nb is the optimal blocksize.
+//
+// If lwork == -1, instead of performing Dtrevc3, the function only estimates
+// the optimal workspace size based on n and stores it into work[0].
+//
+// Dtrevc3 returns the number of columns in VL and/or VR actually used to store
+// the eigenvectors.
+//
+// Dtrevc3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtrevc3(side lapack.EVSide, howmny lapack.EVHowMany, selected []bool, n int, t []float64, ldt int, vl []float64, ldvl int, vr []float64, ldvr int, mm int, work []float64, lwork int) (m int) {
+	bothv := side == lapack.EVBoth
+	rightv := side == lapack.EVRight || bothv
+	leftv := side == lapack.EVLeft || bothv
+	switch {
+	case !rightv && !leftv:
+		panic(badEVSide)
+	case howmny != lapack.EVAll && howmny != lapack.EVAllMulQ && howmny != lapack.EVSelected:
+		panic(badEVHowMany)
+	case n < 0:
+		panic(nLT0)
+	case ldt < max(1, n):
+		panic(badLdT)
+	case mm < 0:
+		panic(mmLT0)
+	case ldvl < 1:
+		// ldvl and ldvr are also checked below after the computation of
+		// m (number of columns of VL and VR) in case of howmny == EVSelected.
+		panic(badLdVL)
+	case ldvr < 1:
+		panic(badLdVR)
+	case lwork < max(1, 3*n) && lwork != -1:
+		panic(badLWork)
+	case len(work) < max(1, lwork):
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		work[0] = 1
+		return 0
+	}
+
+	// Normally we don't check slice lengths until after the workspace
+	// query. However, even in case of the workspace query we need to
+	// compute and return the value of m, and since the computation accesses t,
+	// we put the length check of t here.
+	if len(t) < (n-1)*ldt+n {
+		panic(shortT)
+	}
+
+	if howmny == lapack.EVSelected {
+		if len(selected) != n {
+			panic(badLenSelected)
+		}
+		// Set m to the number of columns required to store the selected
+		// eigenvectors, and standardize the slice selected.
+		// Each selected real eigenvector occupies one column and each
+		// selected complex eigenvector occupies two columns.
+		for j := 0; j < n; {
+			if j == n-1 || t[(j+1)*ldt+j] == 0 {
+				// Diagonal 1×1 block corresponding to a
+				// real eigenvalue.
+				if selected[j] {
+					m++
+				}
+				j++
+			} else {
+				// Diagonal 2×2 block corresponding to a
+				// complex eigenvalue.
+				if selected[j] || selected[j+1] {
+					selected[j] = true
+					selected[j+1] = false
+					m += 2
+				}
+				j += 2
+			}
+		}
+	} else {
+		m = n
+	}
+	if mm < m {
+		panic(badMm)
+	}
+
+	// Quick return in case of a workspace query.
+	nb := impl.Ilaenv(1, "DTREVC", string(side)+string(howmny), n, -1, -1, -1)
+	if lwork == -1 {
+		work[0] = float64(n + 2*n*nb)
+		return m
+	}
+
+	// Quick return if no eigenvectors were selected.
+	if m == 0 {
+		return 0
+	}
+
+	switch {
+	case leftv && ldvl < mm:
+		panic(badLdVL)
+	case leftv && len(vl) < (n-1)*ldvl+mm:
+		panic(shortVL)
+
+	case rightv && ldvr < mm:
+		panic(badLdVR)
+	case rightv && len(vr) < (n-1)*ldvr+mm:
+		panic(shortVR)
+	}
+
+	// Use blocked version of back-transformation if sufficient workspace.
+	// Zero-out the workspace to avoid potential NaN propagation.
+	const (
+		nbmin = 8
+		nbmax = 128
+	)
+	if howmny == lapack.EVAllMulQ && lwork >= n+2*n*nbmin {
+		nb = min((lwork-n)/(2*n), nbmax)
+		impl.Dlaset(blas.All, n, 1+2*nb, 0, 0, work[:n+2*nb*n], 1+2*nb)
+	} else {
+		nb = 1
+	}
+
+	// Set the constants to control overflow.
+	ulp := dlamchP
+	smlnum := float64(n) / ulp * dlamchS
+	bignum := (1 - ulp) / smlnum
+
+	// Split work into a vector of column norms and an n×2*nb matrix b.
+	norms := work[:n]
+	ldb := 2 * nb
+	b := work[n : n+n*ldb]
+
+	// Compute 1-norm of each column of strictly upper triangular part of T
+	// to control overflow in triangular solver.
+	norms[0] = 0
+	for j := 1; j < n; j++ {
+		var cn float64
+		for i := 0; i < j; i++ {
+			cn += math.Abs(t[i*ldt+j])
+		}
+		norms[j] = cn
+	}
+
+	bi := blas64.Implementation()
+
+	var (
+		x [4]float64
+
+		iv int // Index of column in current block.
+		is int
+
+		// ip is used below to specify the real or complex eigenvalue:
+		//  ip == 0, real eigenvalue,
+		//        1, first  of conjugate complex pair (wr,wi),
+		//       -1, second of conjugate complex pair (wr,wi).
+		ip        int
+		iscomplex [nbmax]int // Stores ip for each column in current block.
+	)
+
+	if side == lapack.EVLeft {
+		goto leftev
+	}
+
+	// Compute right eigenvectors.
+
+	// For complex right vector, iv-1 is for real part and iv for complex
+	// part. Non-blocked version always uses iv=1, blocked version starts
+	// with iv=nb-1 and goes down to 0 or 1.
+	iv = max(2, nb) - 1
+	ip = 0
+	is = m - 1
+	for ki := n - 1; ki >= 0; ki-- {
+		if ip == -1 {
+			// Previous iteration (ki+1) was second of
+			// conjugate pair, so this ki is first of
+			// conjugate pair.
+			ip = 1
+			continue
+		}
+
+		if ki == 0 || t[ki*ldt+ki-1] == 0 {
+			// Last column or zero on sub-diagonal, so this
+			// ki must be real eigenvalue.
+			ip = 0
+		} else {
+			// Non-zero on sub-diagonal, so this ki is
+			// second of conjugate pair.
+			ip = -1
+		}
+
+		if howmny == lapack.EVSelected {
+			if ip == 0 {
+				if !selected[ki] {
+					continue
+				}
+			} else if !selected[ki-1] {
+				continue
+			}
+		}
+
+		// Compute the ki-th eigenvalue (wr,wi).
+		wr := t[ki*ldt+ki]
+		var wi float64
+		if ip != 0 {
+			wi = math.Sqrt(math.Abs(t[ki*ldt+ki-1])) * math.Sqrt(math.Abs(t[(ki-1)*ldt+ki]))
+		}
+		smin := math.Max(ulp*(math.Abs(wr)+math.Abs(wi)), smlnum)
+
+		if ip == 0 {
+			// Real right eigenvector.
+
+			b[ki*ldb+iv] = 1
+			// Form right-hand side.
+			for k := 0; k < ki; k++ {
+				b[k*ldb+iv] = -t[k*ldt+ki]
+			}
+			// Solve upper quasi-triangular system:
+			//  [ T[0:ki,0:ki] - wr ]*X = scale*b.
+			for j := ki - 1; j >= 0; {
+				if j == 0 || t[j*ldt+j-1] == 0 {
+					// 1×1 diagonal block.
+					scale, xnorm, _ := impl.Dlaln2(false, 1, 1, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, 0, x[:1], 2)
+					// Scale X[0,0] to avoid overflow when updating the
+					// right-hand side.
+					if xnorm > 1 && norms[j] > bignum/xnorm {
+						x[0] /= xnorm
+						scale /= xnorm
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					// Update right-hand side.
+					bi.Daxpy(j, -x[0], t[j:], ldt, b[iv:], ldb)
+					j--
+				} else {
+					// 2×2 diagonal block.
+					scale, xnorm, _ := impl.Dlaln2(false, 2, 1, smin, 1, t[(j-1)*ldt+j-1:], ldt,
+						1, 1, b[(j-1)*ldb+iv:], ldb, wr, 0, x[:3], 2)
+					// Scale X[0,0] and X[1,0] to avoid overflow
+					// when updating the right-hand side.
+					if xnorm > 1 {
+						beta := math.Max(norms[j-1], norms[j])
+						if beta > bignum/xnorm {
+							x[0] /= xnorm
+							x[2] /= xnorm
+							scale /= xnorm
+						}
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[(j-1)*ldb+iv] = x[0]
+					b[j*ldb+iv] = x[2]
+					// Update right-hand side.
+					bi.Daxpy(j-1, -x[0], t[j-1:], ldt, b[iv:], ldb)
+					bi.Daxpy(j-1, -x[2], t[j:], ldt, b[iv:], ldb)
+					j -= 2
+				}
+			}
+			// Copy the vector x or Q*x to VR and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VR and normalize.
+				bi.Dcopy(ki+1, b[iv:], ldb, vr[is:], ldvr)
+				ii := bi.Idamax(ki+1, vr[is:], ldvr)
+				remax := 1 / math.Abs(vr[ii*ldvr+is])
+				bi.Dscal(ki+1, remax, vr[is:], ldvr)
+				for k := ki + 1; k < n; k++ {
+					vr[k*ldvr+is] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with GEMV, Q*x.
+				if ki > 0 {
+					bi.Dgemv(blas.NoTrans, n, ki, 1, vr, ldvr, b[iv:], ldb,
+						b[ki*ldb+iv], vr[ki:], ldvr)
+				}
+				ii := bi.Idamax(n, vr[ki:], ldvr)
+				remax := 1 / math.Abs(vr[ii*ldvr+ki])
+				bi.Dscal(n, remax, vr[ki:], ldvr)
+			default:
+				// Version 2: back-transform block of vectors with GEMM.
+				// Zero out below vector.
+				for k := ki + 1; k < n; k++ {
+					b[k*ldb+iv] = 0
+				}
+				iscomplex[iv] = ip
+				// Back-transform and normalization is done below.
+			}
+		} else {
+			// Complex right eigenvector.
+
+			// Initial solve
+			//  [ ( T[ki-1,ki-1] T[ki-1,ki] ) - (wr + i*wi) ]*X = 0.
+			//  [ ( T[ki,  ki-1] T[ki,  ki] )               ]
+			if math.Abs(t[(ki-1)*ldt+ki]) >= math.Abs(t[ki*ldt+ki-1]) {
+				b[(ki-1)*ldb+iv-1] = 1
+				b[ki*ldb+iv] = wi / t[(ki-1)*ldt+ki]
+			} else {
+				b[(ki-1)*ldb+iv-1] = -wi / t[ki*ldt+ki-1]
+				b[ki*ldb+iv] = 1
+			}
+			b[ki*ldb+iv-1] = 0
+			b[(ki-1)*ldb+iv] = 0
+			// Form right-hand side.
+			for k := 0; k < ki-1; k++ {
+				b[k*ldb+iv-1] = -b[(ki-1)*ldb+iv-1] * t[k*ldt+ki-1]
+				b[k*ldb+iv] = -b[ki*ldb+iv] * t[k*ldt+ki]
+			}
+			// Solve upper quasi-triangular system:
+			//  [ T[0:ki-1,0:ki-1] - (wr+i*wi) ]*X = scale*(b1+i*b2)
+			for j := ki - 2; j >= 0; {
+				if j == 0 || t[j*ldt+j-1] == 0 {
+					// 1×1 diagonal block.
+
+					scale, xnorm, _ := impl.Dlaln2(false, 1, 2, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv-1:], ldb, wr, wi, x[:2], 2)
+					// Scale X[0,0] and X[0,1] to avoid
+					// overflow when updating the right-hand side.
+					if xnorm > 1 && norms[j] > bignum/xnorm {
+						x[0] /= xnorm
+						x[1] /= xnorm
+						scale /= xnorm
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv-1:], ldb)
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[j*ldb+iv-1] = x[0]
+					b[j*ldb+iv] = x[1]
+					// Update the right-hand side.
+					bi.Daxpy(j, -x[0], t[j:], ldt, b[iv-1:], ldb)
+					bi.Daxpy(j, -x[1], t[j:], ldt, b[iv:], ldb)
+					j--
+				} else {
+					// 2×2 diagonal block.
+
+					scale, xnorm, _ := impl.Dlaln2(false, 2, 2, smin, 1, t[(j-1)*ldt+j-1:], ldt,
+						1, 1, b[(j-1)*ldb+iv-1:], ldb, wr, wi, x[:], 2)
+					// Scale X to avoid overflow when updating
+					// the right-hand side.
+					if xnorm > 1 {
+						beta := math.Max(norms[j-1], norms[j])
+						if beta > bignum/xnorm {
+							rec := 1 / xnorm
+							x[0] *= rec
+							x[1] *= rec
+							x[2] *= rec
+							x[3] *= rec
+							scale *= rec
+						}
+					}
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(ki+1, scale, b[iv-1:], ldb)
+						bi.Dscal(ki+1, scale, b[iv:], ldb)
+					}
+					b[(j-1)*ldb+iv-1] = x[0]
+					b[(j-1)*ldb+iv] = x[1]
+					b[j*ldb+iv-1] = x[2]
+					b[j*ldb+iv] = x[3]
+					// Update the right-hand side.
+					bi.Daxpy(j-1, -x[0], t[j-1:], ldt, b[iv-1:], ldb)
+					bi.Daxpy(j-1, -x[1], t[j-1:], ldt, b[iv:], ldb)
+					bi.Daxpy(j-1, -x[2], t[j:], ldt, b[iv-1:], ldb)
+					bi.Daxpy(j-1, -x[3], t[j:], ldt, b[iv:], ldb)
+					j -= 2
+				}
+			}
+
+			// Copy the vector x or Q*x to VR and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VR and normalize.
+				bi.Dcopy(ki+1, b[iv-1:], ldb, vr[is-1:], ldvr)
+				bi.Dcopy(ki+1, b[iv:], ldb, vr[is:], ldvr)
+				emax := 0.0
+				for k := 0; k <= ki; k++ {
+					emax = math.Max(emax, math.Abs(vr[k*ldvr+is-1])+math.Abs(vr[k*ldvr+is]))
+				}
+				remax := 1 / emax
+				bi.Dscal(ki+1, remax, vr[is-1:], ldvr)
+				bi.Dscal(ki+1, remax, vr[is:], ldvr)
+				for k := ki + 1; k < n; k++ {
+					vr[k*ldvr+is-1] = 0
+					vr[k*ldvr+is] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with GEMV, Q*x.
+				if ki-1 > 0 {
+					bi.Dgemv(blas.NoTrans, n, ki-1, 1, vr, ldvr, b[iv-1:], ldb,
+						b[(ki-1)*ldb+iv-1], vr[ki-1:], ldvr)
+					bi.Dgemv(blas.NoTrans, n, ki-1, 1, vr, ldvr, b[iv:], ldb,
+						b[ki*ldb+iv], vr[ki:], ldvr)
+				} else {
+					bi.Dscal(n, b[(ki-1)*ldb+iv-1], vr[ki-1:], ldvr)
+					bi.Dscal(n, b[ki*ldb+iv], vr[ki:], ldvr)
+				}
+				emax := 0.0
+				for k := 0; k < n; k++ {
+					emax = math.Max(emax, math.Abs(vr[k*ldvr+ki-1])+math.Abs(vr[k*ldvr+ki]))
+				}
+				remax := 1 / emax
+				bi.Dscal(n, remax, vr[ki-1:], ldvr)
+				bi.Dscal(n, remax, vr[ki:], ldvr)
+			default:
+				// Version 2: back-transform block of vectors with GEMM.
+				// Zero out below vector.
+				for k := ki + 1; k < n; k++ {
+					b[k*ldb+iv-1] = 0
+					b[k*ldb+iv] = 0
+				}
+				iscomplex[iv-1] = -ip
+				iscomplex[iv] = ip
+				iv--
+				// Back-transform and normalization is done below.
+			}
+		}
+		if nb > 1 {
+			// Blocked version of back-transform.
+
+			// For complex case, ki2 includes both vectors (ki-1 and ki).
+			ki2 := ki
+			if ip != 0 {
+				ki2--
+			}
+			// Columns iv:nb of b are valid vectors.
+			// When the number of vectors stored reaches nb-1 or nb,
+			// or if this was last vector, do the Gemm.
+			if iv < 2 || ki2 == 0 {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, nb-iv, ki2+nb-iv,
+					1, vr, ldvr, b[iv:], ldb,
+					0, b[nb+iv:], ldb)
+				// Normalize vectors.
+				var remax float64
+				for k := iv; k < nb; k++ {
+					if iscomplex[k] == 0 {
+						// Real eigenvector.
+						ii := bi.Idamax(n, b[nb+k:], ldb)
+						remax = 1 / math.Abs(b[ii*ldb+nb+k])
+					} else if iscomplex[k] == 1 {
+						// First eigenvector of conjugate pair.
+						emax := 0.0
+						for ii := 0; ii < n; ii++ {
+							emax = math.Max(emax, math.Abs(b[ii*ldb+nb+k])+math.Abs(b[ii*ldb+nb+k+1]))
+						}
+						remax = 1 / emax
+						// Second eigenvector of conjugate pair
+						// will reuse this value of remax.
+					}
+					bi.Dscal(n, remax, b[nb+k:], ldb)
+				}
+				impl.Dlacpy(blas.All, n, nb-iv, b[nb+iv:], ldb, vr[ki2:], ldvr)
+				iv = nb - 1
+			} else {
+				iv--
+			}
+		}
+		is--
+		if ip != 0 {
+			is--
+		}
+	}
+
+	if side == lapack.EVRight {
+		return m
+	}
+
+leftev:
+	// Compute left eigenvectors.
+
+	// For complex left vector, iv is for real part and iv+1 for complex
+	// part. Non-blocked version always uses iv=0. Blocked version starts
+	// with iv=0, goes up to nb-2 or nb-1.
+	iv = 0
+	ip = 0
+	is = 0
+	for ki := 0; ki < n; ki++ {
+		if ip == 1 {
+			// Previous iteration ki-1 was first of conjugate pair,
+			// so this ki is second of conjugate pair.
+			ip = -1
+			continue
+		}
+
+		if ki == n-1 || t[(ki+1)*ldt+ki] == 0 {
+			// Last column or zero on sub-diagonal, so this ki must
+			// be real eigenvalue.
+			ip = 0
+		} else {
+			// Non-zero on sub-diagonal, so this ki is first of
+			// conjugate pair.
+			ip = 1
+		}
+		if howmny == lapack.EVSelected && !selected[ki] {
+			continue
+		}
+
+		// Compute the ki-th eigenvalue (wr,wi).
+		wr := t[ki*ldt+ki]
+		var wi float64
+		if ip != 0 {
+			wi = math.Sqrt(math.Abs(t[ki*ldt+ki+1])) * math.Sqrt(math.Abs(t[(ki+1)*ldt+ki]))
+		}
+		smin := math.Max(ulp*(math.Abs(wr)+math.Abs(wi)), smlnum)
+
+		if ip == 0 {
+			// Real left eigenvector.
+
+			b[ki*ldb+iv] = 1
+			// Form right-hand side.
+			for k := ki + 1; k < n; k++ {
+				b[k*ldb+iv] = -t[ki*ldt+k]
+			}
+			// Solve transposed quasi-triangular system:
+			//  [ T[ki+1:n,ki+1:n] - wr ]ᵀ * X = scale*b
+			vmax := 1.0
+			vcrit := bignum
+			for j := ki + 1; j < n; {
+				if j == n-1 || t[(j+1)*ldt+j] == 0 {
+					// 1×1 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side.
+					if norms[j] > vcrit {
+						rec := 1 / vmax
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-1, t[(ki+1)*ldt+j:], ldt, b[(ki+1)*ldb+iv:], ldb)
+					// Solve [ T[j,j] - wr ]ᵀ * X = b.
+					scale, _, _ := impl.Dlaln2(false, 1, 1, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, 0, x[:1], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					vmax = math.Max(math.Abs(b[j*ldb+iv]), vmax)
+					vcrit = bignum / vmax
+					j++
+				} else {
+					// 2×2 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side.
+					beta := math.Max(norms[j], norms[j+1])
+					if beta > vcrit {
+						bi.Dscal(n-ki, 1/vmax, b[ki*ldb+iv:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-1, t[(ki+1)*ldt+j:], ldt, b[(ki+1)*ldb+iv:], ldb)
+					b[(j+1)*ldb+iv] -= bi.Ddot(j-ki-1, t[(ki+1)*ldt+j+1:], ldt, b[(ki+1)*ldb+iv:], ldb)
+					// Solve
+					//  [ T[j,j]-wr  T[j,j+1]      ]ᵀ * X = scale*[ b1 ]
+					//  [ T[j+1,j]   T[j+1,j+1]-wr ]              [ b2 ]
+					scale, _, _ := impl.Dlaln2(true, 2, 1, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, 0, x[:3], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					b[(j+1)*ldb+iv] = x[2]
+					vmax = math.Max(vmax, math.Max(math.Abs(b[j*ldb+iv]), math.Abs(b[(j+1)*ldb+iv])))
+					vcrit = bignum / vmax
+					j += 2
+				}
+			}
+			// Copy the vector x or Q*x to VL and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VL and normalize.
+				bi.Dcopy(n-ki, b[ki*ldb+iv:], ldb, vl[ki*ldvl+is:], ldvl)
+				ii := bi.Idamax(n-ki, vl[ki*ldvl+is:], ldvl) + ki
+				remax := 1 / math.Abs(vl[ii*ldvl+is])
+				bi.Dscal(n-ki, remax, vl[ki*ldvl+is:], ldvl)
+				for k := 0; k < ki; k++ {
+					vl[k*ldvl+is] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with Gemv, Q*x.
+				if n-ki-1 > 0 {
+					bi.Dgemv(blas.NoTrans, n, n-ki-1,
+						1, vl[ki+1:], ldvl, b[(ki+1)*ldb+iv:], ldb,
+						b[ki*ldb+iv], vl[ki:], ldvl)
+				}
+				ii := bi.Idamax(n, vl[ki:], ldvl)
+				remax := 1 / math.Abs(vl[ii*ldvl+ki])
+				bi.Dscal(n, remax, vl[ki:], ldvl)
+			default:
+				// Version 2: back-transform block of vectors with Gemm
+				// zero out above vector.
+				for k := 0; k < ki; k++ {
+					b[k*ldb+iv] = 0
+				}
+				iscomplex[iv] = ip
+				// Back-transform and normalization is done below.
+			}
+		} else {
+			// Complex left eigenvector.
+
+			// Initial solve:
+			// [ [ T[ki,ki]   T[ki,ki+1]   ]ᵀ - (wr - i* wi) ]*X = 0.
+			// [ [ T[ki+1,ki] T[ki+1,ki+1] ]                 ]
+			if math.Abs(t[ki*ldt+ki+1]) >= math.Abs(t[(ki+1)*ldt+ki]) {
+				b[ki*ldb+iv] = wi / t[ki*ldt+ki+1]
+				b[(ki+1)*ldb+iv+1] = 1
+			} else {
+				b[ki*ldb+iv] = 1
+				b[(ki+1)*ldb+iv+1] = -wi / t[(ki+1)*ldt+ki]
+			}
+			b[(ki+1)*ldb+iv] = 0
+			b[ki*ldb+iv+1] = 0
+			// Form right-hand side.
+			for k := ki + 2; k < n; k++ {
+				b[k*ldb+iv] = -b[ki*ldb+iv] * t[ki*ldt+k]
+				b[k*ldb+iv+1] = -b[(ki+1)*ldb+iv+1] * t[(ki+1)*ldt+k]
+			}
+			// Solve transposed quasi-triangular system:
+			// [ T[ki+2:n,ki+2:n]ᵀ - (wr-i*wi) ]*X = b1+i*b2
+			vmax := 1.0
+			vcrit := bignum
+			for j := ki + 2; j < n; {
+				if j == n-1 || t[(j+1)*ldt+j] == 0 {
+					// 1×1 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side elements.
+					if norms[j] > vcrit {
+						rec := 1 / vmax
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv+1:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv:], ldb)
+					b[j*ldb+iv+1] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv+1:], ldb)
+					// Solve [ T[j,j]-(wr-i*wi) ]*(X11+i*X12) = b1+i*b2.
+					scale, _, _ := impl.Dlaln2(false, 1, 2, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, -wi, x[:2], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv+1:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					b[j*ldb+iv+1] = x[1]
+					vmax = math.Max(vmax, math.Max(math.Abs(b[j*ldb+iv]), math.Abs(b[j*ldb+iv+1])))
+					vcrit = bignum / vmax
+					j++
+				} else {
+					// 2×2 diagonal block.
+
+					// Scale if necessary to avoid overflow
+					// when forming the right-hand side elements.
+					if math.Max(norms[j], norms[j+1]) > vcrit {
+						rec := 1 / vmax
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, rec, b[ki*ldb+iv+1:], ldb)
+						vmax = 1
+					}
+					b[j*ldb+iv] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv:], ldb)
+					b[j*ldb+iv+1] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j:], ldt, b[(ki+2)*ldb+iv+1:], ldb)
+					b[(j+1)*ldb+iv] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j+1:], ldt, b[(ki+2)*ldb+iv:], ldb)
+					b[(j+1)*ldb+iv+1] -= bi.Ddot(j-ki-2, t[(ki+2)*ldt+j+1:], ldt, b[(ki+2)*ldb+iv+1:], ldb)
+					// Solve 2×2 complex linear equation
+					//  [ [T[j,j]   T[j,j+1]  ]ᵀ - (wr-i*wi)*I ]*X = scale*b
+					//  [ [T[j+1,j] T[j+1,j+1]]                ]
+					scale, _, _ := impl.Dlaln2(true, 2, 2, smin, 1, t[j*ldt+j:], ldt,
+						1, 1, b[j*ldb+iv:], ldb, wr, -wi, x[:], 2)
+					// Scale if necessary.
+					if scale != 1 {
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv:], ldb)
+						bi.Dscal(n-ki, scale, b[ki*ldb+iv+1:], ldb)
+					}
+					b[j*ldb+iv] = x[0]
+					b[j*ldb+iv+1] = x[1]
+					b[(j+1)*ldb+iv] = x[2]
+					b[(j+1)*ldb+iv+1] = x[3]
+					vmax01 := math.Max(math.Abs(x[0]), math.Abs(x[1]))
+					vmax23 := math.Max(math.Abs(x[2]), math.Abs(x[3]))
+					vmax = math.Max(vmax, math.Max(vmax01, vmax23))
+					vcrit = bignum / vmax
+					j += 2
+				}
+			}
+			// Copy the vector x or Q*x to VL and normalize.
+			switch {
+			case howmny != lapack.EVAllMulQ:
+				// No back-transform: copy x to VL and normalize.
+				bi.Dcopy(n-ki, b[ki*ldb+iv:], ldb, vl[ki*ldvl+is:], ldvl)
+				bi.Dcopy(n-ki, b[ki*ldb+iv+1:], ldb, vl[ki*ldvl+is+1:], ldvl)
+				emax := 0.0
+				for k := ki; k < n; k++ {
+					emax = math.Max(emax, math.Abs(vl[k*ldvl+is])+math.Abs(vl[k*ldvl+is+1]))
+				}
+				remax := 1 / emax
+				bi.Dscal(n-ki, remax, vl[ki*ldvl+is:], ldvl)
+				bi.Dscal(n-ki, remax, vl[ki*ldvl+is+1:], ldvl)
+				for k := 0; k < ki; k++ {
+					vl[k*ldvl+is] = 0
+					vl[k*ldvl+is+1] = 0
+				}
+			case nb == 1:
+				// Version 1: back-transform each vector with GEMV, Q*x.
+				if n-ki-2 > 0 {
+					bi.Dgemv(blas.NoTrans, n, n-ki-2,
+						1, vl[ki+2:], ldvl, b[(ki+2)*ldb+iv:], ldb,
+						b[ki*ldb+iv], vl[ki:], ldvl)
+					bi.Dgemv(blas.NoTrans, n, n-ki-2,
+						1, vl[ki+2:], ldvl, b[(ki+2)*ldb+iv+1:], ldb,
+						b[(ki+1)*ldb+iv+1], vl[ki+1:], ldvl)
+				} else {
+					bi.Dscal(n, b[ki*ldb+iv], vl[ki:], ldvl)
+					bi.Dscal(n, b[(ki+1)*ldb+iv+1], vl[ki+1:], ldvl)
+				}
+				emax := 0.0
+				for k := 0; k < n; k++ {
+					emax = math.Max(emax, math.Abs(vl[k*ldvl+ki])+math.Abs(vl[k*ldvl+ki+1]))
+				}
+				remax := 1 / emax
+				bi.Dscal(n, remax, vl[ki:], ldvl)
+				bi.Dscal(n, remax, vl[ki+1:], ldvl)
+			default:
+				// Version 2: back-transform block of vectors with GEMM.
+				// Zero out above vector.
+				// Could go from ki-nv+1 to ki-1.
+				for k := 0; k < ki; k++ {
+					b[k*ldb+iv] = 0
+					b[k*ldb+iv+1] = 0
+				}
+				iscomplex[iv] = ip
+				iscomplex[iv+1] = -ip
+				iv++
+				// Back-transform and normalization is done below.
+			}
+		}
+		if nb > 1 {
+			// Blocked version of back-transform.
+			// For complex case, ki2 includes both vectors ki and ki+1.
+			ki2 := ki
+			if ip != 0 {
+				ki2++
+			}
+			// Columns [0:iv] of work are valid vectors. When the
+			// number of vectors stored reaches nb-1 or nb, or if
+			// this was last vector, do the Gemm.
+			if iv >= nb-2 || ki2 == n-1 {
+				bi.Dgemm(blas.NoTrans, blas.NoTrans, n, iv+1, n-ki2+iv,
+					1, vl[ki2-iv:], ldvl, b[(ki2-iv)*ldb:], ldb,
+					0, b[nb:], ldb)
+				// Normalize vectors.
+				var remax float64
+				for k := 0; k <= iv; k++ {
+					if iscomplex[k] == 0 {
+						// Real eigenvector.
+						ii := bi.Idamax(n, b[nb+k:], ldb)
+						remax = 1 / math.Abs(b[ii*ldb+nb+k])
+					} else if iscomplex[k] == 1 {
+						// First eigenvector of conjugate pair.
+						emax := 0.0
+						for ii := 0; ii < n; ii++ {
+							emax = math.Max(emax, math.Abs(b[ii*ldb+nb+k])+math.Abs(b[ii*ldb+nb+k+1]))
+						}
+						remax = 1 / emax
+						// Second eigenvector of conjugate pair
+						// will reuse this value of remax.
+					}
+					bi.Dscal(n, remax, b[nb+k:], ldb)
+				}
+				impl.Dlacpy(blas.All, n, iv+1, b[nb:], ldb, vl[ki2-iv:], ldvl)
+				iv = 0
+			} else {
+				iv++
+			}
+		}
+		is++
+		if ip != 0 {
+			is++
+		}
+	}
+
+	return m
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go
new file mode 100644
index 00000000000..2a0a5e7c6d0
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrexc.go
@@ -0,0 +1,230 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/lapack"
+
+// Dtrexc reorders the real Schur factorization of a n×n real matrix
+//
+//	A = Q*T*Qᵀ
+//
+// so that the diagonal block of T with row index ifst is moved to row ilst.
+//
+// On entry, T must be in Schur canonical form, that is, block upper triangular
+// with 1×1 and 2×2 diagonal blocks; each 2×2 diagonal block has its diagonal
+// elements equal and its off-diagonal elements of opposite sign.
+//
+// On return, T will be reordered by an orthogonal similarity transformation Z
+// as Zᵀ*T*Z, and will be again in Schur canonical form.
+//
+// If compq is lapack.UpdateSchur, on return the matrix Q of Schur vectors will be
+// updated by post-multiplying it with Z.
+// If compq is lapack.UpdateSchurNone, the matrix Q is not referenced and will not be
+// updated.
+// For other values of compq Dtrexc will panic.
+//
+// ifst and ilst specify the reordering of the diagonal blocks of T. The block
+// with row index ifst is moved to row ilst, by a sequence of transpositions
+// between adjacent blocks.
+//
+// If ifst points to the second row of a 2×2 block, ifstOut will point to the
+// first row, otherwise it will be equal to ifst.
+//
+// ilstOut will point to the first row of the block in its final position. If ok
+// is true, ilstOut may differ from ilst by +1 or -1.
+//
+// It must hold that
+//
+//	0 <= ifst < n, and  0 <= ilst < n,
+//
+// otherwise Dtrexc will panic.
+//
+// If ok is false, two adjacent blocks were too close to swap because the
+// problem is very ill-conditioned. T may have been partially reordered, and
+// ilstOut will point to the first row of the block at the position to which it
+// has been moved.
+//
+// work must have length at least n, otherwise Dtrexc will panic.
+//
+// Dtrexc is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtrexc(compq lapack.UpdateSchurComp, n int, t []float64, ldt int, q []float64, ldq int, ifst, ilst int, work []float64) (ifstOut, ilstOut int, ok bool) {
+	switch {
+	case compq != lapack.UpdateSchur && compq != lapack.UpdateSchurNone:
+		panic(badUpdateSchurComp)
+	case n < 0:
+		panic(nLT0)
+	case ldt < max(1, n):
+		panic(badLdT)
+	case ldq < 1, compq == lapack.UpdateSchur && ldq < n:
+		panic(badLdQ)
+	case (ifst < 0 || n <= ifst) && n > 0:
+		panic(badIfst)
+	case (ilst < 0 || n <= ilst) && n > 0:
+		panic(badIlst)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return ifst, ilst, true
+	}
+
+	switch {
+	case len(t) < (n-1)*ldt+n:
+		panic(shortT)
+	case compq == lapack.UpdateSchur && len(q) < (n-1)*ldq+n:
+		panic(shortQ)
+	case len(work) < n:
+		panic(shortWork)
+	}
+
+	// Quick return if possible.
+	if n == 1 {
+		return ifst, ilst, true
+	}
+
+	// Determine the first row of specified block
+	// and find out it is 1×1 or 2×2.
+	if ifst > 0 && t[ifst*ldt+ifst-1] != 0 {
+		ifst--
+	}
+	nbf := 1 // Size of the first block.
+	if ifst+1 < n && t[(ifst+1)*ldt+ifst] != 0 {
+		nbf = 2
+	}
+	// Determine the first row of the final block
+	// and find out it is 1×1 or 2×2.
+	if ilst > 0 && t[ilst*ldt+ilst-1] != 0 {
+		ilst--
+	}
+	nbl := 1 // Size of the last block.
+	if ilst+1 < n && t[(ilst+1)*ldt+ilst] != 0 {
+		nbl = 2
+	}
+
+	ok = true
+	wantq := compq == lapack.UpdateSchur
+
+	switch {
+	case ifst == ilst:
+		return ifst, ilst, true
+
+	case ifst < ilst:
+		// Update ilst.
+		switch {
+		case nbf == 2 && nbl == 1:
+			ilst--
+		case nbf == 1 && nbl == 2:
+			ilst++
+		}
+		here := ifst
+		for here < ilst {
+			// Swap block with next one below.
+			if nbf == 1 || nbf == 2 {
+				// Current block either 1×1 or 2×2.
+				nbnext := 1 // Size of the next block.
+				if here+nbf+1 < n && t[(here+nbf+1)*ldt+here+nbf] != 0 {
+					nbnext = 2
+				}
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, nbf, nbnext, work)
+				if !ok {
+					return ifst, here, false
+				}
+				here += nbnext
+				// Test if 2×2 block breaks into two 1×1 blocks.
+				if nbf == 2 && t[(here+1)*ldt+here] == 0 {
+					nbf = 3
+				}
+				continue
+			}
+
+			// Current block consists of two 1×1 blocks each of
+			// which must be swapped individually.
+			nbnext := 1 // Size of the next block.
+			if here+3 < n && t[(here+3)*ldt+here+2] != 0 {
+				nbnext = 2
+			}
+			ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here+1, 1, nbnext, work)
+			if !ok {
+				return ifst, here, false
+			}
+			if nbnext == 1 {
+				// Swap two 1×1 blocks, no problems possible.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, nbnext, work)
+				here++
+				continue
+			}
+			// Recompute nbnext in case 2×2 split.
+			if t[(here+2)*ldt+here+1] == 0 {
+				nbnext = 1
+			}
+			if nbnext == 2 {
+				// 2×2 block did not split.
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, nbnext, work)
+				if !ok {
+					return ifst, here, false
+				}
+			} else {
+				// 2×2 block did split.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, 1, work)
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here+1, 1, 1, work)
+			}
+			here += 2
+		}
+		return ifst, here, true
+
+	default: // ifst > ilst
+		here := ifst
+		for here > ilst {
+			// Swap block with next one above.
+			nbnext := 1
+			if here >= 2 && t[(here-1)*ldt+here-2] != 0 {
+				nbnext = 2
+			}
+			if nbf == 1 || nbf == 2 {
+				// Current block either 1×1 or 2×2.
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-nbnext, nbnext, nbf, work)
+				if !ok {
+					return ifst, here, false
+				}
+				here -= nbnext
+				// Test if 2×2 block breaks into two 1×1 blocks.
+				if nbf == 2 && t[(here+1)*ldt+here] == 0 {
+					nbf = 3
+				}
+				continue
+			}
+
+			// Current block consists of two 1×1 blocks each of
+			// which must be swapped individually.
+			ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-nbnext, nbnext, 1, work)
+			if !ok {
+				return ifst, here, false
+			}
+			if nbnext == 1 {
+				// Swap two 1×1 blocks, no problems possible.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, nbnext, 1, work)
+				here--
+				continue
+			}
+			// Recompute nbnext in case 2×2 split.
+			if t[here*ldt+here-1] == 0 {
+				nbnext = 1
+			}
+			if nbnext == 2 {
+				// 2×2 block did not split.
+				ok = impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-1, 2, 1, work)
+				if !ok {
+					return ifst, here, false
+				}
+			} else {
+				// 2×2 block did split.
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here, 1, 1, work)
+				impl.Dlaexc(wantq, n, t, ldt, q, ldq, here-1, 1, 1, work)
+			}
+			here -= 2
+		}
+		return ifst, here, true
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go
new file mode 100644
index 00000000000..efc24b65ea2
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrti2.go
@@ -0,0 +1,69 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtrti2 computes the inverse of a triangular matrix, storing the result in place
+// into a. This is the BLAS level 2 version of the algorithm.
+//
+// Dtrti2 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtrti2(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	bi := blas64.Implementation()
+
+	nonUnit := diag == blas.NonUnit
+	// TODO(btracey): Replace this with a row-major ordering.
+	if uplo == blas.Upper {
+		for j := 0; j < n; j++ {
+			var ajj float64
+			if nonUnit {
+				ajj = 1 / a[j*lda+j]
+				a[j*lda+j] = ajj
+				ajj *= -1
+			} else {
+				ajj = -1
+			}
+			bi.Dtrmv(blas.Upper, blas.NoTrans, diag, j, a, lda, a[j:], lda)
+			bi.Dscal(j, ajj, a[j:], lda)
+		}
+		return
+	}
+	for j := n - 1; j >= 0; j-- {
+		var ajj float64
+		if nonUnit {
+			ajj = 1 / a[j*lda+j]
+			a[j*lda+j] = ajj
+			ajj *= -1
+		} else {
+			ajj = -1
+		}
+		if j < n-1 {
+			bi.Dtrmv(blas.Lower, blas.NoTrans, diag, n-j-1, a[(j+1)*lda+j+1:], lda, a[(j+1)*lda+j:], lda)
+			bi.Dscal(n-j-1, ajj, a[(j+1)*lda+j:], lda)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go
new file mode 100644
index 00000000000..6ec3663c35d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtri.go
@@ -0,0 +1,72 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtrtri computes the inverse of a triangular matrix, storing the result in place
+// into a. This is the BLAS level 3 version of the algorithm which builds upon
+// Dtrti2 to operate on matrix blocks instead of only individual columns.
+//
+// Dtrtri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+func (impl Implementation) Dtrtri(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 {
+		return true
+	}
+
+	if len(a) < (n-1)*lda+n {
+		panic(shortA)
+	}
+
+	if diag == blas.NonUnit {
+		for i := 0; i < n; i++ {
+			if a[i*lda+i] == 0 {
+				return false
+			}
+		}
+	}
+
+	bi := blas64.Implementation()
+
+	nb := impl.Ilaenv(1, "DTRTRI", "UD", n, -1, -1, -1)
+	if nb <= 1 || nb > n {
+		impl.Dtrti2(uplo, diag, n, a, lda)
+		return true
+	}
+	if uplo == blas.Upper {
+		for j := 0; j < n; j += nb {
+			jb := min(nb, n-j)
+			bi.Dtrmm(blas.Left, blas.Upper, blas.NoTrans, diag, j, jb, 1, a, lda, a[j:], lda)
+			bi.Dtrsm(blas.Right, blas.Upper, blas.NoTrans, diag, j, jb, -1, a[j*lda+j:], lda, a[j:], lda)
+			impl.Dtrti2(blas.Upper, diag, jb, a[j*lda+j:], lda)
+		}
+		return true
+	}
+	nn := ((n - 1) / nb) * nb
+	for j := nn; j >= 0; j -= nb {
+		jb := min(nb, n-j)
+		if j+jb <= n-1 {
+			bi.Dtrmm(blas.Left, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, 1, a[(j+jb)*lda+j+jb:], lda, a[(j+jb)*lda+j:], lda)
+			bi.Dtrsm(blas.Right, blas.Lower, blas.NoTrans, diag, n-j-jb, jb, -1, a[j*lda+j:], lda, a[(j+jb)*lda+j:], lda)
+		}
+		impl.Dtrti2(blas.Lower, diag, jb, a[j*lda+j:], lda)
+	}
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go
new file mode 100644
index 00000000000..2145fbd5fd1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/dtrtrs.go
@@ -0,0 +1,55 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// Dtrtrs solves a triangular system of the form A * X = B or Aᵀ * X = B. Dtrtrs
+// returns whether the solve completed successfully. If A is singular, no solve is performed.
+func (impl Implementation) Dtrtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool) {
+	switch {
+	case uplo != blas.Upper && uplo != blas.Lower:
+		panic(badUplo)
+	case trans != blas.NoTrans && trans != blas.Trans && trans != blas.ConjTrans:
+		panic(badTrans)
+	case diag != blas.NonUnit && diag != blas.Unit:
+		panic(badDiag)
+	case n < 0:
+		panic(nLT0)
+	case nrhs < 0:
+		panic(nrhsLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	case ldb < max(1, nrhs):
+		panic(badLdB)
+	}
+
+	if n == 0 {
+		return true
+	}
+
+	switch {
+	case len(a) < (n-1)*lda+n:
+		panic(shortA)
+	case len(b) < (n-1)*ldb+nrhs:
+		panic(shortB)
+	}
+
+	// Check for singularity.
+	nounit := diag == blas.NonUnit
+	if nounit {
+		for i := 0; i < n; i++ {
+			if a[i*lda+i] == 0 {
+				return false
+			}
+		}
+	}
+	bi := blas64.Implementation()
+	bi.Dtrsm(blas.Left, uplo, trans, diag, n, nrhs, 1, a, lda, b, ldb)
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/errors.go b/vendor/gonum.org/v1/gonum/lapack/gonum/errors.go
new file mode 100644
index 00000000000..711cc2d5ad5
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/errors.go
@@ -0,0 +1,183 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// This list is duplicated in netlib/lapack/netlib. Keep in sync.
+const (
+	// Panic strings for bad enumeration values.
+	badApplyOrtho       = "lapack: bad ApplyOrtho"
+	badBalanceJob       = "lapack: bad BalanceJob"
+	badDiag             = "lapack: bad Diag"
+	badDirect           = "lapack: bad Direct"
+	badEVComp           = "lapack: bad EVComp"
+	badEVHowMany        = "lapack: bad EVHowMany"
+	badEVJob            = "lapack: bad EVJob"
+	badEVSide           = "lapack: bad EVSide"
+	badGSVDJob          = "lapack: bad GSVDJob"
+	badGenOrtho         = "lapack: bad GenOrtho"
+	badLeftEVJob        = "lapack: bad LeftEVJob"
+	badMatrixType       = "lapack: bad MatrixType"
+	badMaximizeNormXJob = "lapack: bad MaximizeNormXJob"
+	badNorm             = "lapack: bad Norm"
+	badOrthoComp        = "lapack: bad OrthoComp"
+	badPivot            = "lapack: bad Pivot"
+	badRightEVJob       = "lapack: bad RightEVJob"
+	badSVDJob           = "lapack: bad SVDJob"
+	badSchurComp        = "lapack: bad SchurComp"
+	badSchurJob         = "lapack: bad SchurJob"
+	badSide             = "lapack: bad Side"
+	badSort             = "lapack: bad Sort"
+	badStoreV           = "lapack: bad StoreV"
+	badTrans            = "lapack: bad Trans"
+	badUpdateSchurComp  = "lapack: bad UpdateSchurComp"
+	badUplo             = "lapack: bad Uplo"
+	bothSVDOver         = "lapack: both jobU and jobVT are lapack.SVDOverwrite"
+
+	// Panic strings for bad numerical and string values.
+	badIfst     = "lapack: ifst out of range"
+	badIhi      = "lapack: ihi out of range"
+	badIhiz     = "lapack: ihiz out of range"
+	badIlo      = "lapack: ilo out of range"
+	badIloz     = "lapack: iloz out of range"
+	badIlst     = "lapack: ilst out of range"
+	badIsave    = "lapack: bad isave value"
+	badIspec    = "lapack: bad ispec value"
+	badJ1       = "lapack: j1 out of range"
+	badJpvt     = "lapack: bad element of jpvt"
+	badK1       = "lapack: k1 out of range"
+	badK2       = "lapack: k2 out of range"
+	badKacc22   = "lapack: invalid value of kacc22"
+	badKbot     = "lapack: kbot out of range"
+	badKtop     = "lapack: ktop out of range"
+	badLWork    = "lapack: insufficient declared workspace length"
+	badMm       = "lapack: mm out of range"
+	badN1       = "lapack: bad value of n1"
+	badN2       = "lapack: bad value of n2"
+	badNa       = "lapack: bad value of na"
+	badName     = "lapack: bad name"
+	badNh       = "lapack: bad value of nh"
+	badNw       = "lapack: bad value of nw"
+	badPp       = "lapack: bad value of pp"
+	badShifts   = "lapack: bad shifts"
+	i0LT0       = "lapack: i0 < 0"
+	kGTM        = "lapack: k > m"
+	kGTN        = "lapack: k > n"
+	kLT0        = "lapack: k < 0"
+	kLT1        = "lapack: k < 1"
+	kdLT0       = "lapack: kd < 0"
+	klLT0       = "lapack: kl < 0"
+	kuLT0       = "lapack: ku < 0"
+	mGTN        = "lapack: m > n"
+	mLT0        = "lapack: m < 0"
+	mmLT0       = "lapack: mm < 0"
+	n0LT0       = "lapack: n0 < 0"
+	nGTM        = "lapack: n > m"
+	nLT0        = "lapack: n < 0"
+	nLT1        = "lapack: n < 1"
+	nLTM        = "lapack: n < m"
+	nanCFrom    = "lapack: cfrom is NaN"
+	nanCTo      = "lapack: cto is NaN"
+	nbGTM       = "lapack: nb > m"
+	nbGTN       = "lapack: nb > n"
+	nbLT0       = "lapack: nb < 0"
+	nccLT0      = "lapack: ncc < 0"
+	ncvtLT0     = "lapack: ncvt < 0"
+	negANorm    = "lapack: anorm < 0"
+	negZ        = "lapack: negative z value"
+	nhLT0       = "lapack: nh < 0"
+	notIsolated = "lapack: block is not isolated"
+	nrhsLT0     = "lapack: nrhs < 0"
+	nruLT0      = "lapack: nru < 0"
+	nshftsLT0   = "lapack: nshfts < 0"
+	nshftsOdd   = "lapack: nshfts must be even"
+	nvLT0       = "lapack: nv < 0"
+	offsetGTM   = "lapack: offset > m"
+	offsetLT0   = "lapack: offset < 0"
+	pLT0        = "lapack: p < 0"
+	recurLT0    = "lapack: recur < 0"
+	zeroCFrom   = "lapack: zero cfrom"
+
+	// Panic strings for bad slice lengths.
+	badLenAlpha    = "lapack: bad length of alpha"
+	badLenBeta     = "lapack: bad length of beta"
+	badLenIpiv     = "lapack: bad length of ipiv"
+	badLenJpiv     = "lapack: bad length of jpiv"
+	badLenJpvt     = "lapack: bad length of jpvt"
+	badLenK        = "lapack: bad length of k"
+	badLenPiv      = "lapack: bad length of piv"
+	badLenSelected = "lapack: bad length of selected"
+	badLenSi       = "lapack: bad length of si"
+	badLenSr       = "lapack: bad length of sr"
+	badLenTau      = "lapack: bad length of tau"
+	badLenWi       = "lapack: bad length of wi"
+	badLenWr       = "lapack: bad length of wr"
+
+	// Panic strings for insufficient slice lengths.
+	shortA     = "lapack: insufficient length of a"
+	shortAB    = "lapack: insufficient length of ab"
+	shortAuxv  = "lapack: insufficient length of auxv"
+	shortB     = "lapack: insufficient length of b"
+	shortC     = "lapack: insufficient length of c"
+	shortCNorm = "lapack: insufficient length of cnorm"
+	shortD     = "lapack: insufficient length of d"
+	shortDL    = "lapack: insufficient length of dl"
+	shortDU    = "lapack: insufficient length of du"
+	shortE     = "lapack: insufficient length of e"
+	shortF     = "lapack: insufficient length of f"
+	shortH     = "lapack: insufficient length of h"
+	shortIWork = "lapack: insufficient length of iwork"
+	shortIsgn  = "lapack: insufficient length of isgn"
+	shortQ     = "lapack: insufficient length of q"
+	shortRHS   = "lapack: insufficient length of rhs"
+	shortS     = "lapack: insufficient length of s"
+	shortScale = "lapack: insufficient length of scale"
+	shortT     = "lapack: insufficient length of t"
+	shortTau   = "lapack: insufficient length of tau"
+	shortTauP  = "lapack: insufficient length of tauP"
+	shortTauQ  = "lapack: insufficient length of tauQ"
+	shortU     = "lapack: insufficient length of u"
+	shortV     = "lapack: insufficient length of v"
+	shortVL    = "lapack: insufficient length of vl"
+	shortVR    = "lapack: insufficient length of vr"
+	shortVT    = "lapack: insufficient length of vt"
+	shortVn1   = "lapack: insufficient length of vn1"
+	shortVn2   = "lapack: insufficient length of vn2"
+	shortW     = "lapack: insufficient length of w"
+	shortWH    = "lapack: insufficient length of wh"
+	shortWV    = "lapack: insufficient length of wv"
+	shortWi    = "lapack: insufficient length of wi"
+	shortWork  = "lapack: insufficient length of work"
+	shortWr    = "lapack: insufficient length of wr"
+	shortX     = "lapack: insufficient length of x"
+	shortY     = "lapack: insufficient length of y"
+	shortZ     = "lapack: insufficient length of z"
+
+	// Panic strings for bad leading dimensions of matrices.
+	badLdA    = "lapack: bad leading dimension of A"
+	badLdB    = "lapack: bad leading dimension of B"
+	badLdC    = "lapack: bad leading dimension of C"
+	badLdF    = "lapack: bad leading dimension of F"
+	badLdH    = "lapack: bad leading dimension of H"
+	badLdQ    = "lapack: bad leading dimension of Q"
+	badLdT    = "lapack: bad leading dimension of T"
+	badLdU    = "lapack: bad leading dimension of U"
+	badLdV    = "lapack: bad leading dimension of V"
+	badLdVL   = "lapack: bad leading dimension of VL"
+	badLdVR   = "lapack: bad leading dimension of VR"
+	badLdVT   = "lapack: bad leading dimension of VT"
+	badLdW    = "lapack: bad leading dimension of W"
+	badLdWH   = "lapack: bad leading dimension of WH"
+	badLdWV   = "lapack: bad leading dimension of WV"
+	badLdWork = "lapack: bad leading dimension of Work"
+	badLdX    = "lapack: bad leading dimension of X"
+	badLdY    = "lapack: bad leading dimension of Y"
+	badLdZ    = "lapack: bad leading dimension of Z"
+
+	// Panic strings for bad vector increments.
+	absIncNotOne = "lapack: increment not one or negative one"
+	badIncX      = "lapack: incX <= 0"
+	badIncY      = "lapack: incY <= 0"
+	zeroIncV     = "lapack: incv == 0"
+)
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go
new file mode 100644
index 00000000000..b251d72691e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlc.go
@@ -0,0 +1,45 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Iladlc scans a matrix for its last non-zero column. Returns -1 if the matrix
+// is all zeros.
+//
+// Iladlc is an internal routine. It is exported for testing purposes.
+func (Implementation) Iladlc(m, n int, a []float64, lda int) int {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 || m == 0 {
+		return -1
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Test common case where corner is non-zero.
+	if a[n-1] != 0 || a[(m-1)*lda+(n-1)] != 0 {
+		return n - 1
+	}
+
+	// Scan each row tracking the highest column seen.
+	highest := -1
+	for i := 0; i < m; i++ {
+		for j := n - 1; j >= 0; j-- {
+			if a[i*lda+j] != 0 {
+				highest = max(highest, j)
+				break
+			}
+		}
+	}
+	return highest
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go
new file mode 100644
index 00000000000..b73fe18ea2a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/iladlr.go
@@ -0,0 +1,41 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Iladlr scans a matrix for its last non-zero row. Returns -1 if the matrix
+// is all zeros.
+//
+// Iladlr is an internal routine. It is exported for testing purposes.
+func (Implementation) Iladlr(m, n int, a []float64, lda int) int {
+	switch {
+	case m < 0:
+		panic(mLT0)
+	case n < 0:
+		panic(nLT0)
+	case lda < max(1, n):
+		panic(badLdA)
+	}
+
+	if n == 0 || m == 0 {
+		return -1
+	}
+
+	if len(a) < (m-1)*lda+n {
+		panic(shortA)
+	}
+
+	// Check the common case where the corner is non-zero
+	if a[(m-1)*lda] != 0 || a[(m-1)*lda+n-1] != 0 {
+		return m - 1
+	}
+	for i := m - 1; i >= 0; i-- {
+		for j := 0; j < n; j++ {
+			if a[i*lda+j] != 0 {
+				return i
+			}
+		}
+	}
+	return -1
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go b/vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go
new file mode 100644
index 00000000000..fc70806c458
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/ilaenv.go
@@ -0,0 +1,395 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Ilaenv returns algorithm tuning parameters for the algorithm given by the
+// input string. ispec specifies the parameter to return:
+//
+//	1: The optimal block size for a blocked algorithm.
+//	2: The minimum block size for a blocked algorithm.
+//	3: The block size of unprocessed data at which a blocked algorithm should
+//	   crossover to an unblocked version.
+//	4: The number of shifts.
+//	5: The minimum column dimension for blocking to be used.
+//	6: The crossover point for SVD (to use QR factorization or not).
+//	7: The number of processors.
+//	8: The crossover point for multi-shift in QR and QZ methods for non-symmetric eigenvalue problems.
+//	9: Maximum size of the subproblems in divide-and-conquer algorithms.
+//	10: ieee infinity and NaN arithmetic can be trusted not to trap.
+//	11: ieee infinity arithmetic can be trusted not to trap.
+//	12...16: parameters for Dhseqr and related functions. See Iparmq for more
+//	         information.
+//
+// Ilaenv is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Ilaenv(ispec int, name string, opts string, n1, n2, n3, n4 int) int {
+	// TODO(btracey): Replace this with a constant lookup? A list of constants?
+	sname := name[0] == 'S' || name[0] == 'D'
+	cname := name[0] == 'C' || name[0] == 'Z'
+	if !sname && !cname {
+		panic(badName)
+	}
+	c2 := name[1:3]
+	c3 := name[3:6]
+	c4 := c3[1:3]
+
+	switch ispec {
+	default:
+		panic(badIspec)
+	case 1:
+		switch c2 {
+		default:
+			panic(badName)
+		case "GE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 64
+				}
+				return 64
+			case "QRF", "RQF", "LQF", "QLF":
+				if sname {
+					return 32
+				}
+				return 32
+			case "HRD":
+				if sname {
+					return 32
+				}
+				return 32
+			case "BRD":
+				if sname {
+					return 32
+				}
+				return 32
+			case "TRI":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "PO":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "SY":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 64
+				}
+				return 64
+			case "TRD":
+				return 32
+			case "GST":
+				return 64
+			}
+		case "HE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				return 64
+			case "TRD":
+				return 32
+			case "GST":
+				return 64
+			}
+		case "OR":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			case 'M':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			}
+		case "UN":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			case 'M':
+				switch c3[1:] {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 32
+				}
+			}
+		case "GB":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					if n4 <= 64 {
+						return 1
+					}
+					return 32
+				}
+				if n4 <= 64 {
+					return 1
+				}
+				return 32
+			}
+		case "PB":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					if n2 <= 64 {
+						return 1
+					}
+					return 32
+				}
+				if n2 <= 64 {
+					return 1
+				}
+				return 32
+			}
+		case "PT":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRS":
+				return 1
+			}
+		case "TR":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRI":
+				if sname {
+					return 64
+				}
+				return 64
+			case "EVC":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "LA":
+			switch c3 {
+			default:
+				panic(badName)
+			case "UUM":
+				if sname {
+					return 64
+				}
+				return 64
+			}
+		case "ST":
+			if sname && c3 == "EBZ" {
+				return 1
+			}
+			panic(badName)
+		}
+	case 2:
+		switch c2 {
+		default:
+			panic(badName)
+		case "GE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "QRF", "RQF", "LQF", "QLF":
+				if sname {
+					return 2
+				}
+				return 2
+			case "HRD":
+				if sname {
+					return 2
+				}
+				return 2
+			case "BRD":
+				if sname {
+					return 2
+				}
+				return 2
+			case "TRI":
+				if sname {
+					return 2
+				}
+				return 2
+			}
+		case "SY":
+			switch c3 {
+			default:
+				panic(badName)
+			case "TRF":
+				if sname {
+					return 8
+				}
+				return 8
+			case "TRD":
+				if sname {
+					return 2
+				}
+				panic(badName)
+			}
+		case "HE":
+			if c3 == "TRD" {
+				return 2
+			}
+			panic(badName)
+		case "OR":
+			if !sname {
+				panic(badName)
+			}
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			case 'M':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			}
+		case "UN":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			case 'M':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 2
+				}
+			}
+		}
+	case 3:
+		switch c2 {
+		default:
+			panic(badName)
+		case "GE":
+			switch c3 {
+			default:
+				panic(badName)
+			case "QRF", "RQF", "LQF", "QLF":
+				if sname {
+					return 128
+				}
+				return 128
+			case "HRD":
+				if sname {
+					return 128
+				}
+				return 128
+			case "BRD":
+				if sname {
+					return 128
+				}
+				return 128
+			}
+		case "SY":
+			if sname && c3 == "TRD" {
+				return 32
+			}
+			panic(badName)
+		case "HE":
+			if c3 == "TRD" {
+				return 32
+			}
+			panic(badName)
+		case "OR":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 128
+				}
+			}
+		case "UN":
+			switch c3[0] {
+			default:
+				panic(badName)
+			case 'G':
+				switch c4 {
+				default:
+					panic(badName)
+				case "QR", "RQ", "LQ", "QL", "HR", "TR", "BR":
+					return 128
+				}
+			}
+		}
+	case 4:
+		// Used by xHSEQR
+		return 6
+	case 5:
+		// Not used
+		return 2
+	case 6:
+		// Used by xGELSS and xGESVD
+		return int(float64(min(n1, n2)) * 1.6)
+	case 7:
+		// Not used
+		return 1
+	case 8:
+		// Used by xHSEQR
+		return 50
+	case 9:
+		// used by xGELSD and xGESDD
+		return 25
+	case 10:
+		// Go guarantees ieee
+		return 1
+	case 11:
+		// Go guarantees ieee
+		return 1
+	case 12, 13, 14, 15, 16:
+		// Dhseqr and related functions for eigenvalue problems.
+		return impl.Iparmq(ispec, name, opts, n1, n2, n3, n4)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go b/vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go
new file mode 100644
index 00000000000..65d105245ed
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/iparmq.go
@@ -0,0 +1,117 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "math"
+
+// Iparmq returns problem and machine dependent parameters useful for Dhseqr and
+// related subroutines for eigenvalue problems.
+//
+// ispec specifies the parameter to return:
+//
+//	12: Crossover point between Dlahqr and Dlaqr0. Will be at least 11.
+//	13: Deflation window size.
+//	14: Nibble crossover point. Determines when to skip a multi-shift QR sweep.
+//	15: Number of simultaneous shifts in a multishift QR iteration.
+//	16: Select structured matrix multiply.
+//
+// For other values of ispec Iparmq will panic.
+//
+// name is the name of the calling function. name must be in uppercase but this
+// is not checked.
+//
+// opts is not used and exists for future use.
+//
+// n is the order of the Hessenberg matrix H.
+//
+// ilo and ihi specify the block [ilo:ihi+1,ilo:ihi+1] that is being processed.
+//
+// lwork is the amount of workspace available.
+//
+// Except for ispec input parameters are not checked.
+//
+// Iparmq is an internal routine. It is exported for testing purposes.
+func (Implementation) Iparmq(ispec int, name, opts string, n, ilo, ihi, lwork int) int {
+	nh := ihi - ilo + 1
+	ns := 2
+	switch {
+	case nh >= 30:
+		ns = 4
+	case nh >= 60:
+		ns = 10
+	case nh >= 150:
+		ns = max(10, nh/int(math.Log(float64(nh))/math.Ln2))
+	case nh >= 590:
+		ns = 64
+	case nh >= 3000:
+		ns = 128
+	case nh >= 6000:
+		ns = 256
+	}
+	ns = max(2, ns-(ns%2))
+
+	switch ispec {
+	default:
+		panic(badIspec)
+
+	case 12:
+		// Matrices of order smaller than nmin get sent to Dlahqr, the
+		// classic double shift algorithm. This must be at least 11.
+		const nmin = 75
+		return nmin
+
+	case 13:
+		const knwswp = 500
+		if nh <= knwswp {
+			return ns
+		}
+		return 3 * ns / 2
+
+	case 14:
+		// Skip a computationally expensive multi-shift QR sweep with
+		// Dlaqr5 whenever aggressive early deflation finds at least
+		// nibble*(window size)/100 deflations. The default, small,
+		// value reflects the expectation that the cost of looking
+		// through the deflation window with Dlaqr3 will be
+		// substantially smaller.
+		const nibble = 14
+		return nibble
+
+	case 15:
+		return ns
+
+	case 16:
+		if len(name) != 6 {
+			panic(badName)
+		}
+		const (
+			k22min = 14
+			kacmin = 14
+		)
+		var acc22 int
+		switch {
+		case name[1:] == "GGHRD" || name[1:] == "GGHD3":
+			acc22 = 1
+			if nh >= k22min {
+				acc22 = 2
+			}
+		case name[3:] == "EXC":
+			if nh >= kacmin {
+				acc22 = 1
+			}
+			if nh >= k22min {
+				acc22 = 2
+			}
+		case name[1:] == "HSEQR" || name[1:5] == "LAQR":
+			if ns >= kacmin {
+				acc22 = 1
+			}
+			if ns >= k22min {
+				acc22 = 2
+			}
+		}
+		return acc22
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go b/vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go
new file mode 100644
index 00000000000..fef4f5583de
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/gonum/lapack.go
@@ -0,0 +1,78 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import "gonum.org/v1/gonum/lapack"
+
+// Implementation is the native Go implementation of LAPACK routines. It
+// is built on top of calls to the return of blas64.Implementation(), so while
+// this code is in pure Go, the underlying BLAS implementation may not be.
+type Implementation struct{}
+
+var _ lapack.Float64 = Implementation{}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func abs(a int) int {
+	if a < 0 {
+		return -a
+	}
+	return a
+}
+
+const (
+	// dlamchE is the machine epsilon. For IEEE this is 2^{-53}.
+	dlamchE = 0x1p-53
+
+	// dlamchB is the radix of the machine (the base of the number system).
+	dlamchB = 2
+
+	// dlamchP is base * eps.
+	dlamchP = dlamchB * dlamchE
+
+	// dlamchS is the "safe minimum", that is, the lowest number such that
+	// 1/dlamchS does not overflow, or also the smallest normal number.
+	// For IEEE this is 2^{-1022}.
+	dlamchS = 0x1p-1022
+
+	// Blue's scaling constants
+	//
+	// An n-vector x is well-scaled if
+	//  dtsml ≤ |xᵢ| ≤ dtbig for 0 ≤ i < n and n ≤ 1/dlamchP,
+	// where
+	//  dtsml = 2^ceil((expmin-1)/2) = 2^ceil((-1021-1)/2) = 2^{-511} = 1.4916681462400413e-154
+	//  dtbig = 2^floor((expmax-digits+1)/2) = 2^floor((1024-53+1)/2) = 2^{486} = 1.997919072202235e+146
+	// If any xᵢ is not well-scaled, then multiplying small values by dssml and
+	// large values by dsbig avoids underflow or overflow when computing the sum
+	// of squares \sum_0^{n-1} (xᵢ)².
+	//  dssml = 2^{-floor((expmin-digits)/2)} = 2^{-floor((-1021-53)/2)} = 2^537 = 4.4989137945431964e+161
+	//  dsbig = 2^{-ceil((expmax+digits-1)/2)} = 2^{-ceil((1024+53-1)/2)} = 2^{-538} = 1.1113793747425387e-162
+	//
+	// References:
+	//  - Anderson E. (2017)
+	//    Algorithm 978: Safe Scaling in the Level 1 BLAS
+	//    ACM Trans Math Softw 44:1--28
+	//    https://doi.org/10.1145/3061665
+	//  - Blue, James L. (1978)
+	//    A Portable Fortran Program to Find the Euclidean Norm of a Vector
+	//    ACM Trans Math Softw 4:15--23
+	//    https://doi.org/10.1145/355769.355771
+	dtsml = 0x1p-511
+	dtbig = 0x1p486
+	dssml = 0x1p537
+	dsbig = 0x1p-538
+)
diff --git a/vendor/gonum.org/v1/gonum/lapack/lapack.go b/vendor/gonum.org/v1/gonum/lapack/lapack.go
new file mode 100644
index 00000000000..60b5d0d363b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/lapack.go
@@ -0,0 +1,240 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package lapack
+
+import "gonum.org/v1/gonum/blas"
+
+// Complex128 defines the public complex128 LAPACK API supported by gonum/lapack.
+type Complex128 interface{}
+
+// Float64 defines the public float64 LAPACK API supported by gonum/lapack.
+type Float64 interface {
+	Dgecon(norm MatrixNorm, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64
+	Dgeev(jobvl LeftEVJob, jobvr RightEVJob, n int, a []float64, lda int, wr, wi []float64, vl []float64, ldvl int, vr []float64, ldvr int, work []float64, lwork int) (first int)
+	Dgels(trans blas.Transpose, m, n, nrhs int, a []float64, lda int, b []float64, ldb int, work []float64, lwork int) bool
+	Dgelqf(m, n int, a []float64, lda int, tau, work []float64, lwork int)
+	Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int)
+	Dgeqrf(m, n int, a []float64, lda int, tau, work []float64, lwork int)
+	Dgesvd(jobU, jobVT SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool)
+	Dgetrf(m, n int, a []float64, lda int, ipiv []int) (ok bool)
+	Dgetri(n int, a []float64, lda int, ipiv []int, work []float64, lwork int) (ok bool)
+	Dgetrs(trans blas.Transpose, n, nrhs int, a []float64, lda int, ipiv []int, b []float64, ldb int)
+	Dggsvd3(jobU, jobV, jobQ GSVDJob, m, n, p int, a []float64, lda int, b []float64, ldb int, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64, lwork int, iwork []int) (k, l int, ok bool)
+	Dlantr(norm MatrixNorm, uplo blas.Uplo, diag blas.Diag, m, n int, a []float64, lda int, work []float64) float64
+	Dlange(norm MatrixNorm, m, n int, a []float64, lda int, work []float64) float64
+	Dlansy(norm MatrixNorm, uplo blas.Uplo, n int, a []float64, lda int, work []float64) float64
+	Dlapmr(forward bool, m, n int, x []float64, ldx int, k []int)
+	Dlapmt(forward bool, m, n int, x []float64, ldx int, k []int)
+	Dorgqr(m, n, k int, a []float64, lda int, tau, work []float64, lwork int)
+	Dormqr(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int)
+	Dorglq(m, n, k int, a []float64, lda int, tau, work []float64, lwork int)
+	Dormlq(side blas.Side, trans blas.Transpose, m, n, k int, a []float64, lda int, tau, c []float64, ldc int, work []float64, lwork int)
+	Dpbcon(uplo blas.Uplo, n, kd int, ab []float64, ldab int, anorm float64, work []float64, iwork []int) float64
+	Dpbtrf(uplo blas.Uplo, n, kd int, ab []float64, ldab int) (ok bool)
+	Dpbtrs(uplo blas.Uplo, n, kd, nrhs int, ab []float64, ldab int, b []float64, ldb int)
+	Dpocon(uplo blas.Uplo, n int, a []float64, lda int, anorm float64, work []float64, iwork []int) float64
+	Dpotrf(ul blas.Uplo, n int, a []float64, lda int) (ok bool)
+	Dpotri(ul blas.Uplo, n int, a []float64, lda int) (ok bool)
+	Dpotrs(ul blas.Uplo, n, nrhs int, a []float64, lda int, b []float64, ldb int)
+	Dpstrf(uplo blas.Uplo, n int, a []float64, lda int, piv []int, tol float64, work []float64) (rank int, ok bool)
+	Dsyev(jobz EVJob, uplo blas.Uplo, n int, a []float64, lda int, w, work []float64, lwork int) (ok bool)
+	Dtbtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, kd, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool)
+	Dtrcon(norm MatrixNorm, uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int, work []float64, iwork []int) float64
+	Dtrtri(uplo blas.Uplo, diag blas.Diag, n int, a []float64, lda int) (ok bool)
+	Dtrtrs(uplo blas.Uplo, trans blas.Transpose, diag blas.Diag, n, nrhs int, a []float64, lda int, b []float64, ldb int) (ok bool)
+}
+
+// Direct specifies the direction of the multiplication for the Householder matrix.
+type Direct byte
+
+const (
+	Forward  Direct = 'F' // Reflectors are right-multiplied, H_0 * H_1 * ... * H_{k-1}.
+	Backward Direct = 'B' // Reflectors are left-multiplied, H_{k-1} * ... * H_1 * H_0.
+)
+
+// Sort is the sorting order.
+type Sort byte
+
+const (
+	SortIncreasing Sort = 'I'
+	SortDecreasing Sort = 'D'
+)
+
+// StoreV indicates the storage direction of elementary reflectors.
+type StoreV byte
+
+const (
+	ColumnWise StoreV = 'C' // Reflector stored in a column of the matrix.
+	RowWise    StoreV = 'R' // Reflector stored in a row of the matrix.
+)
+
+// MatrixNorm represents the kind of matrix norm to compute.
+type MatrixNorm byte
+
+const (
+	MaxAbs       MatrixNorm = 'M' // max(abs(A(i,j)))
+	MaxColumnSum MatrixNorm = 'O' // Maximum absolute column sum (one norm)
+	MaxRowSum    MatrixNorm = 'I' // Maximum absolute row sum (infinity norm)
+	Frobenius    MatrixNorm = 'F' // Frobenius norm (sqrt of sum of squares)
+)
+
+// MatrixType represents the kind of matrix represented in the data.
+type MatrixType byte
+
+const (
+	General  MatrixType = 'G' // A general dense matrix.
+	UpperTri MatrixType = 'U' // An upper triangular matrix.
+	LowerTri MatrixType = 'L' // A lower triangular matrix.
+)
+
+// Pivot specifies the pivot type for plane rotations.
+type Pivot byte
+
+const (
+	Variable Pivot = 'V'
+	Top      Pivot = 'T'
+	Bottom   Pivot = 'B'
+)
+
+// ApplyOrtho specifies which orthogonal matrix is applied in Dormbr.
+type ApplyOrtho byte
+
+const (
+	ApplyP ApplyOrtho = 'P' // Apply P or Pᵀ.
+	ApplyQ ApplyOrtho = 'Q' // Apply Q or Qᵀ.
+)
+
+// GenOrtho specifies which orthogonal matrix is generated in Dorgbr.
+type GenOrtho byte
+
+const (
+	GeneratePT GenOrtho = 'P' // Generate Pᵀ.
+	GenerateQ  GenOrtho = 'Q' // Generate Q.
+)
+
+// SVDJob specifies the singular vector computation type for SVD.
+type SVDJob byte
+
+const (
+	SVDAll       SVDJob = 'A' // Compute all columns of the orthogonal matrix U or V.
+	SVDStore     SVDJob = 'S' // Compute the singular vectors and store them in the orthogonal matrix U or V.
+	SVDOverwrite SVDJob = 'O' // Compute the singular vectors and overwrite them on the input matrix A.
+	SVDNone      SVDJob = 'N' // Do not compute singular vectors.
+)
+
+// GSVDJob specifies the singular vector computation type for Generalized SVD.
+type GSVDJob byte
+
+const (
+	GSVDU    GSVDJob = 'U' // Compute orthogonal matrix U.
+	GSVDV    GSVDJob = 'V' // Compute orthogonal matrix V.
+	GSVDQ    GSVDJob = 'Q' // Compute orthogonal matrix Q.
+	GSVDUnit GSVDJob = 'I' // Use unit-initialized matrix.
+	GSVDNone GSVDJob = 'N' // Do not compute orthogonal matrix.
+)
+
+// EVComp specifies how eigenvectors are computed in Dsteqr.
+type EVComp byte
+
+const (
+	EVOrig     EVComp = 'V' // Compute eigenvectors of the original symmetric matrix.
+	EVTridiag  EVComp = 'I' // Compute eigenvectors of the tridiagonal matrix.
+	EVCompNone EVComp = 'N' // Do not compute eigenvectors.
+)
+
+// EVJob specifies whether eigenvectors are computed in Dsyev.
+type EVJob byte
+
+const (
+	EVCompute EVJob = 'V' // Compute eigenvectors.
+	EVNone    EVJob = 'N' // Do not compute eigenvectors.
+)
+
+// LeftEVJob specifies whether left eigenvectors are computed in Dgeev.
+type LeftEVJob byte
+
+const (
+	LeftEVCompute LeftEVJob = 'V' // Compute left eigenvectors.
+	LeftEVNone    LeftEVJob = 'N' // Do not compute left eigenvectors.
+)
+
+// RightEVJob specifies whether right eigenvectors are computed in Dgeev.
+type RightEVJob byte
+
+const (
+	RightEVCompute RightEVJob = 'V' // Compute right eigenvectors.
+	RightEVNone    RightEVJob = 'N' // Do not compute right eigenvectors.
+)
+
+// BalanceJob specifies matrix balancing operation.
+type BalanceJob byte
+
+const (
+	Permute      BalanceJob = 'P'
+	Scale        BalanceJob = 'S'
+	PermuteScale BalanceJob = 'B'
+	BalanceNone  BalanceJob = 'N'
+)
+
+// SchurJob specifies whether the Schur form is computed in Dhseqr.
+type SchurJob byte
+
+const (
+	EigenvaluesOnly     SchurJob = 'E'
+	EigenvaluesAndSchur SchurJob = 'S'
+)
+
+// SchurComp specifies whether and how the Schur vectors are computed in Dhseqr.
+type SchurComp byte
+
+const (
+	SchurOrig SchurComp = 'V' // Compute Schur vectors of the original matrix.
+	SchurHess SchurComp = 'I' // Compute Schur vectors of the upper Hessenberg matrix.
+	SchurNone SchurComp = 'N' // Do not compute Schur vectors.
+)
+
+// UpdateSchurComp specifies whether the matrix of Schur vectors is updated in Dtrexc.
+type UpdateSchurComp byte
+
+const (
+	UpdateSchur     UpdateSchurComp = 'V' // Update the matrix of Schur vectors.
+	UpdateSchurNone UpdateSchurComp = 'N' // Do not update the matrix of Schur vectors.
+)
+
+// EVSide specifies what eigenvectors are computed in Dtrevc3.
+type EVSide byte
+
+const (
+	EVRight EVSide = 'R' // Compute only right eigenvectors.
+	EVLeft  EVSide = 'L' // Compute only left eigenvectors.
+	EVBoth  EVSide = 'B' // Compute both right and left eigenvectors.
+)
+
+// EVHowMany specifies which eigenvectors are computed in Dtrevc3 and how.
+type EVHowMany byte
+
+const (
+	EVAll      EVHowMany = 'A' // Compute all right and/or left eigenvectors.
+	EVAllMulQ  EVHowMany = 'B' // Compute all right and/or left eigenvectors multiplied by an input matrix.
+	EVSelected EVHowMany = 'S' // Compute selected right and/or left eigenvectors.
+)
+
+// MaximizeNormX specifies the heuristic method for computing a contribution to
+// the reciprocal Dif-estimate in Dlatdf.
+type MaximizeNormXJob byte
+
+const (
+	LocalLookAhead       MaximizeNormXJob = 0 // Solve Z*x=h-f where h is a vector of ±1.
+	NormalizedNullVector MaximizeNormXJob = 2 // Compute an approximate null-vector e of Z, normalize e and solve Z*x=±e-f.
+)
+
+// OrthoComp specifies whether and how the orthogonal matrix is computed in Dgghrd.
+type OrthoComp byte
+
+const (
+	OrthoNone     OrthoComp = 'N' // Do not compute the orthogonal matrix.
+	OrthoExplicit OrthoComp = 'I' // The orthogonal matrix is formed explicitly and returned in the argument.
+	OrthoPostmul  OrthoComp = 'V' // The orthogonal matrix is post-multiplied into the matrix stored in the argument on entry.
+)
diff --git a/vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go b/vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go
new file mode 100644
index 00000000000..da19e3ec781
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/lapack64/doc.go
@@ -0,0 +1,20 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package lapack64 provides a set of convenient wrapper functions for LAPACK
+// calls, as specified in the netlib standard (www.netlib.org).
+//
+// The native Go routines are used by default, and the Use function can be used
+// to set an alternative implementation.
+//
+// If the type of matrix (General, Symmetric, etc.) is known and fixed, it is
+// used in the wrapper signature. In many cases, however, the type of the matrix
+// changes during the call to the routine, for example the matrix is symmetric on
+// entry and is triangular on exit. In these cases the correct types should be checked
+// in the documentation.
+//
+// The full set of Lapack functions is very large, and it is not clear that a
+// full implementation is desirable, let alone feasible. Please open up an issue
+// if there is a specific function you need and/or are willing to implement.
+package lapack64 // import "gonum.org/v1/gonum/lapack/lapack64"
diff --git a/vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go b/vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go
new file mode 100644
index 00000000000..d0afab119cd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/lapack/lapack64/lapack64.go
@@ -0,0 +1,915 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package lapack64
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/gonum"
+)
+
+var lapack64 lapack.Float64 = gonum.Implementation{}
+
+// Use sets the LAPACK float64 implementation to be used by subsequent BLAS calls.
+// The default implementation is native.Implementation.
+func Use(l lapack.Float64) {
+	lapack64 = l
+}
+
+// Tridiagonal represents a tridiagonal matrix using its three diagonals.
+type Tridiagonal struct {
+	N  int
+	DL []float64
+	D  []float64
+	DU []float64
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// Potrf computes the Cholesky factorization of a.
+// The factorization has the form
+//
+//	A = Uᵀ * U  if a.Uplo == blas.Upper, or
+//	A = L * Lᵀ  if a.Uplo == blas.Lower,
+//
+// where U is an upper triangular matrix and L is lower triangular.
+// The triangular matrix is returned in t, and the underlying data between
+// a and t is shared. The returned bool indicates whether a is positive
+// definite and the factorization could be finished.
+func Potrf(a blas64.Symmetric) (t blas64.Triangular, ok bool) {
+	ok = lapack64.Dpotrf(a.Uplo, a.N, a.Data, max(1, a.Stride))
+	t.Uplo = a.Uplo
+	t.N = a.N
+	t.Data = a.Data
+	t.Stride = a.Stride
+	t.Diag = blas.NonUnit
+	return
+}
+
+// Potri computes the inverse of a real symmetric positive definite matrix A
+// using its Cholesky factorization.
+//
+// On entry, t contains the triangular factor U or L from the Cholesky
+// factorization A = Uᵀ*U or A = L*Lᵀ, as computed by Potrf.
+//
+// On return, the upper or lower triangle of the (symmetric) inverse of A is
+// stored in t, overwriting the input factor U or L, and also returned in a. The
+// underlying data between a and t is shared.
+//
+// The returned bool indicates whether the inverse was computed successfully.
+func Potri(t blas64.Triangular) (a blas64.Symmetric, ok bool) {
+	ok = lapack64.Dpotri(t.Uplo, t.N, t.Data, max(1, t.Stride))
+	a.Uplo = t.Uplo
+	a.N = t.N
+	a.Data = t.Data
+	a.Stride = t.Stride
+	return
+}
+
+// Potrs solves a system of n linear equations A*X = B where A is an n×n
+// symmetric positive definite matrix and B is an n×nrhs matrix, using the
+// Cholesky factorization A = Uᵀ*U or A = L*Lᵀ. t contains the corresponding
+// triangular factor as returned by Potrf. On entry, B contains the right-hand
+// side matrix B, on return it contains the solution matrix X.
+func Potrs(t blas64.Triangular, b blas64.General) {
+	lapack64.Dpotrs(t.Uplo, t.N, b.Cols, t.Data, max(1, t.Stride), b.Data, max(1, b.Stride))
+}
+
+// Pbcon returns an estimate of the reciprocal of the condition number (in the
+// 1-norm) of an n×n symmetric positive definite band matrix using the Cholesky
+// factorization
+//
+//	A = Uᵀ*U  if uplo == blas.Upper
+//	A = L*Lᵀ  if uplo == blas.Lower
+//
+// computed by Pbtrf. The estimate is obtained for norm(inv(A)), and the
+// reciprocal of the condition number is computed as
+//
+//	rcond = 1 / (anorm * norm(inv(A))).
+//
+// The length of work must be at least 3*n and the length of iwork must be at
+// least n.
+func Pbcon(a blas64.SymmetricBand, anorm float64, work []float64, iwork []int) float64 {
+	return lapack64.Dpbcon(a.Uplo, a.N, a.K, a.Data, a.Stride, anorm, work, iwork)
+}
+
+// Pbtrf computes the Cholesky factorization of an n×n symmetric positive
+// definite band matrix
+//
+//	A = Uᵀ * U  if a.Uplo == blas.Upper
+//	A = L * Lᵀ  if a.Uplo == blas.Lower
+//
+// where U and L are upper, respectively lower, triangular band matrices.
+//
+// The triangular matrix U or L is returned in t, and the underlying data
+// between a and t is shared. The returned bool indicates whether A is positive
+// definite and the factorization could be finished.
+func Pbtrf(a blas64.SymmetricBand) (t blas64.TriangularBand, ok bool) {
+	ok = lapack64.Dpbtrf(a.Uplo, a.N, a.K, a.Data, max(1, a.Stride))
+	t.Uplo = a.Uplo
+	t.Diag = blas.NonUnit
+	t.N = a.N
+	t.K = a.K
+	t.Data = a.Data
+	t.Stride = a.Stride
+	return t, ok
+}
+
+// Pbtrs solves a system of linear equations A*X = B with an n×n symmetric
+// positive definite band matrix A using the Cholesky factorization
+//
+//	A = Uᵀ * U  if t.Uplo == blas.Upper
+//	A = L * Lᵀ  if t.Uplo == blas.Lower
+//
+// t contains the corresponding triangular factor as returned by Pbtrf.
+//
+// On entry, b contains the right hand side matrix B. On return, it is
+// overwritten with the solution matrix X.
+func Pbtrs(t blas64.TriangularBand, b blas64.General) {
+	lapack64.Dpbtrs(t.Uplo, t.N, t.K, b.Cols, t.Data, max(1, t.Stride), b.Data, max(1, b.Stride))
+}
+
+// Pstrf computes the Cholesky factorization with complete pivoting of an n×n
+// symmetric positive semidefinite matrix A.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U ,  if a.Uplo = blas.Upper,
+//	Pᵀ * A * P = L  * Lᵀ,  if a.Uplo = blas.Lower,
+//
+// where U is an upper triangular matrix, L is lower triangular, and P is a
+// permutation matrix.
+//
+// tol is a user-defined tolerance. The algorithm terminates if the pivot is
+// less than or equal to tol. If tol is negative, then n*eps*max(A[k,k]) will be
+// used instead.
+//
+// The triangular factor U or L from the Cholesky factorization is returned in t
+// and the underlying data between a and t is shared. P is stored on return in
+// vector piv such that P[piv[k],k] = 1.
+//
+// Pstrf returns the computed rank of A and whether the factorization can be
+// used to solve a system. Pstrf does not attempt to check that A is positive
+// semi-definite, so if ok is false, the matrix A is either rank deficient or is
+// not positive semidefinite.
+//
+// The length of piv must be n and the length of work must be at least 2*n,
+// otherwise Pstrf will panic.
+func Pstrf(a blas64.Symmetric, piv []int, tol float64, work []float64) (t blas64.Triangular, rank int, ok bool) {
+	rank, ok = lapack64.Dpstrf(a.Uplo, a.N, a.Data, max(1, a.Stride), piv, tol, work)
+	t.Uplo = a.Uplo
+	t.Diag = blas.NonUnit
+	t.N = a.N
+	t.Data = a.Data
+	t.Stride = a.Stride
+	return t, rank, ok
+}
+
+// Gecon estimates the reciprocal of the condition number of the n×n matrix A
+// given the LU decomposition of the matrix. The condition number computed may
+// be based on the 1-norm or the ∞-norm.
+//
+// a contains the result of the LU decomposition of A as computed by Getrf.
+//
+// anorm is the corresponding 1-norm or ∞-norm of the original matrix A.
+//
+// work is a temporary data slice of length at least 4*n and Gecon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Gecon will panic otherwise.
+func Gecon(norm lapack.MatrixNorm, a blas64.General, anorm float64, work []float64, iwork []int) float64 {
+	return lapack64.Dgecon(norm, a.Cols, a.Data, max(1, a.Stride), anorm, work, iwork)
+}
+
+// Gels finds a minimum-norm solution based on the matrices A and B using the
+// QR or LQ factorization. Gels returns false if the matrix
+// A is singular, and true if this solution was successfully found.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//  1. If m >= n and trans == blas.NoTrans, Gels finds X such that || A*X - B||_2
+//     is minimized.
+//  2. If m < n and trans == blas.NoTrans, Gels finds the minimum norm solution of
+//     A * X = B.
+//  3. If m >= n and trans == blas.Trans, Gels finds the minimum norm solution of
+//     Aᵀ * X = B.
+//  4. If m < n and trans == blas.Trans, Gels finds X such that || A*X - B||_2
+//     is minimized.
+//
+// Note that the least-squares solutions (cases 1 and 3) perform the minimization
+// per column of B. This is not the same as finding the minimum-norm matrix.
+//
+// The matrix A is a general matrix of size m×n and is modified during this call.
+// The input matrix B is of size max(m,n)×nrhs, and serves two purposes. On entry,
+// the elements of b specify the input matrix B. B has size m×nrhs if
+// trans == blas.NoTrans, and n×nrhs if trans == blas.Trans. On exit, the
+// leading submatrix of b contains the solution vectors X. If trans == blas.NoTrans,
+// this submatrix is of size n×nrhs, and of size m×nrhs otherwise.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= max(m,n) + max(m,n,nrhs), and this function will panic
+// otherwise. A longer work will enable blocked algorithms to be called.
+// In the special case that lwork == -1, work[0] will be set to the optimal working
+// length.
+func Gels(trans blas.Transpose, a blas64.General, b blas64.General, work []float64, lwork int) bool {
+	return lapack64.Dgels(trans, a.Rows, a.Cols, b.Cols, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride), work, lwork)
+}
+
+// Geqp3 computes a QR factorization with column pivoting of the m×n matrix A:
+//
+//	A*P = Q*R
+//
+// where P is a permutation matrix, Q is an orthogonal matrix and R is a
+// min(m,n)×n upper trapezoidal matrix.
+//
+// On return, the upper triangle of A contains the matrix R. The elements below
+// the diagonal together with tau represent the matrix Q as a product of
+// elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// Each H_i has the form
+//
+//	H_i = I - tau * v * vᵀ
+//
+// where tau is a scalar and v is a vector with v[0:i] = 0 and v[i] = 1;
+// v[i+1:m] is stored on exit in A[i+1:m,i], and tau in tau[i].
+//
+// jpvt specifies a column pivot to be applied to A. On entry, if jpvt[j] is at
+// least zero, the jth column of A is permuted to the front of A*P (a leading
+// column), if jpvt[j] is -1 the jth column of A is a free column. If jpvt[j] <
+// -1, Geqp3 will panic. On return, jpvt holds the permutation that was applied;
+// the jth column of A*P was the jpvt[j] column of A. jpvt must have length n or
+// Geqp3 will panic.
+//
+// tau holds the scalar factors of the elementary reflectors. It must have
+// length min(m,n), otherwise Geqp3 will panic.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// 3*n+1, otherwise Geqp3 will panic. For optimal performance lwork must be at
+// least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return, work[0]
+// will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Geqp3, only the optimal value of lwork
+// will be stored in work[0].
+func Geqp3(a blas64.General, jpvt []int, tau, work []float64, lwork int) {
+	lapack64.Dgeqp3(a.Rows, a.Cols, a.Data, max(1, a.Stride), jpvt, tau, work, lwork)
+}
+
+// Geqrf computes the QR factorization of the m×n matrix A using a blocked
+// algorithm. A is modified to contain the information to construct Q and R.
+// The upper triangle of a contains the matrix R. The lower triangular elements
+// (not including the diagonal) contain the elementary reflectors. tau is modified
+// to contain the reflector scales. tau must have length min(m,n), and
+// this function will panic otherwise.
+//
+// The ith elementary reflector can be explicitly constructed by first extracting
+// the
+//
+//	v[j] = 0           j < i
+//	v[j] = 1           j == i
+//	v[j] = a[j*lda+i]  j > i
+//
+// and computing H_i = I - tau[i] * v * vᵀ.
+//
+// The orthonormal matrix Q can be constructed from a product of these elementary
+// reflectors, Q = H_0 * H_1 * ... * H_{k-1}, where k = min(m,n).
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m and this function will panic otherwise.
+// Geqrf is a blocked QR factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Geqrf,
+// the optimal work length will be stored into work[0].
+func Geqrf(a blas64.General, tau, work []float64, lwork int) {
+	lapack64.Dgeqrf(a.Rows, a.Cols, a.Data, max(1, a.Stride), tau, work, lwork)
+}
+
+// Gelqf computes the LQ factorization of the m×n matrix A using a blocked
+// algorithm. A is modified to contain the information to construct L and Q. The
+// lower triangle of a contains the matrix L. The elements above the diagonal
+// and the slice tau represent the matrix Q. tau is modified to contain the
+// reflector scales. tau must have length at least min(m,n), and this function
+// will panic otherwise.
+//
+// See Geqrf for a description of the elementary reflectors and orthonormal
+// matrix Q. Q is constructed as a product of these elementary reflectors,
+// Q = H_{k-1} * ... * H_1 * H_0.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m and this function will panic otherwise.
+// Gelqf is a blocked LQ factorization, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Gelqf,
+// the optimal work length will be stored into work[0].
+func Gelqf(a blas64.General, tau, work []float64, lwork int) {
+	lapack64.Dgelqf(a.Rows, a.Cols, a.Data, max(1, a.Stride), tau, work, lwork)
+}
+
+// Gesvd computes the singular value decomposition of the input matrix A.
+//
+// The singular value decomposition is
+//
+//	A = U * Sigma * Vᵀ
+//
+// where Sigma is an m×n diagonal matrix containing the singular values of A,
+// U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
+// min(m,n) columns of U and V are the left and right singular vectors of A
+// respectively.
+//
+// jobU and jobVT are options for computing the singular vectors. The behavior
+// is as follows
+//
+//	jobU == lapack.SVDAll       All m columns of U are returned in u
+//	jobU == lapack.SVDStore     The first min(m,n) columns are returned in u
+//	jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
+//	jobU == lapack.SVDNone      The columns of U are not computed.
+//
+// The behavior is the same for jobVT and the rows of Vᵀ. At most one of jobU
+// and jobVT can equal lapack.SVDOverwrite, and Gesvd will panic otherwise.
+//
+// On entry, a contains the data for the m×n matrix A. During the call to Gesvd
+// the data is overwritten. On exit, A contains the appropriate singular vectors
+// if either job is lapack.SVDOverwrite.
+//
+// s is a slice of length at least min(m,n) and on exit contains the singular
+// values in decreasing order.
+//
+// u contains the left singular vectors on exit, stored columnwise. If
+// jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDStore u is
+// of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
+// not used.
+//
+// vt contains the left singular vectors on exit, stored rowwise. If
+// jobV == lapack.SVDAll, vt is of size n×m. If jobVT == lapack.SVDStore vt is
+// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
+// not used.
+//
+// work is a slice for storing temporary memory, and lwork is the usable size of
+// the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
+// If lwork == -1, instead of performing Gesvd, the optimal work length will be
+// stored into work[0]. Gesvd will panic if the working memory has insufficient
+// storage.
+//
+// Gesvd returns whether the decomposition successfully completed.
+func Gesvd(jobU, jobVT lapack.SVDJob, a, u, vt blas64.General, s, work []float64, lwork int) (ok bool) {
+	return lapack64.Dgesvd(jobU, jobVT, a.Rows, a.Cols, a.Data, max(1, a.Stride), s, u.Data, max(1, u.Stride), vt.Data, max(1, vt.Stride), work, lwork)
+}
+
+// Getrf computes the LU decomposition of an m×n matrix A using partial
+// pivoting with row interchanges.
+//
+// The LU decomposition is a factorization of A into
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is a lower triangular with unit diagonal
+// elements (lower trapezoidal if m > n), and U is upper triangular (upper
+// trapezoidal if m < n).
+//
+// On entry, a contains the matrix A. On return, L and U are stored in place
+// into a, and P is represented by ipiv.
+//
+// ipiv contains a sequence of row swaps. It indicates that row i of the matrix
+// was interchanged with ipiv[i]. ipiv must have length min(m,n), and Getrf will
+// panic otherwise. ipiv is zero-indexed.
+//
+// Getrf returns whether the matrix A is nonsingular. The LU decomposition will
+// be computed regardless of the singularity of A, but the result should not be
+// used to solve a system of equation.
+func Getrf(a blas64.General, ipiv []int) bool {
+	return lapack64.Dgetrf(a.Rows, a.Cols, a.Data, max(1, a.Stride), ipiv)
+}
+
+// Getri computes the inverse of the matrix A using the LU factorization computed
+// by Getrf. On entry, a contains the PLU decomposition of A as computed by
+// Getrf and on exit contains the reciprocal of the original matrix.
+//
+// Getri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= n and this function will panic otherwise.
+// Getri is a blocked inversion, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Getri,
+// the optimal work length will be stored into work[0].
+func Getri(a blas64.General, ipiv []int, work []float64, lwork int) (ok bool) {
+	return lapack64.Dgetri(a.Cols, a.Data, max(1, a.Stride), ipiv, work, lwork)
+}
+
+// Getrs solves a system of equations using an LU factorization.
+// The system of equations solved is
+//
+//	A * X = B   if trans == blas.Trans
+//	Aᵀ * X = B  if trans == blas.NoTrans
+//
+// A is a general n×n matrix with stride lda. B is a general matrix of size n×nrhs.
+//
+// On entry b contains the elements of the matrix B. On exit, b contains the
+// elements of X, the solution to the system of equations.
+//
+// a and ipiv contain the LU factorization of A and the permutation indices as
+// computed by Getrf. ipiv is zero-indexed.
+func Getrs(trans blas.Transpose, a blas64.General, b blas64.General, ipiv []int) {
+	lapack64.Dgetrs(trans, a.Cols, b.Cols, a.Data, max(1, a.Stride), ipiv, b.Data, max(1, b.Stride))
+}
+
+// Ggsvd3 computes the generalized singular value decomposition (GSVD)
+// of an m×n matrix A and p×n matrix B:
+//
+//	Uᵀ*A*Q = D1*[ 0 R ]
+//
+//	Vᵀ*B*Q = D2*[ 0 R ]
+//
+// where U, V and Q are orthogonal matrices.
+//
+// Ggsvd3 returns k and l, the dimensions of the sub-blocks. k+l
+// is the effective numerical rank of the (m+p)×n matrix [ Aᵀ Bᵀ ]ᵀ.
+// R is a (k+l)×(k+l) nonsingular upper triangular matrix, D1 and
+// D2 are m×(k+l) and p×(k+l) diagonal matrices and of the following
+// structures, respectively:
+//
+// If m-k-l >= 0,
+//
+//	                  k  l
+//	     D1 =     k [ I  0 ]
+//	              l [ 0  C ]
+//	          m-k-l [ 0  0 ]
+//
+//	                k  l
+//	     D2 = l   [ 0  S ]
+//	          p-l [ 0  0 ]
+//
+//	             n-k-l  k    l
+//	[ 0 R ] = k [  0   R11  R12 ] k
+//	          l [  0    0   R22 ] l
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_{k+l} ),
+//	S = diag( beta_k,  ... , beta_{k+l} ),
+//	C^2 + S^2 = I.
+//
+// R is stored in
+//
+//	A[0:k+l, n-k-l:n]
+//
+// on exit.
+//
+// If m-k-l < 0,
+//
+//	               k m-k k+l-m
+//	    D1 =   k [ I  0    0  ]
+//	         m-k [ 0  C    0  ]
+//
+//	                 k m-k k+l-m
+//	    D2 =   m-k [ 0  S    0  ]
+//	         k+l-m [ 0  0    I  ]
+//	           p-l [ 0  0    0  ]
+//
+//	               n-k-l  k   m-k  k+l-m
+//	[ 0 R ] =    k [ 0    R11  R12  R13 ]
+//	           m-k [ 0     0   R22  R23 ]
+//	         k+l-m [ 0     0    0   R33 ]
+//
+// where
+//
+//	C = diag( alpha_k, ... , alpha_m ),
+//	S = diag( beta_k,  ... , beta_m ),
+//	C^2 + S^2 = I.
+//
+//	R = [ R11 R12 R13 ] is stored in A[1:m, n-k-l+1:n]
+//	    [  0  R22 R23 ]
+//
+// and R33 is stored in
+//
+//	B[m-k:l, n+m-k-l:n] on exit.
+//
+// Ggsvd3 computes C, S, R, and optionally the orthogonal transformation
+// matrices U, V and Q.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//
+//	jobU == lapack.GSVDU        Compute orthogonal matrix U
+//	jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+//
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// alpha and beta must have length n or Ggsvd3 will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//
+//	alpha[0:k] = 1,
+//	beta[0:k]  = 0,
+//
+// if m-k-l >= 0,
+//
+//	alpha[k:k+l] = diag(C),
+//	beta[k:k+l]  = diag(S),
+//
+// if m-k-l < 0,
+//
+//	alpha[k:m]= C, alpha[m:k+l]= 0
+//	beta[k:m] = S, beta[m:k+l] = 1.
+//
+// if k+l < n,
+//
+//	alpha[k+l:n] = 0 and
+//	beta[k+l:n]  = 0.
+//
+// On exit, iwork contains the permutation required to sort alpha descending.
+//
+// iwork must have length n, work must have length at least max(1, lwork), and
+// lwork must be -1 or greater than n, otherwise Ggsvd3 will panic. If
+// lwork is -1, work[0] holds the optimal lwork on return, but Ggsvd3 does
+// not perform the GSVD.
+func Ggsvd3(jobU, jobV, jobQ lapack.GSVDJob, a, b blas64.General, alpha, beta []float64, u, v, q blas64.General, work []float64, lwork int, iwork []int) (k, l int, ok bool) {
+	return lapack64.Dggsvd3(jobU, jobV, jobQ, a.Rows, a.Cols, b.Rows, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride), alpha, beta, u.Data, max(1, u.Stride), v.Data, max(1, v.Stride), q.Data, max(1, q.Stride), work, lwork, iwork)
+}
+
+// Gtsv solves one of the equations
+//
+//	A * X = B   if trans == blas.NoTrans
+//	Aᵀ * X = B  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n tridiagonal matrix. It uses Gaussian elimination with
+// partial pivoting.
+//
+// On entry, a contains the matrix A, on return it will be overwritten.
+//
+// On entry, b contains the n×nrhs right-hand side matrix B. On return, it will
+// be overwritten. If ok is true, it will be overwritten by the solution matrix X.
+//
+// Gtsv returns whether the solution X has been successfully computed.
+//
+// Dgtsv is not part of the lapack.Float64 interface and so calls to Gtsv are
+// always executed by the Gonum implementation.
+func Gtsv(trans blas.Transpose, a Tridiagonal, b blas64.General) (ok bool) {
+	if trans != blas.NoTrans {
+		a.DL, a.DU = a.DU, a.DL
+	}
+	return gonum.Implementation{}.Dgtsv(a.N, b.Cols, a.DL, a.D, a.DU, b.Data, max(1, b.Stride))
+}
+
+// Lagtm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C   if trans == blas.NoTrans
+//	C = alpha * Aᵀ * B + beta * C  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an m×m tridiagonal matrix represented by its diagonals dl, d, du,
+// B and C are m×n dense matrices, and alpha and beta are scalars.
+//
+// Dlagtm is not part of the lapack.Float64 interface and so calls to Lagtm are
+// always executed by the Gonum implementation.
+func Lagtm(trans blas.Transpose, alpha float64, a Tridiagonal, b blas64.General, beta float64, c blas64.General) {
+	gonum.Implementation{}.Dlagtm(trans, c.Rows, c.Cols, alpha, a.DL, a.D, a.DU, b.Data, max(1, b.Stride), beta, c.Data, max(1, c.Stride))
+}
+
+// Lange computes the matrix norm of the general m×n matrix A. The input norm
+// specifies the norm computed.
+//
+//	lapack.MaxAbs: the maximum absolute value of an element.
+//	lapack.MaxColumnSum: the maximum column sum of the absolute values of the entries.
+//	lapack.MaxRowSum: the maximum row sum of the absolute values of the entries.
+//	lapack.Frobenius: the square root of the sum of the squares of the entries.
+//
+// If norm == lapack.MaxColumnSum, work must be of length n, and this function will panic otherwise.
+// There are no restrictions on work for the other matrix norms.
+func Lange(norm lapack.MatrixNorm, a blas64.General, work []float64) float64 {
+	return lapack64.Dlange(norm, a.Rows, a.Cols, a.Data, max(1, a.Stride), work)
+}
+
+// Langb returns the given norm of a general m×n band matrix with kl sub-diagonals and
+// ku super-diagonals.
+//
+// Dlangb is not part of the lapack.Float64 interface and so calls to Langb are always
+// executed by the Gonum implementation.
+func Langb(norm lapack.MatrixNorm, a blas64.Band) float64 {
+	return gonum.Implementation{}.Dlangb(norm, a.Rows, a.Cols, a.KL, a.KU, a.Data, max(1, a.Stride))
+}
+
+// Langt computes the specified norm of an n×n tridiagonal matrix.
+//
+// Dlangt is not part of the lapack.Float64 interface and so calls to Langt are
+// always executed by the Gonum implementation.
+func Langt(norm lapack.MatrixNorm, a Tridiagonal) float64 {
+	return gonum.Implementation{}.Dlangt(norm, a.N, a.DL, a.D, a.DU)
+}
+
+// Lansb computes the specified norm of an n×n symmetric band matrix. If
+// norm == lapack.MaxColumnSum or norm == lapack.MaxRowSum, work must have length
+// at least n and this function will panic otherwise.
+// There are no restrictions on work for the other matrix norms.
+//
+// Dlansb is not part of the lapack.Float64 interface and so calls to Lansb are always
+// executed by the Gonum implementation.
+func Lansb(norm lapack.MatrixNorm, a blas64.SymmetricBand, work []float64) float64 {
+	return gonum.Implementation{}.Dlansb(norm, a.Uplo, a.N, a.K, a.Data, max(1, a.Stride), work)
+}
+
+// Lansy computes the specified norm of an n×n symmetric matrix. If
+// norm == lapack.MaxColumnSum or norm == lapack.MaxRowSum, work must have length
+// at least n and this function will panic otherwise.
+// There are no restrictions on work for the other matrix norms.
+func Lansy(norm lapack.MatrixNorm, a blas64.Symmetric, work []float64) float64 {
+	return lapack64.Dlansy(norm, a.Uplo, a.N, a.Data, max(1, a.Stride), work)
+}
+
+// Lantr computes the specified norm of an m×n trapezoidal matrix A. If
+// norm == lapack.MaxColumnSum work must have length at least n and this function
+// will panic otherwise. There are no restrictions on work for the other matrix norms.
+func Lantr(norm lapack.MatrixNorm, a blas64.Triangular, work []float64) float64 {
+	return lapack64.Dlantr(norm, a.Uplo, a.Diag, a.N, a.N, a.Data, max(1, a.Stride), work)
+}
+
+// Lantb computes the specified norm of an n×n triangular band matrix A. If
+// norm == lapack.MaxColumnSum work must have length at least n and this function
+// will panic otherwise. There are no restrictions on work for the other matrix
+// norms.
+func Lantb(norm lapack.MatrixNorm, a blas64.TriangularBand, work []float64) float64 {
+	return gonum.Implementation{}.Dlantb(norm, a.Uplo, a.Diag, a.N, a.K, a.Data, max(1, a.Stride), work)
+}
+
+// Lapmr rearranges the rows of the m×n matrix X as specified by the permutation
+// k[0],k[1],...,k[m-1] of the integers 0,...,m-1.
+//
+// If forward is true, a forward permutation is applied:
+//
+//	X[k[i],0:n] is moved to X[i,0:n] for i=0,1,...,m-1.
+//
+// If forward is false, a backward permutation is applied:
+//
+//	X[i,0:n] is moved to X[k[i],0:n] for i=0,1,...,m-1.
+//
+// k must have length m, otherwise Lapmr will panic. k is zero-indexed.
+func Lapmr(forward bool, x blas64.General, k []int) {
+	lapack64.Dlapmr(forward, x.Rows, x.Cols, x.Data, max(1, x.Stride), k)
+}
+
+// Lapmt rearranges the columns of the m×n matrix X as specified by the
+// permutation k[0],k[1],...,k[n-1] of the integers 0,...,n-1.
+//
+// If forward is true, a forward permutation is applied:
+//
+//	X[0:m,k[j]] is moved to X[0:m,j] for j=0,1,...,n-1.
+//
+// If forward is false, a backward permutation is applied:
+//
+//	X[0:m,j] is moved to X[0:m,k[j]] for j=0,1,...,n-1.
+//
+// k must have length n, otherwise Lapmt will panic. k is zero-indexed.
+func Lapmt(forward bool, x blas64.General, k []int) {
+	lapack64.Dlapmt(forward, x.Rows, x.Cols, x.Data, max(1, x.Stride), k)
+}
+
+// Orglq generates an m×n matrix Q with orthonormal rows defined as the first m
+// rows of a product of k elementary reflectors of order n
+//
+//	Q = H_{k-1} * ... * H_0
+//
+// as returned by Dgelqf.
+//
+// k is determined by the length of tau.
+//
+// On entry, tau and the first k rows of A must contain the scalar factors and
+// the vectors, respectively, which define the elementary reflectors H_i,
+// i=0,...,k-1, as returned by Dgelqf. On return, A contains the matrix Q.
+//
+// work must have length at least lwork and lwork must be at least max(1,m). On
+// return, optimal value of lwork will be stored in work[0]. It must also hold
+// that 0 <= k <= m <= n, otherwise Orglq will panic.
+//
+// If lwork == -1, instead of performing Orglq, the function only calculates the
+// optimal value of lwork and stores it into work[0].
+func Orglq(a blas64.General, tau, work []float64, lwork int) {
+	lapack64.Dorglq(a.Rows, a.Cols, len(tau), a.Data, a.Stride, tau, work, lwork)
+}
+
+// Ormlq multiplies the matrix C by the othogonal matrix Q defined by
+// A and tau. A and tau are as returned from Gelqf.
+//
+//	C = Q * C   if side == blas.Left and trans == blas.NoTrans
+//	C = Qᵀ * C  if side == blas.Left and trans == blas.Trans
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans
+//
+// If side == blas.Left, A is a matrix of side k×m, and if side == blas.Right
+// A is of size k×n. This uses a blocked algorithm.
+//
+// Work is temporary storage, and lwork specifies the usable memory length.
+// At minimum, lwork >= m if side == blas.Left and lwork >= n if side == blas.Right,
+// and this function will panic otherwise.
+// Ormlq uses a block algorithm, but the block size is limited
+// by the temporary space available. If lwork == -1, instead of performing Ormlq,
+// the optimal work length will be stored into work[0].
+//
+// Tau contains the Householder scales and must have length at least k, and
+// this function will panic otherwise.
+func Ormlq(side blas.Side, trans blas.Transpose, a blas64.General, tau []float64, c blas64.General, work []float64, lwork int) {
+	lapack64.Dormlq(side, trans, c.Rows, c.Cols, a.Rows, a.Data, max(1, a.Stride), tau, c.Data, max(1, c.Stride), work, lwork)
+}
+
+// Orgqr generates an m×n matrix Q with orthonormal columns defined by the
+// product of elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}
+//
+// as computed by Geqrf.
+//
+// k is determined by the length of tau.
+//
+// The length of work must be at least n and it also must be that 0 <= k <= n
+// and 0 <= n <= m.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= n, and the amount of blocking is limited by the usable
+// length. If lwork == -1, instead of computing Orgqr the optimal work length
+// is stored into work[0].
+//
+// Orgqr will panic if the conditions on input values are not met.
+func Orgqr(a blas64.General, tau []float64, work []float64, lwork int) {
+	lapack64.Dorgqr(a.Rows, a.Cols, len(tau), a.Data, a.Stride, tau, work, lwork)
+}
+
+// Ormqr multiplies an m×n matrix C by an orthogonal matrix Q as
+//
+//	C = Q * C   if side == blas.Left  and trans == blas.NoTrans,
+//	C = Qᵀ * C  if side == blas.Left  and trans == blas.Trans,
+//	C = C * Q   if side == blas.Right and trans == blas.NoTrans,
+//	C = C * Qᵀ  if side == blas.Right and trans == blas.Trans,
+//
+// where Q is defined as the product of k elementary reflectors
+//
+//	Q = H_0 * H_1 * ... * H_{k-1}.
+//
+// k is determined by the length of tau.
+//
+// If side == blas.Left, A is an m×k matrix and 0 <= k <= m.
+// If side == blas.Right, A is an n×k matrix and 0 <= k <= n.
+// The ith column of A contains the vector which defines the elementary
+// reflector H_i and tau[i] contains its scalar factor. Geqrf returns A and tau
+// in the required form.
+//
+// work must have length at least max(1,lwork), and lwork must be at least n if
+// side == blas.Left and at least m if side == blas.Right, otherwise Ormqr will
+// panic.
+//
+// work is temporary storage, and lwork specifies the usable memory length. At
+// minimum, lwork >= m if side == blas.Left and lwork >= n if side ==
+// blas.Right, and this function will panic otherwise. Larger values of lwork
+// will generally give better performance. On return, work[0] will contain the
+// optimal value of lwork.
+//
+// If lwork is -1, instead of performing Ormqr, the optimal workspace size will
+// be stored into work[0].
+func Ormqr(side blas.Side, trans blas.Transpose, a blas64.General, tau []float64, c blas64.General, work []float64, lwork int) {
+	lapack64.Dormqr(side, trans, c.Rows, c.Cols, len(tau), a.Data, max(1, a.Stride), tau, c.Data, max(1, c.Stride), work, lwork)
+}
+
+// Pocon estimates the reciprocal of the condition number of a positive-definite
+// matrix A given the Cholesky decomposition of A. The condition number computed
+// is based on the 1-norm and the ∞-norm.
+//
+// anorm is the 1-norm and the ∞-norm of the original matrix A.
+//
+// work is a temporary data slice of length at least 3*n and Pocon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Pocon will panic otherwise.
+func Pocon(a blas64.Symmetric, anorm float64, work []float64, iwork []int) float64 {
+	return lapack64.Dpocon(a.Uplo, a.N, a.Data, max(1, a.Stride), anorm, work, iwork)
+}
+
+// Syev computes all eigenvalues and, optionally, the eigenvectors of a real
+// symmetric matrix A.
+//
+// w contains the eigenvalues in ascending order upon return. w must have length
+// at least n, and Syev will panic otherwise.
+//
+// On entry, a contains the elements of the symmetric matrix A in the triangular
+// portion specified by uplo. If jobz == lapack.EVCompute, a contains the
+// orthonormal eigenvectors of A on exit, otherwise jobz must be lapack.EVNone
+// and on exit the specified triangular region is overwritten.
+//
+// Work is temporary storage, and lwork specifies the usable memory length. At minimum,
+// lwork >= 3*n-1, and Syev will panic otherwise. The amount of blocking is
+// limited by the usable length. If lwork == -1, instead of computing Syev the
+// optimal work length is stored into work[0].
+func Syev(jobz lapack.EVJob, a blas64.Symmetric, w, work []float64, lwork int) (ok bool) {
+	return lapack64.Dsyev(jobz, a.Uplo, a.N, a.Data, max(1, a.Stride), w, work, lwork)
+}
+
+// Tbtrs solves a triangular system of the form
+//
+//	A * X = B   if trans == blas.NoTrans
+//	Aᵀ * X = B  if trans == blas.Trans or blas.ConjTrans
+//
+// where A is an n×n triangular band matrix, and B is an n×nrhs matrix.
+//
+// Tbtrs returns whether A is non-singular. If A is singular, no solutions X
+// are computed.
+func Tbtrs(trans blas.Transpose, a blas64.TriangularBand, b blas64.General) (ok bool) {
+	return lapack64.Dtbtrs(a.Uplo, trans, a.Diag, a.N, a.K, b.Cols, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride))
+}
+
+// Trcon estimates the reciprocal of the condition number of a triangular matrix A.
+// The condition number computed may be based on the 1-norm or the ∞-norm.
+//
+// work is a temporary data slice of length at least 3*n and Trcon will panic otherwise.
+//
+// iwork is a temporary data slice of length at least n and Trcon will panic otherwise.
+func Trcon(norm lapack.MatrixNorm, a blas64.Triangular, work []float64, iwork []int) float64 {
+	return lapack64.Dtrcon(norm, a.Uplo, a.Diag, a.N, a.Data, max(1, a.Stride), work, iwork)
+}
+
+// Trtri computes the inverse of a triangular matrix, storing the result in place
+// into a.
+//
+// Trtri will not perform the inversion if the matrix is singular, and returns
+// a boolean indicating whether the inversion was successful.
+func Trtri(a blas64.Triangular) (ok bool) {
+	return lapack64.Dtrtri(a.Uplo, a.Diag, a.N, a.Data, max(1, a.Stride))
+}
+
+// Trtrs solves a triangular system of the form A * X = B or Aᵀ * X = B. Trtrs
+// returns whether the solve completed successfully. If A is singular, no solve is performed.
+func Trtrs(trans blas.Transpose, a blas64.Triangular, b blas64.General) (ok bool) {
+	return lapack64.Dtrtrs(a.Uplo, trans, a.Diag, a.N, b.Cols, a.Data, max(1, a.Stride), b.Data, max(1, b.Stride))
+}
+
+// Geev computes the eigenvalues and, optionally, the left and/or right
+// eigenvectors for an n×n real nonsymmetric matrix A.
+//
+// The right eigenvector v_j of A corresponding to an eigenvalue λ_j
+// is defined by
+//
+//	A v_j = λ_j v_j,
+//
+// and the left eigenvector u_j corresponding to an eigenvalue λ_j is defined by
+//
+//	u_jᴴ A = λ_j u_jᴴ,
+//
+// where u_jᴴ is the conjugate transpose of u_j.
+//
+// On return, A will be overwritten and the left and right eigenvectors will be
+// stored, respectively, in the columns of the n×n matrices VL and VR in the
+// same order as their eigenvalues. If the j-th eigenvalue is real, then
+//
+//	u_j = VL[:,j],
+//	v_j = VR[:,j],
+//
+// and if it is not real, then j and j+1 form a complex conjugate pair and the
+// eigenvectors can be recovered as
+//
+//	u_j     = VL[:,j] + i*VL[:,j+1],
+//	u_{j+1} = VL[:,j] - i*VL[:,j+1],
+//	v_j     = VR[:,j] + i*VR[:,j+1],
+//	v_{j+1} = VR[:,j] - i*VR[:,j+1],
+//
+// where i is the imaginary unit. The computed eigenvectors are normalized to
+// have Euclidean norm equal to 1 and largest component real.
+//
+// Left eigenvectors will be computed only if jobvl == lapack.LeftEVCompute,
+// otherwise jobvl must be lapack.LeftEVNone.
+// Right eigenvectors will be computed only if jobvr == lapack.RightEVCompute,
+// otherwise jobvr must be lapack.RightEVNone.
+// For other values of jobvl and jobvr Geev will panic.
+//
+// On return, wr and wi will contain the real and imaginary parts, respectively,
+// of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear
+// consecutively with the eigenvalue having the positive imaginary part first.
+// wr and wi must have length n, and Geev will panic otherwise.
+//
+// work must have length at least lwork and lwork must be at least max(1,4*n) if
+// the left or right eigenvectors are computed, and at least max(1,3*n) if no
+// eigenvectors are computed. For good performance, lwork must generally be
+// larger. On return, optimal value of lwork will be stored in work[0].
+//
+// If lwork == -1, instead of performing Geev, the function only calculates the
+// optimal value of lwork and stores it into work[0].
+//
+// On return, first will be the index of the first valid eigenvalue.
+// If first == 0, all eigenvalues and eigenvectors have been computed.
+// If first is positive, Geev failed to compute all the eigenvalues, no
+// eigenvectors have been computed and wr[first:] and wi[first:] contain those
+// eigenvalues which have converged.
+func Geev(jobvl lapack.LeftEVJob, jobvr lapack.RightEVJob, a blas64.General, wr, wi []float64, vl, vr blas64.General, work []float64, lwork int) (first int) {
+	n := a.Rows
+	if a.Cols != n {
+		panic("lapack64: matrix not square")
+	}
+	if jobvl == lapack.LeftEVCompute && (vl.Rows != n || vl.Cols != n) {
+		panic("lapack64: bad size of VL")
+	}
+	if jobvr == lapack.RightEVCompute && (vr.Rows != n || vr.Cols != n) {
+		panic("lapack64: bad size of VR")
+	}
+	return lapack64.Dgeev(jobvl, jobvr, n, a.Data, max(1, a.Stride), wr, wi, vl.Data, max(1, vl.Stride), vr.Data, max(1, vr.Stride), work, lwork)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/README.md b/vendor/gonum.org/v1/gonum/mat/README.md
new file mode 100644
index 00000000000..5e7be6b2340
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/README.md
@@ -0,0 +1,6 @@
+# Gonum matrix
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/mat)](https://pkg.go.dev/gonum.org/v1/gonum/mat)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/mat?status.svg)](https://godocs.io/gonum.org/v1/gonum/mat)
+
+Package mat is a matrix package for the Go language.
diff --git a/vendor/gonum.org/v1/gonum/mat/band.go b/vendor/gonum.org/v1/gonum/mat/band.go
new file mode 100644
index 00000000000..7660cdaa8eb
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/band.go
@@ -0,0 +1,368 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	bandDense *BandDense
+	_         Matrix      = bandDense
+	_         allMatrix   = bandDense
+	_         denseMatrix = bandDense
+	_         Banded      = bandDense
+	_         RawBander   = bandDense
+
+	_ NonZeroDoer    = bandDense
+	_ RowNonZeroDoer = bandDense
+	_ ColNonZeroDoer = bandDense
+)
+
+// BandDense represents a band matrix in dense storage format.
+type BandDense struct {
+	mat blas64.Band
+}
+
+// Banded is a band matrix representation.
+type Banded interface {
+	Matrix
+	// Bandwidth returns the lower and upper bandwidth values for
+	// the matrix. The total bandwidth of the matrix is kl+ku+1.
+	Bandwidth() (kl, ku int)
+
+	// TBand is the equivalent of the T() method in the Matrix
+	// interface but guarantees the transpose is of banded type.
+	TBand() Banded
+}
+
+// A RawBander can return a blas64.Band representation of the receiver.
+// Changes to the blas64.Band.Data slice will be reflected in the original
+// matrix, changes to the Rows, Cols, KL, KU and Stride fields will not.
+type RawBander interface {
+	RawBand() blas64.Band
+}
+
+// A MutableBanded can set elements of a band matrix.
+type MutableBanded interface {
+	Banded
+
+	// SetBand sets the element at row i, column j to the value v.
+	// It panics if the location is outside the appropriate region of the matrix.
+	SetBand(i, j int, v float64)
+}
+
+var (
+	_ Matrix            = TransposeBand{}
+	_ Banded            = TransposeBand{}
+	_ UntransposeBander = TransposeBand{}
+)
+
+// TransposeBand is a type for performing an implicit transpose of a band
+// matrix. It implements the Banded interface, returning values from the
+// transpose of the matrix within.
+type TransposeBand struct {
+	Banded Banded
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Banded field.
+func (t TransposeBand) At(i, j int) float64 {
+	return t.Banded.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix.
+func (t TransposeBand) Dims() (r, c int) {
+	c, r = t.Banded.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Banded field.
+func (t TransposeBand) T() Matrix {
+	return t.Banded
+}
+
+// Bandwidth returns the lower and upper bandwidth values for
+// the transposed matrix.
+func (t TransposeBand) Bandwidth() (kl, ku int) {
+	kl, ku = t.Banded.Bandwidth()
+	return ku, kl
+}
+
+// TBand performs an implicit transpose by returning the Banded field.
+func (t TransposeBand) TBand() Banded {
+	return t.Banded
+}
+
+// Untranspose returns the Banded field.
+func (t TransposeBand) Untranspose() Matrix {
+	return t.Banded
+}
+
+// UntransposeBand returns the Banded field.
+func (t TransposeBand) UntransposeBand() Banded {
+	return t.Banded
+}
+
+// NewBandDense creates a new Band matrix with r rows and c columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == min(r, c+kl)*(kl+ku+1),
+// data is used as the backing slice, and changes to the elements of the returned
+// BandDense will be reflected in data. If neither of these is true, NewBandDense
+// will panic. kl must be at least zero and less r, and ku must be at least zero and
+// less than c, otherwise NewBandDense will panic.
+// NewBandDense will panic if either r or c is zero.
+//
+// The data must be arranged in row-major order constructed by removing the zeros
+// from the rows outside the band and aligning the diagonals. For example, the matrix
+//
+//	1  2  3  0  0  0
+//	4  5  6  7  0  0
+//	0  8  9 10 11  0
+//	0  0 12 13 14 15
+//	0  0  0 16 17 18
+//	0  0  0  0 19 20
+//
+// becomes (* entries are never accessed)
+//   - 1  2  3
+//     4  5  6  7
+//     8  9 10 11
+//     12 13 14 15
+//     16 17 18  *
+//     19 20  *  *
+//
+// which is passed to NewBandDense as []float64{*, 1, 2, 3, 4, ...} with kl=1 and ku=2.
+// Only the values in the band portion of the matrix are used.
+func NewBandDense(r, c, kl, ku int, data []float64) *BandDense {
+	if r <= 0 || c <= 0 || kl < 0 || ku < 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if kl+1 > r || ku+1 > c {
+		panic(ErrBandwidth)
+	}
+	bc := kl + ku + 1
+	if data != nil && len(data) != min(r, c+kl)*bc {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, min(r, c+kl)*bc)
+	}
+	return &BandDense{
+		mat: blas64.Band{
+			Rows:   r,
+			Cols:   c,
+			KL:     kl,
+			KU:     ku,
+			Stride: bc,
+			Data:   data,
+		},
+	}
+}
+
+// NewDiagonalRect is a convenience function that returns a diagonal matrix represented by a
+// BandDense. The length of data must be min(r, c) otherwise NewDiagonalRect will panic.
+func NewDiagonalRect(r, c int, data []float64) *BandDense {
+	return NewBandDense(r, c, 0, 0, data)
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (b *BandDense) Dims() (r, c int) {
+	return b.mat.Rows, b.mat.Cols
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+func (b *BandDense) Bandwidth() (kl, ku int) {
+	return b.mat.KL, b.mat.KU
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (b *BandDense) T() Matrix {
+	return Transpose{b}
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a TransposeBand.
+func (b *BandDense) TBand() Banded {
+	return TransposeBand{b}
+}
+
+// RawBand returns the underlying blas64.Band used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Band.
+func (b *BandDense) RawBand() blas64.Band {
+	return b.mat
+}
+
+// SetRawBand sets the underlying blas64.Band used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+func (b *BandDense) SetRawBand(mat blas64.Band) {
+	b.mat = mat
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be zeroed using Reset.
+func (b *BandDense) IsEmpty() bool {
+	return b.mat.Stride == 0
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (b *BandDense) Reset() {
+	b.mat.Rows = 0
+	b.mat.Cols = 0
+	b.mat.KL = 0
+	b.mat.KU = 0
+	b.mat.Stride = 0
+	b.mat.Data = b.mat.Data[:0]
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (b *BandDense) DiagView() Diagonal {
+	n := min(b.mat.Rows, b.mat.Cols)
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  b.mat.Stride,
+			Data: b.mat.Data[b.mat.KL : (n-1)*b.mat.Stride+b.mat.KL+1],
+		},
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of b. The function fn
+// takes a row/column index and the element value of b at (i, j).
+func (b *BandDense) DoNonZero(fn func(i, j int, v float64)) {
+	for i := 0; i < min(b.mat.Rows, b.mat.Cols+b.mat.KL); i++ {
+		for j := max(0, i-b.mat.KL); j < min(b.mat.Cols, i+b.mat.KU+1); j++ {
+			v := b.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of b. The function fn
+// takes a row/column index and the element value of b at (i, j).
+func (b *BandDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || b.mat.Rows <= i {
+		panic(ErrRowAccess)
+	}
+	for j := max(0, i-b.mat.KL); j < min(b.mat.Cols, i+b.mat.KU+1); j++ {
+		v := b.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of b. The function fn
+// takes a row/column index and the element value of b at (i, j).
+func (b *BandDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || b.mat.Cols <= j {
+		panic(ErrColAccess)
+	}
+	for i := 0; i < min(b.mat.Rows, b.mat.Cols+b.mat.KL); i++ {
+		if i-b.mat.KL <= j && j < i+b.mat.KU+1 {
+			v := b.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// Zero sets all of the matrix elements to zero.
+func (b *BandDense) Zero() {
+	m := b.mat.Rows
+	kL := b.mat.KL
+	nCol := b.mat.KU + 1 + kL
+	for i := 0; i < m; i++ {
+		l := max(0, kL-i)
+		u := min(nCol, m+kL-i)
+		zero(b.mat.Data[i*b.mat.Stride+l : i*b.mat.Stride+u])
+	}
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (b *BandDense) Norm(norm float64) float64 {
+	if b.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum || lnorm == lapack.MaxRowSum {
+		return lapack64.Langb(lnorm, b.mat)
+	}
+	return lapack64.Langb(lnorm, b.mat)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrSquare if the matrix is not square and with
+// ErrZeroLength if the matrix has zero size.
+func (b *BandDense) Trace() float64 {
+	r, c := b.Dims()
+	if r != c {
+		panic(ErrSquare)
+	}
+	if b.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := b.RawBand()
+	var tr float64
+	for i := 0; i < r; i++ {
+		tr += rb.Data[rb.KL+i*rb.Stride]
+	}
+	return tr
+}
+
+// MulVecTo computes B⋅x or Bᵀ⋅x storing the result into dst.
+func (b *BandDense) MulVecTo(dst *VecDense, trans bool, x Vector) {
+	m, n := b.Dims()
+	if trans {
+		m, n = n, m
+	}
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(m)
+
+	t := blas.NoTrans
+	if trans {
+		t = blas.Trans
+	}
+
+	xMat, _ := untransposeExtract(x)
+	if xVec, ok := xMat.(*VecDense); ok {
+		if dst != xVec {
+			dst.checkOverlap(xVec.mat)
+			blas64.Gbmv(t, 1, b.mat, xVec.mat, 0, dst.mat)
+		} else {
+			xCopy := getVecDenseWorkspace(n, false)
+			xCopy.CloneFromVec(xVec)
+			blas64.Gbmv(t, 1, b.mat, xCopy.mat, 0, dst.mat)
+			putVecDenseWorkspace(xCopy)
+		}
+	} else {
+		xCopy := getVecDenseWorkspace(n, false)
+		xCopy.CloneFromVec(x)
+		blas64.Gbmv(t, 1, b.mat, xCopy.mat, 0, dst.mat)
+		putVecDenseWorkspace(xCopy)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/cdense.go b/vendor/gonum.org/v1/gonum/mat/cdense.go
new file mode 100644
index 00000000000..86f0423c582
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/cdense.go
@@ -0,0 +1,368 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas/cblas128"
+)
+
+var (
+	cDense *CDense
+
+	_ CMatrix   = cDense
+	_ allMatrix = cDense
+)
+
+// CDense is a dense matrix representation with complex data.
+type CDense struct {
+	mat cblas128.General
+
+	capRows, capCols int
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (m *CDense) Dims() (r, c int) {
+	return m.mat.Rows, m.mat.Cols
+}
+
+// Caps returns the number of rows and columns in the backing matrix.
+func (m *CDense) Caps() (r, c int) { return m.capRows, m.capCols }
+
+// H performs an implicit conjugate transpose by returning the receiver inside a
+// ConjTranspose.
+func (m *CDense) H() CMatrix {
+	return ConjTranspose{m}
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// CTranspose.
+func (m *CDense) T() CMatrix {
+	return CTranspose{m}
+}
+
+// Conj calculates the element-wise conjugate of a and stores the result in the
+// receiver.
+// Conj will panic if m and a do not have the same dimension unless m is empty.
+func (m *CDense) Conj(a CMatrix) {
+	ar, ac := a.Dims()
+	aU, aTrans, aConj := untransposeExtractCmplx(a)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*CDense); ok {
+		amat := arm.mat
+		if m != aU {
+			m.checkOverlap(amat)
+		}
+		for ja, jm := 0, 0; ja < ar*amat.Stride; ja, jm = ja+amat.Stride, jm+m.mat.Stride {
+			for i, v := range amat.Data[ja : ja+ac] {
+				m.mat.Data[i+jm] = cmplx.Conj(v)
+			}
+		}
+		return
+	}
+
+	m.checkOverlapMatrix(aU)
+	if aTrans != aConj && m == aU {
+		// Only make workspace if the destination is transposed
+		// with respect to the source and they are the same
+		// matrix.
+		var restore func()
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, cmplx.Conj(a.At(r, c)))
+		}
+	}
+}
+
+// Slice returns a new CMatrix that shares backing data with the receiver.
+// The returned matrix starts at {i,j} of the receiver and extends k-i rows
+// and l-j columns. The final row in the resulting matrix is k-1 and the
+// final column is l-1.
+// Slice panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (m *CDense) Slice(i, k, j, l int) CMatrix {
+	return m.slice(i, k, j, l)
+}
+
+func (m *CDense) slice(i, k, j, l int) *CDense {
+	mr, mc := m.Caps()
+	if i < 0 || mr <= i || j < 0 || mc <= j || k < i || mr < k || l < j || mc < l {
+		if i == k || j == l {
+			panic(ErrZeroLength)
+		}
+		panic(ErrIndexOutOfRange)
+	}
+	t := *m
+	t.mat.Data = t.mat.Data[i*t.mat.Stride+j : (k-1)*t.mat.Stride+l]
+	t.mat.Rows = k - i
+	t.mat.Cols = l - j
+	t.capRows -= i
+	t.capCols -= j
+	return &t
+}
+
+// NewCDense creates a new complex Dense matrix with r rows and c columns.
+// If data == nil, a new slice is allocated for the backing slice.
+// If len(data) == r*c, data is used as the backing slice, and changes to the
+// elements of the returned CDense will be reflected in data.
+// If neither of these is true, NewCDense will panic.
+// NewCDense will panic if either r or c is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+func NewCDense(r, c int, data []complex128) *CDense {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data != nil && r*c != len(data) {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]complex128, r*c)
+	}
+	return &CDense{
+		mat: cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   data,
+		},
+		capRows: r,
+		capCols: c,
+	}
+}
+
+// ReuseAs changes the receiver if it IsEmpty() to be of size r×c.
+//
+// ReuseAs re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAs panics if the receiver is not empty, and panics if
+// the input sizes are less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (m *CDense) ReuseAs(r, c int) {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !m.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	m.reuseAsZeroed(r, c)
+}
+
+// reuseAs resizes an empty matrix to a r×c matrix,
+// or checks that a non-empty matrix is r×c.
+//
+// reuseAs must be kept in sync with reuseAsZeroed.
+func (m *CDense) reuseAsNonZeroed(r, c int) {
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   useC(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+}
+
+func (m *CDense) reuseAsZeroed(r, c int) {
+	// This must be kept in-sync with reuseAs.
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   useZeroedC(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+	m.Zero()
+}
+
+// isolatedWorkspace returns a new dense matrix w with the size of a and
+// returns a callback to defer which performs cleanup at the return of the call.
+// This should be used when a method receiver is the same pointer as an input argument.
+func (m *CDense) isolatedWorkspace(a CMatrix) (w *CDense, restore func()) {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getCDenseWorkspace(r, c, false)
+	return w, func() {
+		m.Copy(w)
+		putCDenseWorkspace(w)
+	}
+}
+
+// Reset zeros the dimensions of the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (m *CDense) Reset() {
+	// Row, Cols and Stride must be zeroed in unison.
+	m.mat.Rows, m.mat.Cols, m.mat.Stride = 0, 0, 0
+	m.capRows, m.capCols = 0, 0
+	m.mat.Data = m.mat.Data[:0]
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be zeroed using Reset.
+func (m *CDense) IsEmpty() bool {
+	// It must be the case that m.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return m.mat.Stride == 0
+}
+
+// Zero sets all of the matrix elements to zero.
+func (m *CDense) Zero() {
+	r := m.mat.Rows
+	c := m.mat.Cols
+	for i := 0; i < r; i++ {
+		zeroC(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+	}
+}
+
+// Copy makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two matrices and
+// returns the number of rows and columns it copied. If a aliases the receiver
+// and is a transposed Dense or VecDense, with a non-unitary increment, Copy will
+// panic.
+//
+// See the Copier interface for more information.
+func (m *CDense) Copy(a CMatrix) (r, c int) {
+	r, c = a.Dims()
+	if a == m {
+		return r, c
+	}
+	r = min(r, m.mat.Rows)
+	c = min(c, m.mat.Cols)
+	if r == 0 || c == 0 {
+		return 0, 0
+	}
+	// TODO(btracey): Check for overlap when complex version exists.
+	// TODO(btracey): Add fast-paths.
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			m.set(i, j, a.At(i, j))
+		}
+	}
+	return r, c
+}
+
+// SetRawCMatrix sets the underlying cblas128.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in b.
+func (m *CDense) SetRawCMatrix(b cblas128.General) {
+	m.capRows, m.capCols = b.Rows, b.Cols
+	m.mat = b
+}
+
+// RawCMatrix returns the underlying cblas128.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned cblas128.General.
+func (m *CDense) RawCMatrix() cblas128.General { return m.mat }
+
+// Grow returns the receiver expanded by r rows and c columns. If the dimensions
+// of the expanded matrix are outside the capacities of the receiver a new
+// allocation is made, otherwise not. Note the receiver itself is not modified
+// during the call to Grow.
+func (m *CDense) Grow(r, c int) CMatrix {
+	if r < 0 || c < 0 {
+		panic(ErrIndexOutOfRange)
+	}
+	if r == 0 && c == 0 {
+		return m
+	}
+
+	r += m.mat.Rows
+	c += m.mat.Cols
+
+	var t CDense
+	switch {
+	case m.mat.Rows == 0 || m.mat.Cols == 0:
+		t.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			// We zero because we don't know how the matrix will be used.
+			// In other places, the mat is immediately filled with a result;
+			// this is not the case here.
+			Data: useZeroedC(m.mat.Data, r*c),
+		}
+	case r > m.capRows || c > m.capCols:
+		cr := max(r, m.capRows)
+		cc := max(c, m.capCols)
+		t.mat = cblas128.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: cc,
+			Data:   make([]complex128, cr*cc),
+		}
+		t.capRows = cr
+		t.capCols = cc
+		// Copy the complete matrix over to the new matrix.
+		// Including elements not currently visible. Use a temporary structure
+		// to avoid modifying the receiver.
+		var tmp CDense
+		tmp.mat = cblas128.General{
+			Rows:   m.mat.Rows,
+			Cols:   m.mat.Cols,
+			Stride: m.mat.Stride,
+			Data:   m.mat.Data,
+		}
+		tmp.capRows = m.capRows
+		tmp.capCols = m.capCols
+		t.Copy(&tmp)
+		return &t
+	default:
+		t.mat = cblas128.General{
+			Data:   m.mat.Data[:(r-1)*m.mat.Stride+c],
+			Rows:   r,
+			Cols:   c,
+			Stride: m.mat.Stride,
+		}
+	}
+	t.capRows = r
+	t.capCols = c
+	return &t
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/cholesky.go b/vendor/gonum.org/v1/gonum/mat/cholesky.go
new file mode 100644
index 00000000000..f11948d0f89
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/cholesky.go
@@ -0,0 +1,1203 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const (
+	badTriangle = "mat: invalid triangle"
+	badCholesky = "mat: invalid Cholesky factorization"
+)
+
+var (
+	_ Matrix    = (*Cholesky)(nil)
+	_ Symmetric = (*Cholesky)(nil)
+
+	_ Matrix    = (*BandCholesky)(nil)
+	_ Symmetric = (*BandCholesky)(nil)
+	_ Banded    = (*BandCholesky)(nil)
+	_ SymBanded = (*BandCholesky)(nil)
+
+	_ Matrix    = (*PivotedCholesky)(nil)
+	_ Symmetric = (*PivotedCholesky)(nil)
+)
+
+// Cholesky is a symmetric positive definite matrix represented by its
+// Cholesky decomposition.
+//
+// The decomposition can be constructed using the Factorize method. The
+// factorization itself can be extracted using the UTo or LTo methods, and the
+// original symmetric matrix can be recovered with ToSym.
+//
+// Note that this matrix representation is useful for certain operations, in
+// particular finding solutions to linear equations. It is very inefficient
+// at other operations, in particular At is slow.
+//
+// Cholesky methods may only be called on a value that has been successfully
+// initialized by a call to Factorize that has returned true. Calls to methods
+// of an unsuccessful Cholesky factorization will panic.
+type Cholesky struct {
+	// The chol pointer must never be retained as a pointer outside the Cholesky
+	// struct, either by returning chol outside the struct or by setting it to
+	// a pointer coming from outside. The same prohibition applies to the data
+	// slice within chol.
+	chol *TriDense
+	cond float64
+}
+
+// updateCond updates the condition number of the Cholesky decomposition. If
+// norm > 0, then that norm is used as the norm of the original matrix A, otherwise
+// the norm is estimated from the decomposition.
+func (c *Cholesky) updateCond(norm float64) {
+	n := c.chol.mat.N
+	work := getFloat64s(3*n, false)
+	defer putFloat64s(work)
+	if norm < 0 {
+		// This is an approximation. By the definition of a norm,
+		//  |AB| <= |A| |B|.
+		// Since A = Uᵀ*U, we get for the condition number κ that
+		//  κ(A) := |A| |A^-1| = |Uᵀ*U| |A^-1| <= |Uᵀ| |U| |A^-1|,
+		// so this will overestimate the condition number somewhat.
+		// The norm of the original factorized matrix cannot be stored
+		// because of update possibilities.
+		unorm := lapack64.Lantr(CondNorm, c.chol.mat, work)
+		lnorm := lapack64.Lantr(CondNormTrans, c.chol.mat, work)
+		norm = unorm * lnorm
+	}
+	sym := c.chol.asSymBlas()
+	iwork := getInts(n, false)
+	v := lapack64.Pocon(sym, norm, work, iwork)
+	putInts(iwork)
+	c.cond = 1 / v
+}
+
+// Dims returns the dimensions of the matrix.
+func (ch *Cholesky) Dims() (r, c int) {
+	n := ch.SymmetricDim()
+	return n, n
+}
+
+// At returns the element at row i, column j.
+func (c *Cholesky) At(i, j int) float64 {
+	n := c.SymmetricDim()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k <= min(i, j); k++ {
+		val += c.chol.at(k, i) * c.chol.at(k, j)
+	}
+	return val
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (c *Cholesky) T() Matrix {
+	return c
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of rows
+// in the matrix (this is also the number of columns).
+func (c *Cholesky) SymmetricDim() int {
+	if c.chol == nil {
+		return 0
+	}
+	n, _ := c.chol.Triangle()
+	return n
+}
+
+// Cond returns the condition number of the factorized matrix.
+func (c *Cholesky) Cond() float64 {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	return c.cond
+}
+
+// Factorize calculates the Cholesky decomposition of the matrix A and returns
+// whether the matrix is positive definite. If Factorize returns false, the
+// factorization must not be used.
+func (c *Cholesky) Factorize(a Symmetric) (ok bool) {
+	n := a.SymmetricDim()
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol.Reset()
+		c.chol.reuseAsNonZeroed(n, Upper)
+	}
+	copySymIntoTriangle(c.chol, a)
+
+	sym := c.chol.asSymBlas()
+	work := getFloat64s(c.chol.mat.N, false)
+	norm := lapack64.Lansy(CondNorm, sym, work)
+	putFloat64s(work)
+	_, ok = lapack64.Potrf(sym)
+	if ok {
+		c.updateCond(norm)
+	} else {
+		c.Reset()
+	}
+	return ok
+}
+
+// Reset resets the factorization so that it can be reused as the receiver of a
+// dimensionally restricted operation.
+func (c *Cholesky) Reset() {
+	if c.chol != nil {
+		c.chol.Reset()
+	}
+	c.cond = math.Inf(1)
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (c *Cholesky) IsEmpty() bool {
+	return c.chol == nil || c.chol.IsEmpty()
+}
+
+// SetFromU sets the Cholesky decomposition from the given triangular matrix.
+// SetFromU panics if t is not upper triangular. If the receiver is empty it
+// is resized to be n×n, the size of t. If dst is non-empty, SetFromU panics
+// if c is not of size n×n. Note that t is copied into, not stored inside, the
+// receiver.
+func (c *Cholesky) SetFromU(t Triangular) {
+	n, kind := t.Triangle()
+	if kind != Upper {
+		panic("cholesky: matrix must be upper triangular")
+	}
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol.reuseAsNonZeroed(n, Upper)
+	}
+	c.chol.Copy(t)
+	c.updateCond(-1)
+}
+
+// Clone makes a copy of the input Cholesky into the receiver, overwriting the
+// previous value of the receiver. Clone does not place any restrictions on receiver
+// shape. Clone panics if the input Cholesky is not the result of a valid decomposition.
+func (c *Cholesky) Clone(chol *Cholesky) {
+	if !chol.valid() {
+		panic(badCholesky)
+	}
+	n := chol.SymmetricDim()
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol = NewTriDense(n, Upper, use(c.chol.mat.Data, n*n))
+	}
+	c.chol.Copy(chol.chol)
+	c.cond = chol.cond
+}
+
+// Det returns the determinant of the matrix that has been factorized.
+func (c *Cholesky) Det() float64 {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	return math.Exp(c.LogDet())
+}
+
+// LogDet returns the log of the determinant of the matrix that has been factorized.
+func (c *Cholesky) LogDet() float64 {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	var det float64
+	for i := 0; i < c.chol.mat.N; i++ {
+		det += 2 * math.Log(c.chol.mat.Data[i*c.chol.mat.Stride+i])
+	}
+	return det
+}
+
+// SolveTo finds the matrix X that solves A * X = B where A is represented
+// by the Cholesky decomposition. The result is stored in-place into dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (c *Cholesky) SolveTo(dst *Dense, b Matrix) error {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	bm, bn := b.Dims()
+	if n != bm {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(bm, bn)
+	if b != dst {
+		dst.Copy(b)
+	}
+	lapack64.Potrs(c.chol.mat, dst.mat)
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
+
+// SolveCholTo finds the matrix X that solves A * X = B where A and B are represented
+// by their Cholesky decompositions a and b. The result is stored in-place into
+// dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (a *Cholesky) SolveCholTo(dst *Dense, b *Cholesky) error {
+	if !a.valid() || !b.valid() {
+		panic(badCholesky)
+	}
+	bn := b.chol.mat.N
+	if a.chol.mat.N != bn {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsZeroed(bn, bn)
+	dst.Copy(b.chol.T())
+	blas64.Trsm(blas.Left, blas.Trans, 1, a.chol.mat, dst.mat)
+	blas64.Trsm(blas.Left, blas.NoTrans, 1, a.chol.mat, dst.mat)
+	blas64.Trmm(blas.Right, blas.NoTrans, 1, b.chol.mat, dst.mat)
+	if a.cond > ConditionTolerance {
+		return Condition(a.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds the vector x that solves A * x = b where A is represented
+// by the Cholesky decomposition. The result is stored in-place into
+// dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (c *Cholesky) SolveVecTo(dst *VecDense, b Vector) error {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+	switch rv := b.(type) {
+	default:
+		dst.reuseAsNonZeroed(n)
+		return c.SolveTo(dst.asDense(), b)
+	case RawVectorer:
+		bmat := rv.RawVector()
+		if dst != b {
+			dst.checkOverlap(bmat)
+		}
+		dst.reuseAsNonZeroed(n)
+		if dst != b {
+			dst.CopyVec(b)
+		}
+		lapack64.Potrs(c.chol.mat, dst.asGeneral())
+		if c.cond > ConditionTolerance {
+			return Condition(c.cond)
+		}
+		return nil
+	}
+}
+
+// RawU returns the Triangular matrix used to store the Cholesky factorization
+// of the original matrix A. If the returned matrix is modified, the
+// factorization is invalid and should not be used.
+//
+// If Factorize has not been called, RawU will return nil.
+func (c *Cholesky) RawU() Triangular {
+	if !c.valid() {
+		return nil
+	}
+	return c.chol
+}
+
+// UTo stores into dst the n×n upper triangular matrix U from a Cholesky
+// decomposition
+//
+//	A = Uᵀ * U.
+//
+// If dst is empty, it is resized to be an n×n upper triangular matrix. When dst
+// is non-empty, UTo panics if dst is not n×n or not Upper. UTo will also panic
+// if the receiver does not contain a successful factorization.
+func (c *Cholesky) UTo(dst *TriDense) {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Upper)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Upper {
+			panic(ErrTriangle)
+		}
+	}
+	dst.Copy(c.chol)
+}
+
+// LTo stores into dst the n×n lower triangular matrix L from a Cholesky
+// decomposition
+//
+//	A = L * Lᵀ.
+//
+// If dst is empty, it is resized to be an n×n lower triangular matrix. When dst
+// is non-empty, LTo panics if dst is not n×n or not Lower. LTo will also panic
+// if the receiver does not contain a successful factorization.
+func (c *Cholesky) LTo(dst *TriDense) {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Lower)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Lower {
+			panic(ErrTriangle)
+		}
+	}
+	dst.Copy(c.chol.TTri())
+}
+
+// ToSym reconstructs the original positive definite matrix from its
+// Cholesky decomposition, storing the result into dst. If dst is
+// empty it is resized to be n×n. If dst is non-empty, ToSym panics
+// if dst is not of size n×n. ToSym will also panic if the receiver
+// does not contain a successful factorization.
+func (c *Cholesky) ToSym(dst *SymDense) {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsSym(n)
+	} else {
+		n2 := dst.SymmetricDim()
+		if n != n2 {
+			panic(ErrShape)
+		}
+	}
+	// Create a TriDense representing the Cholesky factor U with dst's
+	// backing slice.
+	// Operations on u are reflected in s.
+	u := &TriDense{
+		mat: blas64.Triangular{
+			Uplo:   blas.Upper,
+			Diag:   blas.NonUnit,
+			N:      n,
+			Data:   dst.mat.Data,
+			Stride: dst.mat.Stride,
+		},
+		cap: n,
+	}
+	u.Copy(c.chol)
+	// Compute the product Uᵀ*U using the algorithm from LAPACK/TESTING/LIN/dpot01.f
+	a := u.mat.Data
+	lda := u.mat.Stride
+	bi := blas64.Implementation()
+	for k := n - 1; k >= 0; k-- {
+		a[k*lda+k] = bi.Ddot(k+1, a[k:], lda, a[k:], lda)
+		if k > 0 {
+			bi.Dtrmv(blas.Upper, blas.Trans, blas.NonUnit, k, a, lda, a[k:], lda)
+		}
+	}
+}
+
+// InverseTo computes the inverse of the matrix represented by its Cholesky
+// factorization and stores the result into s. If the factorized
+// matrix is ill-conditioned, a Condition error will be returned.
+// Note that matrix inversion is numerically unstable, and should generally be
+// avoided where possible, for example by using the Solve routines.
+func (c *Cholesky) InverseTo(dst *SymDense) error {
+	if !c.valid() {
+		panic(badCholesky)
+	}
+	dst.reuseAsNonZeroed(c.chol.mat.N)
+	// Create a TriDense representing the Cholesky factor U with the backing
+	// slice from dst.
+	// Operations on u are reflected in dst.
+	u := &TriDense{
+		mat: blas64.Triangular{
+			Uplo:   blas.Upper,
+			Diag:   blas.NonUnit,
+			N:      dst.mat.N,
+			Data:   dst.mat.Data,
+			Stride: dst.mat.Stride,
+		},
+		cap: dst.mat.N,
+	}
+	u.Copy(c.chol)
+
+	_, ok := lapack64.Potri(u.mat)
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
+
+// Scale multiplies the original matrix A by a positive constant using
+// its Cholesky decomposition, storing the result in-place into the receiver.
+// That is, if the original Cholesky factorization is
+//
+//	Uᵀ * U = A
+//
+// the updated factorization is
+//
+//	U'ᵀ * U' = f A = A'
+//
+// Scale panics if the constant is non-positive, or if the receiver is non-empty
+// and is of a different size from the input.
+func (c *Cholesky) Scale(f float64, orig *Cholesky) {
+	if !orig.valid() {
+		panic(badCholesky)
+	}
+	if f <= 0 {
+		panic("cholesky: scaling by a non-positive constant")
+	}
+	n := orig.SymmetricDim()
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else if c.chol.mat.N != n {
+		panic(ErrShape)
+	}
+	c.chol.ScaleTri(math.Sqrt(f), orig.chol)
+	c.cond = orig.cond // Scaling by a positive constant does not change the condition number.
+}
+
+// ExtendVecSym computes the Cholesky decomposition of the original matrix A,
+// whose Cholesky decomposition is in a, extended by a the n×1 vector v according to
+//
+//	[A  w]
+//	[w' k]
+//
+// where k = v[n-1] and w = v[:n-1]. The result is stored into the receiver.
+// In order for the updated matrix to be positive definite, it must be the case
+// that k > w' A^-1 w. If this condition does not hold then ExtendVecSym will
+// return false and the receiver will not be updated.
+//
+// ExtendVecSym will panic if v.Len() != a.SymmetricDim()+1 or if a does not contain
+// a valid decomposition.
+func (c *Cholesky) ExtendVecSym(a *Cholesky, v Vector) (ok bool) {
+	n := a.SymmetricDim()
+
+	if v.Len() != n+1 {
+		panic(badSliceLength)
+	}
+	if !a.valid() {
+		panic(badCholesky)
+	}
+
+	// The algorithm is commented here, but see also
+	//  https://math.stackexchange.com/questions/955874/cholesky-factor-when-adding-a-row-and-column-to-already-factorized-matrix
+	// We have A and want to compute the Cholesky of
+	//  [A  w]
+	//  [w' k]
+	// We want
+	//  [U c]
+	//  [0 d]
+	// to be the updated Cholesky, and so it must be that
+	//  [A  w] = [U' 0] [U c]
+	//  [w' k]   [c' d] [0 d]
+	// Thus, we need
+	//  1) A = U'U (true by the original decomposition being valid),
+	//  2) U' * c = w  =>  c = U'^-1 w
+	//  3) c'*c + d'*d = k  =>  d = sqrt(k-c'*c)
+
+	// First, compute c = U'^-1 a
+	w := NewVecDense(n, nil)
+	w.CopyVec(v)
+	k := v.At(n, 0)
+
+	var t VecDense
+	_ = t.SolveVec(a.chol.T(), w)
+
+	dot := Dot(&t, &t)
+	if dot >= k {
+		return false
+	}
+	d := math.Sqrt(k - dot)
+
+	newU := NewTriDense(n+1, Upper, nil)
+	newU.Copy(a.chol)
+	for i := 0; i < n; i++ {
+		newU.SetTri(i, n, t.At(i, 0))
+	}
+	newU.SetTri(n, n, d)
+	c.chol = newU
+	c.updateCond(-1)
+	return true
+}
+
+// SymRankOne performs a rank-1 update of the original matrix A and refactorizes
+// its Cholesky factorization, storing the result into the receiver. That is, if
+// in the original Cholesky factorization
+//
+//	Uᵀ * U = A,
+//
+// in the updated factorization
+//
+//	U'ᵀ * U' = A + alpha * x * xᵀ = A'.
+//
+// Note that when alpha is negative, the updating problem may be ill-conditioned
+// and the results may be inaccurate, or the updated matrix A' may not be
+// positive definite and not have a Cholesky factorization. SymRankOne returns
+// whether the updated matrix A' is positive definite. If the update fails
+// the receiver is left unchanged.
+//
+// SymRankOne updates a Cholesky factorization in O(n²) time. The Cholesky
+// factorization computation from scratch is O(n³).
+func (c *Cholesky) SymRankOne(orig *Cholesky, alpha float64, x Vector) (ok bool) {
+	if !orig.valid() {
+		panic(badCholesky)
+	}
+	n := orig.SymmetricDim()
+	if r, c := x.Dims(); r != n || c != 1 {
+		panic(ErrShape)
+	}
+	if orig != c {
+		if c.chol == nil {
+			c.chol = NewTriDense(n, Upper, nil)
+		} else if c.chol.mat.N != n {
+			panic(ErrShape)
+		}
+		c.chol.Copy(orig.chol)
+	}
+
+	if alpha == 0 {
+		return true
+	}
+
+	// Algorithms for updating and downdating the Cholesky factorization are
+	// described, for example, in
+	// - J. J. Dongarra, J. R. Bunch, C. B. Moler, G. W. Stewart: LINPACK
+	//   Users' Guide. SIAM (1979), pages 10.10--10.14
+	// or
+	// - P. E. Gill, G. H. Golub, W. Murray, and M. A. Saunders: Methods for
+	//   modifying matrix factorizations. Mathematics of Computation 28(126)
+	//   (1974), Method C3 on page 521
+	//
+	// The implementation is based on LINPACK code
+	// http://www.netlib.org/linpack/dchud.f
+	// http://www.netlib.org/linpack/dchdd.f
+	// and
+	// https://icl.cs.utk.edu/lapack-forum/viewtopic.php?f=2&t=2646
+	//
+	// According to http://icl.cs.utk.edu/lapack-forum/archives/lapack/msg00301.html
+	// LINPACK is released under BSD license.
+	//
+	// See also:
+	// - M. A. Saunders: Large-scale Linear Programming Using the Cholesky
+	//   Factorization. Technical Report Stanford University (1972)
+	//   http://i.stanford.edu/pub/cstr/reports/cs/tr/72/252/CS-TR-72-252.pdf
+	// - Matthias Seeger: Low rank updates for the Cholesky decomposition.
+	//   EPFL Technical Report 161468 (2004)
+	//   http://infoscience.epfl.ch/record/161468
+
+	work := getFloat64s(n, false)
+	defer putFloat64s(work)
+	var xmat blas64.Vector
+	if rv, ok := x.(RawVectorer); ok {
+		xmat = rv.RawVector()
+	} else {
+		var tmp *VecDense
+		tmp.CopyVec(x)
+		xmat = tmp.RawVector()
+	}
+	blas64.Copy(xmat, blas64.Vector{N: n, Data: work, Inc: 1})
+
+	if alpha > 0 {
+		// Compute rank-1 update.
+		if alpha != 1 {
+			blas64.Scal(math.Sqrt(alpha), blas64.Vector{N: n, Data: work, Inc: 1})
+		}
+		umat := c.chol.mat
+		stride := umat.Stride
+		for i := 0; i < n; i++ {
+			// Compute parameters of the Givens matrix that zeroes
+			// the i-th element of x.
+			c, s, r, _ := blas64.Rotg(umat.Data[i*stride+i], work[i])
+			if r < 0 {
+				// Multiply by -1 to have positive diagonal
+				// elements.
+				r *= -1
+				c *= -1
+				s *= -1
+			}
+			umat.Data[i*stride+i] = r
+			if i < n-1 {
+				// Multiply the extended factorization matrix by
+				// the Givens matrix from the left. Only
+				// the i-th row and x are modified.
+				blas64.Rot(
+					blas64.Vector{N: n - i - 1, Data: umat.Data[i*stride+i+1 : i*stride+n], Inc: 1},
+					blas64.Vector{N: n - i - 1, Data: work[i+1 : n], Inc: 1},
+					c, s)
+			}
+		}
+		c.updateCond(-1)
+		return true
+	}
+
+	// Compute rank-1 downdate.
+	alpha = math.Sqrt(-alpha)
+	if alpha != 1 {
+		blas64.Scal(alpha, blas64.Vector{N: n, Data: work, Inc: 1})
+	}
+	// Solve Uᵀ * p = x storing the result into work.
+	ok = lapack64.Trtrs(blas.Trans, c.chol.RawTriangular(), blas64.General{
+		Rows:   n,
+		Cols:   1,
+		Stride: 1,
+		Data:   work,
+	})
+	if !ok {
+		// The original matrix is singular. Should not happen, because
+		// the factorization is valid.
+		panic(badCholesky)
+	}
+	norm := blas64.Nrm2(blas64.Vector{N: n, Data: work, Inc: 1})
+	if norm >= 1 {
+		// The updated matrix is not positive definite.
+		return false
+	}
+	norm = math.Sqrt((1 + norm) * (1 - norm))
+	cos := getFloat64s(n, false)
+	defer putFloat64s(cos)
+	sin := getFloat64s(n, false)
+	defer putFloat64s(sin)
+	for i := n - 1; i >= 0; i-- {
+		// Compute parameters of Givens matrices that zero elements of p
+		// backwards.
+		cos[i], sin[i], norm, _ = blas64.Rotg(norm, work[i])
+		if norm < 0 {
+			norm *= -1
+			cos[i] *= -1
+			sin[i] *= -1
+		}
+	}
+	workMat := getTriDenseWorkspace(c.chol.mat.N, c.chol.triKind(), false)
+	defer putTriWorkspace(workMat)
+	workMat.Copy(c.chol)
+	umat := workMat.mat
+	stride := workMat.mat.Stride
+	for i := n - 1; i >= 0; i-- {
+		work[i] = 0
+		// Apply Givens matrices to U.
+		blas64.Rot(
+			blas64.Vector{N: n - i, Data: work[i:n], Inc: 1},
+			blas64.Vector{N: n - i, Data: umat.Data[i*stride+i : i*stride+n], Inc: 1},
+			cos[i], sin[i])
+		if umat.Data[i*stride+i] == 0 {
+			// The matrix is singular (may rarely happen due to
+			// floating-point effects?).
+			ok = false
+		} else if umat.Data[i*stride+i] < 0 {
+			// Diagonal elements should be positive. If it happens
+			// that on the i-th row the diagonal is negative,
+			// multiply U from the left by an identity matrix that
+			// has -1 on the i-th row.
+			blas64.Scal(-1, blas64.Vector{N: n - i, Data: umat.Data[i*stride+i : i*stride+n], Inc: 1})
+		}
+	}
+	if ok {
+		c.chol.Copy(workMat)
+		c.updateCond(-1)
+	}
+	return ok
+}
+
+func (c *Cholesky) valid() bool {
+	return c.chol != nil && !c.chol.IsEmpty()
+}
+
+// BandCholesky is a symmetric positive-definite band matrix represented by its
+// Cholesky decomposition.
+//
+// Note that this matrix representation is useful for certain operations, in
+// particular finding solutions to linear equations. It is very inefficient at
+// other operations, in particular At is slow.
+//
+// BandCholesky methods may only be called on a value that has been successfully
+// initialized by a call to Factorize that has returned true. Calls to methods
+// of an unsuccessful Cholesky factorization will panic.
+type BandCholesky struct {
+	// The chol pointer must never be retained as a pointer outside the Cholesky
+	// struct, either by returning chol outside the struct or by setting it to
+	// a pointer coming from outside. The same prohibition applies to the data
+	// slice within chol.
+	chol *TriBandDense
+	cond float64
+}
+
+// Factorize calculates the Cholesky decomposition of the matrix A and returns
+// whether the matrix is positive definite. If Factorize returns false, the
+// factorization must not be used.
+func (ch *BandCholesky) Factorize(a SymBanded) (ok bool) {
+	n, k := a.SymBand()
+	if ch.chol == nil {
+		ch.chol = NewTriBandDense(n, k, Upper, nil)
+	} else {
+		ch.chol.Reset()
+		ch.chol.ReuseAsTriBand(n, k, Upper)
+	}
+	copySymBandIntoTriBand(ch.chol, a)
+	cSym := blas64.SymmetricBand{
+		Uplo:   blas.Upper,
+		N:      n,
+		K:      k,
+		Data:   ch.chol.RawTriBand().Data,
+		Stride: ch.chol.RawTriBand().Stride,
+	}
+	_, ok = lapack64.Pbtrf(cSym)
+	if !ok {
+		ch.Reset()
+		return false
+	}
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	aNorm := lapack64.Lansb(CondNorm, cSym, work)
+	ch.cond = 1 / lapack64.Pbcon(cSym, aNorm, work, iwork)
+	putInts(iwork)
+	putFloat64s(work)
+	return true
+}
+
+// SolveTo finds the matrix X that solves A * X = B where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (ch *BandCholesky) SolveTo(dst *Dense, b Matrix) error {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	br, bc := b.Dims()
+	if br != ch.chol.mat.N {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(br, bc)
+	if b != dst {
+		dst.Copy(b)
+	}
+	lapack64.Pbtrs(ch.chol.mat, dst.mat)
+	if ch.cond > ConditionTolerance {
+		return Condition(ch.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds the vector x that solves A * x = b where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst.
+// If the Cholesky decomposition is singular or near-singular a Condition error
+// is returned. See the documentation for Condition for more information.
+func (ch *BandCholesky) SolveVecTo(dst *VecDense, b Vector) error {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	n := ch.chol.mat.N
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+	lapack64.Pbtrs(ch.chol.mat, dst.asGeneral())
+	if ch.cond > ConditionTolerance {
+		return Condition(ch.cond)
+	}
+	return nil
+}
+
+// Cond returns the condition number of the factorized matrix.
+func (ch *BandCholesky) Cond() float64 {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	return ch.cond
+}
+
+// Reset resets the factorization so that it can be reused as the receiver of
+// a dimensionally restricted operation.
+func (ch *BandCholesky) Reset() {
+	if ch.chol != nil {
+		ch.chol.Reset()
+	}
+	ch.cond = math.Inf(1)
+}
+
+// Dims returns the dimensions of the matrix.
+func (ch *BandCholesky) Dims() (r, c int) {
+	n := ch.SymmetricDim()
+	return n, n
+}
+
+// At returns the element at row i, column j.
+func (ch *BandCholesky) At(i, j int) float64 {
+	n, k := ch.SymBand()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	if i > j {
+		i, j = j, i
+	}
+	if j-i > k {
+		return 0
+	}
+	var aij float64
+	for k := max(0, j-k); k <= i; k++ {
+		aij += ch.chol.at(k, i) * ch.chol.at(k, j)
+	}
+	return aij
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (ch *BandCholesky) T() Matrix {
+	return ch
+}
+
+// TBand returns the receiver, the transpose of a symmetric band matrix.
+func (ch *BandCholesky) TBand() Banded {
+	return ch
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of rows
+// in the matrix (this is also the number of columns).
+func (ch *BandCholesky) SymmetricDim() int {
+	if ch.chol == nil {
+		return 0
+	}
+	n, _ := ch.chol.Triangle()
+	return n
+}
+
+// Bandwidth returns the lower and upper bandwidth values for the matrix.
+// The total bandwidth of the matrix is kl+ku+1.
+func (ch *BandCholesky) Bandwidth() (kl, ku int) {
+	_, k, _ := ch.chol.TriBand()
+	return k, k
+}
+
+// SymBand returns the number of rows/columns in the matrix, and the size of the
+// bandwidth. The total bandwidth of the matrix is 2*k+1.
+func (ch *BandCholesky) SymBand() (n, k int) {
+	n, k, _ = ch.chol.TriBand()
+	return n, k
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for dimensionally restricted operations. The receiver can be emptied
+// using Reset.
+func (ch *BandCholesky) IsEmpty() bool {
+	return ch == nil || ch.chol.IsEmpty()
+}
+
+// Det returns the determinant of the matrix that has been factorized.
+func (ch *BandCholesky) Det() float64 {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	return math.Exp(ch.LogDet())
+}
+
+// LogDet returns the log of the determinant of the matrix that has been factorized.
+func (ch *BandCholesky) LogDet() float64 {
+	if !ch.valid() {
+		panic(badCholesky)
+	}
+	var det float64
+	for i := 0; i < ch.chol.mat.N; i++ {
+		det += 2 * math.Log(ch.chol.mat.Data[i*ch.chol.mat.Stride])
+	}
+	return det
+}
+
+func (ch *BandCholesky) valid() bool {
+	return ch.chol != nil && !ch.chol.IsEmpty()
+}
+
+// PivotedCholesky is a symmetric positive semi-definite matrix represented by
+// its Cholesky factorization with complete pivoting.
+//
+// The factorization has the form
+//
+//	Pᵀ * A * P = Uᵀ * U
+//
+// where U is an upper triangular matrix and P is a permutation matrix.
+//
+// Cholesky methods may only be called on a receiver that has been initialized
+// by a call to Factorize. SolveTo and SolveVecTo methods may only called if
+// Factorize has returned true.
+//
+// If the matrix A is certainly positive definite, then the unpivoted Cholesky
+// could be more efficient, especially for smaller matrices.
+type PivotedCholesky struct {
+	chol          *TriDense // The factor U
+	piv, pivTrans []int     // The permutation matrices P and Pᵀ
+	rank          int       // The computed rank of A
+
+	ok   bool    // Indicates whether and the factorization can be used for solving linear systems
+	cond float64 // The condition number when ok is true
+}
+
+// Factorize computes the Cholesky factorization of the symmetric positive
+// semi-definite matrix A and returns whether the matrix is positive definite.
+// If Factorize returns false, the SolveTo methods must not be used.
+//
+// tol is a tolerance used to determine the computed rank of A. If it is
+// negative, a default value will be used.
+func (c *PivotedCholesky) Factorize(a Symmetric, tol float64) (ok bool) {
+	n := a.SymmetricDim()
+	c.reset(n)
+	copySymIntoTriangle(c.chol, a)
+
+	work := getFloat64s(3*c.chol.mat.N, false)
+	defer putFloat64s(work)
+
+	sym := c.chol.asSymBlas()
+	aNorm := lapack64.Lansy(CondNorm, sym, work)
+	_, c.rank, c.ok = lapack64.Pstrf(sym, c.piv, tol, work)
+	if c.ok {
+		iwork := getInts(n, false)
+		defer putInts(iwork)
+		c.cond = 1 / lapack64.Pocon(sym, aNorm, work, iwork)
+	} else {
+		for i := c.rank; i < n; i++ {
+			zero(sym.Data[i*sym.Stride+i : i*sym.Stride+n])
+		}
+	}
+	for i, p := range c.piv {
+		c.pivTrans[p] = i
+	}
+
+	return c.ok
+}
+
+// reset prepares the receiver for factorization of matrices of size n.
+func (c *PivotedCholesky) reset(n int) {
+	if c.chol == nil {
+		c.chol = NewTriDense(n, Upper, nil)
+	} else {
+		c.chol.Reset()
+		c.chol.reuseAsNonZeroed(n, Upper)
+	}
+	c.piv = useInt(c.piv, n)
+	c.pivTrans = useInt(c.pivTrans, n)
+	c.rank = 0
+	c.ok = false
+	c.cond = math.Inf(1)
+}
+
+// Dims returns the dimensions of the matrix A.
+func (ch *PivotedCholesky) Dims() (r, c int) {
+	n := ch.SymmetricDim()
+	return n, n
+}
+
+// At returns the element of A at row i, column j.
+func (c *PivotedCholesky) At(i, j int) float64 {
+	n := c.SymmetricDim()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	i = c.pivTrans[i]
+	j = c.pivTrans[j]
+	minij := min(min(i+1, j+1), c.rank)
+	var val float64
+	for k := 0; k < minij; k++ {
+		val += c.chol.at(k, i) * c.chol.at(k, j)
+	}
+	return val
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (c *PivotedCholesky) T() Matrix {
+	return c
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of
+// rows (or columns) in the matrix .
+func (c *PivotedCholesky) SymmetricDim() int {
+	if c.chol == nil {
+		return 0
+	}
+	n, _ := c.chol.Triangle()
+	return n
+}
+
+// Rank returns the computed rank of the matrix A.
+func (c *PivotedCholesky) Rank() int {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	return c.rank
+}
+
+// Cond returns the condition number of the factorized matrix.
+func (c *PivotedCholesky) Cond() float64 {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	return c.cond
+}
+
+// RawU returns the Triangular matrix used to store the Cholesky factorization
+// of the original matrix A. If the returned matrix is modified, the
+// factorization is invalid and should not be used.
+//
+// If Factorized returned false, the rows of U from Rank to n will contain zeros
+// and so U will be upper trapezoidal.
+//
+// If Factorize has not been called, RawU will return nil.
+func (c *PivotedCholesky) RawU() Triangular {
+	if c.chol == nil {
+		return nil
+	}
+	return c.chol
+}
+
+// UTo stores the n×n upper triangular matrix U from the Cholesky factorization
+//
+//	Pᵀ * A * P = Uᵀ * U.
+//
+// into dst. If dst is empty, it is resized to be an n×n upper triangular
+// matrix. When dst is non-empty, UTo panics if dst is not n×n or not Upper.
+//
+// If Factorized returned false, the rows of U from Rank to n will contain zeros
+// and so U will be upper trapezoidal.
+func (c *PivotedCholesky) UTo(dst *TriDense) {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Upper)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Upper {
+			panic(ErrTriangle)
+		}
+	}
+	dst.Copy(c.chol)
+}
+
+// ColumnPivots returns the column permutation p that represents the permutation
+// matrix P from the Cholesky factorization
+//
+//	Pᵀ * A * P = Uᵀ * U
+//
+// such that the nonzero entries are P[p[k],k] = 1.
+func (c *PivotedCholesky) ColumnPivots(dst []int) []int {
+	if c.chol == nil {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if dst == nil {
+		dst = make([]int, n)
+	}
+	if len(dst) != n {
+		panic(badSliceLength)
+	}
+	copy(dst, c.piv)
+	return dst
+}
+
+// SolveTo finds the matrix X that solves A * X = B where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst. If the
+// Cholesky decomposition is singular or near-singular, a Condition error is
+// returned. See the documentation for Condition for more information.
+//
+// If Factorize returned false, SolveTo will panic.
+func (c *PivotedCholesky) SolveTo(dst *Dense, b Matrix) error {
+	if !c.ok {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	bm, bn := b.Dims()
+	if n != bm {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(bm, bn)
+	if dst != b {
+		dst.Copy(b)
+	}
+
+	// Permute rows of B: D = Pᵀ * B.
+	lapack64.Lapmr(true, dst.mat, c.piv)
+	// Solve Uᵀ * U * Y = D.
+	lapack64.Potrs(c.chol.mat, dst.mat)
+	// Permute rows of Y to recover the solution: X = P * Y.
+	lapack64.Lapmr(false, dst.mat, c.piv)
+
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds the vector x that solves A * x = b where A is represented by
+// the Cholesky decomposition. The result is stored in-place into dst. If the
+// Cholesky decomposition is singular or near-singular, a Condition error is
+// returned. See the documentation for Condition for more information.
+//
+// If Factorize returned false, SolveVecTo will panic.
+func (c *PivotedCholesky) SolveVecTo(dst *VecDense, b Vector) error {
+	if !c.ok {
+		panic(badCholesky)
+	}
+	n := c.chol.mat.N
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+
+	// Permute rows of B: D = Pᵀ * B.
+	lapack64.Lapmr(true, dst.asGeneral(), c.piv)
+	// Solve Uᵀ * U * Y = D.
+	lapack64.Potrs(c.chol.mat, dst.asGeneral())
+	// Permute rows of Y to recover the solution: X = P * Y.
+	lapack64.Lapmr(false, dst.asGeneral(), c.piv)
+
+	if c.cond > ConditionTolerance {
+		return Condition(c.cond)
+	}
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/cmatrix.go b/vendor/gonum.org/v1/gonum/mat/cmatrix.go
new file mode 100644
index 00000000000..336645751da
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/cmatrix.go
@@ -0,0 +1,314 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+	"math/cmplx"
+
+	"gonum.org/v1/gonum/blas/cblas128"
+	"gonum.org/v1/gonum/floats/scalar"
+)
+
+// CMatrix is the basic matrix interface type for complex matrices.
+type CMatrix interface {
+	// Dims returns the dimensions of a CMatrix.
+	Dims() (r, c int)
+
+	// At returns the value of a matrix element at row i, column j.
+	// It will panic if i or j are out of bounds for the matrix.
+	At(i, j int) complex128
+
+	// H returns the conjugate transpose of the CMatrix. Whether H
+	// returns a copy of the underlying data is implementation dependent.
+	// This method may be implemented using the ConjTranspose type, which
+	// provides an implicit matrix conjugate transpose.
+	H() CMatrix
+
+	// T returns the transpose of the CMatrix. Whether T returns a copy of the
+	// underlying data is implementation dependent.
+	// This method may be implemented using the CTranspose type, which
+	// provides an implicit matrix transpose.
+	T() CMatrix
+}
+
+// A RawCMatrixer can return a cblas128.General representation of the receiver. Changes to the cblas128.General.Data
+// slice will be reflected in the original matrix, changes to the Rows, Cols and Stride fields will not.
+type RawCMatrixer interface {
+	RawCMatrix() cblas128.General
+}
+
+var (
+	_ CMatrix          = ConjTranspose{}
+	_ UnConjTransposer = ConjTranspose{}
+)
+
+// ConjTranspose is a type for performing an implicit matrix conjugate transpose.
+// It implements the CMatrix interface, returning values from the conjugate
+// transpose of the matrix within.
+type ConjTranspose struct {
+	CMatrix CMatrix
+}
+
+// At returns the value of the element at row i and column j of the conjugate
+// transposed matrix, that is, row j and column i of the CMatrix field.
+func (t ConjTranspose) At(i, j int) complex128 {
+	z := t.CMatrix.At(j, i)
+	return cmplx.Conj(z)
+}
+
+// Dims returns the dimensions of the transposed matrix. The number of rows returned
+// is the number of columns in the CMatrix field, and the number of columns is
+// the number of rows in the CMatrix field.
+func (t ConjTranspose) Dims() (r, c int) {
+	c, r = t.CMatrix.Dims()
+	return r, c
+}
+
+// H performs an implicit conjugate transpose by returning the CMatrix field.
+func (t ConjTranspose) H() CMatrix {
+	return t.CMatrix
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// CTranspose.
+func (t ConjTranspose) T() CMatrix {
+	return CTranspose{t}
+}
+
+// UnConjTranspose returns the CMatrix field.
+func (t ConjTranspose) UnConjTranspose() CMatrix {
+	return t.CMatrix
+}
+
+// CTranspose is a type for performing an implicit matrix conjugate transpose.
+// It implements the CMatrix interface, returning values from the conjugate
+// transpose of the matrix within.
+type CTranspose struct {
+	CMatrix CMatrix
+}
+
+// At returns the value of the element at row i and column j of the conjugate
+// transposed matrix, that is, row j and column i of the CMatrix field.
+func (t CTranspose) At(i, j int) complex128 {
+	return t.CMatrix.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. The number of rows returned
+// is the number of columns in the CMatrix field, and the number of columns is
+// the number of rows in the CMatrix field.
+func (t CTranspose) Dims() (r, c int) {
+	c, r = t.CMatrix.Dims()
+	return r, c
+}
+
+// H performs an implicit transpose by returning the receiver inside a
+// ConjTranspose.
+func (t CTranspose) H() CMatrix {
+	return ConjTranspose{t}
+}
+
+// T performs an implicit conjugate transpose by returning the CMatrix field.
+func (t CTranspose) T() CMatrix {
+	return t.CMatrix
+}
+
+// Untranspose returns the CMatrix field.
+func (t CTranspose) Untranspose() CMatrix {
+	return t.CMatrix
+}
+
+// UnConjTransposer is a type that can undo an implicit conjugate transpose.
+type UnConjTransposer interface {
+	// UnConjTranspose returns the underlying CMatrix stored for the implicit
+	// conjugate transpose.
+	UnConjTranspose() CMatrix
+
+	// Note: This interface is needed to unify all of the Conjugate types. In
+	// the cmat128 methods, we need to test if the CMatrix has been implicitly
+	// transposed. If this is checked by testing for the specific Conjugate type
+	// then the behavior will be different if the user uses H() or HTri() for a
+	// triangular matrix.
+}
+
+// CUntransposer is a type that can undo an implicit transpose.
+type CUntransposer interface {
+	// Untranspose returns the underlying CMatrix stored for the implicit
+	// transpose.
+	Untranspose() CMatrix
+
+	// Note: This interface is needed to unify all of the CTranspose types. In
+	// the cmat128 methods, we need to test if the CMatrix has been implicitly
+	// transposed. If this is checked by testing for the specific CTranspose type
+	// then the behavior will be different if the user uses T() or TTri() for a
+	// triangular matrix.
+}
+
+// useC returns a complex128 slice with l elements, using c if it
+// has the necessary capacity, otherwise creating a new slice.
+func useC(c []complex128, l int) []complex128 {
+	if l <= cap(c) {
+		return c[:l]
+	}
+	return make([]complex128, l)
+}
+
+// useZeroedC returns a complex128 slice with l elements, using c if it
+// has the necessary capacity, otherwise creating a new slice. The
+// elements of the returned slice are guaranteed to be zero.
+func useZeroedC(c []complex128, l int) []complex128 {
+	if l <= cap(c) {
+		c = c[:l]
+		zeroC(c)
+		return c
+	}
+	return make([]complex128, l)
+}
+
+// zeroC zeros the given slice's elements.
+func zeroC(c []complex128) {
+	for i := range c {
+		c[i] = 0
+	}
+}
+
+// untransposeCmplx untransposes a matrix if applicable. If a is an CUntransposer
+// or an UnConjTransposer, then untranspose returns the underlying matrix and true for
+// the kind of transpose (potentially both).
+// If it is not, then it returns the input matrix and false for trans and conj.
+func untransposeCmplx(a CMatrix) (u CMatrix, trans, conj bool) {
+	switch ut := a.(type) {
+	case CUntransposer:
+		trans = true
+		u := ut.Untranspose()
+		if uc, ok := u.(UnConjTransposer); ok {
+			return uc.UnConjTranspose(), trans, true
+		}
+		return u, trans, false
+	case UnConjTransposer:
+		conj = true
+		u := ut.UnConjTranspose()
+		if ut, ok := u.(CUntransposer); ok {
+			return ut.Untranspose(), true, conj
+		}
+		return u, false, conj
+	default:
+		return a, false, false
+	}
+}
+
+// untransposeExtractCmplx returns an untransposed matrix in a built-in matrix type.
+//
+// The untransposed matrix is returned unaltered if it is a built-in matrix type.
+// Otherwise, if it implements a Raw method, an appropriate built-in type value
+// is returned holding the raw matrix value of the input. If neither of these
+// is possible, the untransposed matrix is returned.
+func untransposeExtractCmplx(a CMatrix) (u CMatrix, trans, conj bool) {
+	ut, trans, conj := untransposeCmplx(a)
+	switch m := ut.(type) {
+	case *CDense:
+		return m, trans, conj
+	case RawCMatrixer:
+		var d CDense
+		d.SetRawCMatrix(m.RawCMatrix())
+		return &d, trans, conj
+	default:
+		return ut, trans, conj
+	}
+}
+
+// CEqual returns whether the matrices a and b have the same size
+// and are element-wise equal.
+func CEqual(a, b CMatrix) bool {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	// TODO(btracey): Add in fast-paths.
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if a.At(i, j) != b.At(i, j) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// CEqualApprox returns whether the matrices a and b have the same size and contain all equal
+// elements with tolerance for element-wise equality specified by epsilon. Matrices
+// with non-equal shapes are not equal.
+func CEqualApprox(a, b CMatrix, epsilon float64) bool {
+	// TODO(btracey):
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if !cEqualWithinAbsOrRel(a.At(i, j), b.At(i, j), epsilon, epsilon) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// TODO(btracey): Move these into a cmplxs if/when we have one.
+
+func cEqualWithinAbsOrRel(a, b complex128, absTol, relTol float64) bool {
+	if cEqualWithinAbs(a, b, absTol) {
+		return true
+	}
+	return cEqualWithinRel(a, b, relTol)
+}
+
+// cEqualWithinAbs returns true if a and b have an absolute
+// difference of less than tol.
+func cEqualWithinAbs(a, b complex128, tol float64) bool {
+	return a == b || cmplx.Abs(a-b) <= tol
+}
+
+const minNormalFloat64 = 2.2250738585072014e-308
+
+// cEqualWithinRel returns true if the difference between a and b
+// is not greater than tol times the greater value.
+func cEqualWithinRel(a, b complex128, tol float64) bool {
+	if a == b {
+		return true
+	}
+	if cmplx.IsNaN(a) || cmplx.IsNaN(b) {
+		return false
+	}
+	// Cannot play the same trick as in floats/scalar because there are multiple
+	// possible infinities.
+	if cmplx.IsInf(a) {
+		if !cmplx.IsInf(b) {
+			return false
+		}
+		ra := real(a)
+		if math.IsInf(ra, 0) {
+			if ra == real(b) {
+				return scalar.EqualWithinRel(imag(a), imag(b), tol)
+			}
+			return false
+		}
+		if imag(a) == imag(b) {
+			return scalar.EqualWithinRel(ra, real(b), tol)
+		}
+		return false
+	}
+	if cmplx.IsInf(b) {
+		return false
+	}
+
+	delta := cmplx.Abs(a - b)
+	if delta <= minNormalFloat64 {
+		return delta <= tol*minNormalFloat64
+	}
+	return delta/math.Max(cmplx.Abs(a), cmplx.Abs(b)) <= tol
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/consts.go b/vendor/gonum.org/v1/gonum/mat/consts.go
new file mode 100644
index 00000000000..3de3f5bf47d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/consts.go
@@ -0,0 +1,15 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+// TriKind represents the triangularity of the matrix.
+type TriKind bool
+
+const (
+	// Upper specifies an upper triangular matrix.
+	Upper TriKind = true
+	// Lower specifies a lower triangular matrix.
+	Lower TriKind = false
+)
diff --git a/vendor/gonum.org/v1/gonum/mat/dense.go b/vendor/gonum.org/v1/gonum/mat/dense.go
new file mode 100644
index 00000000000..b08360cc704
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/dense.go
@@ -0,0 +1,670 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	dense *Dense
+
+	_ Matrix      = dense
+	_ allMatrix   = dense
+	_ denseMatrix = dense
+	_ Mutable     = dense
+
+	_ ClonerFrom   = dense
+	_ RowViewer    = dense
+	_ ColViewer    = dense
+	_ RawRowViewer = dense
+	_ Grower       = dense
+
+	_ RawMatrixSetter = dense
+	_ RawMatrixer     = dense
+
+	_ Reseter = dense
+)
+
+// Dense is a dense matrix representation.
+type Dense struct {
+	mat blas64.General
+
+	capRows, capCols int
+}
+
+// NewDense creates a new Dense matrix with r rows and c columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == r*c, data is
+// used as the backing slice, and changes to the elements of the returned Dense
+// will be reflected in data. If neither of these is true, NewDense will panic.
+// NewDense will panic if either r or c is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+func NewDense(r, c int, data []float64) *Dense {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if data != nil && r*c != len(data) {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, r*c)
+	}
+	return &Dense{
+		mat: blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   data,
+		},
+		capRows: r,
+		capCols: c,
+	}
+}
+
+// ReuseAs changes the receiver if it IsEmpty() to be of size r×c.
+//
+// ReuseAs re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAs panics if the receiver is not empty, and panics if
+// the input sizes are less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (m *Dense) ReuseAs(r, c int) {
+	if r <= 0 || c <= 0 {
+		if r == 0 || c == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !m.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	m.reuseAsZeroed(r, c)
+}
+
+// reuseAsNonZeroed resizes an empty matrix to a r×c matrix,
+// or checks that a non-empty matrix is r×c. It does not zero
+// the data in the receiver.
+func (m *Dense) reuseAsNonZeroed(r, c int) {
+	// reuseAs must be kept in sync with reuseAsZeroed.
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   use(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+}
+
+// reuseAsZeroed resizes an empty matrix to a r×c matrix,
+// or checks that a non-empty matrix is r×c. It zeroes
+// all the elements of the matrix.
+func (m *Dense) reuseAsZeroed(r, c int) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if m.mat.Rows > m.capRows || m.mat.Cols > m.capCols {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	if m.IsEmpty() {
+		m.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			Data:   useZeroed(m.mat.Data, r*c),
+		}
+		m.capRows = r
+		m.capCols = c
+		return
+	}
+	if r != m.mat.Rows || c != m.mat.Cols {
+		panic(ErrShape)
+	}
+	m.Zero()
+}
+
+// Zero sets all of the matrix elements to zero.
+func (m *Dense) Zero() {
+	r := m.mat.Rows
+	c := m.mat.Cols
+	for i := 0; i < r; i++ {
+		zero(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+	}
+}
+
+// isolatedWorkspace returns a new dense matrix w with the size of a and
+// returns a callback to defer which performs cleanup at the return of the call.
+// This should be used when a method receiver is the same pointer as an input argument.
+func (m *Dense) isolatedWorkspace(a Matrix) (w *Dense, restore func()) {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getDenseWorkspace(r, c, false)
+	return w, func() {
+		m.Copy(w)
+		putDenseWorkspace(w)
+	}
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (m *Dense) Reset() {
+	// Row, Cols and Stride must be zeroed in unison.
+	m.mat.Rows, m.mat.Cols, m.mat.Stride = 0, 0, 0
+	m.capRows, m.capCols = 0, 0
+	m.mat.Data = m.mat.Data[:0]
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (m *Dense) IsEmpty() bool {
+	// It must be the case that m.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return m.mat.Stride == 0
+}
+
+// asTriDense returns a TriDense with the given size and side. The backing data
+// of the TriDense is the same as the receiver.
+func (m *Dense) asTriDense(n int, diag blas.Diag, uplo blas.Uplo) *TriDense {
+	return &TriDense{
+		mat: blas64.Triangular{
+			N:      n,
+			Stride: m.mat.Stride,
+			Data:   m.mat.Data,
+			Uplo:   uplo,
+			Diag:   diag,
+		},
+		cap: n,
+	}
+}
+
+// DenseCopyOf returns a newly allocated copy of the elements of a.
+func DenseCopyOf(a Matrix) *Dense {
+	d := &Dense{}
+	d.CloneFrom(a)
+	return d
+}
+
+// SetRawMatrix sets the underlying blas64.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in b.
+func (m *Dense) SetRawMatrix(b blas64.General) {
+	m.capRows, m.capCols = b.Rows, b.Cols
+	m.mat = b
+}
+
+// RawMatrix returns the underlying blas64.General used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.General.
+func (m *Dense) RawMatrix() blas64.General { return m.mat }
+
+// Dims returns the number of rows and columns in the matrix.
+func (m *Dense) Dims() (r, c int) { return m.mat.Rows, m.mat.Cols }
+
+// Caps returns the number of rows and columns in the backing matrix.
+func (m *Dense) Caps() (r, c int) { return m.capRows, m.capCols }
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (m *Dense) T() Matrix {
+	return Transpose{m}
+}
+
+// ColView returns a Vector reflecting the column j, backed by the matrix data.
+//
+// See ColViewer for more information.
+func (m *Dense) ColView(j int) Vector {
+	var v VecDense
+	v.ColViewOf(m, j)
+	return &v
+}
+
+// SetCol sets the values in the specified column of the matrix to the values
+// in src. len(src) must equal the number of rows in the receiver.
+func (m *Dense) SetCol(j int, src []float64) {
+	if j >= m.mat.Cols || j < 0 {
+		panic(ErrColAccess)
+	}
+	if len(src) != m.mat.Rows {
+		panic(ErrColLength)
+	}
+
+	blas64.Copy(
+		blas64.Vector{N: m.mat.Rows, Inc: 1, Data: src},
+		blas64.Vector{N: m.mat.Rows, Inc: m.mat.Stride, Data: m.mat.Data[j:]},
+	)
+}
+
+// SetRow sets the values in the specified rows of the matrix to the values
+// in src. len(src) must equal the number of columns in the receiver.
+func (m *Dense) SetRow(i int, src []float64) {
+	if i >= m.mat.Rows || i < 0 {
+		panic(ErrRowAccess)
+	}
+	if len(src) != m.mat.Cols {
+		panic(ErrRowLength)
+	}
+
+	copy(m.rawRowView(i), src)
+}
+
+// RowView returns row i of the matrix data represented as a column vector,
+// backed by the matrix data.
+//
+// See RowViewer for more information.
+func (m *Dense) RowView(i int) Vector {
+	var v VecDense
+	v.RowViewOf(m, i)
+	return &v
+}
+
+// RawRowView returns a slice backed by the same array as backing the
+// receiver.
+func (m *Dense) RawRowView(i int) []float64 {
+	if i >= m.mat.Rows || i < 0 {
+		panic(ErrRowAccess)
+	}
+	return m.rawRowView(i)
+}
+
+func (m *Dense) rawRowView(i int) []float64 {
+	return m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+m.mat.Cols]
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (m *Dense) DiagView() Diagonal {
+	n := min(m.mat.Rows, m.mat.Cols)
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  m.mat.Stride + 1,
+			Data: m.mat.Data[:(n-1)*m.mat.Stride+n],
+		},
+	}
+}
+
+// Slice returns a new Matrix that shares backing data with the receiver.
+// The returned matrix starts at {i,j} of the receiver and extends k-i rows
+// and l-j columns. The final row in the resulting matrix is k-1 and the
+// final column is l-1.
+// Slice panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (m *Dense) Slice(i, k, j, l int) Matrix {
+	return m.slice(i, k, j, l)
+}
+
+func (m *Dense) slice(i, k, j, l int) *Dense {
+	mr, mc := m.Caps()
+	if i < 0 || mr <= i || j < 0 || mc <= j || k < i || mr < k || l < j || mc < l {
+		if i == k || j == l {
+			panic(ErrZeroLength)
+		}
+		panic(ErrIndexOutOfRange)
+	}
+	t := *m
+	t.mat.Data = t.mat.Data[i*t.mat.Stride+j : (k-1)*t.mat.Stride+l]
+	t.mat.Rows = k - i
+	t.mat.Cols = l - j
+	t.capRows -= i
+	t.capCols -= j
+	return &t
+}
+
+// Grow returns the receiver expanded by r rows and c columns. If the dimensions
+// of the expanded matrix are outside the capacities of the receiver a new
+// allocation is made, otherwise not. Note the receiver itself is not modified
+// during the call to Grow.
+func (m *Dense) Grow(r, c int) Matrix {
+	if r < 0 || c < 0 {
+		panic(ErrIndexOutOfRange)
+	}
+	if r == 0 && c == 0 {
+		return m
+	}
+
+	r += m.mat.Rows
+	c += m.mat.Cols
+
+	var t Dense
+	switch {
+	case m.mat.Rows == 0 || m.mat.Cols == 0:
+		t.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: c,
+			// We zero because we don't know how the matrix will be used.
+			// In other places, the mat is immediately filled with a result;
+			// this is not the case here.
+			Data: useZeroed(m.mat.Data, r*c),
+		}
+	case r > m.capRows || c > m.capCols:
+		cr := max(r, m.capRows)
+		cc := max(c, m.capCols)
+		t.mat = blas64.General{
+			Rows:   r,
+			Cols:   c,
+			Stride: cc,
+			Data:   make([]float64, cr*cc),
+		}
+		t.capRows = cr
+		t.capCols = cc
+		// Copy the complete matrix over to the new matrix.
+		// Including elements not currently visible. Use a temporary structure
+		// to avoid modifying the receiver.
+		var tmp Dense
+		tmp.mat = blas64.General{
+			Rows:   m.mat.Rows,
+			Cols:   m.mat.Cols,
+			Stride: m.mat.Stride,
+			Data:   m.mat.Data,
+		}
+		tmp.capRows = m.capRows
+		tmp.capCols = m.capCols
+		t.Copy(&tmp)
+		return &t
+	default:
+		t.mat = blas64.General{
+			Data:   m.mat.Data[:(r-1)*m.mat.Stride+c],
+			Rows:   r,
+			Cols:   c,
+			Stride: m.mat.Stride,
+		}
+	}
+	t.capRows = r
+	t.capCols = c
+	return &t
+}
+
+// CloneFrom makes a copy of a into the receiver, overwriting the previous value of
+// the receiver. The clone from operation does not make any restriction on shape and
+// will not cause shadowing.
+//
+// See the ClonerFrom interface for more information.
+func (m *Dense) CloneFrom(a Matrix) {
+	r, c := a.Dims()
+	mat := blas64.General{
+		Rows:   r,
+		Cols:   c,
+		Stride: c,
+	}
+	m.capRows, m.capCols = r, c
+
+	aU, trans := untransposeExtract(a)
+	switch aU := aU.(type) {
+	case *Dense:
+		amat := aU.mat
+		mat.Data = make([]float64, r*c)
+		if trans {
+			for i := 0; i < r; i++ {
+				blas64.Copy(blas64.Vector{N: c, Inc: amat.Stride, Data: amat.Data[i : i+(c-1)*amat.Stride+1]},
+					blas64.Vector{N: c, Inc: 1, Data: mat.Data[i*c : (i+1)*c]})
+			}
+		} else {
+			for i := 0; i < r; i++ {
+				copy(mat.Data[i*c:(i+1)*c], amat.Data[i*amat.Stride:i*amat.Stride+c])
+			}
+		}
+	case *VecDense:
+		amat := aU.mat
+		mat.Data = make([]float64, aU.mat.N)
+		blas64.Copy(blas64.Vector{N: aU.mat.N, Inc: amat.Inc, Data: amat.Data},
+			blas64.Vector{N: aU.mat.N, Inc: 1, Data: mat.Data})
+	default:
+		mat.Data = make([]float64, r*c)
+		w := *m
+		w.mat = mat
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				w.set(i, j, a.At(i, j))
+			}
+		}
+		*m = w
+		return
+	}
+	m.mat = mat
+}
+
+// Copy makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two matrices and
+// returns the number of rows and columns it copied. If a aliases the receiver
+// and is a transposed Dense or VecDense, with a non-unitary increment, Copy will
+// panic.
+//
+// See the Copier interface for more information.
+func (m *Dense) Copy(a Matrix) (r, c int) {
+	r, c = a.Dims()
+	if a == m {
+		return r, c
+	}
+	r = min(r, m.mat.Rows)
+	c = min(c, m.mat.Cols)
+	if r == 0 || c == 0 {
+		return 0, 0
+	}
+
+	aU, trans := untransposeExtract(a)
+	switch aU := aU.(type) {
+	case *Dense:
+		amat := aU.mat
+		if trans {
+			if amat.Stride != 1 {
+				m.checkOverlap(amat)
+			}
+			for i := 0; i < r; i++ {
+				blas64.Copy(blas64.Vector{N: c, Inc: amat.Stride, Data: amat.Data[i : i+(c-1)*amat.Stride+1]},
+					blas64.Vector{N: c, Inc: 1, Data: m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c]})
+			}
+		} else {
+			switch o := offset(m.mat.Data, amat.Data); {
+			case o < 0:
+				for i := r - 1; i >= 0; i-- {
+					copy(m.mat.Data[i*m.mat.Stride:i*m.mat.Stride+c], amat.Data[i*amat.Stride:i*amat.Stride+c])
+				}
+			case o > 0:
+				for i := 0; i < r; i++ {
+					copy(m.mat.Data[i*m.mat.Stride:i*m.mat.Stride+c], amat.Data[i*amat.Stride:i*amat.Stride+c])
+				}
+			default:
+				// Nothing to do.
+			}
+		}
+	case *VecDense:
+		var n, stride int
+		amat := aU.mat
+		if trans {
+			if amat.Inc != 1 {
+				m.checkOverlap(aU.asGeneral())
+			}
+			n = c
+			stride = 1
+		} else {
+			n = r
+			stride = m.mat.Stride
+		}
+		if amat.Inc == 1 && stride == 1 {
+			copy(m.mat.Data, amat.Data[:n])
+			break
+		}
+		switch o := offset(m.mat.Data, amat.Data); {
+		case o < 0:
+			blas64.Copy(blas64.Vector{N: n, Inc: -amat.Inc, Data: amat.Data},
+				blas64.Vector{N: n, Inc: -stride, Data: m.mat.Data})
+		case o > 0:
+			blas64.Copy(blas64.Vector{N: n, Inc: amat.Inc, Data: amat.Data},
+				blas64.Vector{N: n, Inc: stride, Data: m.mat.Data})
+		default:
+			// Nothing to do.
+		}
+	default:
+		m.checkOverlapMatrix(aU)
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				m.set(i, j, a.At(i, j))
+			}
+		}
+	}
+
+	return r, c
+}
+
+// Stack appends the rows of b onto the rows of a, placing the result into the
+// receiver with b placed in the greater indexed rows. Stack will panic if the
+// two input matrices do not have the same number of columns or the constructed
+// stacked matrix is not the same shape as the receiver.
+func (m *Dense) Stack(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ac != bc || m == a || m == b {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(ar+br, ac)
+
+	m.Copy(a)
+	w := m.slice(ar, ar+br, 0, bc)
+	w.Copy(b)
+}
+
+// Augment creates the augmented matrix of a and b, where b is placed in the
+// greater indexed columns. Augment will panic if the two input matrices do
+// not have the same number of rows or the constructed augmented matrix is
+// not the same shape as the receiver.
+func (m *Dense) Augment(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || m == a || m == b {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(ar, ac+bc)
+
+	m.Copy(a)
+	w := m.slice(0, br, ac, ac+bc)
+	w.Copy(b)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrSquare if the matrix is not square and with
+// ErrZeroLength if the matrix has zero size.
+func (m *Dense) Trace() float64 {
+	r, c := m.Dims()
+	if r != c {
+		panic(ErrSquare)
+	}
+	if m.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	// TODO(btracey): could use internal asm sum routine.
+	var v float64
+	for i := 0; i < m.mat.Rows; i++ {
+		v += m.mat.Data[i*m.mat.Stride+i]
+	}
+	return v
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrShape if the matrix has zero size.
+func (m *Dense) Norm(norm float64) float64 {
+	if m.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum {
+		work := getFloat64s(m.mat.Cols, false)
+		defer putFloat64s(work)
+		return lapack64.Lange(lnorm, m.mat, work)
+	}
+	return lapack64.Lange(lnorm, m.mat, nil)
+}
+
+// Permutation constructs an n×n permutation matrix P from the given
+// row permutation such that the nonzero entries are P[i,p[i]] = 1.
+func (m *Dense) Permutation(n int, p []int) {
+	if len(p) != n {
+		panic(badSliceLength)
+	}
+	m.reuseAsZeroed(n, n)
+	for i, v := range p {
+		if v < 0 || v >= n {
+			panic(ErrRowAccess)
+		}
+		m.mat.Data[i*m.mat.Stride+v] = 1
+	}
+}
+
+// PermuteRows rearranges the rows of the m×n matrix A in the receiver as
+// specified by the permutation p[0],p[1],...,p[m-1] of the integers 0,...,m-1.
+//
+// If inverse is false, the given permutation is applied:
+//
+//	A[p[i],0:n] is moved to A[i,0:n] for i=0,1,...,m-1.
+//
+// If inverse is true, the inverse permutation is applied:
+//
+//	A[i,0:n] is moved to A[p[i],0:n] for i=0,1,...,m-1.
+//
+// p must have length m, otherwise PermuteRows will panic.
+func (m *Dense) PermuteRows(p []int, inverse bool) {
+	r, _ := m.Dims()
+	if len(p) != r {
+		panic(badSliceLength)
+	}
+	lapack64.Lapmr(!inverse, m.mat, p)
+}
+
+// PermuteCols rearranges the columns of the m×n matrix A in the reciever as
+// specified by the permutation p[0],p[1],...,p[n-1] of the integers 0,...,n-1.
+//
+// If inverse is false, the given permutation is applied:
+//
+//	A[0:m,p[j]] is moved to A[0:m,j] for j = 0, 1, ..., n-1.
+//
+// If inverse is true, the inverse permutation is applied:
+//
+//	A[0:m,j] is moved to A[0:m,p[j]] for j = 0, 1, ..., n-1.
+//
+// p must have length n, otherwise PermuteCols will panic.
+func (m *Dense) PermuteCols(p []int, inverse bool) {
+	_, c := m.Dims()
+	if len(p) != c {
+		panic(badSliceLength)
+	}
+	lapack64.Lapmt(!inverse, m.mat, p)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go b/vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go
new file mode 100644
index 00000000000..259ee13d518
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/dense_arithmetic.go
@@ -0,0 +1,877 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+// Add adds a and b element-wise, placing the result in the receiver. Add
+// will panic if the two matrices do not have the same shape.
+func (m *Dense) Add(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v + bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)+b.At(r, c))
+		}
+	}
+}
+
+// Sub subtracts the matrix b from a, placing the result in the receiver. Sub
+// will panic if the two matrices do not have the same shape.
+func (m *Dense) Sub(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v - bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)-b.At(r, c))
+		}
+	}
+}
+
+// MulElem performs element-wise multiplication of a and b, placing the result
+// in the receiver. MulElem will panic if the two matrices do not have the same
+// shape.
+func (m *Dense) MulElem(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v * bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)*b.At(r, c))
+		}
+	}
+}
+
+// DivElem performs element-wise division of a by b, placing the result
+// in the receiver. DivElem will panic if the two matrices do not have the same
+// shape.
+func (m *Dense) DivElem(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, ac)
+
+	if arm, ok := a.(*Dense); ok {
+		if brm, ok := b.(*Dense); ok {
+			amat, bmat := arm.mat, brm.mat
+			if m != aU {
+				m.checkOverlap(amat)
+			}
+			if m != bU {
+				m.checkOverlap(bmat)
+			}
+			for ja, jb, jm := 0, 0, 0; ja < ar*amat.Stride; ja, jb, jm = ja+amat.Stride, jb+bmat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v / bmat.Data[i+jb]
+				}
+			}
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	var restore func()
+	if aTrans && m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if bTrans && m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, a.At(r, c)/b.At(r, c))
+		}
+	}
+}
+
+// Inverse computes the inverse of the matrix a, storing the result into the
+// receiver. If a is ill-conditioned, a Condition error will be returned.
+// Note that matrix inversion is numerically unstable, and should generally
+// be avoided where possible, for example by using the Solve routines.
+func (m *Dense) Inverse(a Matrix) error {
+	// TODO(btracey): Special case for RawTriangular, etc.
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrSquare)
+	}
+	m.reuseAsNonZeroed(a.Dims())
+	aU, aTrans := untransposeExtract(a)
+	switch rm := aU.(type) {
+	case *Dense:
+		if m != aU || aTrans {
+			if m == aU || m.checkOverlap(rm.mat) {
+				tmp := getDenseWorkspace(r, c, false)
+				tmp.Copy(a)
+				m.Copy(tmp)
+				putDenseWorkspace(tmp)
+				break
+			}
+			m.Copy(a)
+		}
+	default:
+		m.Copy(a)
+	}
+	// Compute the norm of A.
+	work := getFloat64s(4*r, false) // Length must be at least 4*r for Gecon.
+	norm := lapack64.Lange(CondNorm, m.mat, work)
+	// Compute the LU factorization of A.
+	ipiv := getInts(r, false)
+	defer putInts(ipiv)
+	ok := lapack64.Getrf(m.mat, ipiv)
+	if !ok {
+		// A is exactly singular.
+		return Condition(math.Inf(1))
+	}
+	// Compute the condition number of A using the LU factorization.
+	iwork := getInts(r, false)
+	defer putInts(iwork)
+	rcond := lapack64.Gecon(CondNorm, m.mat, norm, work, iwork)
+	// Compute A^{-1} from the LU factorization regardless of the value of rcond.
+	lapack64.Getri(m.mat, ipiv, work, -1)
+	if int(work[0]) > len(work) {
+		l := int(work[0])
+		putFloat64s(work)
+		work = getFloat64s(l, false)
+	}
+	defer putFloat64s(work)
+	ok = lapack64.Getri(m.mat, ipiv, work, len(work))
+	if !ok || rcond == 0 {
+		// A is exactly singular.
+		return Condition(math.Inf(1))
+	}
+	// Check whether A is singular for computational purposes.
+	cond := 1 / rcond
+	if cond > ConditionTolerance {
+		return Condition(cond)
+	}
+	return nil
+}
+
+// Mul takes the matrix product of a and b, placing the result in the receiver.
+// If the number of columns in a does not equal the number of rows in b, Mul will panic.
+func (m *Dense) Mul(a, b Matrix) {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+
+	if ac != br {
+		panic(ErrShape)
+	}
+
+	aU, aTrans := untransposeExtract(a)
+	bU, bTrans := untransposeExtract(b)
+	m.reuseAsNonZeroed(ar, bc)
+	var restore func()
+	if m == aU {
+		m, restore = m.isolatedWorkspace(aU)
+		defer restore()
+	} else if m == bU {
+		m, restore = m.isolatedWorkspace(bU)
+		defer restore()
+	}
+	aT := blas.NoTrans
+	if aTrans {
+		aT = blas.Trans
+	}
+	bT := blas.NoTrans
+	if bTrans {
+		bT = blas.Trans
+	}
+
+	// Some of the cases do not have a transpose option, so create
+	// temporary memory.
+	// C = Aᵀ * B = (Bᵀ * A)ᵀ
+	// Cᵀ = Bᵀ * A.
+	if aU, ok := aU.(*Dense); ok {
+		if restore == nil {
+			m.checkOverlap(aU.mat)
+		}
+		switch bU := bU.(type) {
+		case *Dense:
+			if restore == nil {
+				m.checkOverlap(bU.mat)
+			}
+			blas64.Gemm(aT, bT, 1, aU.mat, bU.mat, 0, m.mat)
+			return
+
+		case *SymDense:
+			if aTrans {
+				c := getDenseWorkspace(ac, ar, false)
+				blas64.Symm(blas.Left, 1, bU.mat, aU.mat, 0, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			blas64.Symm(blas.Right, 1, bU.mat, aU.mat, 0, m.mat)
+			return
+
+		case *TriDense:
+			// Trmm updates in place, so copy aU first.
+			if aTrans {
+				c := getDenseWorkspace(ac, ar, false)
+				var tmp Dense
+				tmp.SetRawMatrix(aU.mat)
+				c.Copy(&tmp)
+				bT := blas.Trans
+				if bTrans {
+					bT = blas.NoTrans
+				}
+				blas64.Trmm(blas.Left, bT, 1, bU.mat, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			m.Copy(a)
+			blas64.Trmm(blas.Right, bT, 1, bU.mat, m.mat)
+			return
+
+		case *VecDense:
+			m.checkOverlap(bU.asGeneral())
+			bvec := bU.RawVector()
+			if bTrans {
+				// {ar,1} x {1,bc}, which is not a vector.
+				// Instead, construct B as a General.
+				bmat := blas64.General{
+					Rows:   bc,
+					Cols:   1,
+					Stride: bvec.Inc,
+					Data:   bvec.Data,
+				}
+				blas64.Gemm(aT, bT, 1, aU.mat, bmat, 0, m.mat)
+				return
+			}
+			cvec := blas64.Vector{
+				Inc:  m.mat.Stride,
+				Data: m.mat.Data,
+			}
+			blas64.Gemv(aT, 1, aU.mat, bvec, 0, cvec)
+			return
+		}
+	}
+	if bU, ok := bU.(*Dense); ok {
+		if restore == nil {
+			m.checkOverlap(bU.mat)
+		}
+		switch aU := aU.(type) {
+		case *SymDense:
+			if bTrans {
+				c := getDenseWorkspace(bc, br, false)
+				blas64.Symm(blas.Right, 1, aU.mat, bU.mat, 0, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			blas64.Symm(blas.Left, 1, aU.mat, bU.mat, 0, m.mat)
+			return
+
+		case *TriDense:
+			// Trmm updates in place, so copy bU first.
+			if bTrans {
+				c := getDenseWorkspace(bc, br, false)
+				var tmp Dense
+				tmp.SetRawMatrix(bU.mat)
+				c.Copy(&tmp)
+				aT := blas.Trans
+				if aTrans {
+					aT = blas.NoTrans
+				}
+				blas64.Trmm(blas.Right, aT, 1, aU.mat, c.mat)
+				strictCopy(m, c.T())
+				putDenseWorkspace(c)
+				return
+			}
+			m.Copy(b)
+			blas64.Trmm(blas.Left, aT, 1, aU.mat, m.mat)
+			return
+
+		case *VecDense:
+			m.checkOverlap(aU.asGeneral())
+			avec := aU.RawVector()
+			if aTrans {
+				// {1,ac} x {ac, bc}
+				// Transpose B so that the vector is on the right.
+				cvec := blas64.Vector{
+					Inc:  1,
+					Data: m.mat.Data,
+				}
+				bT := blas.Trans
+				if bTrans {
+					bT = blas.NoTrans
+				}
+				blas64.Gemv(bT, 1, bU.mat, avec, 0, cvec)
+				return
+			}
+			// {ar,1} x {1,bc} which is not a vector result.
+			// Instead, construct A as a General.
+			amat := blas64.General{
+				Rows:   ar,
+				Cols:   1,
+				Stride: avec.Inc,
+				Data:   avec.Data,
+			}
+			blas64.Gemm(aT, bT, 1, amat, bU.mat, 0, m.mat)
+			return
+		}
+	}
+
+	m.checkOverlapMatrix(aU)
+	m.checkOverlapMatrix(bU)
+	row := getFloat64s(ac, false)
+	defer putFloat64s(row)
+	for r := 0; r < ar; r++ {
+		for i := range row {
+			row[i] = a.At(r, i)
+		}
+		for c := 0; c < bc; c++ {
+			var v float64
+			for i, e := range row {
+				v += e * b.At(i, c)
+			}
+			m.mat.Data[r*m.mat.Stride+c] = v
+		}
+	}
+}
+
+// strictCopy copies a into m panicking if the shape of a and m differ.
+func strictCopy(m *Dense, a Matrix) {
+	r, c := m.Copy(a)
+	if r != m.mat.Rows || c != m.mat.Cols {
+		// Panic with a string since this
+		// is not a user-facing panic.
+		panic(ErrShape.Error())
+	}
+}
+
+// Exp calculates the exponential of the matrix a, e^a, placing the result
+// in the receiver. Exp will panic with ErrShape if a is not square.
+func (m *Dense) Exp(a Matrix) {
+	// The implementation used here is from Functions of Matrices: Theory and Computation
+	// Chapter 10, Algorithm 10.20. https://doi.org/10.1137/1.9780898717778.ch10
+
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(r, r)
+	if r == 1 {
+		m.mat.Data[0] = math.Exp(a.At(0, 0))
+		return
+	}
+
+	pade := []struct {
+		theta float64
+		b     []float64
+	}{
+		{theta: 0.015, b: []float64{
+			120, 60, 12, 1,
+		}},
+		{theta: 0.25, b: []float64{
+			30240, 15120, 3360, 420, 30, 1,
+		}},
+		{theta: 0.95, b: []float64{
+			17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1,
+		}},
+		{theta: 2.1, b: []float64{
+			17643225600, 8821612800, 2075673600, 302702400, 30270240, 2162160, 110880, 3960, 90, 1,
+		}},
+	}
+
+	a1 := m
+	a1.Copy(a)
+	v := getDenseWorkspace(r, r, true)
+	vraw := v.RawMatrix()
+	n := r * r
+	vvec := blas64.Vector{N: n, Inc: 1, Data: vraw.Data}
+	defer putDenseWorkspace(v)
+
+	u := getDenseWorkspace(r, r, true)
+	uraw := u.RawMatrix()
+	uvec := blas64.Vector{N: n, Inc: 1, Data: uraw.Data}
+	defer putDenseWorkspace(u)
+
+	a2 := getDenseWorkspace(r, r, false)
+	defer putDenseWorkspace(a2)
+
+	n1 := Norm(a, 1)
+	for i, t := range pade {
+		if n1 > t.theta {
+			continue
+		}
+
+		// This loop only executes once, so
+		// this is not as horrible as it looks.
+		p := getDenseWorkspace(r, r, true)
+		praw := p.RawMatrix()
+		pvec := blas64.Vector{N: n, Inc: 1, Data: praw.Data}
+		defer putDenseWorkspace(p)
+
+		for k := 0; k < r; k++ {
+			p.set(k, k, 1)
+			v.set(k, k, t.b[0])
+			u.set(k, k, t.b[1])
+		}
+
+		a2.Mul(a1, a1)
+		for j := 0; j <= i; j++ {
+			p.Mul(p, a2)
+			blas64.Axpy(t.b[2*j+2], pvec, vvec)
+			blas64.Axpy(t.b[2*j+3], pvec, uvec)
+		}
+		u.Mul(a1, u)
+
+		// Use p as a workspace here and
+		// rename u for the second call's
+		// receiver.
+		vmu, vpu := u, p
+		vpu.Add(v, u)
+		vmu.Sub(v, u)
+
+		_ = m.Solve(vmu, vpu)
+		return
+	}
+
+	// Remaining Padé table line.
+	const theta13 = 5.4
+	b := [...]float64{
+		64764752532480000, 32382376266240000, 7771770303897600, 1187353796428800,
+		129060195264000, 10559470521600, 670442572800, 33522128640,
+		1323241920, 40840800, 960960, 16380, 182, 1,
+	}
+
+	s := math.Log2(n1 / theta13)
+	if s >= 0 {
+		s = math.Ceil(s)
+		a1.Scale(1/math.Pow(2, s), a1)
+	}
+	a2.Mul(a1, a1)
+
+	i := getDenseWorkspace(r, r, true)
+	for j := 0; j < r; j++ {
+		i.set(j, j, 1)
+	}
+	iraw := i.RawMatrix()
+	ivec := blas64.Vector{N: n, Inc: 1, Data: iraw.Data}
+	defer putDenseWorkspace(i)
+
+	a2raw := a2.RawMatrix()
+	a2vec := blas64.Vector{N: n, Inc: 1, Data: a2raw.Data}
+
+	a4 := getDenseWorkspace(r, r, false)
+	a4raw := a4.RawMatrix()
+	a4vec := blas64.Vector{N: n, Inc: 1, Data: a4raw.Data}
+	defer putDenseWorkspace(a4)
+	a4.Mul(a2, a2)
+
+	a6 := getDenseWorkspace(r, r, false)
+	a6raw := a6.RawMatrix()
+	a6vec := blas64.Vector{N: n, Inc: 1, Data: a6raw.Data}
+	defer putDenseWorkspace(a6)
+	a6.Mul(a2, a4)
+
+	// V = A_6(b_12*A_6 + b_10*A_4 + b_8*A_2) + b_6*A_6 + b_4*A_4 + b_2*A_2 +b_0*I
+	blas64.Axpy(b[12], a6vec, vvec)
+	blas64.Axpy(b[10], a4vec, vvec)
+	blas64.Axpy(b[8], a2vec, vvec)
+	v.Mul(v, a6)
+	blas64.Axpy(b[6], a6vec, vvec)
+	blas64.Axpy(b[4], a4vec, vvec)
+	blas64.Axpy(b[2], a2vec, vvec)
+	blas64.Axpy(b[0], ivec, vvec)
+
+	// U = A(A_6(b_13*A_6 + b_11*A_4 + b_9*A_2) + b_7*A_6 + b_5*A_4 + b_2*A_3 +b_1*I)
+	blas64.Axpy(b[13], a6vec, uvec)
+	blas64.Axpy(b[11], a4vec, uvec)
+	blas64.Axpy(b[9], a2vec, uvec)
+	u.Mul(u, a6)
+	blas64.Axpy(b[7], a6vec, uvec)
+	blas64.Axpy(b[5], a4vec, uvec)
+	blas64.Axpy(b[3], a2vec, uvec)
+	blas64.Axpy(b[1], ivec, uvec)
+	u.Mul(u, a1)
+
+	// Use i as a workspace here and
+	// rename u for the second call's
+	// receiver.
+	vmu, vpu := u, i
+	vpu.Add(v, u)
+	vmu.Sub(v, u)
+
+	_ = m.Solve(vmu, vpu)
+
+	for ; s > 0; s-- {
+		m.Mul(m, m)
+	}
+}
+
+// Pow calculates the integral power of the matrix a to n, placing the result
+// in the receiver. Pow will panic if n is negative or if a is not square.
+func (m *Dense) Pow(a Matrix, n int) {
+	if n < 0 {
+		panic("mat: illegal power")
+	}
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrShape)
+	}
+
+	m.reuseAsNonZeroed(r, c)
+
+	// Take possible fast paths.
+	switch n {
+	case 0:
+		for i := 0; i < r; i++ {
+			zero(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+			m.mat.Data[i*m.mat.Stride+i] = 1
+		}
+		return
+	case 1:
+		m.Copy(a)
+		return
+	case 2:
+		m.Mul(a, a)
+		return
+	}
+
+	// Perform iterative exponentiation by squaring in work space.
+	w := getDenseWorkspace(r, r, false)
+	w.Copy(a)
+	s := getDenseWorkspace(r, r, false)
+	s.Copy(a)
+	x := getDenseWorkspace(r, r, false)
+	for n--; n > 0; n >>= 1 {
+		if n&1 != 0 {
+			x.Mul(w, s)
+			w, x = x, w
+		}
+		if n != 1 {
+			x.Mul(s, s)
+			s, x = x, s
+		}
+	}
+	m.Copy(w)
+	putDenseWorkspace(w)
+	putDenseWorkspace(s)
+	putDenseWorkspace(x)
+}
+
+// Kronecker calculates the Kronecker product of a and b, placing the result in
+// the receiver.
+func (m *Dense) Kronecker(a, b Matrix) {
+	ra, ca := a.Dims()
+	rb, cb := b.Dims()
+
+	m.reuseAsNonZeroed(ra*rb, ca*cb)
+	for i := 0; i < ra; i++ {
+		for j := 0; j < ca; j++ {
+			m.slice(i*rb, (i+1)*rb, j*cb, (j+1)*cb).Scale(a.At(i, j), b)
+		}
+	}
+}
+
+// Scale multiplies the elements of a by f, placing the result in the receiver.
+//
+// See the Scaler interface for more information.
+func (m *Dense) Scale(f float64, a Matrix) {
+	ar, ac := a.Dims()
+
+	m.reuseAsNonZeroed(ar, ac)
+
+	aU, aTrans := untransposeExtract(a)
+	if rm, ok := aU.(*Dense); ok {
+		amat := rm.mat
+		if m == aU || m.checkOverlap(amat) {
+			var restore func()
+			m, restore = m.isolatedWorkspace(a)
+			defer restore()
+		}
+		if !aTrans {
+			for ja, jm := 0, 0; ja < ar*amat.Stride; ja, jm = ja+amat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = v * f
+				}
+			}
+		} else {
+			for ja, jm := 0, 0; ja < ac*amat.Stride; ja, jm = ja+amat.Stride, jm+1 {
+				for i, v := range amat.Data[ja : ja+ar] {
+					m.mat.Data[i*m.mat.Stride+jm] = v * f
+				}
+			}
+		}
+		return
+	}
+
+	m.checkOverlapMatrix(a)
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, f*a.At(r, c))
+		}
+	}
+}
+
+// Apply applies the function fn to each of the elements of a, placing the
+// resulting matrix in the receiver. The function fn takes a row/column
+// index and element value and returns some function of that tuple.
+func (m *Dense) Apply(fn func(i, j int, v float64) float64, a Matrix) {
+	ar, ac := a.Dims()
+
+	m.reuseAsNonZeroed(ar, ac)
+
+	aU, aTrans := untransposeExtract(a)
+	if rm, ok := aU.(*Dense); ok {
+		amat := rm.mat
+		if m == aU || m.checkOverlap(amat) {
+			var restore func()
+			m, restore = m.isolatedWorkspace(a)
+			defer restore()
+		}
+		if !aTrans {
+			for j, ja, jm := 0, 0, 0; ja < ar*amat.Stride; j, ja, jm = j+1, ja+amat.Stride, jm+m.mat.Stride {
+				for i, v := range amat.Data[ja : ja+ac] {
+					m.mat.Data[i+jm] = fn(j, i, v)
+				}
+			}
+		} else {
+			for j, ja, jm := 0, 0, 0; ja < ac*amat.Stride; j, ja, jm = j+1, ja+amat.Stride, jm+1 {
+				for i, v := range amat.Data[ja : ja+ar] {
+					m.mat.Data[i*m.mat.Stride+jm] = fn(i, j, v)
+				}
+			}
+		}
+		return
+	}
+
+	m.checkOverlapMatrix(a)
+	for r := 0; r < ar; r++ {
+		for c := 0; c < ac; c++ {
+			m.set(r, c, fn(r, c, a.At(r, c)))
+		}
+	}
+}
+
+// RankOne performs a rank-one update to the matrix a with the vectors x and
+// y, where x and y are treated as column vectors. The result is stored in the
+// receiver. The Outer method can be used instead of RankOne if a is not needed.
+//
+//	m = a + alpha * x * yᵀ
+func (m *Dense) RankOne(a Matrix, alpha float64, x, y Vector) {
+	ar, ac := a.Dims()
+	if x.Len() != ar {
+		panic(ErrShape)
+	}
+	if y.Len() != ac {
+		panic(ErrShape)
+	}
+
+	if a != m {
+		aU, _ := untransposeExtract(a)
+		if rm, ok := aU.(*Dense); ok {
+			m.checkOverlap(rm.RawMatrix())
+		}
+	}
+
+	var xmat, ymat blas64.Vector
+	fast := true
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat = rv.mat
+		m.checkOverlap(generalFromVector(xmat, r, c))
+	} else {
+		fast = false
+	}
+	yU, _ := untransposeExtract(y)
+	if rv, ok := yU.(*VecDense); ok {
+		r, c := yU.Dims()
+		ymat = rv.mat
+		m.checkOverlap(generalFromVector(ymat, r, c))
+	} else {
+		fast = false
+	}
+
+	if fast {
+		if m != a {
+			m.reuseAsNonZeroed(ar, ac)
+			m.Copy(a)
+		}
+		blas64.Ger(alpha, xmat, ymat, m.mat)
+		return
+	}
+
+	m.reuseAsNonZeroed(ar, ac)
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			m.set(i, j, a.At(i, j)+alpha*x.AtVec(i)*y.AtVec(j))
+		}
+	}
+}
+
+// Outer calculates the outer product of the vectors x and y, where x and y
+// are treated as column vectors, and stores the result in the receiver.
+//
+//	m = alpha * x * yᵀ
+//
+// In order to update an existing matrix, see RankOne.
+func (m *Dense) Outer(alpha float64, x, y Vector) {
+	r, c := x.Len(), y.Len()
+
+	m.reuseAsZeroed(r, c)
+
+	var xmat, ymat blas64.Vector
+	fast := true
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat = rv.mat
+		m.checkOverlap(generalFromVector(xmat, r, c))
+	} else {
+		fast = false
+	}
+	yU, _ := untransposeExtract(y)
+	if rv, ok := yU.(*VecDense); ok {
+		r, c := yU.Dims()
+		ymat = rv.mat
+		m.checkOverlap(generalFromVector(ymat, r, c))
+	} else {
+		fast = false
+	}
+
+	if fast {
+		for i := 0; i < r; i++ {
+			zero(m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+c])
+		}
+		blas64.Ger(alpha, xmat, ymat, m.mat)
+		return
+	}
+
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			m.set(i, j, alpha*x.AtVec(i)*y.AtVec(j))
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/diagonal.go b/vendor/gonum.org/v1/gonum/mat/diagonal.go
new file mode 100644
index 00000000000..c42f70c831e
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/diagonal.go
@@ -0,0 +1,342 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+var (
+	diagDense *DiagDense
+	_         Matrix          = diagDense
+	_         allMatrix       = diagDense
+	_         denseMatrix     = diagDense
+	_         Diagonal        = diagDense
+	_         MutableDiagonal = diagDense
+	_         Triangular      = diagDense
+	_         TriBanded       = diagDense
+	_         Symmetric       = diagDense
+	_         SymBanded       = diagDense
+	_         Banded          = diagDense
+	_         RawBander       = diagDense
+	_         RawSymBander    = diagDense
+
+	diag Diagonal
+	_    Matrix     = diag
+	_    Diagonal   = diag
+	_    Triangular = diag
+	_    TriBanded  = diag
+	_    Symmetric  = diag
+	_    SymBanded  = diag
+	_    Banded     = diag
+)
+
+// Diagonal represents a diagonal matrix, that is a square matrix that only
+// has non-zero terms on the diagonal.
+type Diagonal interface {
+	Matrix
+	// Diag returns the number of rows/columns in the matrix.
+	Diag() int
+
+	// The following interfaces are included in the Diagonal
+	// interface to allow the use of Diagonal types in
+	// functions operating on these types.
+	Banded
+	SymBanded
+	Symmetric
+	Triangular
+	TriBanded
+}
+
+// MutableDiagonal is a Diagonal matrix whose elements can be set.
+type MutableDiagonal interface {
+	Diagonal
+	SetDiag(i int, v float64)
+}
+
+// DiagDense represents a diagonal matrix in dense storage format.
+type DiagDense struct {
+	mat blas64.Vector
+}
+
+// NewDiagDense creates a new Diagonal matrix with n rows and n columns.
+// The length of data must be n or data must be nil, otherwise NewDiagDense
+// will panic. NewDiagDense will panic if n is zero.
+func NewDiagDense(n int, data []float64) *DiagDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data == nil {
+		data = make([]float64, n)
+	}
+	if len(data) != n {
+		panic(ErrShape)
+	}
+	return &DiagDense{
+		mat: blas64.Vector{N: n, Data: data, Inc: 1},
+	}
+}
+
+// Diag returns the dimension of the receiver.
+func (d *DiagDense) Diag() int {
+	return d.mat.N
+}
+
+// Dims returns the dimensions of the matrix.
+func (d *DiagDense) Dims() (r, c int) {
+	return d.mat.N, d.mat.N
+}
+
+// T returns the transpose of the matrix.
+func (d *DiagDense) T() Matrix {
+	return d
+}
+
+// TTri returns the transpose of the matrix. Note that Diagonal matrices are
+// Upper by default.
+func (d *DiagDense) TTri() Triangular {
+	return TransposeTri{d}
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a
+// TransposeBand.
+func (d *DiagDense) TBand() Banded {
+	return TransposeBand{d}
+}
+
+// TTriBand performs an implicit transpose by returning the receiver inside a
+// TransposeTriBand. Note that Diagonal matrices are Upper by default.
+func (d *DiagDense) TTriBand() TriBanded {
+	return TransposeTriBand{d}
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+// These values are always zero for diagonal matrices.
+func (d *DiagDense) Bandwidth() (kl, ku int) {
+	return 0, 0
+}
+
+// SymmetricDim implements the Symmetric interface.
+func (d *DiagDense) SymmetricDim() int {
+	return d.mat.N
+}
+
+// SymBand returns the number of rows/columns in the matrix, and the size of
+// the bandwidth.
+func (d *DiagDense) SymBand() (n, k int) {
+	return d.mat.N, 0
+}
+
+// Triangle implements the Triangular interface.
+func (d *DiagDense) Triangle() (int, TriKind) {
+	return d.mat.N, Upper
+}
+
+// TriBand returns the number of rows/columns in the matrix, the
+// size of the bandwidth, and the orientation. Note that Diagonal matrices are
+// Upper by default.
+func (d *DiagDense) TriBand() (n, k int, kind TriKind) {
+	return d.mat.N, 0, Upper
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (d *DiagDense) Reset() {
+	// No change of Inc or n to 0 may be
+	// made unless both are set to 0.
+	d.mat.Inc = 0
+	d.mat.N = 0
+	d.mat.Data = d.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (d *DiagDense) Zero() {
+	for i := 0; i < d.mat.N; i++ {
+		d.mat.Data[d.mat.Inc*i] = 0
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (d *DiagDense) DiagView() Diagonal {
+	return d
+}
+
+// DiagFrom copies the diagonal of m into the receiver. The receiver must
+// be min(r, c) long or empty, otherwise DiagFrom will panic.
+func (d *DiagDense) DiagFrom(m Matrix) {
+	n := min(m.Dims())
+	d.reuseAsNonZeroed(n)
+
+	var vec blas64.Vector
+	switch r := m.(type) {
+	case *DiagDense:
+		vec = r.mat
+	case RawBander:
+		mat := r.RawBand()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride,
+			Data: mat.Data[mat.KL : (n-1)*mat.Stride+mat.KL+1],
+		}
+	case RawMatrixer:
+		mat := r.RawMatrix()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride + 1,
+			Data: mat.Data[:(n-1)*mat.Stride+n],
+		}
+	case RawSymBander:
+		mat := r.RawSymBand()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride,
+			Data: mat.Data[:(n-1)*mat.Stride+1],
+		}
+	case RawSymmetricer:
+		mat := r.RawSymmetric()
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride + 1,
+			Data: mat.Data[:(n-1)*mat.Stride+n],
+		}
+	case RawTriBander:
+		mat := r.RawTriBand()
+		data := mat.Data
+		if mat.Uplo == blas.Lower {
+			data = data[mat.K:]
+		}
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride,
+			Data: data[:(n-1)*mat.Stride+1],
+		}
+	case RawTriangular:
+		mat := r.RawTriangular()
+		if mat.Diag == blas.Unit {
+			for i := 0; i < n; i += d.mat.Inc {
+				d.mat.Data[i] = 1
+			}
+			return
+		}
+		vec = blas64.Vector{
+			N:    n,
+			Inc:  mat.Stride + 1,
+			Data: mat.Data[:(n-1)*mat.Stride+n],
+		}
+	case RawVectorer:
+		d.mat.Data[0] = r.RawVector().Data[0]
+		return
+	default:
+		for i := 0; i < n; i++ {
+			d.setDiag(i, m.At(i, i))
+		}
+		return
+	}
+	blas64.Copy(vec, d.mat)
+}
+
+// RawBand returns the underlying data used by the receiver represented
+// as a blas64.Band.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Band.
+func (d *DiagDense) RawBand() blas64.Band {
+	return blas64.Band{
+		Rows:   d.mat.N,
+		Cols:   d.mat.N,
+		KL:     0,
+		KU:     0,
+		Stride: d.mat.Inc,
+		Data:   d.mat.Data,
+	}
+}
+
+// RawSymBand returns the underlying data used by the receiver represented
+// as a blas64.SymmetricBand.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Band.
+func (d *DiagDense) RawSymBand() blas64.SymmetricBand {
+	return blas64.SymmetricBand{
+		N:      d.mat.N,
+		K:      0,
+		Stride: d.mat.Inc,
+		Uplo:   blas.Upper,
+		Data:   d.mat.Data,
+	}
+}
+
+// reuseAsNonZeroed resizes an empty diagonal to a r×r diagonal,
+// or checks that a non-empty matrix is r×r.
+func (d *DiagDense) reuseAsNonZeroed(r int) {
+	if r == 0 {
+		panic(ErrZeroLength)
+	}
+	if d.IsEmpty() {
+		d.mat = blas64.Vector{
+			Inc:  1,
+			Data: use(d.mat.Data, r),
+		}
+		d.mat.N = r
+		return
+	}
+	if r != d.mat.N {
+		panic(ErrShape)
+	}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (d *DiagDense) IsEmpty() bool {
+	// It must be the case that d.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return d.mat.Inc == 0
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (d *DiagDense) Trace() float64 {
+	if d.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := d.RawBand()
+	var tr float64
+	for i := 0; i < rb.Rows; i++ {
+		tr += rb.Data[rb.KL+i*rb.Stride]
+	}
+	return tr
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 or Inf - The maximum diagonal element magnitude
+//	2 - The Frobenius norm, the square root of the sum of the squares of
+//	    the diagonal elements
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the receiver has zero size.
+func (d *DiagDense) Norm(norm float64) float64 {
+	if d.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	switch norm {
+	default:
+		panic(ErrNormOrder)
+	case 1, math.Inf(1):
+		imax := blas64.Iamax(d.mat)
+		return math.Abs(d.at(imax, imax))
+	case 2:
+		return blas64.Nrm2(d.mat)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/doc.go b/vendor/gonum.org/v1/gonum/mat/doc.go
new file mode 100644
index 00000000000..f8c078cfef3
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/doc.go
@@ -0,0 +1,200 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package mat provides implementations of float64 and complex128 matrix
+// structures and linear algebra operations on them.
+//
+// # Overview
+//
+// This section provides a quick overview of the mat package. The following
+// sections provide more in depth commentary.
+//
+// mat provides:
+//   - Interfaces for Matrix classes (Matrix, Symmetric, Triangular)
+//   - Concrete implementations (Dense, SymDense, TriDense, VecDense)
+//   - Methods and functions for using matrix data (Add, Trace, SymRankOne)
+//   - Types for constructing and using matrix factorizations (QR, LU, etc.)
+//   - The complementary types for complex matrices, CMatrix, CSymDense, etc.
+//
+// In the documentation below, we use "matrix" as a short-hand for all of
+// the FooDense types implemented in this package. We use "Matrix" to
+// refer to the Matrix interface.
+//
+// A matrix may be constructed through the corresponding New function. If no
+// backing array is provided the matrix will be initialized to all zeros.
+//
+//	// Allocate a zeroed real matrix of size 3×5
+//	zero := mat.NewDense(3, 5, nil)
+//
+// If a backing data slice is provided, the matrix will have those elements.
+// All matrices are stored in row-major format and users should consider
+// this when expressing matrix arithmetic to ensure optimal performance.
+//
+//	// Generate a 6×6 matrix of random values.
+//	data := make([]float64, 36)
+//	for i := range data {
+//		data[i] = rand.NormFloat64()
+//	}
+//	a := mat.NewDense(6, 6, data)
+//
+// Operations involving matrix data are implemented as functions when the values
+// of the matrix remain unchanged
+//
+//	tr := mat.Trace(a)
+//
+// and are implemented as methods when the operation modifies the receiver.
+//
+//	zero.Copy(a)
+//
+// Note that the input arguments to most functions and methods are interfaces
+// rather than concrete types `func Trace(Matrix)` rather than
+// `func Trace(*Dense)` allowing flexible use of internal and external
+// Matrix types.
+//
+// When a matrix is the destination or receiver for a function or method,
+// the operation will panic if the matrix is not the correct size.
+// An exception to this is when the destination is empty (see below).
+//
+// # Empty matrix
+//
+// An empty matrix is one that has zero size. Empty matrices are used to allow
+// the destination of a matrix operation to assume the correct size automatically.
+// This operation will re-use the backing data, if available, or will allocate
+// new data if necessary. The IsEmpty method returns whether the given matrix
+// is empty. The zero-value of a matrix is empty, and is useful for easily
+// getting the result of matrix operations.
+//
+//	var c mat.Dense // construct a new zero-value matrix
+//	c.Mul(a, a)     // c is automatically adjusted to be the right size
+//
+// The Reset method can be used to revert a matrix to an empty matrix.
+// Reset should not be used when multiple different matrices share the same backing
+// data slice. This can cause unexpected data modifications after being resized.
+// An empty matrix can not be sliced even if it does have an adequately sized
+// backing data slice, but can be expanded using its Grow method if it exists.
+//
+// # The Matrix Interfaces
+//
+// The Matrix interface is the common link between the concrete types of real
+// matrices. The Matrix interface is defined by three functions: Dims, which
+// returns the dimensions of the Matrix, At, which returns the element in the
+// specified location, and T for returning a Transpose (discussed later). All of
+// the matrix types can perform these behaviors and so implement the interface.
+// Methods and functions are designed to use this interface, so in particular the method
+//
+//	func (m *Dense) Mul(a, b Matrix)
+//
+// constructs a *Dense from the result of a multiplication with any Matrix types,
+// not just *Dense. Where more restrictive requirements must be met, there are also
+// additional interfaces like Symmetric and Triangular. For example, in
+//
+//	func (s *SymDense) AddSym(a, b Symmetric)
+//
+// the Symmetric interface guarantees a symmetric result.
+//
+// The CMatrix interface plays the same role for complex matrices. The difference
+// is that the CMatrix type has the H method instead T, for returning the conjugate
+// transpose.
+//
+// (Conjugate) Transposes
+//
+// The T method is used for transposition on real matrices, and H is used for
+// conjugate transposition on complex matrices. For example, c.Mul(a.T(), b) computes
+// c = aᵀ * b. The mat types implement this method implicitly —
+// see the Transpose and Conjugate types for more details. Note that some
+// operations have a transpose as part of their definition, as in *SymDense.SymOuterK.
+//
+// # Matrix Factorization
+//
+// Matrix factorizations, such as the LU decomposition, typically have their own
+// specific data storage, and so are each implemented as a specific type. The
+// factorization can be computed through a call to Factorize
+//
+//	var lu mat.LU
+//	lu.Factorize(a)
+//
+// The elements of the factorization can be extracted through methods on the
+// factorized type, for example *LU.UTo. The factorization types can also be used
+// directly, as in *Cholesky.SolveTo. Some factorizations can be updated directly,
+// without needing to update the original matrix and refactorize, for example with
+// *LU.RankOne.
+//
+// # BLAS and LAPACK
+//
+// BLAS and LAPACK are the standard APIs for linear algebra routines. Many
+// operations in mat are implemented using calls to the wrapper functions
+// in gonum/blas/blas64 and gonum/lapack/lapack64 and their complex equivalents.
+// By default, blas64 and lapack64 call the native Go implementations of the
+// routines. Alternatively, it is possible to use C-based implementations of the
+// APIs through the respective cgo packages and the wrapper packages' "Use"
+// functions. The Go implementation of LAPACK makes calls through blas64, so if
+// a cgo BLAS implementation is registered, the lapack64 calls will be partially
+// executed in Go and partially executed in C.
+//
+// # Type Switching
+//
+// The Matrix abstraction enables efficiency as well as interoperability. Go's
+// type reflection capabilities are used to choose the most efficient routine
+// given the specific concrete types. For example, in
+//
+//	c.Mul(a, b)
+//
+// if a and b both implement RawMatrixer, that is, they can be represented as a
+// blas64.General, blas64.Gemm (general matrix multiplication) is called, while
+// instead if b is a RawSymmetricer blas64.Symm is used (general-symmetric
+// multiplication), and if b is a *VecDense blas64.Gemv is used.
+//
+// There are many possible type combinations and special cases. No specific guarantees
+// are made about the performance of any method, and in particular, note that an
+// abstract matrix type may be copied into a concrete type of the corresponding
+// value. If there are specific special cases that are needed, please submit a
+// pull-request or file an issue.
+//
+// # Invariants
+//
+// Matrix input arguments to package functions are never directly modified. If an
+// operation changes Matrix data, the mutated matrix will be the receiver of a
+// method, or will be the first, dst, argument to a method named with a To suffix.
+//
+// For convenience, a matrix may be used as both a receiver and as an input, e.g.
+//
+//	a.Pow(a, 6)
+//	v.SolveVec(a.T(), v)
+//
+// though in many cases this will cause an allocation (see Element Aliasing).
+// An exception to this rule is Copy, which does not allow a.Copy(a.T()).
+//
+// # Element Aliasing
+//
+// Most methods in mat modify receiver data. It is forbidden for the modified
+// data region of the receiver to overlap the used data area of the input
+// arguments. The exception to this rule is when the method receiver is equal to one
+// of the input arguments, as in the a.Pow(a, 6) call above, or its implicit transpose.
+//
+// This prohibition is to help avoid subtle mistakes when the method needs to read
+// from and write to the same data region. There are ways to make mistakes using the
+// mat API, and mat functions will detect and complain about those.
+// There are many ways to make mistakes by excursion from the mat API via
+// interaction with raw matrix values.
+//
+// If you need to read the rest of this section to understand the behavior of
+// your program, you are being clever. Don't be clever. If you must be clever,
+// blas64 and lapack64 may be used to call the behavior directly.
+//
+// mat will use the following rules to detect overlap between the receiver and one
+// of the inputs:
+//   - the input implements one of the Raw methods, and
+//   - the address ranges of the backing data slices overlap, and
+//   - the strides differ or there is an overlap in the used data elements.
+//
+// If such an overlap is detected, the method will panic.
+//
+// The following cases will not panic:
+//   - the data slices do not overlap,
+//   - there is pointer identity between the receiver and input values after
+//     the value has been untransposed if necessary.
+//
+// mat will not attempt to detect element overlap if the input does not implement a
+// Raw method. Method behavior is undefined if there is undetected overlap.
+package mat // import "gonum.org/v1/gonum/mat"
diff --git a/vendor/gonum.org/v1/gonum/mat/eigen.go b/vendor/gonum.org/v1/gonum/mat/eigen.go
new file mode 100644
index 00000000000..859247d880d
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/eigen.go
@@ -0,0 +1,450 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const (
+	badFact   = "mat: use without successful factorization"
+	noVectors = "mat: eigenvectors not computed"
+)
+
+// EigenSym is a type for computing all eigenvalues and, optionally,
+// eigenvectors of a symmetric matrix A.
+//
+// It is a Symmetric matrix represented by its spectral factorization. Once
+// computed, this representation is useful for extracting eigenvalues and
+// eigenvector, but At is slow.
+type EigenSym struct {
+	vectorsComputed bool
+
+	values  []float64
+	vectors *Dense
+}
+
+// Dims returns the dimensions of the matrix.
+func (e *EigenSym) Dims() (r, c int) {
+	n := e.SymmetricDim()
+	return n, n
+}
+
+// SymmetricDim implements the Symmetric interface.
+func (e *EigenSym) SymmetricDim() int {
+	return len(e.values)
+}
+
+// At returns the element at row i, column j of the matrix A.
+//
+// At will panic if the eigenvectors have not been computed.
+func (e *EigenSym) At(i, j int) float64 {
+	if !e.vectorsComputed {
+		panic(noVectors)
+	}
+	n, _ := e.Dims()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k < n; k++ {
+		val += e.values[k] * e.vectors.at(i, k) * e.vectors.at(j, k)
+	}
+	return val
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (e *EigenSym) T() Matrix {
+	return e
+}
+
+// Factorize computes the spectral factorization (eigendecomposition) of the
+// symmetric matrix A.
+//
+// The spectral factorization of A can be written as
+//
+//	A = Q * Λ * Qᵀ
+//
+// where Λ is a diagonal matrix whose entries are the eigenvalues, and Q is an
+// orthogonal matrix whose columns are the eigenvectors.
+//
+// If vectors is false, the eigenvectors are not computed and later calls to
+// VectorsTo and At will panic.
+//
+// Factorize returns whether the factorization succeeded. If it returns false,
+// methods that require a successful factorization will panic.
+func (e *EigenSym) Factorize(a Symmetric, vectors bool) (ok bool) {
+	// kill previous decomposition
+	e.vectorsComputed = false
+	e.values = e.values[:]
+
+	n := a.SymmetricDim()
+	sd := NewSymDense(n, nil)
+	sd.CopySym(a)
+
+	jobz := lapack.EVNone
+	if vectors {
+		jobz = lapack.EVCompute
+	}
+	w := make([]float64, n)
+	work := []float64{0}
+	lapack64.Syev(jobz, sd.mat, w, work, -1)
+
+	work = getFloat64s(int(work[0]), false)
+	ok = lapack64.Syev(jobz, sd.mat, w, work, len(work))
+	putFloat64s(work)
+	if !ok {
+		e.vectorsComputed = false
+		e.values = nil
+		e.vectors = nil
+		return false
+	}
+	e.vectorsComputed = vectors
+	e.values = w
+	e.vectors = NewDense(n, n, sd.mat.Data)
+	return true
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (e *EigenSym) succFact() bool {
+	return len(e.values) != 0
+}
+
+// Values extracts the eigenvalues of the factorized n×n matrix A in ascending
+// order.
+//
+// If dst is not nil, the values are stored in-place into dst and returned,
+// otherwise a new slice is allocated first. If dst is not nil, it must have
+// length equal to n.
+//
+// If the receiver does not contain a successful factorization, Values will
+// panic.
+func (e *EigenSym) Values(dst []float64) []float64 {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if dst == nil {
+		dst = make([]float64, len(e.values))
+	}
+	if len(dst) != len(e.values) {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(dst, e.values)
+	return dst
+}
+
+// RawValues returns the slice storing the eigenvalues of A in ascending order.
+//
+// If the returned slice is modified, the factorization is invalid and should
+// not be used.
+//
+// If the receiver does not contain a successful factorization, RawValues will
+// return nil.
+func (e *EigenSym) RawValues() []float64 {
+	if !e.succFact() {
+		return nil
+	}
+	return e.values
+}
+
+// VectorsTo stores the orthonormal eigenvectors of the factorized n×n matrix A
+// into the columns of dst.
+//
+// If dst is empty, VectorsTo will resize dst to be n×n. When dst is non-empty,
+// VectorsTo will panic if dst is not n×n. VectorsTo will also panic if the
+// eigenvectors were not computed during the factorization, or if the receiver
+// does not contain a successful factorization.
+func (e *EigenSym) VectorsTo(dst *Dense) {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if !e.vectorsComputed {
+		panic(noVectors)
+	}
+	r, c := e.vectors.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(e.vectors)
+}
+
+// RawQ returns the orthogonal matrix Q from the spectral factorization of the
+// original matrix A
+//
+//	A = Q * Λ * Qᵀ
+//
+// The columns of Q contain the eigenvectors of A.
+//
+// If the returned matrix is modified, the factorization is invalid and should
+// not be used.
+//
+// If the receiver does not contain a successful factorization or eigenvectors
+// not computed, RawU will return nil.
+func (e *EigenSym) RawQ() Matrix {
+	if !e.succFact() || !e.vectorsComputed {
+		return nil
+	}
+	return e.vectors
+}
+
+// EigenKind specifies the computation of eigenvectors during factorization.
+type EigenKind int
+
+const (
+	// EigenNone specifies to not compute any eigenvectors.
+	EigenNone EigenKind = 0
+	// EigenLeft specifies to compute the left eigenvectors.
+	EigenLeft EigenKind = 1 << iota
+	// EigenRight specifies to compute the right eigenvectors.
+	EigenRight
+	// EigenBoth is a convenience value for computing both eigenvectors.
+	EigenBoth EigenKind = EigenLeft | EigenRight
+)
+
+// Eigen is a type for creating and using the eigenvalue decomposition of a dense matrix.
+type Eigen struct {
+	n int // The size of the factorized matrix.
+
+	kind EigenKind
+
+	values   []complex128
+	rVectors *CDense
+	lVectors *CDense
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (e *Eigen) succFact() bool {
+	return e.n != 0
+}
+
+// Factorize computes the eigenvalues of the square matrix a, and optionally
+// the eigenvectors.
+//
+// A right eigenvalue/eigenvector combination is defined by
+//
+//	A * x_r = λ * x_r
+//
+// where x_r is the column vector called an eigenvector, and λ is the corresponding
+// eigenvalue.
+//
+// Similarly, a left eigenvalue/eigenvector combination is defined by
+//
+//	x_l * A = λ * x_l
+//
+// The eigenvalues, but not the eigenvectors, are the same for both decompositions.
+//
+// Typically eigenvectors refer to right eigenvectors.
+//
+// In all cases, Factorize computes the eigenvalues of the matrix. kind
+// specifies which of the eigenvectors, if any, to compute. See the EigenKind
+// documentation for more information.
+// Eigen panics if the input matrix is not square.
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, methods that require a successful factorization will panic.
+func (e *Eigen) Factorize(a Matrix, kind EigenKind) (ok bool) {
+	// kill previous factorization.
+	e.n = 0
+	e.kind = 0
+	// Copy a because it is modified during the Lapack call.
+	r, c := a.Dims()
+	if r != c {
+		panic(ErrShape)
+	}
+	var sd Dense
+	sd.CloneFrom(a)
+
+	left := kind&EigenLeft != 0
+	right := kind&EigenRight != 0
+
+	var vl, vr Dense
+	jobvl := lapack.LeftEVNone
+	jobvr := lapack.RightEVNone
+	if left {
+		vl = *NewDense(r, r, nil)
+		jobvl = lapack.LeftEVCompute
+	}
+	if right {
+		vr = *NewDense(c, c, nil)
+		jobvr = lapack.RightEVCompute
+	}
+
+	wr := getFloat64s(c, false)
+	defer putFloat64s(wr)
+	wi := getFloat64s(c, false)
+	defer putFloat64s(wi)
+
+	work := []float64{0}
+	lapack64.Geev(jobvl, jobvr, sd.mat, wr, wi, vl.mat, vr.mat, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	first := lapack64.Geev(jobvl, jobvr, sd.mat, wr, wi, vl.mat, vr.mat, work, len(work))
+	putFloat64s(work)
+
+	if first != 0 {
+		e.values = nil
+		return false
+	}
+	e.n = r
+	e.kind = kind
+
+	// Construct complex eigenvalues from float64 data.
+	values := make([]complex128, r)
+	for i, v := range wr {
+		values[i] = complex(v, wi[i])
+	}
+	e.values = values
+
+	// Construct complex eigenvectors from float64 data.
+	var cvl, cvr CDense
+	if left {
+		cvl = *NewCDense(r, r, nil)
+		e.complexEigenTo(&cvl, &vl)
+		e.lVectors = &cvl
+	} else {
+		e.lVectors = nil
+	}
+	if right {
+		cvr = *NewCDense(c, c, nil)
+		e.complexEigenTo(&cvr, &vr)
+		e.rVectors = &cvr
+	} else {
+		e.rVectors = nil
+	}
+	return true
+}
+
+// Kind returns the EigenKind of the decomposition. If no decomposition has been
+// computed, Kind returns -1.
+func (e *Eigen) Kind() EigenKind {
+	if !e.succFact() {
+		return -1
+	}
+	return e.kind
+}
+
+// Values extracts the eigenvalues of the factorized matrix. If dst is
+// non-nil, the values are stored in-place into dst. In this case
+// dst must have length n, otherwise Values will panic. If dst is
+// nil, then a new slice will be allocated of the proper length and
+// filed with the eigenvalues.
+//
+// Values panics if the Eigen decomposition was not successful.
+func (e *Eigen) Values(dst []complex128) []complex128 {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if dst == nil {
+		dst = make([]complex128, e.n)
+	}
+	if len(dst) != e.n {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(dst, e.values)
+	return dst
+}
+
+// complexEigenTo extracts the complex eigenvectors from the real matrix d
+// and stores them into the complex matrix dst.
+//
+// The columns of the returned n×n dense matrix contain the eigenvectors of the
+// decomposition in the same order as the eigenvalues.
+// If the j-th eigenvalue is real, then
+//
+//	dst[:,j] = d[:,j],
+//
+// and if it is not real, then the elements of the j-th and (j+1)-th columns of d
+// form complex conjugate pairs and the eigenvectors are recovered as
+//
+//	dst[:,j]   = d[:,j] + i*d[:,j+1],
+//	dst[:,j+1] = d[:,j] - i*d[:,j+1],
+//
+// where i is the imaginary unit.
+func (e *Eigen) complexEigenTo(dst *CDense, d *Dense) {
+	r, c := d.Dims()
+	cr, cc := dst.Dims()
+	if r != cr {
+		panic("size mismatch")
+	}
+	if c != cc {
+		panic("size mismatch")
+	}
+	for j := 0; j < c; j++ {
+		if imag(e.values[j]) == 0 {
+			for i := 0; i < r; i++ {
+				dst.set(i, j, complex(d.at(i, j), 0))
+			}
+			continue
+		}
+		for i := 0; i < r; i++ {
+			real := d.at(i, j)
+			imag := d.at(i, j+1)
+			dst.set(i, j, complex(real, imag))
+			dst.set(i, j+1, complex(real, -imag))
+		}
+		j++
+	}
+}
+
+// VectorsTo stores the right eigenvectors of the decomposition into the columns
+// of dst. The computed eigenvectors are normalized to have Euclidean norm equal
+// to 1 and largest component real.
+//
+// If dst is empty, VectorsTo will resize dst to be n×n. When dst is
+// non-empty, VectorsTo will panic if dst is not n×n. VectorsTo will also
+// panic if the eigenvectors were not computed during the factorization,
+// or if the receiver does not contain a successful factorization.
+func (e *Eigen) VectorsTo(dst *CDense) {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if e.kind&EigenRight == 0 {
+		panic(noVectors)
+	}
+	if dst.IsEmpty() {
+		dst.ReuseAs(e.n, e.n)
+	} else {
+		r, c := dst.Dims()
+		if r != e.n || c != e.n {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(e.rVectors)
+}
+
+// LeftVectorsTo stores the left eigenvectors of the decomposition into the
+// columns of dst. The computed eigenvectors are normalized to have Euclidean
+// norm equal to 1 and largest component real.
+//
+// If dst is empty, LeftVectorsTo will resize dst to be n×n. When dst is
+// non-empty, LeftVectorsTo will panic if dst is not n×n. LeftVectorsTo will also
+// panic if the left eigenvectors were not computed during the factorization,
+// or if the receiver does not contain a successful factorization
+func (e *Eigen) LeftVectorsTo(dst *CDense) {
+	if !e.succFact() {
+		panic(badFact)
+	}
+	if e.kind&EigenLeft == 0 {
+		panic(noVectors)
+	}
+	if dst.IsEmpty() {
+		dst.ReuseAs(e.n, e.n)
+	} else {
+		r, c := dst.Dims()
+		if r != e.n || c != e.n {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(e.lVectors)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/errors.go b/vendor/gonum.org/v1/gonum/mat/errors.go
new file mode 100644
index 00000000000..641d816219f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/errors.go
@@ -0,0 +1,154 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"fmt"
+	"runtime"
+
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Condition is the condition number of a matrix. The condition
+// number is defined as |A| * |A^-1|.
+//
+// One important use of Condition is during linear solve routines (finding x such
+// that A * x = b). The condition number of A indicates the accuracy of
+// the computed solution. A Condition error will be returned if the condition
+// number of A is sufficiently large. If A is exactly singular to working precision,
+// Condition == ∞, and the solve algorithm may have completed early. If Condition
+// is large and finite the solve algorithm will be performed, but the computed
+// solution may be inaccurate. Due to the nature of finite precision arithmetic,
+// the value of Condition is only an approximate test of singularity.
+type Condition float64
+
+func (c Condition) Error() string {
+	return fmt.Sprintf("matrix singular or near-singular with condition number %.4e", c)
+}
+
+// ConditionTolerance is the tolerance limit of the condition number. If the
+// condition number is above this value, the matrix is considered singular.
+const ConditionTolerance = 1e16
+
+const (
+	// CondNorm is the matrix norm used for computing the condition number by routines
+	// in the matrix packages.
+	CondNorm = lapack.MaxRowSum
+
+	// CondNormTrans is the norm used to compute on Aᵀ to get the same result as
+	// computing CondNorm on A.
+	CondNormTrans = lapack.MaxColumnSum
+)
+
+const stackTraceBufferSize = 1 << 20
+
+// Maybe will recover a panic with a type mat.Error from fn, and return this error
+// as the Err field of an ErrorStack. The stack trace for the panicking function will be
+// recovered and placed in the StackTrace field. Any other error is re-panicked.
+func Maybe(fn func()) (err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(Error); ok {
+				if e.string == "" {
+					panic("mat: invalid error")
+				}
+				buf := make([]byte, stackTraceBufferSize)
+				n := runtime.Stack(buf, false)
+				err = ErrorStack{Err: e, StackTrace: string(buf[:n])}
+				return
+			}
+			panic(r)
+		}
+	}()
+	fn()
+	return
+}
+
+// MaybeFloat will recover a panic with a type mat.Error from fn, and return this error
+// as the Err field of an ErrorStack. The stack trace for the panicking function will be
+// recovered and placed in the StackTrace field. Any other error is re-panicked.
+func MaybeFloat(fn func() float64) (f float64, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(Error); ok {
+				if e.string == "" {
+					panic("mat: invalid error")
+				}
+				buf := make([]byte, stackTraceBufferSize)
+				n := runtime.Stack(buf, false)
+				err = ErrorStack{Err: e, StackTrace: string(buf[:n])}
+				return
+			}
+			panic(r)
+		}
+	}()
+	return fn(), nil
+}
+
+// MaybeComplex will recover a panic with a type mat.Error from fn, and return this error
+// as the Err field of an ErrorStack. The stack trace for the panicking function will be
+// recovered and placed in the StackTrace field. Any other error is re-panicked.
+func MaybeComplex(fn func() complex128) (f complex128, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			if e, ok := r.(Error); ok {
+				if e.string == "" {
+					panic("mat: invalid error")
+				}
+				buf := make([]byte, stackTraceBufferSize)
+				n := runtime.Stack(buf, false)
+				err = ErrorStack{Err: e, StackTrace: string(buf[:n])}
+				return
+			}
+			panic(r)
+		}
+	}()
+	return fn(), nil
+}
+
+// Error represents matrix handling errors. These errors can be recovered by Maybe wrappers.
+type Error struct{ string }
+
+func (err Error) Error() string { return err.string }
+
+var (
+	ErrNegativeDimension   = Error{"mat: negative dimension"}
+	ErrIndexOutOfRange     = Error{"mat: index out of range"}
+	ErrReuseNonEmpty       = Error{"mat: reuse of non-empty matrix"}
+	ErrRowAccess           = Error{"mat: row index out of range"}
+	ErrColAccess           = Error{"mat: column index out of range"}
+	ErrVectorAccess        = Error{"mat: vector index out of range"}
+	ErrZeroLength          = Error{"mat: zero length in matrix dimension"}
+	ErrRowLength           = Error{"mat: row length mismatch"}
+	ErrColLength           = Error{"mat: col length mismatch"}
+	ErrSquare              = Error{"mat: expect square matrix"}
+	ErrNormOrder           = Error{"mat: invalid norm order for matrix"}
+	ErrSingular            = Error{"mat: matrix is singular"}
+	ErrShape               = Error{"mat: dimension mismatch"}
+	ErrIllegalStride       = Error{"mat: illegal stride"}
+	ErrPivot               = Error{"mat: malformed pivot list"}
+	ErrTriangle            = Error{"mat: triangular storage mismatch"}
+	ErrTriangleSet         = Error{"mat: triangular set out of bounds"}
+	ErrBandwidth           = Error{"mat: bandwidth out of range"}
+	ErrBandSet             = Error{"mat: band set out of bounds"}
+	ErrDiagSet             = Error{"mat: diagonal set out of bounds"}
+	ErrSliceLengthMismatch = Error{"mat: input slice length mismatch"}
+	ErrNotPSD              = Error{"mat: input not positive symmetric definite"}
+	ErrFailedEigen         = Error{"mat: eigendecomposition not successful"}
+)
+
+// ErrorStack represents matrix handling errors that have been recovered by Maybe wrappers.
+type ErrorStack struct {
+	Err error
+
+	// StackTrace is the stack trace
+	// recovered by Maybe, MaybeFloat
+	// or MaybeComplex.
+	StackTrace string
+}
+
+func (err ErrorStack) Error() string { return err.Err.Error() }
+
+const badCap = "mat: bad capacity"
diff --git a/vendor/gonum.org/v1/gonum/mat/format.go b/vendor/gonum.org/v1/gonum/mat/format.go
new file mode 100644
index 00000000000..c239ddd3638
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/format.go
@@ -0,0 +1,516 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// Formatted returns a fmt.Formatter for the matrix m using the given options.
+func Formatted(m Matrix, options ...FormatOption) fmt.Formatter {
+	f := formatter{
+		matrix: m,
+		dot:    '.',
+	}
+	for _, o := range options {
+		o(&f)
+	}
+	return f
+}
+
+type formatter struct {
+	matrix  Matrix
+	prefix  string
+	margin  int
+	dot     byte
+	squeeze bool
+
+	format func(m Matrix, prefix string, margin int, dot byte, squeeze bool, fs fmt.State, c rune)
+}
+
+// FormatOption is a functional option for matrix formatting.
+type FormatOption func(*formatter)
+
+// Prefix sets the formatted prefix to the string p. Prefix is a string that is prepended to
+// each line of output after the first line.
+func Prefix(p string) FormatOption {
+	return func(f *formatter) { f.prefix = p }
+}
+
+// Excerpt sets the maximum number of rows and columns to print at the margins of the matrix
+// to m. If m is zero or less all elements are printed.
+func Excerpt(m int) FormatOption {
+	return func(f *formatter) { f.margin = m }
+}
+
+// DotByte sets the dot character to b. The dot character is used to replace zero elements
+// if the result is printed with the fmt ' ' verb flag. Without a DotByte option, the default
+// dot character is '.'.
+func DotByte(b byte) FormatOption {
+	return func(f *formatter) { f.dot = b }
+}
+
+// Squeeze sets the printing behavior to minimise column width for each individual column.
+func Squeeze() FormatOption {
+	return func(f *formatter) { f.squeeze = true }
+}
+
+// FormatMATLAB sets the printing behavior to output MATLAB syntax. If MATLAB syntax is
+// specified, the ' ' verb flag and Excerpt option are ignored. If the alternative syntax
+// verb flag, '#' is used the matrix is formatted in rows and columns.
+func FormatMATLAB() FormatOption {
+	return func(f *formatter) { f.format = formatMATLAB }
+}
+
+// FormatPython sets the printing behavior to output Python syntax. If Python syntax is
+// specified, the ' ' verb flag and Excerpt option are ignored. If the alternative syntax
+// verb flag, '#' is used the matrix is formatted in rows and columns.
+func FormatPython() FormatOption {
+	return func(f *formatter) { f.format = formatPython }
+}
+
+// Format satisfies the fmt.Formatter interface.
+func (f formatter) Format(fs fmt.State, c rune) {
+	if c == 'v' && fs.Flag('#') && f.format == nil {
+		fmt.Fprintf(fs, "%#v", f.matrix)
+		return
+	}
+	if f.format == nil {
+		f.format = format
+	}
+	f.format(f.matrix, f.prefix, f.margin, f.dot, f.squeeze, fs, c)
+}
+
+// format prints a pretty representation of m to the fs io.Writer. The format character c
+// specifies the numerical representation of elements; valid values are those for float64
+// specified in the fmt package, with their associated flags. In addition to this, a space
+// preceding a verb indicates that zero values should be represented by the dot character.
+// The printed range of the matrix can be limited by specifying a positive value for margin;
+// If margin is greater than zero, only the first and last margin rows/columns of the matrix
+// are output. If squeeze is true, column widths are determined on a per-column basis.
+//
+// format will not provide Go syntax output.
+func format(m Matrix, prefix string, margin int, dot byte, squeeze bool, fs fmt.State, c rune) {
+	rows, cols := m.Dims()
+
+	var printed int
+	if margin <= 0 {
+		printed = rows
+		if cols > printed {
+			printed = cols
+		}
+	} else {
+		printed = margin
+	}
+
+	prec, pOk := fs.Precision()
+	if !pOk {
+		prec = -1
+	}
+
+	var (
+		maxWidth int
+		widths   widther
+		buf, pad []byte
+	)
+	if squeeze {
+		widths = make(columnWidth, cols)
+	} else {
+		widths = new(uniformWidth)
+	}
+	switch c {
+	case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		if c == 'v' {
+			buf, maxWidth = maxCellWidth(m, 'g', printed, prec, widths)
+		} else {
+			buf, maxWidth = maxCellWidth(m, c, printed, prec, widths)
+		}
+	default:
+		fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+		return
+	}
+	width, _ := fs.Width()
+	width = max(width, maxWidth)
+	pad = make([]byte, max(width, 2))
+	for i := range pad {
+		pad[i] = ' '
+	}
+
+	first := true
+	if rows > 2*printed || cols > 2*printed {
+		first = false
+		fmt.Fprintf(fs, "Dims(%d, %d)\n", rows, cols)
+	}
+
+	skipZero := fs.Flag(' ')
+	for i := 0; i < rows; i++ {
+		if !first {
+			fmt.Fprint(fs, prefix)
+		}
+		first = false
+		var el string
+		switch {
+		case rows == 1:
+			fmt.Fprint(fs, "[")
+			el = "]"
+		case i == 0:
+			fmt.Fprint(fs, "⎡")
+			el = "⎤\n"
+		case i < rows-1:
+			fmt.Fprint(fs, "⎢")
+			el = "⎥\n"
+		default:
+			fmt.Fprint(fs, "⎣")
+			el = "⎦"
+		}
+
+		for j := 0; j < cols; j++ {
+			if j >= printed && j < cols-printed {
+				j = cols - printed - 1
+				if i == 0 || i == rows-1 {
+					fmt.Fprint(fs, "...  ...  ")
+				} else {
+					fmt.Fprint(fs, "          ")
+				}
+				continue
+			}
+
+			v := m.At(i, j)
+			if v == 0 && skipZero {
+				buf = buf[:1]
+				buf[0] = dot
+			} else {
+				if c == 'v' {
+					buf = strconv.AppendFloat(buf[:0], v, 'g', prec, 64)
+				} else {
+					buf = strconv.AppendFloat(buf[:0], v, byte(c), prec, 64)
+				}
+			}
+			if fs.Flag('-') {
+				fs.Write(buf)
+				fs.Write(pad[:widths.width(j)-len(buf)])
+			} else {
+				fs.Write(pad[:widths.width(j)-len(buf)])
+				fs.Write(buf)
+			}
+
+			if j < cols-1 {
+				fs.Write(pad[:2])
+			}
+		}
+
+		fmt.Fprint(fs, el)
+
+		if i >= printed-1 && i < rows-printed && 2*printed < rows {
+			i = rows - printed - 1
+			fmt.Fprintf(fs, "%s .\n%[1]s .\n%[1]s .\n", prefix)
+			continue
+		}
+	}
+}
+
+// formatMATLAB prints a MATLAB representation of m to the fs io.Writer. The format character c
+// specifies the numerical representation of elements; valid values are those for float64
+// specified in the fmt package, with their associated flags.
+// The printed range of the matrix can be limited by specifying a positive value for margin;
+// If squeeze is true, column widths are determined on a per-column basis.
+//
+// formatMATLAB will not provide Go syntax output.
+func formatMATLAB(m Matrix, prefix string, _ int, _ byte, squeeze bool, fs fmt.State, c rune) {
+	rows, cols := m.Dims()
+
+	prec, pOk := fs.Precision()
+	width, _ := fs.Width()
+	if !fs.Flag('#') {
+		switch c {
+		case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		default:
+			fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+			return
+		}
+		format := fmtString(fs, c, prec, width)
+		fs.Write([]byte{'['})
+		for i := 0; i < rows; i++ {
+			if i != 0 {
+				fs.Write([]byte("; "))
+			}
+			for j := 0; j < cols; j++ {
+				if j != 0 {
+					fs.Write([]byte{' '})
+				}
+				fmt.Fprintf(fs, format, m.At(i, j))
+			}
+		}
+		fs.Write([]byte{']'})
+		return
+	}
+
+	if !pOk {
+		prec = -1
+	}
+
+	printed := rows
+	if cols > printed {
+		printed = cols
+	}
+
+	var (
+		maxWidth int
+		widths   widther
+		buf, pad []byte
+	)
+	if squeeze {
+		widths = make(columnWidth, cols)
+	} else {
+		widths = new(uniformWidth)
+	}
+	switch c {
+	case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		if c == 'v' {
+			buf, maxWidth = maxCellWidth(m, 'g', printed, prec, widths)
+		} else {
+			buf, maxWidth = maxCellWidth(m, c, printed, prec, widths)
+		}
+	default:
+		fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+		return
+	}
+	width = max(width, maxWidth)
+	pad = make([]byte, max(width, 1))
+	for i := range pad {
+		pad[i] = ' '
+	}
+
+	for i := 0; i < rows; i++ {
+		var el string
+		switch {
+		case rows == 1:
+			fmt.Fprint(fs, "[")
+			el = "]"
+		case i == 0:
+			fmt.Fprint(fs, "[\n"+prefix+" ")
+			el = "\n"
+		case i < rows-1:
+			fmt.Fprint(fs, prefix+" ")
+			el = "\n"
+		default:
+			fmt.Fprint(fs, prefix+" ")
+			el = "\n" + prefix + "]"
+		}
+
+		for j := 0; j < cols; j++ {
+			v := m.At(i, j)
+			if c == 'v' {
+				buf = strconv.AppendFloat(buf[:0], v, 'g', prec, 64)
+			} else {
+				buf = strconv.AppendFloat(buf[:0], v, byte(c), prec, 64)
+			}
+			if fs.Flag('-') {
+				fs.Write(buf)
+				fs.Write(pad[:widths.width(j)-len(buf)])
+			} else {
+				fs.Write(pad[:widths.width(j)-len(buf)])
+				fs.Write(buf)
+			}
+
+			if j < cols-1 {
+				fs.Write(pad[:1])
+			}
+		}
+
+		fmt.Fprint(fs, el)
+	}
+}
+
+// formatPython prints a Python representation of m to the fs io.Writer. The format character c
+// specifies the numerical representation of elements; valid values are those for float64
+// specified in the fmt package, with their associated flags.
+// The printed range of the matrix can be limited by specifying a positive value for margin;
+// If squeeze is true, column widths are determined on a per-column basis.
+//
+// formatPython will not provide Go syntax output.
+func formatPython(m Matrix, prefix string, _ int, _ byte, squeeze bool, fs fmt.State, c rune) {
+	rows, cols := m.Dims()
+
+	prec, pOk := fs.Precision()
+	width, _ := fs.Width()
+	if !fs.Flag('#') {
+		switch c {
+		case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		default:
+			fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+			return
+		}
+		format := fmtString(fs, c, prec, width)
+		fs.Write([]byte{'['})
+		if rows > 1 {
+			fs.Write([]byte{'['})
+		}
+		for i := 0; i < rows; i++ {
+			if i != 0 {
+				fs.Write([]byte("], ["))
+			}
+			for j := 0; j < cols; j++ {
+				if j != 0 {
+					fs.Write([]byte(", "))
+				}
+				fmt.Fprintf(fs, format, m.At(i, j))
+			}
+		}
+		if rows > 1 {
+			fs.Write([]byte{']'})
+		}
+		fs.Write([]byte{']'})
+		return
+	}
+
+	if !pOk {
+		prec = -1
+	}
+
+	printed := rows
+	if cols > printed {
+		printed = cols
+	}
+
+	var (
+		maxWidth int
+		widths   widther
+		buf, pad []byte
+	)
+	if squeeze {
+		widths = make(columnWidth, cols)
+	} else {
+		widths = new(uniformWidth)
+	}
+	switch c {
+	case 'v', 'e', 'E', 'f', 'F', 'g', 'G':
+		if c == 'v' {
+			buf, maxWidth = maxCellWidth(m, 'g', printed, prec, widths)
+		} else {
+			buf, maxWidth = maxCellWidth(m, c, printed, prec, widths)
+		}
+	default:
+		fmt.Fprintf(fs, "%%!%c(%T=Dims(%d, %d))", c, m, rows, cols)
+		return
+	}
+	width = max(width, maxWidth)
+	pad = make([]byte, max(width, 1))
+	for i := range pad {
+		pad[i] = ' '
+	}
+
+	for i := 0; i < rows; i++ {
+		if i != 0 {
+			fmt.Fprint(fs, prefix)
+		}
+		var el string
+		switch {
+		case rows == 1:
+			fmt.Fprint(fs, "[")
+			el = "]"
+		case i == 0:
+			fmt.Fprint(fs, "[[")
+			el = "],\n"
+		case i < rows-1:
+			fmt.Fprint(fs, " [")
+			el = "],\n"
+		default:
+			fmt.Fprint(fs, " [")
+			el = "]]"
+		}
+
+		for j := 0; j < cols; j++ {
+			v := m.At(i, j)
+			if c == 'v' {
+				buf = strconv.AppendFloat(buf[:0], v, 'g', prec, 64)
+			} else {
+				buf = strconv.AppendFloat(buf[:0], v, byte(c), prec, 64)
+			}
+			if fs.Flag('-') {
+				fs.Write(buf)
+				fs.Write(pad[:widths.width(j)-len(buf)])
+			} else {
+				fs.Write(pad[:widths.width(j)-len(buf)])
+				fs.Write(buf)
+			}
+
+			if j < cols-1 {
+				fs.Write([]byte{','})
+				fs.Write(pad[:1])
+			}
+		}
+
+		fmt.Fprint(fs, el)
+	}
+}
+
+// This is horrible, but it's what we have.
+func fmtString(fs fmt.State, c rune, prec, width int) string {
+	var b strings.Builder
+	b.WriteByte('%')
+	for _, f := range "0+- " {
+		if fs.Flag(int(f)) {
+			b.WriteByte(byte(f))
+		}
+	}
+	if width >= 0 {
+		fmt.Fprint(&b, width)
+	}
+	if prec >= 0 {
+		b.WriteByte('.')
+		if prec > 0 {
+			fmt.Fprint(&b, prec)
+		}
+	}
+	b.WriteRune(c)
+	return b.String()
+}
+
+func maxCellWidth(m Matrix, c rune, printed, prec int, w widther) ([]byte, int) {
+	var (
+		buf        = make([]byte, 0, 64)
+		rows, cols = m.Dims()
+		max        int
+	)
+	for i := 0; i < rows; i++ {
+		if i >= printed-1 && i < rows-printed && 2*printed < rows {
+			i = rows - printed - 1
+			continue
+		}
+		for j := 0; j < cols; j++ {
+			if j >= printed && j < cols-printed {
+				continue
+			}
+
+			buf = strconv.AppendFloat(buf, m.At(i, j), byte(c), prec, 64)
+			if len(buf) > max {
+				max = len(buf)
+			}
+			if len(buf) > w.width(j) {
+				w.setWidth(j, len(buf))
+			}
+			buf = buf[:0]
+		}
+	}
+	return buf, max
+}
+
+type widther interface {
+	width(i int) int
+	setWidth(i, w int)
+}
+
+type uniformWidth int
+
+func (u *uniformWidth) width(_ int) int   { return int(*u) }
+func (u *uniformWidth) setWidth(_, w int) { *u = uniformWidth(w) }
+
+type columnWidth []int
+
+func (c columnWidth) width(i int) int   { return c[i] }
+func (c columnWidth) setWidth(i, w int) { c[i] = w }
diff --git a/vendor/gonum.org/v1/gonum/mat/gsvd.go b/vendor/gonum.org/v1/gonum/mat/gsvd.go
new file mode 100644
index 00000000000..02286207cf1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/gsvd.go
@@ -0,0 +1,436 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+// GSVDKind specifies the treatment of singular vectors during a GSVD
+// factorization.
+type GSVDKind int
+
+const (
+	// GSVDNone specifies that no singular vectors should be computed during
+	// the decomposition.
+	GSVDNone GSVDKind = 0
+
+	// GSVDU specifies that the U singular vectors should be computed during
+	// the decomposition.
+	GSVDU GSVDKind = 1 << iota
+	// GSVDV specifies that the V singular vectors should be computed during
+	// the decomposition.
+	GSVDV
+	// GSVDQ specifies that the Q singular vectors should be computed during
+	// the decomposition.
+	GSVDQ
+
+	// GSVDAll is a convenience value for computing all of the singular vectors.
+	GSVDAll = GSVDU | GSVDV | GSVDQ
+)
+
+// GSVD is a type for creating and using the Generalized Singular Value Decomposition
+// (GSVD) of a matrix.
+//
+// The factorization is a linear transformation of the data sets from the given
+// variable×sample spaces to reduced and diagonalized "eigenvariable"×"eigensample"
+// spaces.
+type GSVD struct {
+	kind GSVDKind
+
+	r, p, c, k, l int
+	s1, s2        []float64
+	a, b, u, v, q blas64.General
+
+	work  []float64
+	iwork []int
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (gsvd *GSVD) succFact() bool {
+	return gsvd.r != 0
+}
+
+// Factorize computes the generalized singular value decomposition (GSVD) of the input
+// the r×c matrix A and the p×c matrix B. The singular values of A and B are computed
+// in all cases, while the singular vectors are optionally computed depending on the
+// input kind.
+//
+// The full singular value decomposition (kind == GSVDAll) deconstructs A and B as
+//
+//	A = U * Σ₁ * [ 0 R ] * Qᵀ
+//
+//	B = V * Σ₂ * [ 0 R ] * Qᵀ
+//
+// where Σ₁ and Σ₂ are r×(k+l) and p×(k+l) diagonal matrices of singular values, and
+// U, V and Q are r×r, p×p and c×c orthogonal matrices of singular vectors. k+l is the
+// effective numerical rank of the matrix [ Aᵀ Bᵀ ]ᵀ.
+//
+// It is frequently not necessary to compute the full GSVD. Computation time and
+// storage costs can be reduced using the appropriate kind. Either only the singular
+// values can be computed (kind == SVDNone), or in conjunction with specific singular
+// vectors (kind bit set according to GSVDU, GSVDV and GSVDQ).
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, routines that require a successful factorization will panic.
+func (gsvd *GSVD) Factorize(a, b Matrix, kind GSVDKind) (ok bool) {
+	// kill the previous decomposition
+	gsvd.r = 0
+	gsvd.kind = 0
+
+	r, c := a.Dims()
+	gsvd.r, gsvd.c = r, c
+	p, c := b.Dims()
+	gsvd.p = p
+	if gsvd.c != c {
+		panic(ErrShape)
+	}
+	var jobU, jobV, jobQ lapack.GSVDJob
+	switch {
+	default:
+		panic("gsvd: bad input kind")
+	case kind == GSVDNone:
+		jobU = lapack.GSVDNone
+		jobV = lapack.GSVDNone
+		jobQ = lapack.GSVDNone
+	case GSVDAll&kind != 0:
+		if GSVDU&kind != 0 {
+			jobU = lapack.GSVDU
+			gsvd.u = blas64.General{
+				Rows:   r,
+				Cols:   r,
+				Stride: r,
+				Data:   use(gsvd.u.Data, r*r),
+			}
+		}
+		if GSVDV&kind != 0 {
+			jobV = lapack.GSVDV
+			gsvd.v = blas64.General{
+				Rows:   p,
+				Cols:   p,
+				Stride: p,
+				Data:   use(gsvd.v.Data, p*p),
+			}
+		}
+		if GSVDQ&kind != 0 {
+			jobQ = lapack.GSVDQ
+			gsvd.q = blas64.General{
+				Rows:   c,
+				Cols:   c,
+				Stride: c,
+				Data:   use(gsvd.q.Data, c*c),
+			}
+		}
+	}
+
+	// A and B are destroyed on call, so copy the matrices.
+	aCopy := DenseCopyOf(a)
+	bCopy := DenseCopyOf(b)
+
+	gsvd.s1 = use(gsvd.s1, c)
+	gsvd.s2 = use(gsvd.s2, c)
+
+	gsvd.iwork = useInt(gsvd.iwork, c)
+
+	gsvd.work = use(gsvd.work, 1)
+	lapack64.Ggsvd3(jobU, jobV, jobQ, aCopy.mat, bCopy.mat, gsvd.s1, gsvd.s2, gsvd.u, gsvd.v, gsvd.q, gsvd.work, -1, gsvd.iwork)
+	gsvd.work = use(gsvd.work, int(gsvd.work[0]))
+	gsvd.k, gsvd.l, ok = lapack64.Ggsvd3(jobU, jobV, jobQ, aCopy.mat, bCopy.mat, gsvd.s1, gsvd.s2, gsvd.u, gsvd.v, gsvd.q, gsvd.work, len(gsvd.work), gsvd.iwork)
+	if ok {
+		gsvd.a = aCopy.mat
+		gsvd.b = bCopy.mat
+		gsvd.kind = kind
+	}
+	return ok
+}
+
+// Kind returns the GSVDKind of the decomposition. If no decomposition has been
+// computed, Kind returns -1.
+func (gsvd *GSVD) Kind() GSVDKind {
+	if !gsvd.succFact() {
+		return -1
+	}
+	return gsvd.kind
+}
+
+// Rank returns the k and l terms of the rank of [ Aᵀ Bᵀ ]ᵀ.
+func (gsvd *GSVD) Rank() (k, l int) {
+	return gsvd.k, gsvd.l
+}
+
+// GeneralizedValues returns the generalized singular values of the factorized matrices.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length min(r,c)-k, and GeneralizedValues will
+// panic with ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// GeneralizedValues will panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) GeneralizedValues(v []float64) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	d := min(r, c)
+	if v == nil {
+		v = make([]float64, d-k)
+	}
+	if len(v) != d-k {
+		panic(ErrSliceLengthMismatch)
+	}
+	floats.DivTo(v, gsvd.s1[k:d], gsvd.s2[k:d])
+	return v
+}
+
+// ValuesA returns the singular values of the factorized A matrix.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length min(r,c)-k, and ValuesA will panic with
+// ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// ValuesA will panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) ValuesA(s []float64) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	d := min(r, c)
+	if s == nil {
+		s = make([]float64, d-k)
+	}
+	if len(s) != d-k {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(s, gsvd.s1[k:min(r, c)])
+	return s
+}
+
+// ValuesB returns the singular values of the factorized B matrix.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length min(r,c)-k, and ValuesB will panic with
+// ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// ValuesB will panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) ValuesB(s []float64) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	d := min(r, c)
+	if s == nil {
+		s = make([]float64, d-k)
+	}
+	if len(s) != d-k {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(s, gsvd.s2[k:d])
+	return s
+}
+
+// ZeroRTo extracts the matrix [ 0 R ] from the singular value decomposition,
+// storing the result into dst. [ 0 R ] is of size (k+l)×c.
+//
+// If dst is empty, ZeroRTo will resize dst to be (k+l)×c. When dst is
+// non-empty, ZeroRTo will panic if dst is not (k+l)×c. ZeroRTo will also panic
+// if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) ZeroRTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	c := gsvd.c
+	k := gsvd.k
+	l := gsvd.l
+	h := min(k+l, r)
+	if dst.IsEmpty() {
+		dst.ReuseAs(k+l, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r2 != k+l || c != c2 {
+			panic(ErrShape)
+		}
+		dst.Zero()
+	}
+	a := Dense{
+		mat:     gsvd.a,
+		capRows: r,
+		capCols: c,
+	}
+	dst.slice(0, h, c-k-l, c).Copy(a.Slice(0, h, c-k-l, c))
+	if r < k+l {
+		b := Dense{
+			mat:     gsvd.b,
+			capRows: gsvd.p,
+			capCols: c,
+		}
+		dst.slice(r, k+l, c+r-k-l, c).Copy(b.Slice(r-k, l, c+r-k-l, c))
+	}
+}
+
+// SigmaATo extracts the matrix Σ₁ from the singular value decomposition, storing
+// the result into dst. Σ₁ is size r×(k+l).
+//
+// If dst is empty, SigmaATo will resize dst to be r×(k+l). When dst is
+// non-empty, SigmATo will panic if dst is not r×(k+l). SigmaATo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) SigmaATo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	k := gsvd.k
+	l := gsvd.l
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, k+l)
+	} else {
+		r2, c := dst.Dims()
+		if r2 != r || c != k+l {
+			panic(ErrShape)
+		}
+		dst.Zero()
+	}
+	for i := 0; i < k; i++ {
+		dst.set(i, i, 1)
+	}
+	for i := k; i < min(r, k+l); i++ {
+		dst.set(i, i, gsvd.s1[i])
+	}
+}
+
+// SigmaBTo extracts the matrix Σ₂ from the singular value decomposition, storing
+// the result into dst. Σ₂ is size p×(k+l).
+//
+// If dst is empty, SigmaBTo will resize dst to be p×(k+l). When dst is
+// non-empty, SigmBTo will panic if dst is not p×(k+l). SigmaBTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) SigmaBTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r := gsvd.r
+	p := gsvd.p
+	k := gsvd.k
+	l := gsvd.l
+	if dst.IsEmpty() {
+		dst.ReuseAs(p, k+l)
+	} else {
+		r, c := dst.Dims()
+		if r != p || c != k+l {
+			panic(ErrShape)
+		}
+		dst.Zero()
+	}
+	for i := 0; i < min(l, r-k); i++ {
+		dst.set(i, i+k, gsvd.s2[k+i])
+	}
+	for i := r - k; i < l; i++ {
+		dst.set(i, i+k, 1)
+	}
+}
+
+// UTo extracts the matrix U from the singular value decomposition, storing
+// the result into dst. U is size r×r.
+//
+// If dst is empty, UTo will resize dst to be r×r. When dst is
+// non-empty, UTo will panic if dst is not r×r. UTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) UTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if gsvd.kind&GSVDU == 0 {
+		panic("mat: improper GSVD kind")
+	}
+	r := gsvd.u.Rows
+	c := gsvd.u.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     gsvd.u,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
+
+// VTo extracts the matrix V from the singular value decomposition, storing
+// the result into dst. V is size p×p.
+//
+// If dst is empty, VTo will resize dst to be p×p. When dst is
+// non-empty, VTo will panic if dst is not p×p. VTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) VTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if gsvd.kind&GSVDV == 0 {
+		panic("mat: improper GSVD kind")
+	}
+	r := gsvd.v.Rows
+	c := gsvd.v.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     gsvd.v,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
+
+// QTo extracts the matrix Q from the singular value decomposition, storing
+// the result into dst. Q is size c×c.
+//
+// If dst is empty, QTo will resize dst to be c×c. When dst is
+// non-empty, QTo will panic if dst is not c×c. QTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *GSVD) QTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if gsvd.kind&GSVDQ == 0 {
+		panic("mat: improper GSVD kind")
+	}
+	r := gsvd.q.Rows
+	c := gsvd.q.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     gsvd.q,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/hogsvd.go b/vendor/gonum.org/v1/gonum/mat/hogsvd.go
new file mode 100644
index 00000000000..40a03315b94
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/hogsvd.go
@@ -0,0 +1,239 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"errors"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+// HOGSVD is a type for creating and using the Higher Order Generalized Singular Value
+// Decomposition (HOGSVD) of a set of matrices.
+//
+// The factorization is a linear transformation of the data sets from the given
+// variable×sample spaces to reduced and diagonalized "eigenvariable"×"eigensample"
+// spaces.
+type HOGSVD struct {
+	n int
+	v *Dense
+	b []Dense
+
+	err error
+}
+
+// succFact returns whether the receiver contains a successful factorization.
+func (gsvd *HOGSVD) succFact() bool {
+	return gsvd.n != 0
+}
+
+// Factorize computes the higher order generalized singular value decomposition (HOGSVD)
+// of the n input r_i×c column tall matrices in m. HOGSV extends the GSVD case from 2 to n
+// input matrices.
+//
+//	M_0 = U_0 * Σ_0 * Vᵀ
+//	M_1 = U_1 * Σ_1 * Vᵀ
+//	.
+//	.
+//	.
+//	M_{n-1} = U_{n-1} * Σ_{n-1} * Vᵀ
+//
+// where U_i are r_i×c matrices of singular vectors, Σ are c×c matrices singular values, and V
+// is a c×c matrix of singular vectors.
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, routines that require a successful factorization will panic.
+func (gsvd *HOGSVD) Factorize(m ...Matrix) (ok bool) {
+	// Factorize performs the HOGSVD factorisation
+	// essentially as described by Ponnapalli et al.
+	// https://doi.org/10.1371/journal.pone.0028072
+
+	if len(m) < 2 {
+		panic("hogsvd: too few matrices")
+	}
+	gsvd.n = 0
+
+	r, c := m[0].Dims()
+	a := make([]Cholesky, len(m))
+	var ts SymDense
+	for i, d := range m {
+		rd, cd := d.Dims()
+		if rd < cd {
+			gsvd.err = ErrShape
+			return false
+		}
+		if rd > r {
+			r = rd
+		}
+		if cd != c {
+			panic(ErrShape)
+		}
+		ts.Reset()
+		ts.SymOuterK(1, d.T())
+		ok = a[i].Factorize(&ts)
+		if !ok {
+			gsvd.err = errors.New("hogsvd: cholesky decomposition failed")
+			return false
+		}
+	}
+
+	s := getDenseWorkspace(c, c, true)
+	defer putDenseWorkspace(s)
+	sij := getDenseWorkspace(c, c, false)
+	defer putDenseWorkspace(sij)
+	for i, ai := range a {
+		for _, aj := range a[i+1:] {
+			gsvd.err = ai.SolveCholTo(sij, &aj)
+			if gsvd.err != nil {
+				return false
+			}
+			s.Add(s, sij)
+
+			gsvd.err = aj.SolveCholTo(sij, &ai)
+			if gsvd.err != nil {
+				return false
+			}
+			s.Add(s, sij)
+		}
+	}
+	s.Scale(1/float64(len(m)*(len(m)-1)), s)
+
+	var eig Eigen
+	ok = eig.Factorize(s.T(), EigenRight)
+	if !ok {
+		gsvd.err = errors.New("hogsvd: eigen decomposition failed")
+		return false
+	}
+	var vc CDense
+	eig.VectorsTo(&vc)
+	// vc is guaranteed to have real eigenvalues.
+	rc, cc := vc.Dims()
+	v := NewDense(rc, cc, nil)
+	for i := 0; i < rc; i++ {
+		for j := 0; j < cc; j++ {
+			a := vc.At(i, j)
+			v.set(i, j, real(a))
+		}
+	}
+	// Rescale the columns of v by their Frobenius norms.
+	// Work done in cv is reflected in v.
+	var cv VecDense
+	for j := 0; j < c; j++ {
+		cv.ColViewOf(v, j)
+		cv.ScaleVec(1/blas64.Nrm2(cv.mat), &cv)
+	}
+
+	b := make([]Dense, len(m))
+	biT := getDenseWorkspace(c, r, false)
+	defer putDenseWorkspace(biT)
+	for i, d := range m {
+		// All calls to reset will leave an emptied
+		// matrix with capacity to store the result
+		// without additional allocation.
+		biT.Reset()
+		gsvd.err = biT.Solve(v, d.T())
+		if gsvd.err != nil {
+			return false
+		}
+		b[i].CloneFrom(biT.T())
+	}
+
+	gsvd.n = len(m)
+	gsvd.v = v
+	gsvd.b = b
+	return true
+}
+
+// Err returns the reason for a factorization failure.
+func (gsvd *HOGSVD) Err() error {
+	return gsvd.err
+}
+
+// Len returns the number of matrices that have been factorized. If Len returns
+// zero, the factorization was not successful.
+func (gsvd *HOGSVD) Len() int {
+	return gsvd.n
+}
+
+// UTo extracts the matrix U_n from the singular value decomposition, storing
+// the result in-place into dst. U_n is size r×c.
+//
+// If dst is empty, UTo will resize dst to be r×c. When dst is
+// non-empty, UTo will panic if dst is not r×c. UTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *HOGSVD) UTo(dst *Dense, n int) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if n < 0 || gsvd.n <= n {
+		panic("hogsvd: invalid index")
+	}
+	r, c := gsvd.b[n].Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(&gsvd.b[n])
+	var v VecDense
+	for j, f := range gsvd.Values(nil, n) {
+		v.ColViewOf(dst, j)
+		v.ScaleVec(1/f, &v)
+	}
+}
+
+// Values returns the nth set of singular values of the factorized system.
+// If the input slice is non-nil, the values will be stored in-place into the slice.
+// In this case, the slice must have length c, and Values will panic with
+// ErrSliceLengthMismatch otherwise. If the input slice is nil,
+// a new slice of the appropriate length will be allocated and returned.
+//
+// Values will panic if the receiver does not contain a successful factorization.
+func (gsvd *HOGSVD) Values(s []float64, n int) []float64 {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	if n < 0 || gsvd.n <= n {
+		panic("hogsvd: invalid index")
+	}
+
+	_, c := gsvd.b[n].Dims()
+	if s == nil {
+		s = make([]float64, c)
+	} else if len(s) != c {
+		panic(ErrSliceLengthMismatch)
+	}
+	var v VecDense
+	for j := 0; j < c; j++ {
+		v.ColViewOf(&gsvd.b[n], j)
+		s[j] = blas64.Nrm2(v.mat)
+	}
+	return s
+}
+
+// VTo extracts the matrix V from the singular value decomposition, storing
+// the result in-place into dst. V is size c×c.
+//
+// If dst is empty, VTo will resize dst to be c×c. When dst is
+// non-empty, VTo will panic if dst is not c×c. VTo will also
+// panic if the receiver does not contain a successful factorization.
+func (gsvd *HOGSVD) VTo(dst *Dense) {
+	if !gsvd.succFact() {
+		panic(badFact)
+	}
+	r, c := gsvd.v.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(gsvd.v)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/index_bound_checks.go b/vendor/gonum.org/v1/gonum/mat/index_bound_checks.go
new file mode 100644
index 00000000000..59a9e04788a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/index_bound_checks.go
@@ -0,0 +1,398 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file must be kept in sync with index_no_bound_checks.go.
+
+//go:build bounds
+// +build bounds
+
+package mat
+
+// At returns the element at row i, column j.
+func (m *Dense) At(i, j int) float64 {
+	return m.at(i, j)
+}
+
+func (m *Dense) at(i, j int) float64 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *Dense) Set(i, j int, v float64) {
+	m.set(i, j, v)
+}
+
+func (m *Dense) set(i, j int, v float64) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (m *CDense) At(i, j int) complex128 {
+	return m.at(i, j)
+}
+
+func (m *CDense) at(i, j int) complex128 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *CDense) Set(i, j int, v complex128) {
+	m.set(i, j, v)
+}
+
+func (m *CDense) set(i, j int, v complex128) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i.
+// It panics if i is out of bounds or if j is not zero.
+func (v *VecDense) At(i, j int) float64 {
+	if j != 0 {
+		panic(ErrColAccess)
+	}
+	return v.at(i)
+}
+
+// AtVec returns the element at row i.
+// It panics if i is out of bounds.
+func (v *VecDense) AtVec(i int) float64 {
+	return v.at(i)
+}
+
+func (v *VecDense) at(i int) float64 {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrRowAccess)
+	}
+	return v.mat.Data[i*v.mat.Inc]
+}
+
+// SetVec sets the element at row i to the value val.
+// It panics if i is out of bounds.
+func (v *VecDense) SetVec(i int, val float64) {
+	v.setVec(i, val)
+}
+
+func (v *VecDense) setVec(i int, val float64) {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrVectorAccess)
+	}
+	v.mat.Data[i*v.mat.Inc] = val
+}
+
+// At returns the element at row i and column j.
+func (t *SymDense) At(i, j int) float64 {
+	return t.at(i, j)
+}
+
+func (t *SymDense) at(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	return t.mat.Data[i*t.mat.Stride+j]
+}
+
+// SetSym sets the elements at (i,j) and (j,i) to the value v.
+func (t *SymDense) SetSym(i, j int, v float64) {
+	t.set(i, j, v)
+}
+
+func (t *SymDense) set(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	t.mat.Data[i*t.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (t *TriDense) At(i, j int) float64 {
+	return t.at(i, j)
+}
+
+func (t *TriDense) at(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+j]
+}
+
+// SetTri sets the element of the triangular matrix at row i, column j to the value v.
+// It panics if the location is outside the appropriate half of the matrix.
+func (t *TriDense) SetTri(i, j int, v float64) {
+	t.set(i, j, v)
+}
+
+func (t *TriDense) set(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	t.mat.Data[i*t.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (b *BandDense) At(i, j int) float64 {
+	return b.at(i, j)
+}
+
+func (b *BandDense) at(i, j int) float64 {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		return 0
+	}
+	return b.mat.Data[i*b.mat.Stride+pj]
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (b *BandDense) SetBand(i, j int, v float64) {
+	b.set(i, j, v)
+}
+
+func (b *BandDense) set(i, j int, v float64) {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		panic(ErrBandSet)
+	}
+	b.mat.Data[i*b.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (s *SymBandDense) At(i, j int) float64 {
+	return s.at(i, j)
+}
+
+func (s *SymBandDense) at(i, j int) float64 {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		return 0
+	}
+	return s.mat.Data[i*s.mat.Stride+pj]
+}
+
+// SetSymBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (s *SymBandDense) SetSymBand(i, j int, v float64) {
+	s.set(i, j, v)
+}
+
+func (s *SymBandDense) set(i, j int, v float64) {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		panic(ErrBandSet)
+	}
+	s.mat.Data[i*s.mat.Stride+pj] = v
+}
+
+func (t *TriBandDense) At(i, j int) float64 {
+	return t.at(i, j)
+}
+
+func (t *TriBandDense) at(i, j int) float64 {
+	// TODO(btracey): Support Diag field, see #692.
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	kl, ku := t.mat.K, 0
+	if isUpper {
+		kl, ku = 0, t.mat.K
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+pj]
+}
+
+func (t *TriBandDense) SetTriBand(i, j int, v float64) {
+	t.setTriBand(i, j, v)
+}
+
+func (t *TriBandDense) setTriBand(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	kl, ku := t.mat.K, 0
+	if isUpper {
+		kl, ku = 0, t.mat.K
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		panic(ErrBandSet)
+	}
+	// TODO(btracey): Support Diag field, see #692.
+	t.mat.Data[i*t.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (d *DiagDense) At(i, j int) float64 {
+	return d.at(i, j)
+}
+
+func (d *DiagDense) at(i, j int) float64 {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(d.mat.N) {
+		panic(ErrColAccess)
+	}
+	if i != j {
+		return 0
+	}
+	return d.mat.Data[i*d.mat.Inc]
+}
+
+// SetDiag sets the element at row i, column i to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (d *DiagDense) SetDiag(i int, v float64) {
+	d.setDiag(i, v)
+}
+
+func (d *DiagDense) setDiag(i int, v float64) {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	d.mat.Data[i*d.mat.Inc] = v
+}
+
+// At returns the element at row i, column j.
+func (a *Tridiag) At(i, j int) float64 {
+	return a.at(i, j)
+}
+
+func (a *Tridiag) at(i, j int) float64 {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	switch i - j {
+	case -1:
+		return a.mat.DU[i]
+	case 0:
+		return a.mat.D[i]
+	case 1:
+		return a.mat.DL[j]
+	default:
+		return 0
+	}
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (a *Tridiag) SetBand(i, j int, v float64) {
+	a.set(i, j, v)
+}
+
+func (a *Tridiag) set(i, j int, v float64) {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	switch i - j {
+	case -1:
+		a.mat.DU[i] = v
+	case 0:
+		a.mat.D[i] = v
+	case 1:
+		a.mat.DL[j] = v
+	default:
+		panic(ErrBandSet)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go b/vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go
new file mode 100644
index 00000000000..335128806fe
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/index_no_bound_checks.go
@@ -0,0 +1,400 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file must be kept in sync with index_bound_checks.go.
+
+//go:build !bounds
+// +build !bounds
+
+package mat
+
+// At returns the element at row i, column j.
+func (m *Dense) At(i, j int) float64 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.at(i, j)
+}
+
+func (m *Dense) at(i, j int) float64 {
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *Dense) Set(i, j int, v float64) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.set(i, j, v)
+}
+
+func (m *Dense) set(i, j int, v float64) {
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (m *CDense) At(i, j int) complex128 {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return m.at(i, j)
+}
+
+func (m *CDense) at(i, j int) complex128 {
+	return m.mat.Data[i*m.mat.Stride+j]
+}
+
+// Set sets the element at row i, column j to the value v.
+func (m *CDense) Set(i, j int, v complex128) {
+	if uint(i) >= uint(m.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(m.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	m.set(i, j, v)
+}
+
+func (m *CDense) set(i, j int, v complex128) {
+	m.mat.Data[i*m.mat.Stride+j] = v
+}
+
+// At returns the element at row i.
+// It panics if i is out of bounds or if j is not zero.
+func (v *VecDense) At(i, j int) float64 {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if j != 0 {
+		panic(ErrColAccess)
+	}
+	return v.at(i)
+}
+
+// AtVec returns the element at row i.
+// It panics if i is out of bounds.
+func (v *VecDense) AtVec(i int) float64 {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrRowAccess)
+	}
+	return v.at(i)
+}
+
+func (v *VecDense) at(i int) float64 {
+	return v.mat.Data[i*v.mat.Inc]
+}
+
+// SetVec sets the element at row i to the value val.
+// It panics if i is out of bounds.
+func (v *VecDense) SetVec(i int, val float64) {
+	if uint(i) >= uint(v.mat.N) {
+		panic(ErrVectorAccess)
+	}
+	v.setVec(i, val)
+}
+
+func (v *VecDense) setVec(i int, val float64) {
+	v.mat.Data[i*v.mat.Inc] = val
+}
+
+// At returns the element at row i and column j.
+func (s *SymDense) At(i, j int) float64 {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	return s.at(i, j)
+}
+
+func (s *SymDense) at(i, j int) float64 {
+	if i > j {
+		i, j = j, i
+	}
+	return s.mat.Data[i*s.mat.Stride+j]
+}
+
+// SetSym sets the elements at (i,j) and (j,i) to the value v.
+func (s *SymDense) SetSym(i, j int, v float64) {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	s.set(i, j, v)
+}
+
+func (s *SymDense) set(i, j int, v float64) {
+	if i > j {
+		i, j = j, i
+	}
+	s.mat.Data[i*s.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (t *TriDense) At(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	return t.at(i, j)
+}
+
+func (t *TriDense) at(i, j int) float64 {
+	isUpper := t.triKind()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+j]
+}
+
+// SetTri sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate half of the matrix.
+func (t *TriDense) SetTri(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	t.set(i, j, v)
+}
+
+func (t *TriDense) set(i, j int, v float64) {
+	t.mat.Data[i*t.mat.Stride+j] = v
+}
+
+// At returns the element at row i, column j.
+func (b *BandDense) At(i, j int) float64 {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	return b.at(i, j)
+}
+
+func (b *BandDense) at(i, j int) float64 {
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		return 0
+	}
+	return b.mat.Data[i*b.mat.Stride+pj]
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (b *BandDense) SetBand(i, j int, v float64) {
+	if uint(i) >= uint(b.mat.Rows) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(b.mat.Cols) {
+		panic(ErrColAccess)
+	}
+	pj := j + b.mat.KL - i
+	if pj < 0 || b.mat.KL+b.mat.KU+1 <= pj {
+		panic(ErrBandSet)
+	}
+	b.set(i, j, v)
+}
+
+func (b *BandDense) set(i, j int, v float64) {
+	pj := j + b.mat.KL - i
+	b.mat.Data[i*b.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (s *SymBandDense) At(i, j int) float64 {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	return s.at(i, j)
+}
+
+func (s *SymBandDense) at(i, j int) float64 {
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		return 0
+	}
+	return s.mat.Data[i*s.mat.Stride+pj]
+}
+
+// SetSymBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (s *SymBandDense) SetSymBand(i, j int, v float64) {
+	if uint(i) >= uint(s.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(s.mat.N) {
+		panic(ErrColAccess)
+	}
+	s.set(i, j, v)
+}
+
+func (s *SymBandDense) set(i, j int, v float64) {
+	if i > j {
+		i, j = j, i
+	}
+	pj := j - i
+	if s.mat.K+1 <= pj {
+		panic(ErrBandSet)
+	}
+	s.mat.Data[i*s.mat.Stride+pj] = v
+}
+
+func (t *TriBandDense) At(i, j int) float64 {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	return t.at(i, j)
+}
+
+func (t *TriBandDense) at(i, j int) float64 {
+	// TODO(btracey): Support Diag field, see #692.
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		return 0
+	}
+	kl := t.mat.K
+	ku := 0
+	if isUpper {
+		ku = t.mat.K
+		kl = 0
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		return 0
+	}
+	return t.mat.Data[i*t.mat.Stride+pj]
+}
+
+func (t *TriBandDense) SetTriBand(i, j int, v float64) {
+	if uint(i) >= uint(t.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(t.mat.N) {
+		panic(ErrColAccess)
+	}
+	isUpper := t.isUpper()
+	if (isUpper && i > j) || (!isUpper && i < j) {
+		panic(ErrTriangleSet)
+	}
+	kl, ku := t.mat.K, 0
+	if isUpper {
+		kl, ku = 0, t.mat.K
+	}
+	pj := j + kl - i
+	if pj < 0 || kl+ku+1 <= pj {
+		panic(ErrBandSet)
+	}
+	// TODO(btracey): Support Diag field, see #692.
+	t.mat.Data[i*t.mat.Stride+pj] = v
+}
+
+// At returns the element at row i, column j.
+func (d *DiagDense) At(i, j int) float64 {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(d.mat.N) {
+		panic(ErrColAccess)
+	}
+	return d.at(i, j)
+}
+
+func (d *DiagDense) at(i, j int) float64 {
+	if i != j {
+		return 0
+	}
+	return d.mat.Data[i*d.mat.Inc]
+}
+
+// SetDiag sets the element at row i, column i to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (d *DiagDense) SetDiag(i int, v float64) {
+	if uint(i) >= uint(d.mat.N) {
+		panic(ErrRowAccess)
+	}
+	d.setDiag(i, v)
+}
+
+func (d *DiagDense) setDiag(i int, v float64) {
+	d.mat.Data[i*d.mat.Inc] = v
+}
+
+// At returns the element at row i, column j.
+func (a *Tridiag) At(i, j int) float64 {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	return a.at(i, j)
+}
+
+func (a *Tridiag) at(i, j int) float64 {
+	switch i - j {
+	case -1:
+		return a.mat.DU[i]
+	case 0:
+		return a.mat.D[i]
+	case 1:
+		return a.mat.DL[j]
+	default:
+		return 0
+	}
+}
+
+// SetBand sets the element at row i, column j to the value v.
+// It panics if the location is outside the appropriate region of the matrix.
+func (a *Tridiag) SetBand(i, j int, v float64) {
+	if uint(i) >= uint(a.mat.N) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(a.mat.N) {
+		panic(ErrColAccess)
+	}
+	a.set(i, j, v)
+}
+
+func (a *Tridiag) set(i, j int, v float64) {
+	switch i - j {
+	case -1:
+		a.mat.DU[i] = v
+	case 0:
+		a.mat.D[i] = v
+	case 1:
+		a.mat.DL[j] = v
+	default:
+		panic(ErrBandSet)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/inner.go b/vendor/gonum.org/v1/gonum/mat/inner.go
new file mode 100644
index 00000000000..4f94a96a6b8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/inner.go
@@ -0,0 +1,126 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Inner computes the generalized inner product
+//
+//	xᵀ A y
+//
+// between the vectors x and y with matrix A, where x and y are treated as
+// column vectors.
+//
+// This is only a true inner product if A is symmetric positive definite, though
+// the operation works for any matrix A.
+//
+// Inner panics if x.Len != m or y.Len != n when A is an m x n matrix.
+func Inner(x Vector, a Matrix, y Vector) float64 {
+	m, n := a.Dims()
+	if x.Len() != m {
+		panic(ErrShape)
+	}
+	if y.Len() != n {
+		panic(ErrShape)
+	}
+	if m == 0 || n == 0 {
+		return 0
+	}
+
+	var sum float64
+
+	switch a := a.(type) {
+	case RawSymmetricer:
+		amat := a.RawSymmetric()
+		if amat.Uplo != blas.Upper {
+			// Panic as a string not a mat.Error.
+			panic(badSymTriangle)
+		}
+		var xmat, ymat blas64.Vector
+		if xrv, ok := x.(RawVectorer); ok {
+			xmat = xrv.RawVector()
+		} else {
+			break
+		}
+		if yrv, ok := y.(RawVectorer); ok {
+			ymat = yrv.RawVector()
+		} else {
+			break
+		}
+		for i := 0; i < x.Len(); i++ {
+			xi := x.AtVec(i)
+			if xi != 0 {
+				if ymat.Inc == 1 {
+					sum += xi * f64.DotUnitary(
+						amat.Data[i*amat.Stride+i:i*amat.Stride+n],
+						ymat.Data[i:],
+					)
+				} else {
+					sum += xi * f64.DotInc(
+						amat.Data[i*amat.Stride+i:i*amat.Stride+n],
+						ymat.Data[i*ymat.Inc:], uintptr(n-i),
+						1, uintptr(ymat.Inc),
+						0, 0,
+					)
+				}
+			}
+			yi := y.AtVec(i)
+			if i != n-1 && yi != 0 {
+				if xmat.Inc == 1 {
+					sum += yi * f64.DotUnitary(
+						amat.Data[i*amat.Stride+i+1:i*amat.Stride+n],
+						xmat.Data[i+1:],
+					)
+				} else {
+					sum += yi * f64.DotInc(
+						amat.Data[i*amat.Stride+i+1:i*amat.Stride+n],
+						xmat.Data[(i+1)*xmat.Inc:], uintptr(n-i-1),
+						1, uintptr(xmat.Inc),
+						0, 0,
+					)
+				}
+			}
+		}
+		return sum
+	case RawMatrixer:
+		amat := a.RawMatrix()
+		var ymat blas64.Vector
+		if yrv, ok := y.(RawVectorer); ok {
+			ymat = yrv.RawVector()
+		} else {
+			break
+		}
+		for i := 0; i < x.Len(); i++ {
+			xi := x.AtVec(i)
+			if xi != 0 {
+				if ymat.Inc == 1 {
+					sum += xi * f64.DotUnitary(
+						amat.Data[i*amat.Stride:i*amat.Stride+n],
+						ymat.Data,
+					)
+				} else {
+					sum += xi * f64.DotInc(
+						amat.Data[i*amat.Stride:i*amat.Stride+n],
+						ymat.Data, uintptr(n),
+						1, uintptr(ymat.Inc),
+						0, 0,
+					)
+				}
+			}
+		}
+		return sum
+	}
+	for i := 0; i < x.Len(); i++ {
+		xi := x.AtVec(i)
+		for j := 0; j < y.Len(); j++ {
+			sum += xi * a.At(i, j) * y.AtVec(j)
+		}
+	}
+	return sum
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/io.go b/vendor/gonum.org/v1/gonum/mat/io.go
new file mode 100644
index 00000000000..0641fa28b69
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/io.go
@@ -0,0 +1,495 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+)
+
+// version is the current on-disk codec version.
+const version uint32 = 0x1
+
+// maxLen is the biggest slice/array len one can create on a 32/64b platform.
+const maxLen = int64(int(^uint(0) >> 1))
+
+var (
+	headerSize  = binary.Size(storage{})
+	sizeFloat64 = binary.Size(float64(0))
+
+	errWrongType = errors.New("mat: wrong data type")
+
+	errTooBig    = errors.New("mat: resulting data slice too big")
+	errTooSmall  = errors.New("mat: input slice too small")
+	errBadBuffer = errors.New("mat: data buffer size mismatch")
+	errBadSize   = errors.New("mat: invalid dimension")
+)
+
+// Type encoding scheme:
+//
+// Type 		Form 	Packing 	Uplo 		Unit 		Rows 	Columns kU 	kL
+// uint8 		[GST] 	uint8 [BPF] 	uint8 [AUL] 	bool 		int64 	int64 	int64 	int64
+// General 		'G' 	'F' 		'A' 		false 		r 	c 	0 	0
+// Band 		'G' 	'B' 		'A' 		false 		r 	c 	kU 	kL
+// Symmetric 		'S' 	'F' 		ul 		false 		n 	n 	0 	0
+// SymmetricBand 	'S' 	'B' 		ul 		false 		n 	n 	k 	k
+// SymmetricPacked 	'S' 	'P' 		ul 		false 		n 	n 	0 	0
+// Triangular 		'T' 	'F' 		ul 		Diag==Unit 	n 	n 	0 	0
+// TriangularBand 	'T' 	'B' 		ul 		Diag==Unit 	n 	n 	k 	k
+// TriangularPacked 	'T' 	'P' 		ul	 	Diag==Unit 	n 	n 	0 	0
+//
+// G - general, S - symmetric, T - triangular
+// F - full, B - band, P - packed
+// A - all, U - upper, L - lower
+
+// MarshalBinary encodes the receiver into a binary form and returns the result.
+//
+// Dense is little-endian encoded as follows:
+//
+//	 0 -  3  Version = 1          (uint32)
+//	 4       'G'                  (byte)
+//	 5       'F'                  (byte)
+//	 6       'A'                  (byte)
+//	 7       0                    (byte)
+//	 8 - 15  number of rows       (int64)
+//	16 - 23  number of columns    (int64)
+//	24 - 31  0                    (int64)
+//	32 - 39  0                    (int64)
+//	40 - ..  matrix data elements (float64)
+//	         [0,0] [0,1] ... [0,ncols-1]
+//	         [1,0] [1,1] ... [1,ncols-1]
+//	         ...
+//	         [nrows-1,0] ... [nrows-1,ncols-1]
+func (m Dense) MarshalBinary() ([]byte, error) {
+	bufLen := int64(headerSize) + int64(m.mat.Rows)*int64(m.mat.Cols)*int64(sizeFloat64)
+	if bufLen <= 0 {
+		// bufLen is too big and has wrapped around.
+		return nil, errTooBig
+	}
+
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(m.mat.Rows), Cols: int64(m.mat.Cols),
+		Version: version,
+	}
+	buf := make([]byte, bufLen)
+	n, err := header.marshalBinaryTo(bytes.NewBuffer(buf[:0]))
+	if err != nil {
+		return buf[:n], err
+	}
+
+	p := headerSize
+	r, c := m.Dims()
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			binary.LittleEndian.PutUint64(buf[p:p+sizeFloat64], math.Float64bits(m.at(i, j)))
+			p += sizeFloat64
+		}
+	}
+
+	return buf, nil
+}
+
+// MarshalBinaryTo encodes the receiver into a binary form and writes it into w.
+// MarshalBinaryTo returns the number of bytes written into w and an error, if any.
+//
+// See MarshalBinary for the on-disk layout.
+func (m Dense) MarshalBinaryTo(w io.Writer) (int, error) {
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(m.mat.Rows), Cols: int64(m.mat.Cols),
+		Version: version,
+	}
+	n, err := header.marshalBinaryTo(w)
+	if err != nil {
+		return n, err
+	}
+
+	r, c := m.Dims()
+	var b [8]byte
+	for i := 0; i < r; i++ {
+		for j := 0; j < c; j++ {
+			binary.LittleEndian.PutUint64(b[:], math.Float64bits(m.at(i, j)))
+			nn, err := w.Write(b[:])
+			n += nn
+			if err != nil {
+				return n, err
+			}
+		}
+	}
+
+	return n, nil
+}
+
+// UnmarshalBinary decodes the binary form into the receiver.
+// It panics if the receiver is a non-empty Dense matrix.
+//
+// See MarshalBinary for the on-disk layout.
+//
+// Limited checks on the validity of the binary input are performed:
+//   - ErrShape is returned if the number of rows or columns is negative,
+//   - an error is returned if the resulting Dense matrix is too
+//     big for the current architecture (e.g. a 16GB matrix written by a
+//     64b application and read back from a 32b application.)
+//
+// UnmarshalBinary does not limit the size of the unmarshaled matrix, and so
+// it should not be used on untrusted data.
+func (m *Dense) UnmarshalBinary(data []byte) error {
+	if !m.IsEmpty() {
+		panic("mat: unmarshal into non-empty matrix")
+	}
+
+	if len(data) < headerSize {
+		return errTooSmall
+	}
+
+	var header storage
+	err := header.unmarshalBinary(data[:headerSize])
+	if err != nil {
+		return err
+	}
+	rows := header.Rows
+	cols := header.Cols
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return errWrongType
+	}
+	if rows < 0 || cols < 0 {
+		return errBadSize
+	}
+	size := rows * cols
+	if size == 0 {
+		return ErrZeroLength
+	}
+	if int(size) < 0 || size > maxLen {
+		return errTooBig
+	}
+	if len(data) != headerSize+int(rows*cols)*sizeFloat64 {
+		return errBadBuffer
+	}
+
+	p := headerSize
+	m.reuseAsNonZeroed(int(rows), int(cols))
+	for i := range m.mat.Data {
+		m.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(data[p : p+sizeFloat64]))
+		p += sizeFloat64
+	}
+
+	return nil
+}
+
+// UnmarshalBinaryFrom decodes the binary form into the receiver and returns
+// the number of bytes read and an error if any.
+// It panics if the receiver is a non-empty Dense matrix.
+//
+// See MarshalBinary for the on-disk layout.
+//
+// Limited checks on the validity of the binary input are performed:
+//   - ErrShape is returned if the number of rows or columns is negative,
+//   - an error is returned if the resulting Dense matrix is too
+//     big for the current architecture (e.g. a 16GB matrix written by a
+//     64b application and read back from a 32b application.)
+//
+// UnmarshalBinary does not limit the size of the unmarshaled matrix, and so
+// it should not be used on untrusted data.
+func (m *Dense) UnmarshalBinaryFrom(r io.Reader) (int, error) {
+	if !m.IsEmpty() {
+		panic("mat: unmarshal into non-empty matrix")
+	}
+
+	var header storage
+	n, err := header.unmarshalBinaryFrom(r)
+	if err != nil {
+		return n, err
+	}
+	rows := header.Rows
+	cols := header.Cols
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return n, errWrongType
+	}
+	if rows < 0 || cols < 0 {
+		return n, errBadSize
+	}
+	size := rows * cols
+	if size == 0 {
+		return n, ErrZeroLength
+	}
+	if int(size) < 0 || size > maxLen {
+		return n, errTooBig
+	}
+
+	m.reuseAsNonZeroed(int(rows), int(cols))
+	var b [8]byte
+	for i := range m.mat.Data {
+		nn, err := readFull(r, b[:])
+		n += nn
+		if err != nil {
+			if err == io.EOF {
+				return n, io.ErrUnexpectedEOF
+			}
+			return n, err
+		}
+		m.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(b[:]))
+	}
+
+	return n, nil
+}
+
+// MarshalBinary encodes the receiver into a binary form and returns the result.
+//
+// VecDense is little-endian encoded as follows:
+//
+//	 0 -  3  Version = 1            (uint32)
+//	 4       'G'                    (byte)
+//	 5       'F'                    (byte)
+//	 6       'A'                    (byte)
+//	 7       0                      (byte)
+//	 8 - 15  number of elements     (int64)
+//	16 - 23  1                      (int64)
+//	24 - 31  0                      (int64)
+//	32 - 39  0                      (int64)
+//	40 - ..  vector's data elements (float64)
+func (v VecDense) MarshalBinary() ([]byte, error) {
+	bufLen := int64(headerSize) + int64(v.mat.N)*int64(sizeFloat64)
+	if bufLen <= 0 {
+		// bufLen is too big and has wrapped around.
+		return nil, errTooBig
+	}
+
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(v.mat.N), Cols: 1,
+		Version: version,
+	}
+	buf := make([]byte, bufLen)
+	n, err := header.marshalBinaryTo(bytes.NewBuffer(buf[:0]))
+	if err != nil {
+		return buf[:n], err
+	}
+
+	p := headerSize
+	for i := 0; i < v.mat.N; i++ {
+		binary.LittleEndian.PutUint64(buf[p:p+sizeFloat64], math.Float64bits(v.at(i)))
+		p += sizeFloat64
+	}
+
+	return buf, nil
+}
+
+// MarshalBinaryTo encodes the receiver into a binary form, writes it to w and
+// returns the number of bytes written and an error if any.
+//
+// See MarshalBinary for the on-disk format.
+func (v VecDense) MarshalBinaryTo(w io.Writer) (int, error) {
+	header := storage{
+		Form: 'G', Packing: 'F', Uplo: 'A',
+		Rows: int64(v.mat.N), Cols: 1,
+		Version: version,
+	}
+	n, err := header.marshalBinaryTo(w)
+	if err != nil {
+		return n, err
+	}
+
+	var buf [8]byte
+	for i := 0; i < v.mat.N; i++ {
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(v.at(i)))
+		nn, err := w.Write(buf[:])
+		n += nn
+		if err != nil {
+			return n, err
+		}
+	}
+
+	return n, nil
+}
+
+// UnmarshalBinary decodes the binary form into the receiver.
+// It panics if the receiver is a non-empty VecDense.
+//
+// See MarshalBinary for the on-disk layout.
+//
+// Limited checks on the validity of the binary input are performed:
+//   - ErrShape is returned if the number of rows is negative,
+//   - an error is returned if the resulting VecDense is too
+//     big for the current architecture (e.g. a 16GB vector written by a
+//     64b application and read back from a 32b application.)
+//
+// UnmarshalBinary does not limit the size of the unmarshaled vector, and so
+// it should not be used on untrusted data.
+func (v *VecDense) UnmarshalBinary(data []byte) error {
+	if !v.IsEmpty() {
+		panic("mat: unmarshal into non-empty vector")
+	}
+
+	if len(data) < headerSize {
+		return errTooSmall
+	}
+
+	var header storage
+	err := header.unmarshalBinary(data[:headerSize])
+	if err != nil {
+		return err
+	}
+	if header.Cols != 1 {
+		return ErrShape
+	}
+	n := header.Rows
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return errWrongType
+	}
+	if n == 0 {
+		return ErrZeroLength
+	}
+	if n < 0 {
+		return errBadSize
+	}
+	if int64(maxLen) < n {
+		return errTooBig
+	}
+	if len(data) != headerSize+int(n)*sizeFloat64 {
+		return errBadBuffer
+	}
+
+	p := headerSize
+	v.reuseAsNonZeroed(int(n))
+	for i := range v.mat.Data {
+		v.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(data[p : p+sizeFloat64]))
+		p += sizeFloat64
+	}
+
+	return nil
+}
+
+// UnmarshalBinaryFrom decodes the binary form into the receiver, from the
+// io.Reader and returns the number of bytes read and an error if any.
+// It panics if the receiver is a non-empty VecDense.
+//
+// See MarshalBinary for the on-disk layout.
+// See UnmarshalBinary for the list of sanity checks performed on the input.
+func (v *VecDense) UnmarshalBinaryFrom(r io.Reader) (int, error) {
+	if !v.IsEmpty() {
+		panic("mat: unmarshal into non-empty vector")
+	}
+
+	var header storage
+	n, err := header.unmarshalBinaryFrom(r)
+	if err != nil {
+		return n, err
+	}
+	if header.Cols != 1 {
+		return n, ErrShape
+	}
+	l := header.Rows
+	header.Version = 0
+	header.Rows = 0
+	header.Cols = 0
+	if (header != storage{Form: 'G', Packing: 'F', Uplo: 'A'}) {
+		return n, errWrongType
+	}
+	if l == 0 {
+		return n, ErrZeroLength
+	}
+	if l < 0 {
+		return n, errBadSize
+	}
+	if int64(maxLen) < l {
+		return n, errTooBig
+	}
+
+	v.reuseAsNonZeroed(int(l))
+	var b [8]byte
+	for i := range v.mat.Data {
+		nn, err := readFull(r, b[:])
+		n += nn
+		if err != nil {
+			if err == io.EOF {
+				return n, io.ErrUnexpectedEOF
+			}
+			return n, err
+		}
+		v.mat.Data[i] = math.Float64frombits(binary.LittleEndian.Uint64(b[:]))
+	}
+
+	return n, nil
+}
+
+// storage is the internal representation of the storage format of a
+// serialised matrix.
+type storage struct {
+	Version uint32 // Keep this first.
+	Form    byte   // [GST]
+	Packing byte   // [BPF]
+	Uplo    byte   // [AUL]
+	Unit    bool
+	Rows    int64
+	Cols    int64
+	KU      int64
+	KL      int64
+}
+
+// TODO(kortschak): Consider replacing these with calls to direct
+// encoding/decoding of fields rather than to binary.Write/binary.Read.
+
+func (s storage) marshalBinaryTo(w io.Writer) (int, error) {
+	buf := bytes.NewBuffer(make([]byte, 0, headerSize))
+	err := binary.Write(buf, binary.LittleEndian, s)
+	if err != nil {
+		return 0, err
+	}
+	return w.Write(buf.Bytes())
+}
+
+func (s *storage) unmarshalBinary(buf []byte) error {
+	err := binary.Read(bytes.NewReader(buf), binary.LittleEndian, s)
+	if err != nil {
+		return err
+	}
+	if s.Version != version {
+		return fmt.Errorf("mat: incorrect version: %d", s.Version)
+	}
+	return nil
+}
+
+func (s *storage) unmarshalBinaryFrom(r io.Reader) (int, error) {
+	buf := make([]byte, headerSize)
+	n, err := readFull(r, buf)
+	if err != nil {
+		return n, err
+	}
+	return n, s.unmarshalBinary(buf[:n])
+}
+
+// readFull reads from r into buf until it has read len(buf).
+// It returns the number of bytes copied and an error if fewer bytes were read.
+// If an EOF happens after reading fewer than len(buf) bytes, io.ErrUnexpectedEOF is returned.
+func readFull(r io.Reader, buf []byte) (int, error) {
+	var n int
+	var err error
+	for n < len(buf) && err == nil {
+		var nn int
+		nn, err = r.Read(buf[n:])
+		n += nn
+	}
+	if n == len(buf) {
+		return n, nil
+	}
+	if err == io.EOF {
+		return n, io.ErrUnexpectedEOF
+	}
+	return n, err
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/lq.go b/vendor/gonum.org/v1/gonum/mat/lq.go
new file mode 100644
index 00000000000..a3b3543b086
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/lq.go
@@ -0,0 +1,305 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const badLQ = "mat: invalid LQ factorization"
+
+// LQ is a type for creating and using the LQ factorization of a matrix.
+type LQ struct {
+	lq   *Dense
+	q    *Dense
+	tau  []float64
+	cond float64
+}
+
+// Dims returns the dimensions of the matrix.
+func (lq *LQ) Dims() (r, c int) {
+	if lq.lq == nil {
+		return 0, 0
+	}
+	return lq.lq.Dims()
+}
+
+// At returns the element at row i, column j.
+func (lq *LQ) At(i, j int) float64 {
+	m, n := lq.Dims()
+	if uint(i) >= uint(m) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k <= i; k++ {
+		val += lq.lq.at(i, k) * lq.q.at(k, j)
+	}
+	return val
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// Transpose.
+func (lq *LQ) T() Matrix {
+	return Transpose{lq}
+}
+
+func (lq *LQ) updateCond(norm lapack.MatrixNorm) {
+	// Since A = L*Q, and Q is orthogonal, we get for the condition number κ
+	//  κ(A) := |A| |A^-1| = |L*Q| |(L*Q)^-1| = |L| |Qᵀ * L^-1|
+	//        = |L| |L^-1| = κ(L),
+	// where we used that fact that Q^-1 = Qᵀ. However, this assumes that
+	// the matrix norm is invariant under orthogonal transformations which
+	// is not the case for CondNorm. Hopefully the error is negligible: κ
+	// is only a qualitative measure anyway.
+	m := lq.lq.mat.Rows
+	work := getFloat64s(3*m, false)
+	iwork := getInts(m, false)
+	l := lq.lq.asTriDense(m, blas.NonUnit, blas.Lower)
+	v := lapack64.Trcon(norm, l.mat, work, iwork)
+	lq.cond = 1 / v
+	putFloat64s(work)
+	putInts(iwork)
+}
+
+// Factorize computes the LQ factorization of an m×n matrix a where m <= n. The LQ
+// factorization always exists even if A is singular.
+//
+// The LQ decomposition is a factorization of the matrix A such that A = L * Q.
+// The matrix Q is an orthonormal n×n matrix, and L is an m×n lower triangular matrix.
+// L and Q can be extracted using the LTo and QTo methods.
+func (lq *LQ) Factorize(a Matrix) {
+	lq.factorize(a, CondNorm)
+}
+
+func (lq *LQ) factorize(a Matrix, norm lapack.MatrixNorm) {
+	m, n := a.Dims()
+	if m > n {
+		panic(ErrShape)
+	}
+	if lq.lq == nil {
+		lq.lq = &Dense{}
+	}
+	lq.lq.CloneFrom(a)
+	work := []float64{0}
+	lq.tau = make([]float64, m)
+	lapack64.Gelqf(lq.lq.mat, lq.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Gelqf(lq.lq.mat, lq.tau, work, len(work))
+	putFloat64s(work)
+	lq.updateCond(norm)
+	lq.updateQ()
+}
+
+func (lq *LQ) updateQ() {
+	_, n := lq.Dims()
+	if lq.q == nil {
+		lq.q = NewDense(n, n, nil)
+	} else {
+		lq.q.reuseAsNonZeroed(n, n)
+	}
+	// Construct Q from the elementary reflectors.
+	lq.q.Copy(lq.lq)
+	work := []float64{0}
+	lapack64.Orglq(lq.q.mat, lq.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Orglq(lq.q.mat, lq.tau, work, len(work))
+	putFloat64s(work)
+}
+
+// isValid returns whether the receiver contains a factorization.
+func (lq *LQ) isValid() bool {
+	return lq.lq != nil && !lq.lq.IsEmpty()
+}
+
+// Cond returns the condition number for the factorized matrix.
+// Cond will panic if the receiver does not contain a factorization.
+func (lq *LQ) Cond() float64 {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+	return lq.cond
+}
+
+// TODO(btracey): Add in the "Reduced" forms for extracting the m×m orthogonal
+// and upper triangular matrices.
+
+// LTo extracts the m×n lower trapezoidal matrix from a LQ decomposition.
+//
+// If dst is empty, LTo will resize dst to be r×c. When dst is
+// non-empty, LTo will panic if dst is not r×c. LTo will also panic
+// if the receiver does not contain a successful factorization.
+func (lq *LQ) LTo(dst *Dense) {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	r, c := lq.lq.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	// Disguise the LQ as a lower triangular.
+	t := &TriDense{
+		mat: blas64.Triangular{
+			N:      r,
+			Stride: lq.lq.mat.Stride,
+			Data:   lq.lq.mat.Data,
+			Uplo:   blas.Lower,
+			Diag:   blas.NonUnit,
+		},
+		cap: lq.lq.capCols,
+	}
+	dst.Copy(t)
+
+	if r == c {
+		return
+	}
+	// Zero right of the triangular.
+	for i := 0; i < r; i++ {
+		zero(dst.mat.Data[i*dst.mat.Stride+r : i*dst.mat.Stride+c])
+	}
+}
+
+// QTo extracts the n×n orthonormal matrix Q from an LQ decomposition.
+//
+// If dst is empty, QTo will resize dst to be n×n. When dst is
+// non-empty, QTo will panic if dst is not n×n. QTo will also panic
+// if the receiver does not contain a successful factorization.
+func (lq *LQ) QTo(dst *Dense) {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	_, n := lq.lq.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(n, n)
+	} else {
+		m2, n2 := dst.Dims()
+		if n != m2 || n != n2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(lq.q)
+}
+
+// SolveTo finds a minimum-norm solution to a system of linear equations defined
+// by the matrices A and b, where A is an m×n matrix represented in its LQ factorized
+// form. If A is singular or near-singular a Condition error is returned.
+// See the documentation for Condition for more information.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//	If trans == false, find the minimum norm solution of A * X = B.
+//	If trans == true, find X such that ||A*X - B||_2 is minimized.
+//
+// The solution matrix, X, is stored in place into dst.
+// SolveTo will panic if the receiver does not contain a factorization.
+func (lq *LQ) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	r, c := lq.lq.Dims()
+	br, bc := b.Dims()
+
+	// The LQ solve algorithm stores the result in-place into the right hand side.
+	// The storage for the answer must be large enough to hold both b and x.
+	// However, this method's receiver must be the size of x. Copy b, and then
+	// copy the result into x at the end.
+	if trans {
+		if c != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(r, bc)
+	} else {
+		if r != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(c, bc)
+	}
+	// Do not need to worry about overlap between x and b because w has its own
+	// independent storage.
+	w := getDenseWorkspace(max(r, c), bc, false)
+	w.Copy(b)
+	t := lq.lq.asTriDense(lq.lq.mat.Rows, blas.NonUnit, blas.Lower).mat
+	if trans {
+		work := []float64{0}
+		lapack64.Ormlq(blas.Left, blas.NoTrans, lq.lq.mat, lq.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormlq(blas.Left, blas.NoTrans, lq.lq.mat, lq.tau, w.mat, work, len(work))
+		putFloat64s(work)
+
+		ok := lapack64.Trtrs(blas.Trans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+	} else {
+		ok := lapack64.Trtrs(blas.NoTrans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+		for i := r; i < c; i++ {
+			zero(w.mat.Data[i*w.mat.Stride : i*w.mat.Stride+bc])
+		}
+		work := []float64{0}
+		lapack64.Ormlq(blas.Left, blas.Trans, lq.lq.mat, lq.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormlq(blas.Left, blas.Trans, lq.lq.mat, lq.tau, w.mat, work, len(work))
+		putFloat64s(work)
+	}
+	// x was set above to be the correct size for the result.
+	dst.Copy(w)
+	putDenseWorkspace(w)
+	if lq.cond > ConditionTolerance {
+		return Condition(lq.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds a minimum-norm solution to a system of linear equations.
+// See LQ.SolveTo for the full documentation.
+// SolveToVec will panic if the receiver does not contain a factorization.
+func (lq *LQ) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	if !lq.isValid() {
+		panic(badLQ)
+	}
+
+	r, c := lq.lq.Dims()
+	if _, bc := b.Dims(); bc != 1 {
+		panic(ErrShape)
+	}
+
+	// The Solve implementation is non-trivial, so rather than duplicate the code,
+	// instead recast the VecDenses as Dense and call the matrix code.
+	bm := Matrix(b)
+	if rv, ok := b.(RawVectorer); ok {
+		bmat := rv.RawVector()
+		if dst != b {
+			dst.checkOverlap(bmat)
+		}
+		b := VecDense{mat: bmat}
+		bm = b.asDense()
+	}
+	if trans {
+		dst.reuseAsNonZeroed(r)
+	} else {
+		dst.reuseAsNonZeroed(c)
+	}
+	return lq.SolveTo(dst.asDense(), trans, bm)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/lu.go b/vendor/gonum.org/v1/gonum/mat/lu.go
new file mode 100644
index 00000000000..18ed3dab636
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/lu.go
@@ -0,0 +1,485 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const (
+	badSliceLength = "mat: improper slice length"
+	badLU          = "mat: invalid LU factorization"
+)
+
+// LU is a square n×n matrix represented by its LU factorization with partial
+// pivoting.
+//
+// The factorization has the form
+//
+//	A = P * L * U
+//
+// where P is a permutation matrix, L is lower triangular with unit diagonal
+// elements, and U is upper triangular.
+//
+// Note that this matrix representation is useful for certain operations, in
+// particular for solving linear systems of equations. It is very inefficient at
+// other operations, in particular At is slow.
+type LU struct {
+	lu    *Dense
+	swaps []int
+	piv   []int
+	cond  float64
+	ok    bool // Whether A is nonsingular
+}
+
+var _ Matrix = (*LU)(nil)
+
+// Dims returns the dimensions of the matrix A.
+func (lu *LU) Dims() (r, c int) {
+	if lu.lu == nil {
+		return 0, 0
+	}
+	return lu.lu.Dims()
+}
+
+// At returns the element of A at row i, column j.
+func (lu *LU) At(i, j int) float64 {
+	n, _ := lu.Dims()
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	i = lu.piv[i]
+	var val float64
+	for k := 0; k < min(i, j+1); k++ {
+		val += lu.lu.at(i, k) * lu.lu.at(k, j)
+	}
+	if i <= j {
+		val += lu.lu.at(i, j)
+	}
+	return val
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// Transpose.
+func (lu *LU) T() Matrix {
+	return Transpose{lu}
+}
+
+// updateCond updates the stored condition number of the matrix. anorm is the
+// norm of the original matrix. If anorm is negative it will be estimated.
+func (lu *LU) updateCond(anorm float64, norm lapack.MatrixNorm) {
+	n := lu.lu.mat.Cols
+	work := getFloat64s(4*n, false)
+	defer putFloat64s(work)
+	iwork := getInts(n, false)
+	defer putInts(iwork)
+	if anorm < 0 {
+		// This is an approximation. By the definition of a norm,
+		//  |AB| <= |A| |B|.
+		// Since A = L*U, we get for the condition number κ that
+		//  κ(A) := |A| |A^-1| = |L*U| |A^-1| <= |L| |U| |A^-1|,
+		// so this will overestimate the condition number somewhat.
+		// The norm of the original factorized matrix cannot be stored
+		// because of update possibilities.
+		u := lu.lu.asTriDense(n, blas.NonUnit, blas.Upper)
+		l := lu.lu.asTriDense(n, blas.Unit, blas.Lower)
+		unorm := lapack64.Lantr(norm, u.mat, work)
+		lnorm := lapack64.Lantr(norm, l.mat, work)
+		anorm = unorm * lnorm
+	}
+	v := lapack64.Gecon(norm, lu.lu.mat, anorm, work, iwork)
+	lu.cond = 1 / v
+}
+
+// Factorize computes the LU factorization of the square matrix A and stores the
+// result in the receiver. The LU decomposition will complete regardless of the
+// singularity of a.
+//
+// The L and U matrix factors can be extracted from the factorization using the
+// LTo and UTo methods. The matrix P can be extracted as a row permutation using
+// the RowPivots method and applied using Dense.PermuteRows.
+func (lu *LU) Factorize(a Matrix) {
+	lu.factorize(a, CondNorm)
+}
+
+func (lu *LU) factorize(a Matrix, norm lapack.MatrixNorm) {
+	m, n := a.Dims()
+	if m != n {
+		panic(ErrSquare)
+	}
+	if lu.lu == nil {
+		lu.lu = NewDense(n, n, nil)
+	} else {
+		lu.lu.Reset()
+		lu.lu.reuseAsNonZeroed(n, n)
+	}
+	lu.lu.Copy(a)
+	lu.swaps = useInt(lu.swaps, n)
+	lu.piv = useInt(lu.piv, n)
+	work := getFloat64s(n, false)
+	anorm := lapack64.Lange(norm, lu.lu.mat, work)
+	putFloat64s(work)
+	lu.ok = lapack64.Getrf(lu.lu.mat, lu.swaps)
+	lu.updatePivots(lu.swaps)
+	lu.updateCond(anorm, norm)
+}
+
+func (lu *LU) updatePivots(swaps []int) {
+	// Replay the sequence of row swaps in order to find the row permutation.
+	for i := range lu.piv {
+		lu.piv[i] = i
+	}
+	n, _ := lu.Dims()
+	for i := n - 1; i >= 0; i-- {
+		v := swaps[i]
+		lu.piv[i], lu.piv[v] = lu.piv[v], lu.piv[i]
+	}
+}
+
+// isValid returns whether the receiver contains a factorization.
+func (lu *LU) isValid() bool {
+	return lu.lu != nil && !lu.lu.IsEmpty()
+}
+
+// Cond returns the condition number for the factorized matrix.
+// Cond will panic if the receiver does not contain a factorization.
+func (lu *LU) Cond() float64 {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+	return lu.cond
+}
+
+// Reset resets the factorization so that it can be reused as the receiver of a
+// dimensionally restricted operation.
+func (lu *LU) Reset() {
+	if lu.lu != nil {
+		lu.lu.Reset()
+	}
+	lu.swaps = lu.swaps[:0]
+	lu.piv = lu.piv[:0]
+}
+
+func (lu *LU) isZero() bool {
+	return len(lu.swaps) == 0
+}
+
+// Det returns the determinant of the matrix that has been factorized. In many
+// expressions, using LogDet will be more numerically stable.
+// Det will panic if the receiver does not contain a factorization.
+func (lu *LU) Det() float64 {
+	if !lu.ok {
+		return 0
+	}
+	det, sign := lu.LogDet()
+	return math.Exp(det) * sign
+}
+
+// LogDet returns the log of the determinant and the sign of the determinant
+// for the matrix that has been factorized. Numerical stability in product and
+// division expressions is generally improved by working in log space.
+// LogDet will panic if the receiver does not contain a factorization.
+func (lu *LU) LogDet() (det float64, sign float64) {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	logDiag := getFloat64s(n, false)
+	defer putFloat64s(logDiag)
+	sign = 1.0
+	for i := 0; i < n; i++ {
+		v := lu.lu.at(i, i)
+		if v < 0 {
+			sign *= -1
+		}
+		if lu.swaps[i] != i {
+			sign *= -1
+		}
+		logDiag[i] = math.Log(math.Abs(v))
+	}
+	return floats.Sum(logDiag), sign
+}
+
+// RowPivots returns the row permutation that represents the permutation matrix
+// P from the LU factorization
+//
+//	A = P * L * U.
+//
+// If dst is nil, a new slice is allocated and returned. If dst is not nil and
+// the length of dst does not equal the size of the factorized matrix, RowPivots
+// will panic. RowPivots will panic if the receiver does not contain a
+// factorization.
+func (lu *LU) RowPivots(dst []int) []int {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+	_, n := lu.lu.Dims()
+	if dst == nil {
+		dst = make([]int, n)
+	}
+	if len(dst) != n {
+		panic(badSliceLength)
+	}
+	copy(dst, lu.piv)
+	return dst
+}
+
+// Deprecated: Use RowPivots instead.
+func (lu *LU) Pivot(dst []int) []int {
+	return lu.RowPivots(dst)
+}
+
+// RankOne updates an LU factorization as if a rank-one update had been applied to
+// the original matrix A, storing the result into the receiver. That is, if in
+// the original LU decomposition P * L * U = A, in the updated decomposition
+// P * L' * U' = A + alpha * x * yᵀ.
+// RankOne will panic if orig does not contain a factorization.
+func (lu *LU) RankOne(orig *LU, alpha float64, x, y Vector) {
+	if !orig.isValid() {
+		panic(badLU)
+	}
+
+	// RankOne uses algorithm a1 on page 28 of "Multiple-Rank Updates to Matrix
+	// Factorizations for Nonlinear Analysis and Circuit Design" by Linzhong Deng.
+	// http://web.stanford.edu/group/SOL/dissertations/Linzhong-Deng-thesis.pdf
+	_, n := orig.lu.Dims()
+	if r, c := x.Dims(); r != n || c != 1 {
+		panic(ErrShape)
+	}
+	if r, c := y.Dims(); r != n || c != 1 {
+		panic(ErrShape)
+	}
+	if orig != lu {
+		if lu.isZero() {
+			lu.swaps = useInt(lu.swaps, n)
+			lu.piv = useInt(lu.piv, n)
+			if lu.lu == nil {
+				lu.lu = NewDense(n, n, nil)
+			} else {
+				lu.lu.reuseAsNonZeroed(n, n)
+			}
+		} else if len(lu.swaps) != n {
+			panic(ErrShape)
+		}
+		copy(lu.swaps, orig.swaps)
+		lu.updatePivots(lu.swaps)
+		lu.lu.Copy(orig.lu)
+	}
+
+	xs := getFloat64s(n, false)
+	defer putFloat64s(xs)
+	ys := getFloat64s(n, false)
+	defer putFloat64s(ys)
+	for i := 0; i < n; i++ {
+		xs[i] = x.AtVec(i)
+		ys[i] = y.AtVec(i)
+	}
+
+	// Adjust for the pivoting in the LU factorization
+	for i, v := range lu.swaps {
+		xs[i], xs[v] = xs[v], xs[i]
+	}
+
+	lum := lu.lu.mat
+	omega := alpha
+	for j := 0; j < n; j++ {
+		ujj := lum.Data[j*lum.Stride+j]
+		ys[j] /= ujj
+		theta := 1 + xs[j]*ys[j]*omega
+		beta := omega * ys[j] / theta
+		gamma := omega * xs[j]
+		omega -= beta * gamma
+		lum.Data[j*lum.Stride+j] *= theta
+		for i := j + 1; i < n; i++ {
+			xs[i] -= lum.Data[i*lum.Stride+j] * xs[j]
+			tmp := ys[i]
+			ys[i] -= lum.Data[j*lum.Stride+i] * ys[j]
+			lum.Data[i*lum.Stride+j] += beta * xs[i]
+			lum.Data[j*lum.Stride+i] += gamma * tmp
+		}
+	}
+	lu.updateCond(-1, CondNorm)
+}
+
+// LTo extracts the lower triangular matrix from an LU factorization.
+//
+// If dst is empty, LTo will resize dst to be a lower-triangular n×n matrix.
+// When dst is non-empty, LTo will panic if dst is not n×n or not Lower.
+// LTo will also panic if the receiver does not contain a successful
+// factorization.
+func (lu *LU) LTo(dst *TriDense) *TriDense {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Lower)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Lower {
+			panic(ErrTriangle)
+		}
+	}
+	// Extract the lower triangular elements.
+	for i := 1; i < n; i++ {
+		copy(dst.mat.Data[i*dst.mat.Stride:i*dst.mat.Stride+i], lu.lu.mat.Data[i*lu.lu.mat.Stride:i*lu.lu.mat.Stride+i])
+	}
+	// Set ones on the diagonal.
+	for i := 0; i < n; i++ {
+		dst.mat.Data[i*dst.mat.Stride+i] = 1
+	}
+	return dst
+}
+
+// UTo extracts the upper triangular matrix from an LU factorization.
+//
+// If dst is empty, UTo will resize dst to be an upper-triangular n×n matrix.
+// When dst is non-empty, UTo will panic if dst is not n×n or not Upper.
+// UTo will also panic if the receiver does not contain a successful
+// factorization.
+func (lu *LU) UTo(dst *TriDense) {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAsTri(n, Upper)
+	} else {
+		n2, kind := dst.Triangle()
+		if n != n2 {
+			panic(ErrShape)
+		}
+		if kind != Upper {
+			panic(ErrTriangle)
+		}
+	}
+	// Extract the upper triangular elements.
+	for i := 0; i < n; i++ {
+		copy(dst.mat.Data[i*dst.mat.Stride+i:i*dst.mat.Stride+n], lu.lu.mat.Data[i*lu.lu.mat.Stride+i:i*lu.lu.mat.Stride+n])
+	}
+}
+
+// SolveTo solves a system of linear equations
+//
+//	A * X = B   if trans == false
+//	Aᵀ * X = B  if trans == true
+//
+// using the LU factorization of A stored in the receiver. The solution matrix X
+// is stored into dst.
+//
+// If A is singular or near-singular a Condition error is returned. See the
+// documentation for Condition for more information. SolveTo will panic if the
+// receiver does not contain a factorization.
+func (lu *LU) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	br, bc := b.Dims()
+	if br != n {
+		panic(ErrShape)
+	}
+
+	if !lu.ok {
+		return Condition(math.Inf(1))
+	}
+
+	dst.reuseAsNonZeroed(n, bc)
+	bU, _ := untranspose(b)
+	if dst == bU {
+		var restore func()
+		dst, restore = dst.isolatedWorkspace(bU)
+		defer restore()
+	} else if rm, ok := bU.(RawMatrixer); ok {
+		dst.checkOverlap(rm.RawMatrix())
+	}
+
+	dst.Copy(b)
+	t := blas.NoTrans
+	if trans {
+		t = blas.Trans
+	}
+	lapack64.Getrs(t, lu.lu.mat, dst.mat, lu.swaps)
+	if lu.cond > ConditionTolerance {
+		return Condition(lu.cond)
+	}
+	return nil
+}
+
+// SolveVecTo solves a system of linear equations
+//
+//	A * x = b   if trans == false
+//	Aᵀ * x = b  if trans == true
+//
+// using the LU factorization of A stored in the receiver. The solution matrix x
+// is stored into dst.
+//
+// If A is singular or near-singular a Condition error is returned. See the
+// documentation for Condition for more information. SolveVecTo will panic if the
+// receiver does not contain a factorization.
+func (lu *LU) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	if !lu.isValid() {
+		panic(badLU)
+	}
+
+	_, n := lu.lu.Dims()
+	if br, bc := b.Dims(); br != n || bc != 1 {
+		panic(ErrShape)
+	}
+
+	switch rv := b.(type) {
+	default:
+		dst.reuseAsNonZeroed(n)
+		return lu.SolveTo(dst.asDense(), trans, b)
+	case RawVectorer:
+		if dst != b {
+			dst.checkOverlap(rv.RawVector())
+		}
+
+		if !lu.ok {
+			return Condition(math.Inf(1))
+		}
+
+		dst.reuseAsNonZeroed(n)
+		var restore func()
+		if dst == b {
+			dst, restore = dst.isolatedWorkspace(b)
+			defer restore()
+		}
+		dst.CopyVec(b)
+		vMat := blas64.General{
+			Rows:   n,
+			Cols:   1,
+			Stride: dst.mat.Inc,
+			Data:   dst.mat.Data,
+		}
+		t := blas.NoTrans
+		if trans {
+			t = blas.Trans
+		}
+		lapack64.Getrs(t, lu.lu.mat, vMat, lu.swaps)
+		if lu.cond > ConditionTolerance {
+			return Condition(lu.cond)
+		}
+		return nil
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/matrix.go b/vendor/gonum.org/v1/gonum/mat/matrix.go
new file mode 100644
index 00000000000..9fc372c71ed
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/matrix.go
@@ -0,0 +1,1014 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/floats/scalar"
+	"gonum.org/v1/gonum/lapack"
+)
+
+// Matrix is the basic matrix interface type.
+type Matrix interface {
+	// Dims returns the dimensions of a Matrix.
+	Dims() (r, c int)
+
+	// At returns the value of a matrix element at row i, column j.
+	// It will panic if i or j are out of bounds for the matrix.
+	At(i, j int) float64
+
+	// T returns the transpose of the Matrix. Whether T returns a copy of the
+	// underlying data is implementation dependent.
+	// This method may be implemented using the Transpose type, which
+	// provides an implicit matrix transpose.
+	T() Matrix
+}
+
+// allMatrix represents the extra set of methods that all mat Matrix types
+// should satisfy. This is used to enforce compile-time consistency between the
+// Dense types, especially helpful when adding new features.
+type allMatrix interface {
+	Reseter
+	IsEmpty() bool
+	Zero()
+}
+
+// denseMatrix represents the extra set of methods that all Dense Matrix types
+// should satisfy. This is used to enforce compile-time consistency between the
+// Dense types, especially helpful when adding new features.
+type denseMatrix interface {
+	DiagView() Diagonal
+	Tracer
+	Normer
+}
+
+var (
+	_ Matrix       = Transpose{}
+	_ Untransposer = Transpose{}
+)
+
+// Transpose is a type for performing an implicit matrix transpose. It implements
+// the Matrix interface, returning values from the transpose of the matrix within.
+type Transpose struct {
+	Matrix Matrix
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Matrix field.
+func (t Transpose) At(i, j int) float64 {
+	return t.Matrix.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. The number of rows returned
+// is the number of columns in the Matrix field, and the number of columns is
+// the number of rows in the Matrix field.
+func (t Transpose) Dims() (r, c int) {
+	c, r = t.Matrix.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Matrix field.
+func (t Transpose) T() Matrix {
+	return t.Matrix
+}
+
+// Untranspose returns the Matrix field.
+func (t Transpose) Untranspose() Matrix {
+	return t.Matrix
+}
+
+// Untransposer is a type that can undo an implicit transpose.
+type Untransposer interface {
+	// Note: This interface is needed to unify all of the Transpose types. In
+	// the mat methods, we need to test if the Matrix has been implicitly
+	// transposed. If this is checked by testing for the specific Transpose type
+	// then the behavior will be different if the user uses T() or TTri() for a
+	// triangular matrix.
+
+	// Untranspose returns the underlying Matrix stored for the implicit transpose.
+	Untranspose() Matrix
+}
+
+// UntransposeBander is a type that can undo an implicit band transpose.
+type UntransposeBander interface {
+	// Untranspose returns the underlying Banded stored for the implicit transpose.
+	UntransposeBand() Banded
+}
+
+// UntransposeTrier is a type that can undo an implicit triangular transpose.
+type UntransposeTrier interface {
+	// Untranspose returns the underlying Triangular stored for the implicit transpose.
+	UntransposeTri() Triangular
+}
+
+// UntransposeTriBander is a type that can undo an implicit triangular banded
+// transpose.
+type UntransposeTriBander interface {
+	// Untranspose returns the underlying Triangular stored for the implicit transpose.
+	UntransposeTriBand() TriBanded
+}
+
+// Mutable is a matrix interface type that allows elements to be altered.
+type Mutable interface {
+	// Set alters the matrix element at row i, column j to v.
+	// It will panic if i or j are out of bounds for the matrix.
+	Set(i, j int, v float64)
+
+	Matrix
+}
+
+// A RowViewer can return a Vector reflecting a row that is backed by the matrix
+// data. The Vector returned will have length equal to the number of columns.
+type RowViewer interface {
+	RowView(i int) Vector
+}
+
+// A RawRowViewer can return a slice of float64 reflecting a row that is backed by the matrix
+// data.
+type RawRowViewer interface {
+	RawRowView(i int) []float64
+}
+
+// A ColViewer can return a Vector reflecting a column that is backed by the matrix
+// data. The Vector returned will have length equal to the number of rows.
+type ColViewer interface {
+	ColView(j int) Vector
+}
+
+// A RawColViewer can return a slice of float64 reflecting a column that is backed by the matrix
+// data.
+type RawColViewer interface {
+	RawColView(j int) []float64
+}
+
+// A ClonerFrom can make a copy of a into the receiver, overwriting the previous value of the
+// receiver. The clone operation does not make any restriction on shape and will not cause
+// shadowing.
+type ClonerFrom interface {
+	CloneFrom(a Matrix)
+}
+
+// A Reseter can reset the matrix so that it can be reused as the receiver of a dimensionally
+// restricted operation. This is commonly used when the matrix is being used as a workspace
+// or temporary matrix.
+//
+// If the matrix is a view, using Reset may result in data corruption in elements outside
+// the view. Similarly, if the matrix shares backing data with another variable, using
+// Reset may lead to unexpected changes in data values.
+type Reseter interface {
+	Reset()
+}
+
+// A Copier can make a copy of elements of a into the receiver. The submatrix copied
+// starts at row and column 0 and has dimensions equal to the minimum dimensions of
+// the two matrices. The number of row and columns copied is returned.
+// Copy will copy from a source that aliases the receiver unless the source is transposed;
+// an aliasing transpose copy will panic with the exception for a special case when
+// the source data has a unitary increment or stride.
+type Copier interface {
+	Copy(a Matrix) (r, c int)
+}
+
+// A Grower can grow the size of the represented matrix by the given number of rows and columns.
+// Growing beyond the size given by the Caps method will result in the allocation of a new
+// matrix and copying of the elements. If Grow is called with negative increments it will
+// panic with ErrIndexOutOfRange.
+type Grower interface {
+	Caps() (r, c int)
+	Grow(r, c int) Matrix
+}
+
+// A RawMatrixSetter can set the underlying blas64.General used by the receiver. There is no restriction
+// on the shape of the receiver. Changes to the receiver's elements will be reflected in the blas64.General.Data.
+type RawMatrixSetter interface {
+	SetRawMatrix(a blas64.General)
+}
+
+// A RawMatrixer can return a blas64.General representation of the receiver. Changes to the blas64.General.Data
+// slice will be reflected in the original matrix, changes to the Rows, Cols and Stride fields will not.
+type RawMatrixer interface {
+	RawMatrix() blas64.General
+}
+
+// A RawVectorer can return a blas64.Vector representation of the receiver. Changes to the blas64.Vector.Data
+// slice will be reflected in the original matrix, changes to the Inc field will not.
+type RawVectorer interface {
+	RawVector() blas64.Vector
+}
+
+// A NonZeroDoer can call a function for each non-zero element of the receiver.
+// The parameters of the function are the element indices and its value.
+type NonZeroDoer interface {
+	DoNonZero(func(i, j int, v float64))
+}
+
+// A RowNonZeroDoer can call a function for each non-zero element of a row of the receiver.
+// The parameters of the function are the element indices and its value.
+type RowNonZeroDoer interface {
+	DoRowNonZero(i int, fn func(i, j int, v float64))
+}
+
+// A ColNonZeroDoer can call a function for each non-zero element of a column of the receiver.
+// The parameters of the function are the element indices and its value.
+type ColNonZeroDoer interface {
+	DoColNonZero(j int, fn func(i, j int, v float64))
+}
+
+// A SolveToer can solve a linear system A⋅X = B or Aᵀ⋅X = B where A is a matrix
+// represented by the receiver and B is a given matrix, storing the result into
+// dst.
+//
+// If dst is empty, SolveTo will resize it to the correct size, otherwise it
+// must have the correct size. Individual implementations may impose other
+// restrictions on the input parameters, for example that A is a square matrix.
+type SolveToer interface {
+	SolveTo(dst *Dense, trans bool, b Matrix) error
+}
+
+// untranspose untransposes a matrix if applicable. If a is an Untransposer, then
+// untranspose returns the underlying matrix and true. If it is not, then it returns
+// the input matrix and false.
+func untranspose(a Matrix) (Matrix, bool) {
+	if ut, ok := a.(Untransposer); ok {
+		return ut.Untranspose(), true
+	}
+	return a, false
+}
+
+// untransposeExtract returns an untransposed matrix in a built-in matrix type.
+//
+// The untransposed matrix is returned unaltered if it is a built-in matrix type.
+// Otherwise, if it implements a Raw method, an appropriate built-in type value
+// is returned holding the raw matrix value of the input. If neither of these
+// is possible, the untransposed matrix is returned.
+func untransposeExtract(a Matrix) (Matrix, bool) {
+	ut, trans := untranspose(a)
+	switch m := ut.(type) {
+	case *DiagDense, *SymBandDense, *TriBandDense, *BandDense, *TriDense, *SymDense, *Dense, *VecDense, *Tridiag:
+		return m, trans
+	// TODO(btracey): Add here if we ever have an equivalent of RawDiagDense.
+	case RawSymBander:
+		rsb := m.RawSymBand()
+		if rsb.Uplo != blas.Upper {
+			return ut, trans
+		}
+		var sb SymBandDense
+		sb.SetRawSymBand(rsb)
+		return &sb, trans
+	case RawTriBander:
+		rtb := m.RawTriBand()
+		if rtb.Diag == blas.Unit {
+			return ut, trans
+		}
+		var tb TriBandDense
+		tb.SetRawTriBand(rtb)
+		return &tb, trans
+	case RawBander:
+		var b BandDense
+		b.SetRawBand(m.RawBand())
+		return &b, trans
+	case RawTriangular:
+		rt := m.RawTriangular()
+		if rt.Diag == blas.Unit {
+			return ut, trans
+		}
+		var t TriDense
+		t.SetRawTriangular(rt)
+		return &t, trans
+	case RawSymmetricer:
+		rs := m.RawSymmetric()
+		if rs.Uplo != blas.Upper {
+			return ut, trans
+		}
+		var s SymDense
+		s.SetRawSymmetric(rs)
+		return &s, trans
+	case RawMatrixer:
+		var d Dense
+		d.SetRawMatrix(m.RawMatrix())
+		return &d, trans
+	case RawVectorer:
+		var v VecDense
+		v.SetRawVector(m.RawVector())
+		return &v, trans
+	case RawTridiagonaler:
+		var d Tridiag
+		d.SetRawTridiagonal(m.RawTridiagonal())
+		return &d, trans
+	default:
+		return ut, trans
+	}
+}
+
+// TODO(btracey): Consider adding CopyCol/CopyRow if the behavior seems useful.
+// TODO(btracey): Add in fast paths to Row/Col for the other concrete types
+// (TriDense, etc.) as well as relevant interfaces (RowColer, RawRowViewer, etc.)
+
+// Col copies the elements in the jth column of the matrix into the slice dst.
+// The length of the provided slice must equal the number of rows, unless the
+// slice is nil in which case a new slice is first allocated.
+func Col(dst []float64, j int, a Matrix) []float64 {
+	r, c := a.Dims()
+	if j < 0 || j >= c {
+		panic(ErrColAccess)
+	}
+	if dst == nil {
+		dst = make([]float64, r)
+	} else {
+		if len(dst) != r {
+			panic(ErrColLength)
+		}
+	}
+	aU, aTrans := untranspose(a)
+	if rm, ok := aU.(RawMatrixer); ok {
+		m := rm.RawMatrix()
+		if aTrans {
+			copy(dst, m.Data[j*m.Stride:j*m.Stride+m.Cols])
+			return dst
+		}
+		blas64.Copy(blas64.Vector{N: r, Inc: m.Stride, Data: m.Data[j:]},
+			blas64.Vector{N: r, Inc: 1, Data: dst},
+		)
+		return dst
+	}
+	for i := 0; i < r; i++ {
+		dst[i] = a.At(i, j)
+	}
+	return dst
+}
+
+// Row copies the elements in the ith row of the matrix into the slice dst.
+// The length of the provided slice must equal the number of columns, unless the
+// slice is nil in which case a new slice is first allocated.
+func Row(dst []float64, i int, a Matrix) []float64 {
+	r, c := a.Dims()
+	if i < 0 || i >= r {
+		panic(ErrColAccess)
+	}
+	if dst == nil {
+		dst = make([]float64, c)
+	} else {
+		if len(dst) != c {
+			panic(ErrRowLength)
+		}
+	}
+	aU, aTrans := untranspose(a)
+	if rm, ok := aU.(RawMatrixer); ok {
+		m := rm.RawMatrix()
+		if aTrans {
+			blas64.Copy(blas64.Vector{N: c, Inc: m.Stride, Data: m.Data[i:]},
+				blas64.Vector{N: c, Inc: 1, Data: dst},
+			)
+			return dst
+		}
+		copy(dst, m.Data[i*m.Stride:i*m.Stride+m.Cols])
+		return dst
+	}
+	for j := 0; j < c; j++ {
+		dst[j] = a.At(i, j)
+	}
+	return dst
+}
+
+// Cond returns the condition number of the given matrix under the given norm.
+// The condition number must be based on the 1-norm, 2-norm or ∞-norm.
+// Cond will panic with ErrZeroLength if the matrix has zero size.
+//
+// BUG(btracey): The computation of the 1-norm and ∞-norm for non-square matrices
+// is inaccurate, although is typically the right order of magnitude. See
+// https://github.com/xianyi/OpenBLAS/issues/636. While the value returned will
+// change with the resolution of this bug, the result from Cond will match the
+// condition number used internally.
+func Cond(a Matrix, norm float64) float64 {
+	m, n := a.Dims()
+	if m == 0 || n == 0 {
+		panic(ErrZeroLength)
+	}
+	var lnorm lapack.MatrixNorm
+	switch norm {
+	default:
+		panic("mat: bad norm value")
+	case 1:
+		lnorm = lapack.MaxColumnSum
+	case 2:
+		var svd SVD
+		ok := svd.Factorize(a, SVDNone)
+		if !ok {
+			return math.Inf(1)
+		}
+		return svd.Cond()
+	case math.Inf(1):
+		lnorm = lapack.MaxRowSum
+	}
+
+	if m == n {
+		// Use the LU decomposition to compute the condition number.
+		var lu LU
+		lu.factorize(a, lnorm)
+		return lu.Cond()
+	}
+	if m > n {
+		// Use the QR factorization to compute the condition number.
+		var qr QR
+		qr.factorize(a, lnorm)
+		return qr.Cond()
+	}
+	// Use the LQ factorization to compute the condition number.
+	var lq LQ
+	lq.factorize(a, lnorm)
+	return lq.Cond()
+}
+
+// Det returns the determinant of the square matrix a. In many expressions using
+// LogDet will be more numerically stable.
+//
+// Det panics with ErrSquare if a is not square and with ErrZeroLength if a has
+// zero size.
+func Det(a Matrix) float64 {
+	det, sign := LogDet(a)
+	return math.Exp(det) * sign
+}
+
+// Dot returns the sum of the element-wise product of a and b.
+//
+// Dot panics with ErrShape if the vector sizes are unequal and with
+// ErrZeroLength if the sizes are zero.
+func Dot(a, b Vector) float64 {
+	la := a.Len()
+	lb := b.Len()
+	if la != lb {
+		panic(ErrShape)
+	}
+	if la == 0 {
+		panic(ErrZeroLength)
+	}
+	if arv, ok := a.(RawVectorer); ok {
+		if brv, ok := b.(RawVectorer); ok {
+			return blas64.Dot(arv.RawVector(), brv.RawVector())
+		}
+	}
+	var sum float64
+	for i := 0; i < la; i++ {
+		sum += a.At(i, 0) * b.At(i, 0)
+	}
+	return sum
+}
+
+// Equal returns whether the matrices a and b have the same size
+// and are element-wise equal.
+func Equal(a, b Matrix) bool {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	aU, aTrans := untranspose(a)
+	bU, bTrans := untranspose(b)
+	if rma, ok := aU.(RawMatrixer); ok {
+		if rmb, ok := bU.(RawMatrixer); ok {
+			ra := rma.RawMatrix()
+			rb := rmb.RawMatrix()
+			if aTrans == bTrans {
+				for i := 0; i < ra.Rows; i++ {
+					for j := 0; j < ra.Cols; j++ {
+						if ra.Data[i*ra.Stride+j] != rb.Data[i*rb.Stride+j] {
+							return false
+						}
+					}
+				}
+				return true
+			}
+			for i := 0; i < ra.Rows; i++ {
+				for j := 0; j < ra.Cols; j++ {
+					if ra.Data[i*ra.Stride+j] != rb.Data[j*rb.Stride+i] {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if rma, ok := aU.(RawSymmetricer); ok {
+		if rmb, ok := bU.(RawSymmetricer); ok {
+			ra := rma.RawSymmetric()
+			rb := rmb.RawSymmetric()
+			// Symmetric matrices are always upper and equal to their transpose.
+			for i := 0; i < ra.N; i++ {
+				for j := i; j < ra.N; j++ {
+					if ra.Data[i*ra.Stride+j] != rb.Data[i*rb.Stride+j] {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if ra, ok := aU.(*VecDense); ok {
+		if rb, ok := bU.(*VecDense); ok {
+			// If the raw vectors are the same length they must either both be
+			// transposed or both not transposed (or have length 1).
+			for i := 0; i < ra.mat.N; i++ {
+				if ra.mat.Data[i*ra.mat.Inc] != rb.mat.Data[i*rb.mat.Inc] {
+					return false
+				}
+			}
+			return true
+		}
+	}
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if a.At(i, j) != b.At(i, j) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// EqualApprox returns whether the matrices a and b have the same size and contain all equal
+// elements with tolerance for element-wise equality specified by epsilon. Matrices
+// with non-equal shapes are not equal.
+func EqualApprox(a, b Matrix, epsilon float64) bool {
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br || ac != bc {
+		return false
+	}
+	aU, aTrans := untranspose(a)
+	bU, bTrans := untranspose(b)
+	if rma, ok := aU.(RawMatrixer); ok {
+		if rmb, ok := bU.(RawMatrixer); ok {
+			ra := rma.RawMatrix()
+			rb := rmb.RawMatrix()
+			if aTrans == bTrans {
+				for i := 0; i < ra.Rows; i++ {
+					for j := 0; j < ra.Cols; j++ {
+						if !scalar.EqualWithinAbsOrRel(ra.Data[i*ra.Stride+j], rb.Data[i*rb.Stride+j], epsilon, epsilon) {
+							return false
+						}
+					}
+				}
+				return true
+			}
+			for i := 0; i < ra.Rows; i++ {
+				for j := 0; j < ra.Cols; j++ {
+					if !scalar.EqualWithinAbsOrRel(ra.Data[i*ra.Stride+j], rb.Data[j*rb.Stride+i], epsilon, epsilon) {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if rma, ok := aU.(RawSymmetricer); ok {
+		if rmb, ok := bU.(RawSymmetricer); ok {
+			ra := rma.RawSymmetric()
+			rb := rmb.RawSymmetric()
+			// Symmetric matrices are always upper and equal to their transpose.
+			for i := 0; i < ra.N; i++ {
+				for j := i; j < ra.N; j++ {
+					if !scalar.EqualWithinAbsOrRel(ra.Data[i*ra.Stride+j], rb.Data[i*rb.Stride+j], epsilon, epsilon) {
+						return false
+					}
+				}
+			}
+			return true
+		}
+	}
+	if ra, ok := aU.(*VecDense); ok {
+		if rb, ok := bU.(*VecDense); ok {
+			// If the raw vectors are the same length they must either both be
+			// transposed or both not transposed (or have length 1).
+			for i := 0; i < ra.mat.N; i++ {
+				if !scalar.EqualWithinAbsOrRel(ra.mat.Data[i*ra.mat.Inc], rb.mat.Data[i*rb.mat.Inc], epsilon, epsilon) {
+					return false
+				}
+			}
+			return true
+		}
+	}
+	for i := 0; i < ar; i++ {
+		for j := 0; j < ac; j++ {
+			if !scalar.EqualWithinAbsOrRel(a.At(i, j), b.At(i, j), epsilon, epsilon) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// LogDet returns the log of the determinant and the sign of the determinant
+// for the matrix that has been factorized. Numerical stability in product and
+// division expressions is generally improved by working in log space.
+//
+// LogDet panics with ErrSquare is a is not square and with ErrZeroLength if a
+// has zero size.
+func LogDet(a Matrix) (det float64, sign float64) {
+	// TODO(btracey): Add specialized routines for TriDense, etc.
+	var lu LU
+	lu.Factorize(a)
+	return lu.LogDet()
+}
+
+// Max returns the largest element value of the matrix A.
+//
+// Max will panic with ErrZeroLength if the matrix has zero size.
+func Max(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	// Max(A) = Max(Aᵀ)
+	aU, _ := untranspose(a)
+	switch m := aU.(type) {
+	case RawMatrixer:
+		rm := m.RawMatrix()
+		max := math.Inf(-1)
+		for i := 0; i < rm.Rows; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols] {
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	case RawTriangular:
+		rm := m.RawTriangular()
+		// The max of a triangular is at least 0 unless the size is 1.
+		if rm.N == 1 {
+			return rm.Data[0]
+		}
+		max := 0.0
+		if rm.Uplo == blas.Upper {
+			for i := 0; i < rm.N; i++ {
+				for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+					if v > max {
+						max = v
+					}
+				}
+			}
+			return max
+		}
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+i+1] {
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	case RawSymmetricer:
+		rm := m.RawSymmetric()
+		if rm.Uplo != blas.Upper {
+			panic(badSymTriangle)
+		}
+		max := math.Inf(-1)
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	default:
+		r, c := aU.Dims()
+		max := math.Inf(-1)
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				v := aU.At(i, j)
+				if v > max {
+					max = v
+				}
+			}
+		}
+		return max
+	}
+}
+
+// Min returns the smallest element value of the matrix A.
+//
+// Min will panic with ErrZeroLength if the matrix has zero size.
+func Min(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	// Min(A) = Min(Aᵀ)
+	aU, _ := untranspose(a)
+	switch m := aU.(type) {
+	case RawMatrixer:
+		rm := m.RawMatrix()
+		min := math.Inf(1)
+		for i := 0; i < rm.Rows; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols] {
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	case RawTriangular:
+		rm := m.RawTriangular()
+		// The min of a triangular is at most 0 unless the size is 1.
+		if rm.N == 1 {
+			return rm.Data[0]
+		}
+		min := 0.0
+		if rm.Uplo == blas.Upper {
+			for i := 0; i < rm.N; i++ {
+				for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+					if v < min {
+						min = v
+					}
+				}
+			}
+			return min
+		}
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+i+1] {
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	case RawSymmetricer:
+		rm := m.RawSymmetric()
+		if rm.Uplo != blas.Upper {
+			panic(badSymTriangle)
+		}
+		min := math.Inf(1)
+		for i := 0; i < rm.N; i++ {
+			for _, v := range rm.Data[i*rm.Stride+i : i*rm.Stride+rm.N] {
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	default:
+		r, c := aU.Dims()
+		min := math.Inf(1)
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				v := aU.At(i, j)
+				if v < min {
+					min = v
+				}
+			}
+		}
+		return min
+	}
+}
+
+// A Normer can compute a norm of the matrix. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+type Normer interface {
+	Norm(norm float64) float64
+}
+
+// Norm returns the specified norm of the matrix A. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// If a is a Normer, its Norm method will be used to calculate the norm.
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrShape if the matrix has zero size.
+func Norm(a Matrix, norm float64) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	m, trans := untransposeExtract(a)
+	if m, ok := m.(Normer); ok {
+		if trans {
+			switch norm {
+			case 1:
+				norm = math.Inf(1)
+			case math.Inf(1):
+				norm = 1
+			}
+		}
+		return m.Norm(norm)
+	}
+	switch norm {
+	default:
+		panic(ErrNormOrder)
+	case 1:
+		var max float64
+		for j := 0; j < c; j++ {
+			var sum float64
+			for i := 0; i < r; i++ {
+				sum += math.Abs(a.At(i, j))
+			}
+			if sum > max {
+				max = sum
+			}
+		}
+		return max
+	case 2:
+		var sum float64
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				v := a.At(i, j)
+				sum += v * v
+			}
+		}
+		return math.Sqrt(sum)
+	case math.Inf(1):
+		var max float64
+		for i := 0; i < r; i++ {
+			var sum float64
+			for j := 0; j < c; j++ {
+				sum += math.Abs(a.At(i, j))
+			}
+			if sum > max {
+				max = sum
+			}
+		}
+		return max
+	}
+}
+
+// normLapack converts the float64 norm input in Norm to a lapack.MatrixNorm.
+func normLapack(norm float64, aTrans bool) lapack.MatrixNorm {
+	switch norm {
+	case 1:
+		n := lapack.MaxColumnSum
+		if aTrans {
+			n = lapack.MaxRowSum
+		}
+		return n
+	case 2:
+		return lapack.Frobenius
+	case math.Inf(1):
+		n := lapack.MaxRowSum
+		if aTrans {
+			n = lapack.MaxColumnSum
+		}
+		return n
+	default:
+		panic(ErrNormOrder)
+	}
+}
+
+// Sum returns the sum of the elements of the matrix.
+//
+// Sum will panic with ErrZeroLength if the matrix has zero size.
+func Sum(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	var sum float64
+	aU, _ := untranspose(a)
+	switch rma := aU.(type) {
+	case RawSymmetricer:
+		rm := rma.RawSymmetric()
+		for i := 0; i < rm.N; i++ {
+			// Diagonals count once while off-diagonals count twice.
+			sum += rm.Data[i*rm.Stride+i]
+			var s float64
+			for _, v := range rm.Data[i*rm.Stride+i+1 : i*rm.Stride+rm.N] {
+				s += v
+			}
+			sum += 2 * s
+		}
+		return sum
+	case RawTriangular:
+		rm := rma.RawTriangular()
+		var startIdx, endIdx int
+		for i := 0; i < rm.N; i++ {
+			// Start and end index for this triangle-row.
+			switch rm.Uplo {
+			case blas.Upper:
+				startIdx = i
+				endIdx = rm.N
+			case blas.Lower:
+				startIdx = 0
+				endIdx = i + 1
+			default:
+				panic(badTriangle)
+			}
+			for _, v := range rm.Data[i*rm.Stride+startIdx : i*rm.Stride+endIdx] {
+				sum += v
+			}
+		}
+		return sum
+	case RawMatrixer:
+		rm := rma.RawMatrix()
+		for i := 0; i < rm.Rows; i++ {
+			for _, v := range rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols] {
+				sum += v
+			}
+		}
+		return sum
+	case *VecDense:
+		rm := rma.RawVector()
+		for i := 0; i < rm.N; i++ {
+			sum += rm.Data[i*rm.Inc]
+		}
+		return sum
+	default:
+		r, c := a.Dims()
+		for i := 0; i < r; i++ {
+			for j := 0; j < c; j++ {
+				sum += a.At(i, j)
+			}
+		}
+		return sum
+	}
+}
+
+// A Tracer can compute the trace of the matrix. Trace must panic with ErrSquare
+// if the matrix is not square.
+type Tracer interface {
+	Trace() float64
+}
+
+// Trace returns the trace of the matrix. If a is a Tracer, its Trace method
+// will be used to calculate the matrix trace.
+//
+// Trace will panic with ErrSquare if the matrix is not square and with
+// ErrZeroLength if the matrix has zero size.
+func Trace(a Matrix) float64 {
+	r, c := a.Dims()
+	if r == 0 || c == 0 {
+		panic(ErrZeroLength)
+	}
+	m, _ := untransposeExtract(a)
+	if t, ok := m.(Tracer); ok {
+		return t.Trace()
+	}
+	if r != c {
+		panic(ErrSquare)
+	}
+	var v float64
+	for i := 0; i < r; i++ {
+		v += a.At(i, i)
+	}
+	return v
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// use returns a float64 slice with l elements, using f if it
+// has the necessary capacity, otherwise creating a new slice.
+func use(f []float64, l int) []float64 {
+	if l <= cap(f) {
+		return f[:l]
+	}
+	return make([]float64, l)
+}
+
+// useZeroed returns a float64 slice with l elements, using f if it
+// has the necessary capacity, otherwise creating a new slice. The
+// elements of the returned slice are guaranteed to be zero.
+func useZeroed(f []float64, l int) []float64 {
+	if l <= cap(f) {
+		f = f[:l]
+		zero(f)
+		return f
+	}
+	return make([]float64, l)
+}
+
+// zero zeros the given slice's elements.
+func zero(f []float64) {
+	for i := range f {
+		f[i] = 0
+	}
+}
+
+// useInt returns an int slice with l elements, using i if it
+// has the necessary capacity, otherwise creating a new slice.
+func useInt(i []int, l int) []int {
+	if l <= cap(i) {
+		return i[:l]
+	}
+	return make([]int, l)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/offset.go b/vendor/gonum.org/v1/gonum/mat/offset.go
new file mode 100644
index 00000000000..26c80a4c8f8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/offset.go
@@ -0,0 +1,32 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !safe
+// +build !safe
+
+package mat
+
+import "unsafe"
+
+// offset returns the number of float64 values b[0] is after a[0].
+func offset(a, b []float64) int {
+	if &a[0] == &b[0] {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(uintptr(unsafe.Pointer(&b[0]))-uintptr(unsafe.Pointer(&a[0]))) / int(unsafe.Sizeof(float64(0)))
+}
+
+// offsetComplex returns the number of complex128 values b[0] is after a[0].
+func offsetComplex(a, b []complex128) int {
+	if &a[0] == &b[0] {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(uintptr(unsafe.Pointer(&b[0]))-uintptr(unsafe.Pointer(&a[0]))) / int(unsafe.Sizeof(complex128(0)))
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/offset_appengine.go b/vendor/gonum.org/v1/gonum/mat/offset_appengine.go
new file mode 100644
index 00000000000..be2ca78cba8
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/offset_appengine.go
@@ -0,0 +1,40 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build safe
+// +build safe
+
+package mat
+
+import "reflect"
+
+var sizeOfFloat64 = int(reflect.TypeOf(float64(0)).Size())
+
+// offset returns the number of float64 values b[0] is after a[0].
+func offset(a, b []float64) int {
+	va0 := reflect.ValueOf(a).Index(0)
+	vb0 := reflect.ValueOf(b).Index(0)
+	if va0.Addr() == vb0.Addr() {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(vb0.UnsafeAddr()-va0.UnsafeAddr()) / sizeOfFloat64
+}
+
+var sizeOfComplex128 = int(reflect.TypeOf(complex128(0)).Size())
+
+// offsetComplex returns the number of complex128 values b[0] is after a[0].
+func offsetComplex(a, b []complex128) int {
+	va0 := reflect.ValueOf(a).Index(0)
+	vb0 := reflect.ValueOf(b).Index(0)
+	if va0.Addr() == vb0.Addr() {
+		return 0
+	}
+	// This expression must be atomic with respect to GC moves.
+	// At this stage this is true, because the GC does not
+	// move. See https://golang.org/issue/12445.
+	return int(vb0.UnsafeAddr()-va0.UnsafeAddr()) / sizeOfComplex128
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/pool.go b/vendor/gonum.org/v1/gonum/mat/pool.go
new file mode 100644
index 00000000000..b9dce1c45ba
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/pool.go
@@ -0,0 +1,260 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math/bits"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/blas/cblas128"
+)
+
+// poolFor returns the ceiling of base 2 log of size. It provides an index
+// into a pool array to a sync.Pool that will return values able to hold
+// size elements.
+func poolFor(size uint) int {
+	if size == 0 {
+		return 0
+	}
+	return bits.Len(size - 1)
+}
+
+var (
+	// poolDense contains size stratified workspace Dense pools.
+	// Each poolDense element i returns sized matrices with a data
+	// slice capped at 1<<i.
+	poolDense [63]sync.Pool
+
+	// poolSymDense is the SymDense equivalent of poolDense.
+	poolSymDense [63]sync.Pool
+
+	// poolTriDense is the TriDense equivalent of poolDense.
+	poolTriDense [63]sync.Pool
+
+	// poolVecDense is the VecDense equivalent of poolDense.
+	poolVecDense [63]sync.Pool
+
+	// poolCDense is the CDense equivalent of poolDense.
+	poolCDense [63]sync.Pool
+
+	// poolFloat64s is the []float64 equivalent of poolDense.
+	poolFloat64s [63]sync.Pool
+
+	// poolInts is the []int equivalent of poolDense.
+	poolInts [63]sync.Pool
+)
+
+func init() {
+	for i := range poolDense {
+		l := 1 << uint(i)
+		// Real matrix pools.
+		poolDense[i].New = func() interface{} {
+			return &Dense{mat: blas64.General{
+				Data: make([]float64, l),
+			}}
+		}
+		poolSymDense[i].New = func() interface{} {
+			return &SymDense{mat: blas64.Symmetric{
+				Uplo: blas.Upper,
+				Data: make([]float64, l),
+			}}
+		}
+		poolTriDense[i].New = func() interface{} {
+			return &TriDense{mat: blas64.Triangular{
+				Data: make([]float64, l),
+			}}
+		}
+		poolVecDense[i].New = func() interface{} {
+			return &VecDense{mat: blas64.Vector{
+				Inc:  1,
+				Data: make([]float64, l),
+			}}
+		}
+
+		// Complex matrix pools.
+		poolCDense[i].New = func() interface{} {
+			return &CDense{mat: cblas128.General{
+				Data: make([]complex128, l),
+			}}
+		}
+
+		// Helper pools.
+		poolFloat64s[i].New = func() interface{} {
+			s := make([]float64, l)
+			return &s
+		}
+		poolInts[i].New = func() interface{} {
+			s := make([]int, l)
+			return &s
+		}
+	}
+}
+
+// getDenseWorkspace returns a *Dense of size r×c and a data slice
+// with a cap that is less than 2*r*c. If clear is true, the
+// data slice visible through the Matrix interface is zeroed.
+func getDenseWorkspace(r, c int, clear bool) *Dense {
+	l := uint(r * c)
+	w := poolDense[poolFor(l)].Get().(*Dense)
+	w.mat.Data = w.mat.Data[:l]
+	if clear {
+		zero(w.mat.Data)
+	}
+	w.mat.Rows = r
+	w.mat.Cols = c
+	w.mat.Stride = c
+	w.capRows = r
+	w.capCols = c
+	return w
+}
+
+// putDenseWorkspace replaces a used *Dense into the appropriate size
+// workspace pool. putDenseWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putDenseWorkspace(w *Dense) {
+	poolDense[poolFor(uint(cap(w.mat.Data)))].Put(w)
+}
+
+// getSymDenseWorkspace returns a *SymDense of size n and a cap that
+// is less than 2*n. If clear is true, the data slice visible
+// through the Matrix interface is zeroed.
+func getSymDenseWorkspace(n int, clear bool) *SymDense {
+	l := uint(n)
+	l *= l
+	s := poolSymDense[poolFor(l)].Get().(*SymDense)
+	s.mat.Data = s.mat.Data[:l]
+	if clear {
+		zero(s.mat.Data)
+	}
+	s.mat.N = n
+	s.mat.Stride = n
+	s.cap = n
+	return s
+}
+
+// putSymDenseWorkspace replaces a used *SymDense into the appropriate size
+// workspace pool. putSymDenseWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putSymDenseWorkspace(s *SymDense) {
+	poolSymDense[poolFor(uint(cap(s.mat.Data)))].Put(s)
+}
+
+// getTriDenseWorkspace returns a *TriDense of size n and a cap that
+// is less than 2*n. If clear is true, the data slice visible
+// through the Matrix interface is zeroed.
+func getTriDenseWorkspace(n int, kind TriKind, clear bool) *TriDense {
+	l := uint(n)
+	l *= l
+	t := poolTriDense[poolFor(l)].Get().(*TriDense)
+	t.mat.Data = t.mat.Data[:l]
+	if clear {
+		zero(t.mat.Data)
+	}
+	t.mat.N = n
+	t.mat.Stride = n
+	if kind == Upper {
+		t.mat.Uplo = blas.Upper
+	} else if kind == Lower {
+		t.mat.Uplo = blas.Lower
+	} else {
+		panic(ErrTriangle)
+	}
+	t.mat.Diag = blas.NonUnit
+	t.cap = n
+	return t
+}
+
+// putTriWorkspace replaces a used *TriDense into the appropriate size
+// workspace pool. putTriWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putTriWorkspace(t *TriDense) {
+	poolTriDense[poolFor(uint(cap(t.mat.Data)))].Put(t)
+}
+
+// getVecDenseWorkspace returns a *VecDense of length n and a cap that
+// is less than 2*n. If clear is true, the data slice visible
+// through the Matrix interface is zeroed.
+func getVecDenseWorkspace(n int, clear bool) *VecDense {
+	l := uint(n)
+	v := poolVecDense[poolFor(l)].Get().(*VecDense)
+	v.mat.Data = v.mat.Data[:l]
+	if clear {
+		zero(v.mat.Data)
+	}
+	v.mat.N = n
+	return v
+}
+
+// putVecDenseWorkspace replaces a used *VecDense into the appropriate size
+// workspace pool. putVecDenseWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putVecDenseWorkspace(v *VecDense) {
+	poolVecDense[poolFor(uint(cap(v.mat.Data)))].Put(v)
+}
+
+// getCDenseWorkspace returns a *CDense of size r×c and a data slice
+// with a cap that is less than 2*r*c. If clear is true, the
+// data slice visible through the CMatrix interface is zeroed.
+func getCDenseWorkspace(r, c int, clear bool) *CDense {
+	l := uint(r * c)
+	w := poolCDense[poolFor(l)].Get().(*CDense)
+	w.mat.Data = w.mat.Data[:l]
+	if clear {
+		zeroC(w.mat.Data)
+	}
+	w.mat.Rows = r
+	w.mat.Cols = c
+	w.mat.Stride = c
+	w.capRows = r
+	w.capCols = c
+	return w
+}
+
+// putCDenseWorkspace replaces a used *CDense into the appropriate size
+// workspace pool. putWorkspace must not be called with a matrix
+// where references to the underlying data slice have been kept.
+func putCDenseWorkspace(w *CDense) {
+	poolCDense[poolFor(uint(cap(w.mat.Data)))].Put(w)
+}
+
+// getFloat64s returns a []float64 of length l and a cap that is
+// less than 2*l. If clear is true, the slice visible is zeroed.
+func getFloat64s(l int, clear bool) []float64 {
+	w := *poolFloat64s[poolFor(uint(l))].Get().(*[]float64)
+	w = w[:l]
+	if clear {
+		zero(w)
+	}
+	return w
+}
+
+// putFloat64s replaces a used []float64 into the appropriate size
+// workspace pool. putFloat64s must not be called with a slice
+// where references to the underlying data have been kept.
+func putFloat64s(w []float64) {
+	poolFloat64s[poolFor(uint(cap(w)))].Put(&w)
+}
+
+// getInts returns a []int of length l and a cap that is
+// less than 2*l. If clear is true, the slice visible is zeroed.
+func getInts(l int, clear bool) []int {
+	w := *poolInts[poolFor(uint(l))].Get().(*[]int)
+	w = w[:l]
+	if clear {
+		for i := range w {
+			w[i] = 0
+		}
+	}
+	return w
+}
+
+// putInts replaces a used []int into the appropriate size
+// workspace pool. putInts must not be called with a slice
+// where references to the underlying data have been kept.
+func putInts(w []int) {
+	poolInts[poolFor(uint(cap(w)))].Put(&w)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/product.go b/vendor/gonum.org/v1/gonum/mat/product.go
new file mode 100644
index 00000000000..43e46a2c7f9
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/product.go
@@ -0,0 +1,193 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import "fmt"
+
+// Product calculates the product of the given factors and places the result in
+// the receiver. The order of multiplication operations is optimized to minimize
+// the number of floating point operations on the basis that all matrix
+// multiplications are general.
+func (m *Dense) Product(factors ...Matrix) {
+	// The operation order optimisation is the naive O(n^3) dynamic
+	// programming approach and does not take into consideration
+	// finer-grained optimisations that might be available.
+	//
+	// TODO(kortschak) Consider using the O(nlogn) or O(mlogn)
+	// algorithms that are available. e.g.
+	//
+	// e.g. http://www.jofcis.com/publishedpapers/2014_10_10_4299_4306.pdf
+	//
+	// In the case that this is replaced, retain this code in
+	// tests to compare against.
+
+	r, c := m.Dims()
+	switch len(factors) {
+	case 0:
+		if r != 0 || c != 0 {
+			panic(ErrShape)
+		}
+		return
+	case 1:
+		m.reuseAsNonZeroed(factors[0].Dims())
+		m.Copy(factors[0])
+		return
+	case 2:
+		// Don't do work that we know the answer to.
+		m.Mul(factors[0], factors[1])
+		return
+	}
+
+	p := newMultiplier(m, factors)
+	p.optimize()
+	result := p.multiply()
+	m.reuseAsNonZeroed(result.Dims())
+	m.Copy(result)
+	putDenseWorkspace(result)
+}
+
+// debugProductWalk enables debugging output for Product.
+const debugProductWalk = false
+
+// multiplier performs operation order optimisation and tree traversal.
+type multiplier struct {
+	// factors is the ordered set of
+	// factors to multiply.
+	factors []Matrix
+	// dims is the chain of factor
+	// dimensions.
+	dims []int
+
+	// table contains the dynamic
+	// programming costs and subchain
+	// division indices.
+	table table
+}
+
+func newMultiplier(m *Dense, factors []Matrix) *multiplier {
+	// Check size early, but don't yet
+	// allocate data for m.
+	r, c := m.Dims()
+	fr, fc := factors[0].Dims() // newMultiplier is only called with len(factors) > 2.
+	if !m.IsEmpty() {
+		if fr != r {
+			panic(ErrShape)
+		}
+		if _, lc := factors[len(factors)-1].Dims(); lc != c {
+			panic(ErrShape)
+		}
+	}
+
+	dims := make([]int, len(factors)+1)
+	dims[0] = r
+	dims[len(dims)-1] = c
+	pc := fc
+	for i, f := range factors[1:] {
+		cr, cc := f.Dims()
+		dims[i+1] = cr
+		if pc != cr {
+			panic(ErrShape)
+		}
+		pc = cc
+	}
+
+	return &multiplier{
+		factors: factors,
+		dims:    dims,
+		table:   newTable(len(factors)),
+	}
+}
+
+// optimize determines an optimal matrix multiply operation order.
+func (p *multiplier) optimize() {
+	if debugProductWalk {
+		fmt.Printf("chain dims: %v\n", p.dims)
+	}
+	const maxInt = int(^uint(0) >> 1)
+	for f := 1; f < len(p.factors); f++ {
+		for i := 0; i < len(p.factors)-f; i++ {
+			j := i + f
+			p.table.set(i, j, entry{cost: maxInt})
+			for k := i; k < j; k++ {
+				cost := p.table.at(i, k).cost + p.table.at(k+1, j).cost + p.dims[i]*p.dims[k+1]*p.dims[j+1]
+				if cost < p.table.at(i, j).cost {
+					p.table.set(i, j, entry{cost: cost, k: k})
+				}
+			}
+		}
+	}
+}
+
+// multiply walks the optimal operation tree found by optimize,
+// leaving the final result in the stack. It returns the
+// product, which may be copied but should be returned to
+// the workspace pool.
+func (p *multiplier) multiply() *Dense {
+	result, _ := p.multiplySubchain(0, len(p.factors)-1)
+	if debugProductWalk {
+		r, c := result.Dims()
+		fmt.Printf("\tpop result (%d×%d) cost=%d\n", r, c, p.table.at(0, len(p.factors)-1).cost)
+	}
+	return result.(*Dense)
+}
+
+func (p *multiplier) multiplySubchain(i, j int) (m Matrix, intermediate bool) {
+	if i == j {
+		return p.factors[i], false
+	}
+
+	a, aTmp := p.multiplySubchain(i, p.table.at(i, j).k)
+	b, bTmp := p.multiplySubchain(p.table.at(i, j).k+1, j)
+
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ac != br {
+		// Panic with a string since this
+		// is not a user-facing panic.
+		panic(ErrShape.Error())
+	}
+
+	if debugProductWalk {
+		fmt.Printf("\tpush f[%d] (%d×%d)%s * f[%d] (%d×%d)%s\n",
+			i, ar, ac, result(aTmp), j, br, bc, result(bTmp))
+	}
+
+	r := getDenseWorkspace(ar, bc, false)
+	r.Mul(a, b)
+	if aTmp {
+		putDenseWorkspace(a.(*Dense))
+	}
+	if bTmp {
+		putDenseWorkspace(b.(*Dense))
+	}
+	return r, true
+}
+
+type entry struct {
+	k    int // is the chain subdivision index.
+	cost int // cost is the cost of the operation.
+}
+
+// table is a row major n×n dynamic programming table.
+type table struct {
+	n       int
+	entries []entry
+}
+
+func newTable(n int) table {
+	return table{n: n, entries: make([]entry, n*n)}
+}
+
+func (t table) at(i, j int) entry     { return t.entries[i*t.n+j] }
+func (t table) set(i, j int, e entry) { t.entries[i*t.n+j] = e }
+
+type result bool
+
+func (r result) String() string {
+	if r {
+		return " (popped result)"
+	}
+	return ""
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/qr.go b/vendor/gonum.org/v1/gonum/mat/qr.go
new file mode 100644
index 00000000000..af99dbcaa15
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/qr.go
@@ -0,0 +1,305 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const badQR = "mat: invalid QR factorization"
+
+// QR is a type for creating and using the QR factorization of a matrix.
+type QR struct {
+	qr   *Dense
+	q    *Dense
+	tau  []float64
+	cond float64
+}
+
+// Dims returns the dimensions of the matrix.
+func (qr *QR) Dims() (r, c int) {
+	if qr.qr == nil {
+		return 0, 0
+	}
+	return qr.qr.Dims()
+}
+
+// At returns the element at row i, column j.
+func (qr *QR) At(i, j int) float64 {
+	m, n := qr.Dims()
+	if uint(i) >= uint(m) {
+		panic(ErrRowAccess)
+	}
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+
+	var val float64
+	for k := 0; k <= j; k++ {
+		val += qr.q.at(i, k) * qr.qr.at(k, j)
+	}
+	return val
+}
+
+// T performs an implicit transpose by returning the receiver inside a
+// Transpose.
+func (qr *QR) T() Matrix {
+	return Transpose{qr}
+}
+
+func (qr *QR) updateCond(norm lapack.MatrixNorm) {
+	// Since A = Q*R, and Q is orthogonal, we get for the condition number κ
+	//  κ(A) := |A| |A^-1| = |Q*R| |(Q*R)^-1| = |R| |R^-1 * Qᵀ|
+	//        = |R| |R^-1| = κ(R),
+	// where we used that fact that Q^-1 = Qᵀ. However, this assumes that
+	// the matrix norm is invariant under orthogonal transformations which
+	// is not the case for CondNorm. Hopefully the error is negligible: κ
+	// is only a qualitative measure anyway.
+	n := qr.qr.mat.Cols
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	r := qr.qr.asTriDense(n, blas.NonUnit, blas.Upper)
+	v := lapack64.Trcon(norm, r.mat, work, iwork)
+	putFloat64s(work)
+	putInts(iwork)
+	qr.cond = 1 / v
+}
+
+// Factorize computes the QR factorization of an m×n matrix a where m >= n. The QR
+// factorization always exists even if A is singular.
+//
+// The QR decomposition is a factorization of the matrix A such that A = Q * R.
+// The matrix Q is an orthonormal m×m matrix, and R is an m×n upper triangular matrix.
+// Q and R can be extracted using the QTo and RTo methods.
+func (qr *QR) Factorize(a Matrix) {
+	qr.factorize(a, CondNorm)
+}
+
+func (qr *QR) factorize(a Matrix, norm lapack.MatrixNorm) {
+	m, n := a.Dims()
+	if m < n {
+		panic(ErrShape)
+	}
+	if qr.qr == nil {
+		qr.qr = &Dense{}
+	}
+	qr.qr.CloneFrom(a)
+	work := []float64{0}
+	qr.tau = make([]float64, n)
+	lapack64.Geqrf(qr.qr.mat, qr.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Geqrf(qr.qr.mat, qr.tau, work, len(work))
+	putFloat64s(work)
+	qr.updateCond(norm)
+	qr.updateQ()
+}
+
+func (qr *QR) updateQ() {
+	m, _ := qr.Dims()
+	if qr.q == nil {
+		qr.q = NewDense(m, m, nil)
+	} else {
+		qr.q.reuseAsNonZeroed(m, m)
+	}
+	// Construct Q from the elementary reflectors.
+	qr.q.Copy(qr.qr)
+	work := []float64{0}
+	lapack64.Orgqr(qr.q.mat, qr.tau, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	lapack64.Orgqr(qr.q.mat, qr.tau, work, len(work))
+	putFloat64s(work)
+}
+
+// isValid returns whether the receiver contains a factorization.
+func (qr *QR) isValid() bool {
+	return qr.qr != nil && !qr.qr.IsEmpty()
+}
+
+// Cond returns the condition number for the factorized matrix.
+// Cond will panic if the receiver does not contain a factorization.
+func (qr *QR) Cond() float64 {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+	return qr.cond
+}
+
+// TODO(btracey): Add in the "Reduced" forms for extracting the n×n orthogonal
+// and upper triangular matrices.
+
+// RTo extracts the m×n upper trapezoidal matrix from a QR decomposition.
+//
+// If dst is empty, RTo will resize dst to be r×c. When dst is non-empty,
+// RTo will panic if dst is not r×c. RTo will also panic if the receiver
+// does not contain a successful factorization.
+func (qr *QR) RTo(dst *Dense) {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, c := qr.qr.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if c != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	// Disguise the QR as an upper triangular
+	t := &TriDense{
+		mat: blas64.Triangular{
+			N:      c,
+			Stride: qr.qr.mat.Stride,
+			Data:   qr.qr.mat.Data,
+			Uplo:   blas.Upper,
+			Diag:   blas.NonUnit,
+		},
+		cap: qr.qr.capCols,
+	}
+	dst.Copy(t)
+
+	// Zero below the triangular.
+	for i := r; i < c; i++ {
+		zero(dst.mat.Data[i*dst.mat.Stride : i*dst.mat.Stride+c])
+	}
+}
+
+// QTo extracts the r×r orthonormal matrix Q from a QR decomposition.
+//
+// If dst is empty, QTo will resize dst to be r×r. When dst is non-empty,
+// QTo will panic if dst is not r×r. QTo will also panic if the receiver
+// does not contain a successful factorization.
+func (qr *QR) QTo(dst *Dense) {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, _ := qr.qr.Dims()
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, r)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || r != c2 {
+			panic(ErrShape)
+		}
+	}
+	dst.Copy(qr.q)
+}
+
+// SolveTo finds a minimum-norm solution to a system of linear equations defined
+// by the matrices A and b, where A is an m×n matrix represented in its QR factorized
+// form. If A is singular or near-singular a Condition error is returned.
+// See the documentation for Condition for more information.
+//
+// The minimization problem solved depends on the input parameters.
+//
+//	If trans == false, find X such that ||A*X - B||_2 is minimized.
+//	If trans == true, find the minimum norm solution of Aᵀ * X = B.
+//
+// The solution matrix, X, is stored in place into dst.
+// SolveTo will panic if the receiver does not contain a factorization.
+func (qr *QR) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, c := qr.qr.Dims()
+	br, bc := b.Dims()
+
+	// The QR solve algorithm stores the result in-place into the right hand side.
+	// The storage for the answer must be large enough to hold both b and x.
+	// However, this method's receiver must be the size of x. Copy b, and then
+	// copy the result into m at the end.
+	if trans {
+		if c != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(r, bc)
+	} else {
+		if r != br {
+			panic(ErrShape)
+		}
+		dst.reuseAsNonZeroed(c, bc)
+	}
+	// Do not need to worry about overlap between m and b because x has its own
+	// independent storage.
+	w := getDenseWorkspace(max(r, c), bc, false)
+	w.Copy(b)
+	t := qr.qr.asTriDense(qr.qr.mat.Cols, blas.NonUnit, blas.Upper).mat
+	if trans {
+		ok := lapack64.Trtrs(blas.Trans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+		for i := c; i < r; i++ {
+			zero(w.mat.Data[i*w.mat.Stride : i*w.mat.Stride+bc])
+		}
+		work := []float64{0}
+		lapack64.Ormqr(blas.Left, blas.NoTrans, qr.qr.mat, qr.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormqr(blas.Left, blas.NoTrans, qr.qr.mat, qr.tau, w.mat, work, len(work))
+		putFloat64s(work)
+	} else {
+		work := []float64{0}
+		lapack64.Ormqr(blas.Left, blas.Trans, qr.qr.mat, qr.tau, w.mat, work, -1)
+		work = getFloat64s(int(work[0]), false)
+		lapack64.Ormqr(blas.Left, blas.Trans, qr.qr.mat, qr.tau, w.mat, work, len(work))
+		putFloat64s(work)
+
+		ok := lapack64.Trtrs(blas.NoTrans, t, w.mat)
+		if !ok {
+			return Condition(math.Inf(1))
+		}
+	}
+	// X was set above to be the correct size for the result.
+	dst.Copy(w)
+	putDenseWorkspace(w)
+	if qr.cond > ConditionTolerance {
+		return Condition(qr.cond)
+	}
+	return nil
+}
+
+// SolveVecTo finds a minimum-norm solution to a system of linear equations,
+//
+//	Ax = b.
+//
+// See QR.SolveTo for the full documentation.
+// SolveVecTo will panic if the receiver does not contain a factorization.
+func (qr *QR) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	if !qr.isValid() {
+		panic(badQR)
+	}
+
+	r, c := qr.qr.Dims()
+	if _, bc := b.Dims(); bc != 1 {
+		panic(ErrShape)
+	}
+
+	// The Solve implementation is non-trivial, so rather than duplicate the code,
+	// instead recast the VecDenses as Dense and call the matrix code.
+	bm := Matrix(b)
+	if rv, ok := b.(RawVectorer); ok {
+		bmat := rv.RawVector()
+		if dst != b {
+			dst.checkOverlap(bmat)
+		}
+		b := VecDense{mat: bmat}
+		bm = b.asDense()
+	}
+	if trans {
+		dst.reuseAsNonZeroed(r)
+	} else {
+		dst.reuseAsNonZeroed(c)
+	}
+	return qr.SolveTo(dst.asDense(), trans, bm)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/shadow.go b/vendor/gonum.org/v1/gonum/mat/shadow.go
new file mode 100644
index 00000000000..4fc24c3466a
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/shadow.go
@@ -0,0 +1,243 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import "gonum.org/v1/gonum/blas/blas64"
+
+// checkOverlap returns false if the receiver does not overlap data elements
+// referenced by the parameter and panics otherwise.
+//
+// checkOverlap methods return a boolean to allow the check call to be added to a
+// boolean expression, making use of short-circuit operators.
+func checkOverlap(a, b blas64.General) bool {
+	if cap(a.Data) == 0 || cap(b.Data) == 0 {
+		return false
+	}
+
+	off := offset(a.Data[:1], b.Data[:1])
+
+	if off == 0 {
+		// At least one element overlaps.
+		if a.Cols == b.Cols && a.Rows == b.Rows && a.Stride == b.Stride {
+			panic(regionIdentity)
+		}
+		panic(regionOverlap)
+	}
+
+	if off > 0 && len(a.Data) <= off {
+		// We know a is completely before b.
+		return false
+	}
+	if off < 0 && len(b.Data) <= -off {
+		// We know a is completely after b.
+		return false
+	}
+
+	if a.Stride != b.Stride && a.Stride != 1 && b.Stride != 1 {
+		// Too hard, so assume the worst; if either stride
+		// is one it will be caught in rectanglesOverlap.
+		panic(mismatchedStrides)
+	}
+
+	if off < 0 {
+		off = -off
+		a.Cols, b.Cols = b.Cols, a.Cols
+	}
+	if rectanglesOverlap(off, a.Cols, b.Cols, min(a.Stride, b.Stride)) {
+		panic(regionOverlap)
+	}
+	return false
+}
+
+func (m *Dense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(m.RawMatrix(), a)
+}
+
+func (m *Dense) checkOverlapMatrix(a Matrix) bool {
+	if m == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return m.checkOverlap(amat)
+}
+
+func (s *SymDense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(generalFromSymmetric(s.RawSymmetric()), a)
+}
+
+func (s *SymDense) checkOverlapMatrix(a Matrix) bool {
+	if s == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return s.checkOverlap(amat)
+}
+
+// generalFromSymmetric returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromSymmetric(a blas64.Symmetric) blas64.General {
+	return blas64.General{
+		Rows:   a.N,
+		Cols:   a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+}
+
+func (t *TriDense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(generalFromTriangular(t.RawTriangular()), a)
+}
+
+func (t *TriDense) checkOverlapMatrix(a Matrix) bool {
+	if t == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return t.checkOverlap(amat)
+}
+
+// generalFromTriangular returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromTriangular(a blas64.Triangular) blas64.General {
+	return blas64.General{
+		Rows:   a.N,
+		Cols:   a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+}
+
+func (v *VecDense) checkOverlap(a blas64.Vector) bool {
+	mat := v.mat
+	if cap(mat.Data) == 0 || cap(a.Data) == 0 {
+		return false
+	}
+
+	off := offset(mat.Data[:1], a.Data[:1])
+
+	if off == 0 {
+		// At least one element overlaps.
+		if mat.Inc == a.Inc && len(mat.Data) == len(a.Data) {
+			panic(regionIdentity)
+		}
+		panic(regionOverlap)
+	}
+
+	if off > 0 && len(mat.Data) <= off {
+		// We know v is completely before a.
+		return false
+	}
+	if off < 0 && len(a.Data) <= -off {
+		// We know v is completely after a.
+		return false
+	}
+
+	if mat.Inc != a.Inc && mat.Inc != 1 && a.Inc != 1 {
+		// Too hard, so assume the worst; if either
+		// increment is one it will be caught below.
+		panic(mismatchedStrides)
+	}
+	inc := min(mat.Inc, a.Inc)
+
+	if inc == 1 || off&inc == 0 {
+		panic(regionOverlap)
+	}
+	return false
+}
+
+// generalFromVector returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromVector(a blas64.Vector, r, c int) blas64.General {
+	return blas64.General{
+		Rows:   r,
+		Cols:   c,
+		Stride: a.Inc,
+		Data:   a.Data,
+	}
+}
+
+func (s *SymBandDense) checkOverlap(a blas64.General) bool {
+	return checkOverlap(generalFromSymmetricBand(s.RawSymBand()), a)
+}
+
+//lint:ignore U1000 This will be used when we do shadow checks for banded matrices.
+func (s *SymBandDense) checkOverlapMatrix(a Matrix) bool {
+	if s == a {
+		return false
+	}
+	var amat blas64.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawMatrixer:
+		amat = ar.RawMatrix()
+	case RawSymmetricer:
+		amat = generalFromSymmetric(ar.RawSymmetric())
+	case RawSymBander:
+		amat = generalFromSymmetricBand(ar.RawSymBand())
+	case RawTriangular:
+		amat = generalFromTriangular(ar.RawTriangular())
+	case RawVectorer:
+		r, c := a.Dims()
+		amat = generalFromVector(ar.RawVector(), r, c)
+	}
+	return s.checkOverlap(amat)
+}
+
+// generalFromSymmetricBand returns a blas64.General with the backing
+// data and dimensions of a.
+func generalFromSymmetricBand(a blas64.SymmetricBand) blas64.General {
+	return blas64.General{
+		Rows:   a.N,
+		Cols:   a.K + 1,
+		Data:   a.Data,
+		Stride: a.Stride,
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/shadow_common.go b/vendor/gonum.org/v1/gonum/mat/shadow_common.go
new file mode 100644
index 00000000000..e4cdf4ddee1
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/shadow_common.go
@@ -0,0 +1,54 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+const (
+	// regionOverlap is the panic string used for the general case
+	// of a matrix region overlap between a source and destination.
+	regionOverlap = "mat: bad region: overlap"
+
+	// regionIdentity is the panic string used for the specific
+	// case of complete agreement between a source and a destination.
+	regionIdentity = "mat: bad region: identical"
+
+	// mismatchedStrides is the panic string used for overlapping
+	// data slices with differing strides.
+	mismatchedStrides = "mat: bad region: different strides"
+)
+
+// rectanglesOverlap returns whether the strided rectangles a and b overlap
+// when b is offset by off elements after a but has at least one element before
+// the end of a. off must be positive. a and b have aCols and bCols respectively.
+//
+// rectanglesOverlap works by shifting both matrices left such that the left
+// column of a is at 0. The column indexes are flattened by obtaining the shifted
+// relative left and right column positions modulo the common stride. This allows
+// direct comparison of the column offsets when the matrix backing data slices
+// are known to overlap.
+func rectanglesOverlap(off, aCols, bCols, stride int) bool {
+	if stride == 1 {
+		// Unit stride means overlapping data
+		// slices must overlap as matrices.
+		return true
+	}
+
+	// Flatten the shifted matrix column positions
+	// so a starts at 0, modulo the common stride.
+	aTo := aCols
+	// The mod stride operations here make the from
+	// and to indexes comparable between a and b when
+	// the data slices of a and b overlap.
+	bFrom := off % stride
+	bTo := (bFrom + bCols) % stride
+
+	if bTo == 0 || bFrom < bTo {
+		// b matrix is not wrapped: compare for
+		// simple overlap.
+		return bFrom < aTo
+	}
+
+	// b strictly wraps and so must overlap with a.
+	return true
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/shadow_complex.go b/vendor/gonum.org/v1/gonum/mat/shadow_complex.go
new file mode 100644
index 00000000000..1a3f3fc2313
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/shadow_complex.go
@@ -0,0 +1,72 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO(kortschak): Generate this file from shadow.go when all complex type are available.
+
+package mat
+
+import "gonum.org/v1/gonum/blas/cblas128"
+
+// checkOverlapComplex returns false if the receiver does not overlap data elements
+// referenced by the parameter and panics otherwise.
+//
+// checkOverlapComplex methods return a boolean to allow the check call to be added to a
+// boolean expression, making use of short-circuit operators.
+func checkOverlapComplex(a, b cblas128.General) bool {
+	if cap(a.Data) == 0 || cap(b.Data) == 0 {
+		return false
+	}
+
+	off := offsetComplex(a.Data[:1], b.Data[:1])
+
+	if off == 0 {
+		// At least one element overlaps.
+		if a.Cols == b.Cols && a.Rows == b.Rows && a.Stride == b.Stride {
+			panic(regionIdentity)
+		}
+		panic(regionOverlap)
+	}
+
+	if off > 0 && len(a.Data) <= off {
+		// We know a is completely before b.
+		return false
+	}
+	if off < 0 && len(b.Data) <= -off {
+		// We know a is completely after b.
+		return false
+	}
+
+	if a.Stride != b.Stride && a.Stride != 1 && b.Stride != 1 {
+		// Too hard, so assume the worst; if either stride
+		// is one it will be caught in rectanglesOverlap.
+		panic(mismatchedStrides)
+	}
+
+	if off < 0 {
+		off = -off
+		a.Cols, b.Cols = b.Cols, a.Cols
+	}
+	if rectanglesOverlap(off, a.Cols, b.Cols, min(a.Stride, b.Stride)) {
+		panic(regionOverlap)
+	}
+	return false
+}
+
+func (m *CDense) checkOverlap(a cblas128.General) bool {
+	return checkOverlapComplex(m.RawCMatrix(), a)
+}
+
+func (m *CDense) checkOverlapMatrix(a CMatrix) bool {
+	if m == a {
+		return false
+	}
+	var amat cblas128.General
+	switch ar := a.(type) {
+	default:
+		return false
+	case RawCMatrixer:
+		amat = ar.RawCMatrix()
+	}
+	return m.checkOverlap(amat)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/solve.go b/vendor/gonum.org/v1/gonum/mat/solve.go
new file mode 100644
index 00000000000..ffccce8c45c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/solve.go
@@ -0,0 +1,124 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+// Solve solves the linear least squares problem
+//
+//	minimize over x |b - A*x|_2
+//
+// where A is an m×n matrix, b is a given m element vector and x is n element
+// solution vector. Solve assumes that A has full rank, that is
+//
+//	rank(A) = min(m,n)
+//
+// If m >= n, Solve finds the unique least squares solution of an overdetermined
+// system.
+//
+// If m < n, there is an infinite number of solutions that satisfy b-A*x=0. In
+// this case Solve finds the unique solution of an underdetermined system that
+// minimizes |x|_2.
+//
+// Several right-hand side vectors b and solution vectors x can be handled in a
+// single call. Vectors b are stored in the columns of the m×k matrix B. Vectors
+// x will be stored in-place into the n×k receiver.
+//
+// If the underlying matrix of a is a SolveToer, its SolveTo method is used,
+// otherwise a Dense copy of a will be used for the solution.
+//
+// If A does not have full rank, a Condition error is returned. See the
+// documentation for Condition for more information.
+func (m *Dense) Solve(a, b Matrix) error {
+	aU, aTrans := untransposeExtract(a)
+	if a, ok := aU.(SolveToer); ok {
+		return a.SolveTo(m, aTrans, b)
+	}
+
+	ar, ac := a.Dims()
+	br, bc := b.Dims()
+	if ar != br {
+		panic(ErrShape)
+	}
+	m.reuseAsNonZeroed(ac, bc)
+
+	switch {
+	case ar == ac:
+		if a == b {
+			// x = I.
+			if ar == 1 {
+				m.mat.Data[0] = 1
+				return nil
+			}
+			for i := 0; i < ar; i++ {
+				v := m.mat.Data[i*m.mat.Stride : i*m.mat.Stride+ac]
+				zero(v)
+				v[i] = 1
+			}
+			return nil
+		}
+		var lu LU
+		lu.Factorize(a)
+		return lu.SolveTo(m, false, b)
+	case ar > ac:
+		var qr QR
+		qr.Factorize(a)
+		return qr.SolveTo(m, false, b)
+	default:
+		var lq LQ
+		lq.Factorize(a)
+		return lq.SolveTo(m, false, b)
+	}
+}
+
+// SolveVec solves the linear least squares problem
+//
+//	minimize over x |b - A*x|_2
+//
+// where A is an m×n matrix, b is a given m element vector and x is n element
+// solution vector. Solve assumes that A has full rank, that is
+//
+//	rank(A) = min(m,n)
+//
+// If m >= n, Solve finds the unique least squares solution of an overdetermined
+// system.
+//
+// If m < n, there is an infinite number of solutions that satisfy b-A*x=0. In
+// this case Solve finds the unique solution of an underdetermined system that
+// minimizes |x|_2.
+//
+// The solution vector x will be stored in-place into the receiver.
+//
+// If A does not have full rank, a Condition error is returned. See the
+// documentation for Condition for more information.
+func (v *VecDense) SolveVec(a Matrix, b Vector) error {
+	if _, bc := b.Dims(); bc != 1 {
+		panic(ErrShape)
+	}
+	_, c := a.Dims()
+
+	// The Solve implementation is non-trivial, so rather than duplicate the code,
+	// instead recast the VecDenses as Dense and call the matrix code.
+
+	if rv, ok := b.(RawVectorer); ok {
+		bmat := rv.RawVector()
+		if v != b {
+			v.checkOverlap(bmat)
+		}
+		v.reuseAsNonZeroed(c)
+		m := v.asDense()
+		// We conditionally create bm as m when b and v are identical
+		// to prevent the overlap detection code from identifying m
+		// and bm as overlapping but not identical.
+		bm := m
+		if v != b {
+			b := VecDense{mat: bmat}
+			bm = b.asDense()
+		}
+		return m.Solve(a, bm)
+	}
+
+	v.reuseAsNonZeroed(c)
+	m := v.asDense()
+	return m.Solve(a, b)
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/svd.go b/vendor/gonum.org/v1/gonum/mat/svd.go
new file mode 100644
index 00000000000..5244d9f67da
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/svd.go
@@ -0,0 +1,425 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+const badRcond = "mat: invalid rcond value"
+
+// SVD is a type for creating and using the Singular Value Decomposition
+// of a matrix.
+type SVD struct {
+	kind SVDKind
+
+	s  []float64
+	u  blas64.General
+	vt blas64.General
+}
+
+// SVDKind specifies the treatment of singular vectors during an SVD
+// factorization.
+type SVDKind int
+
+const (
+	// SVDNone specifies that no singular vectors should be computed during
+	// the decomposition.
+	SVDNone SVDKind = 0
+
+	// SVDThinU specifies the thin decomposition for U should be computed.
+	SVDThinU SVDKind = 1 << (iota - 1)
+	// SVDFullU specifies the full decomposition for U should be computed.
+	SVDFullU
+	// SVDThinV specifies the thin decomposition for V should be computed.
+	SVDThinV
+	// SVDFullV specifies the full decomposition for V should be computed.
+	SVDFullV
+
+	// SVDThin is a convenience value for computing both thin vectors.
+	SVDThin SVDKind = SVDThinU | SVDThinV
+	// SVDFull is a convenience value for computing both full vectors.
+	SVDFull SVDKind = SVDFullU | SVDFullV
+)
+
+// succFact returns whether the receiver contains a successful factorization.
+func (svd *SVD) succFact() bool {
+	return len(svd.s) != 0
+}
+
+// Factorize computes the singular value decomposition (SVD) of the input matrix A.
+// The singular values of A are computed in all cases, while the singular
+// vectors are optionally computed depending on the input kind.
+//
+// The full singular value decomposition (kind == SVDFull) is a factorization
+// of an m×n matrix A of the form
+//
+//	A = U * Σ * Vᵀ
+//
+// where Σ is an m×n diagonal matrix, U is an m×m orthogonal matrix, and V is an
+// n×n orthogonal matrix. The diagonal elements of Σ are the singular values of A.
+// The first min(m,n) columns of U and V are, respectively, the left and right
+// singular vectors of A.
+//
+// Significant storage space can be saved by using the thin representation of
+// the SVD (kind == SVDThin) instead of the full SVD, especially if
+// m >> n or m << n. The thin SVD finds
+//
+//	A = U~ * Σ * V~ᵀ
+//
+// where U~ is of size m×min(m,n), Σ is a diagonal matrix of size min(m,n)×min(m,n)
+// and V~ is of size n×min(m,n).
+//
+// Factorize returns whether the decomposition succeeded. If the decomposition
+// failed, routines that require a successful factorization will panic.
+func (svd *SVD) Factorize(a Matrix, kind SVDKind) (ok bool) {
+	// kill previous factorization
+	svd.s = svd.s[:0]
+	svd.kind = kind
+
+	m, n := a.Dims()
+	var jobU, jobVT lapack.SVDJob
+
+	// TODO(btracey): This code should be modified to have the smaller
+	// matrix written in-place into aCopy when the lapack/native/dgesvd
+	// implementation is complete.
+	switch {
+	case kind&SVDFullU != 0:
+		jobU = lapack.SVDAll
+		svd.u = blas64.General{
+			Rows:   m,
+			Cols:   m,
+			Stride: m,
+			Data:   use(svd.u.Data, m*m),
+		}
+	case kind&SVDThinU != 0:
+		jobU = lapack.SVDStore
+		svd.u = blas64.General{
+			Rows:   m,
+			Cols:   min(m, n),
+			Stride: min(m, n),
+			Data:   use(svd.u.Data, m*min(m, n)),
+		}
+	default:
+		jobU = lapack.SVDNone
+	}
+	switch {
+	case kind&SVDFullV != 0:
+		svd.vt = blas64.General{
+			Rows:   n,
+			Cols:   n,
+			Stride: n,
+			Data:   use(svd.vt.Data, n*n),
+		}
+		jobVT = lapack.SVDAll
+	case kind&SVDThinV != 0:
+		svd.vt = blas64.General{
+			Rows:   min(m, n),
+			Cols:   n,
+			Stride: n,
+			Data:   use(svd.vt.Data, min(m, n)*n),
+		}
+		jobVT = lapack.SVDStore
+	default:
+		jobVT = lapack.SVDNone
+	}
+
+	// A is destroyed on call, so copy the matrix.
+	aCopy := DenseCopyOf(a)
+	svd.kind = kind
+	svd.s = use(svd.s, min(m, n))
+
+	work := []float64{0}
+	lapack64.Gesvd(jobU, jobVT, aCopy.mat, svd.u, svd.vt, svd.s, work, -1)
+	work = getFloat64s(int(work[0]), false)
+	ok = lapack64.Gesvd(jobU, jobVT, aCopy.mat, svd.u, svd.vt, svd.s, work, len(work))
+	putFloat64s(work)
+	if !ok {
+		svd.kind = 0
+	}
+	return ok
+}
+
+// Kind returns the SVDKind of the decomposition. If no decomposition has been
+// computed, Kind returns -1.
+func (svd *SVD) Kind() SVDKind {
+	if !svd.succFact() {
+		return -1
+	}
+	return svd.kind
+}
+
+// Rank returns the rank of A based on the count of singular values greater than
+// rcond scaled by the largest singular value.
+// Rank will panic if the receiver does not contain a successful factorization or
+// rcond is negative.
+func (svd *SVD) Rank(rcond float64) int {
+	if rcond < 0 {
+		panic(badRcond)
+	}
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	s0 := svd.s[0]
+	for i, v := range svd.s {
+		if v <= rcond*s0 {
+			return i
+		}
+	}
+	return len(svd.s)
+}
+
+// Cond returns the 2-norm condition number for the factorized matrix. Cond will
+// panic if the receiver does not contain a successful factorization.
+func (svd *SVD) Cond() float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	return svd.s[0] / svd.s[len(svd.s)-1]
+}
+
+// Values returns the singular values of the factorized matrix in descending order.
+//
+// If the input slice is non-nil, the values will be stored in-place into
+// the slice. In this case, the slice must have length min(m,n), and Values will
+// panic with ErrSliceLengthMismatch otherwise. If the input slice is nil, a new
+// slice of the appropriate length will be allocated and returned.
+//
+// Values will panic if the receiver does not contain a successful factorization.
+func (svd *SVD) Values(s []float64) []float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	if s == nil {
+		s = make([]float64, len(svd.s))
+	}
+	if len(s) != len(svd.s) {
+		panic(ErrSliceLengthMismatch)
+	}
+	copy(s, svd.s)
+	return s
+}
+
+// UTo extracts the matrix U from the singular value decomposition. The first
+// min(m,n) columns are the left singular vectors and correspond to the singular
+// values as returned from SVD.Values.
+//
+// If dst is empty, UTo will resize dst to be m×m if the full U was computed
+// and size m×min(m,n) if the thin U was computed. When dst is non-empty, then
+// UTo will panic if dst is not the appropriate size. UTo will also panic if
+// the receiver does not contain a successful factorization, or if U was
+// not computed during factorization.
+func (svd *SVD) UTo(dst *Dense) {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	kind := svd.kind
+	if kind&SVDThinU == 0 && kind&SVDFullU == 0 {
+		panic("svd: u not computed during factorization")
+	}
+	r := svd.u.Rows
+	c := svd.u.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(r, c)
+	} else {
+		r2, c2 := dst.Dims()
+		if r != r2 || c != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     svd.u,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp)
+}
+
+// VTo extracts the matrix V from the singular value decomposition. The first
+// min(m,n) columns are the right singular vectors and correspond to the singular
+// values as returned from SVD.Values.
+//
+// If dst is empty, VTo will resize dst to be n×n if the full V was computed
+// and size n×min(m,n) if the thin V was computed. When dst is non-empty, then
+// VTo will panic if dst is not the appropriate size. VTo will also panic if
+// the receiver does not contain a successful factorization, or if V was
+// not computed during factorization.
+func (svd *SVD) VTo(dst *Dense) {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	kind := svd.kind
+	if kind&SVDThinV == 0 && kind&SVDFullV == 0 {
+		panic("svd: v not computed during factorization")
+	}
+	r := svd.vt.Rows
+	c := svd.vt.Cols
+	if dst.IsEmpty() {
+		dst.ReuseAs(c, r)
+	} else {
+		r2, c2 := dst.Dims()
+		if c != r2 || r != c2 {
+			panic(ErrShape)
+		}
+	}
+
+	tmp := &Dense{
+		mat:     svd.vt,
+		capRows: r,
+		capCols: c,
+	}
+	dst.Copy(tmp.T())
+}
+
+// SolveTo calculates the minimum-norm solution to a linear least squares problem
+//
+//	minimize over n-element vectors x: |b - A*x|_2 and |x|_2
+//
+// where b is a given m-element vector, using the SVD of m×n matrix A stored in
+// the receiver. A may be rank-deficient, that is, the given effective rank can be
+//
+//	rank ≤ min(m,n)
+//
+// The rank can be computed using SVD.Rank.
+//
+// Several right-hand side vectors b and solution vectors x can be handled in a
+// single call. Vectors b are stored in the columns of the m×k matrix B and the
+// resulting vectors x will be stored in the columns of dst. dst must be either
+// empty or have the size equal to n×k.
+//
+// The decomposition must have been factorized computing both the U and V
+// singular vectors.
+//
+// SolveTo returns the residuals calculated from the complete SVD. For this
+// value to be valid the factorization must have been performed with at least
+// SVDFullU.
+func (svd *SVD) SolveTo(dst *Dense, b Matrix, rank int) []float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	if rank < 1 || len(svd.s) < rank {
+		panic("svd: rank out of range")
+	}
+	kind := svd.kind
+	if kind&SVDThinU == 0 && kind&SVDFullU == 0 {
+		panic("svd: u not computed during factorization")
+	}
+	if kind&SVDThinV == 0 && kind&SVDFullV == 0 {
+		panic("svd: v not computed during factorization")
+	}
+
+	u := Dense{
+		mat:     svd.u,
+		capRows: svd.u.Rows,
+		capCols: svd.u.Cols,
+	}
+	vt := Dense{
+		mat:     svd.vt,
+		capRows: svd.vt.Rows,
+		capCols: svd.vt.Cols,
+	}
+	s := svd.s[:rank]
+
+	_, bc := b.Dims()
+	c := getDenseWorkspace(svd.u.Cols, bc, false)
+	defer putDenseWorkspace(c)
+	c.Mul(u.T(), b)
+
+	y := getDenseWorkspace(rank, bc, false)
+	defer putDenseWorkspace(y)
+	y.DivElem(c.slice(0, rank, 0, bc), repVector{vec: s, cols: bc})
+	dst.Mul(vt.slice(0, rank, 0, svd.vt.Cols).T(), y)
+
+	res := make([]float64, bc)
+	if rank < svd.u.Cols {
+		c = c.slice(len(s), svd.u.Cols, 0, bc)
+		for j := range res {
+			col := c.ColView(j)
+			res[j] = Dot(col, col)
+		}
+	}
+	return res
+}
+
+type repVector struct {
+	vec  []float64
+	cols int
+}
+
+func (m repVector) Dims() (r, c int) { return len(m.vec), m.cols }
+func (m repVector) At(i, j int) float64 {
+	if i < 0 || len(m.vec) <= i || j < 0 || m.cols <= j {
+		panic(ErrIndexOutOfRange.string) // Panic with string to prevent mat.Error recovery.
+	}
+	return m.vec[i]
+}
+func (m repVector) T() Matrix { return Transpose{m} }
+
+// SolveVecTo calculates the minimum-norm solution to a linear least squares problem
+//
+//	minimize over n-element vectors x: |b - A*x|_2 and |x|_2
+//
+// where b is a given m-element vector, using the SVD of m×n matrix A stored in
+// the receiver. A may be rank-deficient, that is, the given effective rank can be
+//
+//	rank ≤ min(m,n)
+//
+// The rank can be computed using SVD.Rank.
+//
+// The resulting vector x will be stored in dst. dst must be either empty or
+// have length equal to n.
+//
+// The decomposition must have been factorized computing both the U and V
+// singular vectors.
+//
+// SolveVecTo returns the residuals calculated from the complete SVD. For this
+// value to be valid the factorization must have been performed with at least
+// SVDFullU.
+func (svd *SVD) SolveVecTo(dst *VecDense, b Vector, rank int) float64 {
+	if !svd.succFact() {
+		panic(badFact)
+	}
+	if rank < 1 || len(svd.s) < rank {
+		panic("svd: rank out of range")
+	}
+	kind := svd.kind
+	if kind&SVDThinU == 0 && kind&SVDFullU == 0 {
+		panic("svd: u not computed during factorization")
+	}
+	if kind&SVDThinV == 0 && kind&SVDFullV == 0 {
+		panic("svd: v not computed during factorization")
+	}
+
+	u := Dense{
+		mat:     svd.u,
+		capRows: svd.u.Rows,
+		capCols: svd.u.Cols,
+	}
+	vt := Dense{
+		mat:     svd.vt,
+		capRows: svd.vt.Rows,
+		capCols: svd.vt.Cols,
+	}
+	s := svd.s[:rank]
+
+	c := getVecDenseWorkspace(svd.u.Cols, false)
+	defer putVecDenseWorkspace(c)
+	c.MulVec(u.T(), b)
+
+	y := getVecDenseWorkspace(rank, false)
+	defer putVecDenseWorkspace(y)
+	y.DivElemVec(c.sliceVec(0, rank), NewVecDense(rank, s))
+	dst.MulVec(vt.slice(0, rank, 0, svd.vt.Cols).T(), y)
+
+	var res float64
+	if rank < c.Len() {
+		c = c.sliceVec(rank, c.Len())
+		res = Dot(c, c)
+	}
+	return res
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/symband.go b/vendor/gonum.org/v1/gonum/mat/symband.go
new file mode 100644
index 00000000000..63638ea9129
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/symband.go
@@ -0,0 +1,312 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	symBandDense *SymBandDense
+	_            Matrix           = symBandDense
+	_            allMatrix        = symBandDense
+	_            denseMatrix      = symBandDense
+	_            Symmetric        = symBandDense
+	_            Banded           = symBandDense
+	_            SymBanded        = symBandDense
+	_            RawSymBander     = symBandDense
+	_            MutableSymBanded = symBandDense
+
+	_ NonZeroDoer    = symBandDense
+	_ RowNonZeroDoer = symBandDense
+	_ ColNonZeroDoer = symBandDense
+)
+
+// SymBandDense represents a symmetric band matrix in dense storage format.
+type SymBandDense struct {
+	mat blas64.SymmetricBand
+}
+
+// SymBanded is a symmetric band matrix interface type.
+type SymBanded interface {
+	Banded
+
+	// SymmetricDim returns the number of rows/columns in the matrix.
+	SymmetricDim() int
+
+	// SymBand returns the number of rows/columns in the matrix, and the size of
+	// the bandwidth.
+	SymBand() (n, k int)
+}
+
+// MutableSymBanded is a symmetric band matrix interface type that allows elements
+// to be altered.
+type MutableSymBanded interface {
+	SymBanded
+	SetSymBand(i, j int, v float64)
+}
+
+// A RawSymBander can return a blas64.SymmetricBand representation of the receiver.
+// Changes to the blas64.SymmetricBand.Data slice will be reflected in the original
+// matrix, changes to the N, K, Stride and Uplo fields will not.
+type RawSymBander interface {
+	RawSymBand() blas64.SymmetricBand
+}
+
+// NewSymBandDense creates a new SymBand matrix with n rows and columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*(k+1),
+// data is used as the backing slice, and changes to the elements of the returned
+// SymBandDense will be reflected in data. If neither of these is true, NewSymBandDense
+// will panic. k must be at least zero and less than n, otherwise NewSymBandDense will panic.
+//
+// The data must be arranged in row-major order constructed by removing the zeros
+// from the rows outside the band and aligning the diagonals. SymBandDense matrices
+// are stored in the upper triangle. For example, the matrix
+//
+//	1  2  3  0  0  0
+//	2  4  5  6  0  0
+//	3  5  7  8  9  0
+//	0  6  8 10 11 12
+//	0  0  9 11 13 14
+//	0  0  0 12 14 15
+//
+// becomes (* entries are never accessed)
+//
+//	 1  2  3
+//	 4  5  6
+//	 7  8  9
+//	10 11 12
+//	13 14  *
+//	15  *  *
+//
+// which is passed to NewSymBandDense as []float64{1, 2, ..., 15, *, *, *} with k=2.
+// Only the values in the band portion of the matrix are used.
+func NewSymBandDense(n, k int, data []float64) *SymBandDense {
+	if n <= 0 || k < 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if k+1 > n {
+		panic("mat: band out of range")
+	}
+	bc := k + 1
+	if data != nil && len(data) != n*bc {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*bc)
+	}
+	return &SymBandDense{
+		mat: blas64.SymmetricBand{
+			N:      n,
+			K:      k,
+			Stride: bc,
+			Uplo:   blas.Upper,
+			Data:   data,
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (s *SymBandDense) Dims() (r, c int) {
+	return s.mat.N, s.mat.N
+}
+
+// SymmetricDim returns the size of the receiver.
+func (s *SymBandDense) SymmetricDim() int {
+	return s.mat.N
+}
+
+// Bandwidth returns the bandwidths of the matrix.
+func (s *SymBandDense) Bandwidth() (kl, ku int) {
+	return s.mat.K, s.mat.K
+}
+
+// SymBand returns the number of rows/columns in the matrix, and the size of
+// the bandwidth.
+func (s *SymBandDense) SymBand() (n, k int) {
+	return s.mat.N, s.mat.K
+}
+
+// T implements the Matrix interface. Symmetric matrices, by definition, are
+// equal to their transpose, and this is a no-op.
+func (s *SymBandDense) T() Matrix {
+	return s
+}
+
+// TBand implements the Banded interface.
+func (s *SymBandDense) TBand() Banded {
+	return s
+}
+
+// RawSymBand returns the underlying blas64.SymBand used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.SymBand.
+func (s *SymBandDense) RawSymBand() blas64.SymmetricBand {
+	return s.mat
+}
+
+// SetRawSymBand sets the underlying blas64.SymmetricBand used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied SymmetricBand must use blas.Upper storage format.
+func (s *SymBandDense) SetRawSymBand(mat blas64.SymmetricBand) {
+	if mat.Uplo != blas.Upper {
+		panic("mat: blas64.SymmetricBand does not have blas.Upper storage")
+	}
+	s.mat = mat
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (s *SymBandDense) IsEmpty() bool {
+	return s.mat.Stride == 0
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (s *SymBandDense) Reset() {
+	s.mat.N = 0
+	s.mat.K = 0
+	s.mat.Stride = 0
+	s.mat.Uplo = 0
+	s.mat.Data = s.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (s *SymBandDense) Zero() {
+	for i := 0; i < s.mat.N; i++ {
+		u := min(1+s.mat.K, s.mat.N-i)
+		zero(s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+u])
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (s *SymBandDense) DiagView() Diagonal {
+	n := s.mat.N
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  s.mat.Stride,
+			Data: s.mat.Data[:(n-1)*s.mat.Stride+1],
+		},
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of s. The function fn
+// takes a row/column index and the element value of s at (i, j).
+func (s *SymBandDense) DoNonZero(fn func(i, j int, v float64)) {
+	for i := 0; i < s.mat.N; i++ {
+		for j := max(0, i-s.mat.K); j < min(s.mat.N, i+s.mat.K+1); j++ {
+			v := s.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of s. The function fn
+// takes a row/column index and the element value of s at (i, j).
+func (s *SymBandDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || s.mat.N <= i {
+		panic(ErrRowAccess)
+	}
+	for j := max(0, i-s.mat.K); j < min(s.mat.N, i+s.mat.K+1); j++ {
+		v := s.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of s. The function fn
+// takes a row/column index and the element value of s at (i, j).
+func (s *SymBandDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || s.mat.N <= j {
+		panic(ErrColAccess)
+	}
+	for i := 0; i < s.mat.N; i++ {
+		if i-s.mat.K <= j && j < i+s.mat.K+1 {
+			v := s.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (s *SymBandDense) Norm(norm float64) float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum || lnorm == lapack.MaxRowSum {
+		work := getFloat64s(s.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lansb(lnorm, s.mat, work)
+	}
+	return lapack64.Lansb(lnorm, s.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (s *SymBandDense) Trace() float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := s.RawSymBand()
+	var tr float64
+	for i := 0; i < rb.N; i++ {
+		tr += rb.Data[i*rb.Stride]
+	}
+	return tr
+}
+
+// MulVecTo computes S⋅x storing the result into dst.
+func (s *SymBandDense) MulVecTo(dst *VecDense, _ bool, x Vector) {
+	n := s.mat.N
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(n)
+
+	xMat, _ := untransposeExtract(x)
+	if xVec, ok := xMat.(*VecDense); ok {
+		if dst != xVec {
+			dst.checkOverlap(xVec.mat)
+			blas64.Sbmv(1, s.mat, xVec.mat, 0, dst.mat)
+		} else {
+			xCopy := getVecDenseWorkspace(n, false)
+			xCopy.CloneFromVec(xVec)
+			blas64.Sbmv(1, s.mat, xCopy.mat, 0, dst.mat)
+			putVecDenseWorkspace(xCopy)
+		}
+	} else {
+		xCopy := getVecDenseWorkspace(n, false)
+		xCopy.CloneFromVec(x)
+		blas64.Sbmv(1, s.mat, xCopy.mat, 0, dst.mat)
+		putVecDenseWorkspace(xCopy)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/symmetric.go b/vendor/gonum.org/v1/gonum/mat/symmetric.go
new file mode 100644
index 00000000000..e38e4c7b6fd
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/symmetric.go
@@ -0,0 +1,698 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	symDense *SymDense
+
+	_ Matrix           = symDense
+	_ allMatrix        = symDense
+	_ denseMatrix      = symDense
+	_ Symmetric        = symDense
+	_ RawSymmetricer   = symDense
+	_ MutableSymmetric = symDense
+)
+
+const badSymTriangle = "mat: blas64.Symmetric not upper"
+
+// SymDense is a symmetric matrix that uses dense storage. SymDense
+// matrices are stored in the upper triangle.
+type SymDense struct {
+	mat blas64.Symmetric
+	cap int
+}
+
+// Symmetric represents a symmetric matrix (where the element at {i, j} equals
+// the element at {j, i}). Symmetric matrices are always square.
+type Symmetric interface {
+	Matrix
+	// SymmetricDim returns the number of rows/columns in the matrix.
+	SymmetricDim() int
+}
+
+// A RawSymmetricer can return a view of itself as a BLAS Symmetric matrix.
+type RawSymmetricer interface {
+	RawSymmetric() blas64.Symmetric
+}
+
+// A MutableSymmetric can set elements of a symmetric matrix.
+type MutableSymmetric interface {
+	Symmetric
+	SetSym(i, j int, v float64)
+}
+
+// NewSymDense creates a new Symmetric matrix with n rows and columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*n, data is
+// used as the backing slice, and changes to the elements of the returned SymDense
+// will be reflected in data. If neither of these is true, NewSymDense will panic.
+// NewSymDense will panic if n is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+// Only the values in the upper triangular portion of the matrix are used.
+func NewSymDense(n int, data []float64) *SymDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data != nil && n*n != len(data) {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*n)
+	}
+	return &SymDense{
+		mat: blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   data,
+			Uplo:   blas.Upper,
+		},
+		cap: n,
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (s *SymDense) Dims() (r, c int) {
+	return s.mat.N, s.mat.N
+}
+
+// Caps returns the number of rows and columns in the backing matrix.
+func (s *SymDense) Caps() (r, c int) {
+	return s.cap, s.cap
+}
+
+// T returns the receiver, the transpose of a symmetric matrix.
+func (s *SymDense) T() Matrix {
+	return s
+}
+
+// SymmetricDim implements the Symmetric interface and returns the number of rows
+// and columns in the matrix.
+func (s *SymDense) SymmetricDim() int {
+	return s.mat.N
+}
+
+// RawSymmetric returns the matrix as a blas64.Symmetric. The returned
+// value must be stored in upper triangular format.
+func (s *SymDense) RawSymmetric() blas64.Symmetric {
+	return s.mat
+}
+
+// SetRawSymmetric sets the underlying blas64.Symmetric used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied Symmetric must use blas.Upper storage format.
+func (s *SymDense) SetRawSymmetric(mat blas64.Symmetric) {
+	if mat.Uplo != blas.Upper {
+		panic(badSymTriangle)
+	}
+	s.cap = mat.N
+	s.mat = mat
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (s *SymDense) Reset() {
+	// N and Stride must be zeroed in unison.
+	s.mat.N, s.mat.Stride = 0, 0
+	s.mat.Data = s.mat.Data[:0]
+}
+
+// ReuseAsSym changes the receiver if it IsEmpty() to be of size n×n.
+//
+// ReuseAsSym re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAsSym panics if the receiver is not empty, and panics if
+// the input size is less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (s *SymDense) ReuseAsSym(n int) {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !s.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	s.reuseAsZeroed(n)
+}
+
+// Zero sets all of the matrix elements to zero.
+func (s *SymDense) Zero() {
+	for i := 0; i < s.mat.N; i++ {
+		zero(s.mat.Data[i*s.mat.Stride+i : i*s.mat.Stride+s.mat.N])
+	}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (s *SymDense) IsEmpty() bool {
+	// It must be the case that m.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return s.mat.N == 0
+}
+
+// reuseAsNonZeroed resizes an empty matrix to a n×n matrix,
+// or checks that a non-empty matrix is n×n.
+func (s *SymDense) reuseAsNonZeroed(n int) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	if s.mat.N > s.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if s.IsEmpty() {
+		s.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   use(s.mat.Data, n*n),
+			Uplo:   blas.Upper,
+		}
+		s.cap = n
+		return
+	}
+	if s.mat.Uplo != blas.Upper {
+		panic(badSymTriangle)
+	}
+	if s.mat.N != n {
+		panic(ErrShape)
+	}
+}
+
+// reuseAsNonZeroed resizes an empty matrix to a n×n matrix,
+// or checks that a non-empty matrix is n×n. It then zeros the
+// elements of the matrix.
+func (s *SymDense) reuseAsZeroed(n int) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	if s.mat.N > s.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if s.IsEmpty() {
+		s.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   useZeroed(s.mat.Data, n*n),
+			Uplo:   blas.Upper,
+		}
+		s.cap = n
+		return
+	}
+	if s.mat.Uplo != blas.Upper {
+		panic(badSymTriangle)
+	}
+	if s.mat.N != n {
+		panic(ErrShape)
+	}
+	s.Zero()
+}
+
+func (s *SymDense) isolatedWorkspace(a Symmetric) (w *SymDense, restore func()) {
+	n := a.SymmetricDim()
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getSymDenseWorkspace(n, false)
+	return w, func() {
+		s.CopySym(w)
+		putSymDenseWorkspace(w)
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (s *SymDense) DiagView() Diagonal {
+	n := s.mat.N
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  s.mat.Stride + 1,
+			Data: s.mat.Data[:(n-1)*s.mat.Stride+n],
+		},
+	}
+}
+
+func (s *SymDense) AddSym(a, b Symmetric) {
+	n := a.SymmetricDim()
+	if n != b.SymmetricDim() {
+		panic(ErrShape)
+	}
+	s.reuseAsNonZeroed(n)
+
+	if a, ok := a.(RawSymmetricer); ok {
+		if b, ok := b.(RawSymmetricer); ok {
+			amat, bmat := a.RawSymmetric(), b.RawSymmetric()
+			if s != a {
+				s.checkOverlap(generalFromSymmetric(amat))
+			}
+			if s != b {
+				s.checkOverlap(generalFromSymmetric(bmat))
+			}
+			for i := 0; i < n; i++ {
+				btmp := bmat.Data[i*bmat.Stride+i : i*bmat.Stride+n]
+				stmp := s.mat.Data[i*s.mat.Stride+i : i*s.mat.Stride+n]
+				for j, v := range amat.Data[i*amat.Stride+i : i*amat.Stride+n] {
+					stmp[j] = v + btmp[j]
+				}
+			}
+			return
+		}
+	}
+
+	s.checkOverlapMatrix(a)
+	s.checkOverlapMatrix(b)
+	for i := 0; i < n; i++ {
+		stmp := s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+n]
+		for j := i; j < n; j++ {
+			stmp[j] = a.At(i, j) + b.At(i, j)
+		}
+	}
+}
+
+func (s *SymDense) CopySym(a Symmetric) int {
+	n := a.SymmetricDim()
+	n = min(n, s.mat.N)
+	if n == 0 {
+		return 0
+	}
+	switch a := a.(type) {
+	case RawSymmetricer:
+		amat := a.RawSymmetric()
+		if amat.Uplo != blas.Upper {
+			panic(badSymTriangle)
+		}
+		for i := 0; i < n; i++ {
+			copy(s.mat.Data[i*s.mat.Stride+i:i*s.mat.Stride+n], amat.Data[i*amat.Stride+i:i*amat.Stride+n])
+		}
+	default:
+		for i := 0; i < n; i++ {
+			stmp := s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+n]
+			for j := i; j < n; j++ {
+				stmp[j] = a.At(i, j)
+			}
+		}
+	}
+	return n
+}
+
+// SymRankOne performs a symmetric rank-one update to the matrix a with x,
+// which is treated as a column vector, and stores the result in the receiver
+//
+//	s = a + alpha * x * xᵀ
+func (s *SymDense) SymRankOne(a Symmetric, alpha float64, x Vector) {
+	n := x.Len()
+	if a.SymmetricDim() != n {
+		panic(ErrShape)
+	}
+	s.reuseAsNonZeroed(n)
+
+	if s != a {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+		s.CopySym(a)
+	}
+
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat := rv.mat
+		s.checkOverlap(generalFromVector(xmat, r, c))
+		blas64.Syr(alpha, xmat, s.mat)
+		return
+	}
+
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			s.set(i, j, s.at(i, j)+alpha*x.AtVec(i)*x.AtVec(j))
+		}
+	}
+}
+
+// SymRankK performs a symmetric rank-k update to the matrix a and stores the
+// result into the receiver. If a is zero, see SymOuterK.
+//
+//	s = a + alpha * x * x'
+func (s *SymDense) SymRankK(a Symmetric, alpha float64, x Matrix) {
+	n := a.SymmetricDim()
+	r, _ := x.Dims()
+	if r != n {
+		panic(ErrShape)
+	}
+	xMat, aTrans := untransposeExtract(x)
+	var g blas64.General
+	if rm, ok := xMat.(*Dense); ok {
+		g = rm.mat
+	} else {
+		g = DenseCopyOf(x).mat
+		aTrans = false
+	}
+	if a != s {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+		s.reuseAsNonZeroed(n)
+		s.CopySym(a)
+	}
+	t := blas.NoTrans
+	if aTrans {
+		t = blas.Trans
+	}
+	blas64.Syrk(t, alpha, g, 1, s.mat)
+}
+
+// SymOuterK calculates the outer product of x with itself and stores
+// the result into the receiver. It is equivalent to the matrix
+// multiplication
+//
+//	s = alpha * x * x'.
+//
+// In order to update an existing matrix, see SymRankOne.
+func (s *SymDense) SymOuterK(alpha float64, x Matrix) {
+	n, _ := x.Dims()
+	switch {
+	case s.IsEmpty():
+		s.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Data:   useZeroed(s.mat.Data, n*n),
+			Uplo:   blas.Upper,
+		}
+		s.cap = n
+		s.SymRankK(s, alpha, x)
+	case s.mat.Uplo != blas.Upper:
+		panic(badSymTriangle)
+	case s.mat.N == n:
+		if s == x {
+			w := getSymDenseWorkspace(n, true)
+			w.SymRankK(w, alpha, x)
+			s.CopySym(w)
+			putSymDenseWorkspace(w)
+		} else {
+			switch r := x.(type) {
+			case RawMatrixer:
+				s.checkOverlap(r.RawMatrix())
+			case RawSymmetricer:
+				s.checkOverlap(generalFromSymmetric(r.RawSymmetric()))
+			case RawTriangular:
+				s.checkOverlap(generalFromTriangular(r.RawTriangular()))
+			}
+			// Only zero the upper triangle.
+			for i := 0; i < n; i++ {
+				ri := i * s.mat.Stride
+				zero(s.mat.Data[ri+i : ri+n])
+			}
+			s.SymRankK(s, alpha, x)
+		}
+	default:
+		panic(ErrShape)
+	}
+}
+
+// RankTwo performs a symmetric rank-two update to the matrix a with the
+// vectors x and y, which are treated as column vectors, and stores the
+// result in the receiver
+//
+//	m = a + alpha * (x * yᵀ + y * xᵀ)
+func (s *SymDense) RankTwo(a Symmetric, alpha float64, x, y Vector) {
+	n := s.mat.N
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	if y.Len() != n {
+		panic(ErrShape)
+	}
+
+	if s != a {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+	}
+
+	var xmat, ymat blas64.Vector
+	fast := true
+	xU, _ := untransposeExtract(x)
+	if rv, ok := xU.(*VecDense); ok {
+		r, c := xU.Dims()
+		xmat = rv.mat
+		s.checkOverlap(generalFromVector(xmat, r, c))
+	} else {
+		fast = false
+	}
+	yU, _ := untransposeExtract(y)
+	if rv, ok := yU.(*VecDense); ok {
+		r, c := yU.Dims()
+		ymat = rv.mat
+		s.checkOverlap(generalFromVector(ymat, r, c))
+	} else {
+		fast = false
+	}
+
+	if s != a {
+		if rs, ok := a.(RawSymmetricer); ok {
+			s.checkOverlap(generalFromSymmetric(rs.RawSymmetric()))
+		}
+		s.reuseAsNonZeroed(n)
+		s.CopySym(a)
+	}
+
+	if fast {
+		if s != a {
+			s.reuseAsNonZeroed(n)
+			s.CopySym(a)
+		}
+		blas64.Syr2(alpha, xmat, ymat, s.mat)
+		return
+	}
+
+	for i := 0; i < n; i++ {
+		s.reuseAsNonZeroed(n)
+		for j := i; j < n; j++ {
+			s.set(i, j, a.At(i, j)+alpha*(x.AtVec(i)*y.AtVec(j)+y.AtVec(i)*x.AtVec(j)))
+		}
+	}
+}
+
+// ScaleSym multiplies the elements of a by f, placing the result in the receiver.
+func (s *SymDense) ScaleSym(f float64, a Symmetric) {
+	n := a.SymmetricDim()
+	s.reuseAsNonZeroed(n)
+	if a, ok := a.(RawSymmetricer); ok {
+		amat := a.RawSymmetric()
+		if s != a {
+			s.checkOverlap(generalFromSymmetric(amat))
+		}
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				s.mat.Data[i*s.mat.Stride+j] = f * amat.Data[i*amat.Stride+j]
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			s.mat.Data[i*s.mat.Stride+j] = f * a.At(i, j)
+		}
+	}
+}
+
+// SubsetSym extracts a subset of the rows and columns of the matrix a and stores
+// the result in-place into the receiver. The resulting matrix size is
+// len(set)×len(set). Specifically, at the conclusion of SubsetSym,
+// s.At(i, j) equals a.At(set[i], set[j]). Note that the supplied set does not
+// have to be a strict subset, dimension repeats are allowed.
+func (s *SymDense) SubsetSym(a Symmetric, set []int) {
+	n := len(set)
+	na := a.SymmetricDim()
+	s.reuseAsNonZeroed(n)
+	var restore func()
+	if a == s {
+		s, restore = s.isolatedWorkspace(a)
+		defer restore()
+	}
+
+	if a, ok := a.(RawSymmetricer); ok {
+		raw := a.RawSymmetric()
+		if s != a {
+			s.checkOverlap(generalFromSymmetric(raw))
+		}
+		for i := 0; i < n; i++ {
+			ssub := s.mat.Data[i*s.mat.Stride : i*s.mat.Stride+n]
+			r := set[i]
+			rsub := raw.Data[r*raw.Stride : r*raw.Stride+na]
+			for j := i; j < n; j++ {
+				c := set[j]
+				if r <= c {
+					ssub[j] = rsub[c]
+				} else {
+					ssub[j] = raw.Data[c*raw.Stride+r]
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			s.mat.Data[i*s.mat.Stride+j] = a.At(set[i], set[j])
+		}
+	}
+}
+
+// SliceSym returns a new Matrix that shares backing data with the receiver.
+// The returned matrix starts at {i,i} of the receiver and extends k-i rows
+// and columns. The final row and column in the resulting matrix is k-1.
+// SliceSym panics with ErrIndexOutOfRange if the slice is outside the
+// capacity of the receiver.
+func (s *SymDense) SliceSym(i, k int) Symmetric {
+	return s.sliceSym(i, k)
+}
+
+func (s *SymDense) sliceSym(i, k int) *SymDense {
+	sz := s.cap
+	if i < 0 || sz < i || k < i || sz < k {
+		panic(ErrIndexOutOfRange)
+	}
+	v := *s
+	v.mat.Data = s.mat.Data[i*s.mat.Stride+i : (k-1)*s.mat.Stride+k]
+	v.mat.N = k - i
+	v.cap = s.cap - i
+	return &v
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (s *SymDense) Norm(norm float64) float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum || lnorm == lapack.MaxRowSum {
+		work := getFloat64s(s.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lansy(lnorm, s.mat, work)
+	}
+	return lapack64.Lansy(lnorm, s.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (s *SymDense) Trace() float64 {
+	if s.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	// TODO(btracey): could use internal asm sum routine.
+	var v float64
+	for i := 0; i < s.mat.N; i++ {
+		v += s.mat.Data[i*s.mat.Stride+i]
+	}
+	return v
+}
+
+// GrowSym returns the receiver expanded by n rows and n columns. If the
+// dimensions of the expanded matrix are outside the capacity of the receiver
+// a new allocation is made, otherwise not. Note that the receiver itself is
+// not modified during the call to GrowSquare.
+func (s *SymDense) GrowSym(n int) Symmetric {
+	if n < 0 {
+		panic(ErrIndexOutOfRange)
+	}
+	if n == 0 {
+		return s
+	}
+	var v SymDense
+	n += s.mat.N
+	if s.IsEmpty() || n > s.cap {
+		v.mat = blas64.Symmetric{
+			N:      n,
+			Stride: n,
+			Uplo:   blas.Upper,
+			Data:   make([]float64, n*n),
+		}
+		v.cap = n
+		// Copy elements, including those not currently visible. Use a temporary
+		// structure to avoid modifying the receiver.
+		var tmp SymDense
+		tmp.mat = blas64.Symmetric{
+			N:      s.cap,
+			Stride: s.mat.Stride,
+			Data:   s.mat.Data,
+			Uplo:   s.mat.Uplo,
+		}
+		tmp.cap = s.cap
+		v.CopySym(&tmp)
+		return &v
+	}
+	v.mat = blas64.Symmetric{
+		N:      n,
+		Stride: s.mat.Stride,
+		Uplo:   blas.Upper,
+		Data:   s.mat.Data[:(n-1)*s.mat.Stride+n],
+	}
+	v.cap = s.cap
+	return &v
+}
+
+// PowPSD computes a^pow where a is a positive symmetric definite matrix.
+//
+// PowPSD returns an error if the matrix is not positive symmetric definite
+// or the Eigen decomposition is not successful.
+func (s *SymDense) PowPSD(a Symmetric, pow float64) error {
+	dim := a.SymmetricDim()
+	s.reuseAsNonZeroed(dim)
+
+	var eigen EigenSym
+	ok := eigen.Factorize(a, true)
+	if !ok {
+		return ErrFailedEigen
+	}
+	values := eigen.Values(nil)
+	for i, v := range values {
+		if v <= 0 {
+			return ErrNotPSD
+		}
+		values[i] = math.Pow(v, pow)
+	}
+	var u Dense
+	eigen.VectorsTo(&u)
+
+	s.SymOuterK(values[0], u.ColView(0))
+
+	var v VecDense
+	for i := 1; i < dim; i++ {
+		v.ColViewOf(&u, i)
+		s.SymRankOne(s, values[i], &v)
+	}
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/triangular.go b/vendor/gonum.org/v1/gonum/mat/triangular.go
new file mode 100644
index 00000000000..0e37fb01026
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/triangular.go
@@ -0,0 +1,832 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	triDense *TriDense
+	_        Matrix            = triDense
+	_        allMatrix         = triDense
+	_        denseMatrix       = triDense
+	_        Triangular        = triDense
+	_        RawTriangular     = triDense
+	_        MutableTriangular = triDense
+
+	_ NonZeroDoer    = triDense
+	_ RowNonZeroDoer = triDense
+	_ ColNonZeroDoer = triDense
+)
+
+// TriDense represents an upper or lower triangular matrix in dense storage
+// format.
+type TriDense struct {
+	mat blas64.Triangular
+	cap int
+}
+
+// Triangular represents a triangular matrix. Triangular matrices are always square.
+type Triangular interface {
+	Matrix
+	// Triangle returns the number of rows/columns in the matrix and its
+	// orientation.
+	Triangle() (n int, kind TriKind)
+
+	// TTri is the equivalent of the T() method in the Matrix interface but
+	// guarantees the transpose is of triangular type.
+	TTri() Triangular
+}
+
+// A RawTriangular can return a blas64.Triangular representation of the receiver.
+// Changes to the blas64.Triangular.Data slice will be reflected in the original
+// matrix, changes to the N, Stride, Uplo and Diag fields will not.
+type RawTriangular interface {
+	RawTriangular() blas64.Triangular
+}
+
+// A MutableTriangular can set elements of a triangular matrix.
+type MutableTriangular interface {
+	Triangular
+	SetTri(i, j int, v float64)
+}
+
+var (
+	_ Matrix           = TransposeTri{}
+	_ Triangular       = TransposeTri{}
+	_ UntransposeTrier = TransposeTri{}
+)
+
+// TransposeTri is a type for performing an implicit transpose of a Triangular
+// matrix. It implements the Triangular interface, returning values from the
+// transpose of the matrix within.
+type TransposeTri struct {
+	Triangular Triangular
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Triangular field.
+func (t TransposeTri) At(i, j int) float64 {
+	return t.Triangular.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. Triangular matrices are
+// square and thus this is the same size as the original Triangular.
+func (t TransposeTri) Dims() (r, c int) {
+	c, r = t.Triangular.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Triangular field.
+func (t TransposeTri) T() Matrix {
+	return t.Triangular
+}
+
+// Triangle returns the number of rows/columns in the matrix and its orientation.
+func (t TransposeTri) Triangle() (int, TriKind) {
+	n, upper := t.Triangular.Triangle()
+	return n, !upper
+}
+
+// TTri performs an implicit transpose by returning the Triangular field.
+func (t TransposeTri) TTri() Triangular {
+	return t.Triangular
+}
+
+// Untranspose returns the Triangular field.
+func (t TransposeTri) Untranspose() Matrix {
+	return t.Triangular
+}
+
+func (t TransposeTri) UntransposeTri() Triangular {
+	return t.Triangular
+}
+
+// NewTriDense creates a new Triangular matrix with n rows and columns. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*n, data is
+// used as the backing slice, and changes to the elements of the returned TriDense
+// will be reflected in data. If neither of these is true, NewTriDense will panic.
+// NewTriDense will panic if n is zero.
+//
+// The data must be arranged in row-major order, i.e. the (i*c + j)-th
+// element in the data slice is the {i, j}-th element in the matrix.
+// Only the values in the triangular portion corresponding to kind are used.
+func NewTriDense(n int, kind TriKind, data []float64) *TriDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if data != nil && len(data) != n*n {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*n)
+	}
+	uplo := blas.Lower
+	if kind == Upper {
+		uplo = blas.Upper
+	}
+	return &TriDense{
+		mat: blas64.Triangular{
+			N:      n,
+			Stride: n,
+			Data:   data,
+			Uplo:   uplo,
+			Diag:   blas.NonUnit,
+		},
+		cap: n,
+	}
+}
+
+func (t *TriDense) Dims() (r, c int) {
+	return t.mat.N, t.mat.N
+}
+
+// Triangle returns the dimension of t and its orientation. The returned
+// orientation is only valid when n is not empty.
+func (t *TriDense) Triangle() (n int, kind TriKind) {
+	return t.mat.N, t.triKind()
+}
+
+func (t *TriDense) isUpper() bool {
+	return isUpperUplo(t.mat.Uplo)
+}
+
+func (t *TriDense) triKind() TriKind {
+	return TriKind(isUpperUplo(t.mat.Uplo))
+}
+
+func isUpperUplo(u blas.Uplo) bool {
+	switch u {
+	case blas.Upper:
+		return true
+	case blas.Lower:
+		return false
+	default:
+		panic(badTriangle)
+	}
+}
+
+// asSymBlas returns the receiver restructured as a blas64.Symmetric with the
+// same backing memory. Panics if the receiver is unit.
+// This returns a blas64.Symmetric and not a *SymDense because SymDense can only
+// be upper triangular.
+func (t *TriDense) asSymBlas() blas64.Symmetric {
+	if t.mat.Diag == blas.Unit {
+		panic("mat: cannot convert unit TriDense into blas64.Symmetric")
+	}
+	return blas64.Symmetric{
+		N:      t.mat.N,
+		Stride: t.mat.Stride,
+		Data:   t.mat.Data,
+		Uplo:   t.mat.Uplo,
+	}
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (t *TriDense) T() Matrix {
+	return Transpose{t}
+}
+
+// TTri performs an implicit transpose by returning the receiver inside a TransposeTri.
+func (t *TriDense) TTri() Triangular {
+	return TransposeTri{t}
+}
+
+func (t *TriDense) RawTriangular() blas64.Triangular {
+	return t.mat
+}
+
+// SetRawTriangular sets the underlying blas64.Triangular used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied Triangular must not use blas.Unit storage format.
+func (t *TriDense) SetRawTriangular(mat blas64.Triangular) {
+	if mat.Diag == blas.Unit {
+		panic("mat: cannot set TriDense with Unit storage format")
+	}
+	t.cap = mat.N
+	t.mat = mat
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (t *TriDense) Reset() {
+	// N and Stride must be zeroed in unison.
+	t.mat.N, t.mat.Stride = 0, 0
+	// Defensively zero Uplo to ensure
+	// it is set correctly later.
+	t.mat.Uplo = 0
+	t.mat.Data = t.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (t *TriDense) Zero() {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			zero(t.mat.Data[i*t.mat.Stride+i : i*t.mat.Stride+t.mat.N])
+		}
+		return
+	}
+	for i := 0; i < t.mat.N; i++ {
+		zero(t.mat.Data[i*t.mat.Stride : i*t.mat.Stride+i+1])
+	}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (t *TriDense) IsEmpty() bool {
+	// It must be the case that t.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return t.mat.Stride == 0
+}
+
+// untransposeTri untransposes a matrix if applicable. If a is an UntransposeTrier, then
+// untransposeTri returns the underlying matrix and true. If it is not, then it returns
+// the input matrix and false.
+func untransposeTri(a Triangular) (Triangular, bool) {
+	if ut, ok := a.(UntransposeTrier); ok {
+		return ut.UntransposeTri(), true
+	}
+	return a, false
+}
+
+// ReuseAsTri changes the receiver if it IsEmpty() to be of size n×n.
+//
+// ReuseAsTri re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAsTri panics if the receiver is not empty, and panics if
+// the input size is less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (t *TriDense) ReuseAsTri(n int, kind TriKind) {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !t.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	t.reuseAsZeroed(n, kind)
+}
+
+// reuseAsNonZeroed resizes an empty receiver to an n×n triangular matrix with the given
+// orientation. If the receiver is not empty, reuseAsNonZeroed checks that the receiver
+// is the correct size and orientation.
+func (t *TriDense) reuseAsNonZeroed(n int, kind TriKind) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.mat.N > t.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.Triangular{
+			N:      n,
+			Stride: n,
+			Diag:   blas.NonUnit,
+			Data:   use(t.mat.Data, n*n),
+			Uplo:   ul,
+		}
+		t.cap = n
+		return
+	}
+	if t.mat.N != n {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+}
+
+// reuseAsZeroed resizes an empty receiver to an n×n triangular matrix with the given
+// orientation. If the receiver is not empty, reuseAsZeroed checks that the receiver
+// is the correct size and orientation. It then zeros out the matrix data.
+func (t *TriDense) reuseAsZeroed(n int, kind TriKind) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.mat.N > t.cap {
+		// Panic as a string, not a mat.Error.
+		panic(badCap)
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.Triangular{
+			N:      n,
+			Stride: n,
+			Diag:   blas.NonUnit,
+			Data:   useZeroed(t.mat.Data, n*n),
+			Uplo:   ul,
+		}
+		t.cap = n
+		return
+	}
+	if t.mat.N != n {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+	t.Zero()
+}
+
+// isolatedWorkspace returns a new TriDense matrix w with the size of a and
+// returns a callback to defer which performs cleanup at the return of the call.
+// This should be used when a method receiver is the same pointer as an input argument.
+func (t *TriDense) isolatedWorkspace(a Triangular) (w *TriDense, restore func()) {
+	n, kind := a.Triangle()
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	w = getTriDenseWorkspace(n, kind, false)
+	return w, func() {
+		t.Copy(w)
+		putTriWorkspace(w)
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (t *TriDense) DiagView() Diagonal {
+	if t.mat.Diag == blas.Unit {
+		panic("mat: cannot take view of Unit diagonal")
+	}
+	n := t.mat.N
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  t.mat.Stride + 1,
+			Data: t.mat.Data[:(n-1)*t.mat.Stride+n],
+		},
+	}
+}
+
+// Copy makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two matrices and
+// returns the number of rows and columns it copied. Only elements within the
+// receiver's non-zero triangle are set.
+//
+// See the Copier interface for more information.
+func (t *TriDense) Copy(a Matrix) (r, c int) {
+	r, c = a.Dims()
+	r = min(r, t.mat.N)
+	c = min(c, t.mat.N)
+	if r == 0 || c == 0 {
+		return 0, 0
+	}
+
+	switch a := a.(type) {
+	case RawMatrixer:
+		amat := a.RawMatrix()
+		if t.isUpper() {
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride+i:i*t.mat.Stride+c], amat.Data[i*amat.Stride+i:i*amat.Stride+c])
+			}
+		} else {
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride:i*t.mat.Stride+i+1], amat.Data[i*amat.Stride:i*amat.Stride+i+1])
+			}
+		}
+	case RawTriangular:
+		amat := a.RawTriangular()
+		aIsUpper := isUpperUplo(amat.Uplo)
+		tIsUpper := t.isUpper()
+		switch {
+		case tIsUpper && aIsUpper:
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride+i:i*t.mat.Stride+c], amat.Data[i*amat.Stride+i:i*amat.Stride+c])
+			}
+		case !tIsUpper && !aIsUpper:
+			for i := 0; i < r; i++ {
+				copy(t.mat.Data[i*t.mat.Stride:i*t.mat.Stride+i+1], amat.Data[i*amat.Stride:i*amat.Stride+i+1])
+			}
+		default:
+			for i := 0; i < r; i++ {
+				t.set(i, i, amat.Data[i*amat.Stride+i])
+			}
+		}
+	default:
+		isUpper := t.isUpper()
+		for i := 0; i < r; i++ {
+			if isUpper {
+				for j := i; j < c; j++ {
+					t.set(i, j, a.At(i, j))
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					t.set(i, j, a.At(i, j))
+				}
+			}
+		}
+	}
+
+	return r, c
+}
+
+// InverseTri computes the inverse of the triangular matrix a, storing the result
+// into the receiver. If a is ill-conditioned, a Condition error will be returned.
+// Note that matrix inversion is numerically unstable, and should generally be
+// avoided where possible, for example by using the Solve routines.
+func (t *TriDense) InverseTri(a Triangular) error {
+	t.checkOverlapMatrix(a)
+	n, _ := a.Triangle()
+	t.reuseAsNonZeroed(a.Triangle())
+	t.Copy(a)
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	cond := lapack64.Trcon(CondNorm, t.mat, work, iwork)
+	putFloat64s(work)
+	putInts(iwork)
+	if math.IsInf(cond, 1) {
+		return Condition(cond)
+	}
+	ok := lapack64.Trtri(t.mat)
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	if cond > ConditionTolerance {
+		return Condition(cond)
+	}
+	return nil
+}
+
+// MulTri takes the product of triangular matrices a and b and places the result
+// in the receiver. The size of a and b must match, and they both must have the
+// same TriKind, or Mul will panic.
+func (t *TriDense) MulTri(a, b Triangular) {
+	n, kind := a.Triangle()
+	nb, kindb := b.Triangle()
+	if n != nb {
+		panic(ErrShape)
+	}
+	if kind != kindb {
+		panic(ErrTriangle)
+	}
+
+	aU, _ := untransposeTri(a)
+	bU, _ := untransposeTri(b)
+	t.checkOverlapMatrix(bU)
+	t.checkOverlapMatrix(aU)
+	t.reuseAsNonZeroed(n, kind)
+	var restore func()
+	if t == aU {
+		t, restore = t.isolatedWorkspace(aU)
+		defer restore()
+	} else if t == bU {
+		t, restore = t.isolatedWorkspace(bU)
+		defer restore()
+	}
+
+	// Inspect types here, helps keep the loops later clean(er).
+	_, aDiag := aU.(Diagonal)
+	_, bDiag := bU.(Diagonal)
+	// If they are both diagonal only need 1 loop.
+	// All diagonal matrices are Upper.
+	// TODO: Add fast paths for DiagDense.
+	if aDiag && bDiag {
+		t.Zero()
+		for i := 0; i < n; i++ {
+			t.SetTri(i, i, a.At(i, i)*b.At(i, i))
+		}
+		return
+	}
+
+	// Now we know at least one matrix is non-diagonal.
+	// And all diagonal matrices are all Upper.
+	// The both-diagonal case is handled above.
+	// TODO: Add fast paths for Dense variants.
+	if kind == Upper {
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				switch {
+				case aDiag:
+					t.SetTri(i, j, a.At(i, i)*b.At(i, j))
+				case bDiag:
+					t.SetTri(i, j, a.At(i, j)*b.At(j, j))
+				default:
+					var v float64
+					for k := i; k <= j; k++ {
+						v += a.At(i, k) * b.At(k, j)
+					}
+					t.SetTri(i, j, v)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := 0; j <= i; j++ {
+			var v float64
+			for k := j; k <= i; k++ {
+				v += a.At(i, k) * b.At(k, j)
+			}
+			t.SetTri(i, j, v)
+		}
+	}
+}
+
+// ScaleTri multiplies the elements of a by f, placing the result in the receiver.
+// If the receiver is non-zero, the size and kind of the receiver must match
+// the input, or ScaleTri will panic.
+func (t *TriDense) ScaleTri(f float64, a Triangular) {
+	n, kind := a.Triangle()
+	t.reuseAsNonZeroed(n, kind)
+
+	// TODO(btracey): Improve the set of fast-paths.
+	switch a := a.(type) {
+	case RawTriangular:
+		amat := a.RawTriangular()
+		if t != a {
+			t.checkOverlap(generalFromTriangular(amat))
+		}
+		if kind == Upper {
+			for i := 0; i < n; i++ {
+				ts := t.mat.Data[i*t.mat.Stride+i : i*t.mat.Stride+n]
+				as := amat.Data[i*amat.Stride+i : i*amat.Stride+n]
+				for i, v := range as {
+					ts[i] = v * f
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ts := t.mat.Data[i*t.mat.Stride : i*t.mat.Stride+i+1]
+			as := amat.Data[i*amat.Stride : i*amat.Stride+i+1]
+			for i, v := range as {
+				ts[i] = v * f
+			}
+		}
+		return
+	default:
+		t.checkOverlapMatrix(a)
+		isUpper := kind == Upper
+		for i := 0; i < n; i++ {
+			if isUpper {
+				for j := i; j < n; j++ {
+					t.set(i, j, f*a.At(i, j))
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					t.set(i, j, f*a.At(i, j))
+				}
+			}
+		}
+	}
+}
+
+// SliceTri returns a new Triangular that shares backing data with the receiver.
+// The returned matrix starts at {i,i} of the receiver and extends k-i rows and
+// columns. The final row and column in the resulting matrix is k-1.
+// SliceTri panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (t *TriDense) SliceTri(i, k int) Triangular {
+	return t.sliceTri(i, k)
+}
+
+func (t *TriDense) sliceTri(i, k int) *TriDense {
+	if i < 0 || t.cap < i || k < i || t.cap < k {
+		panic(ErrIndexOutOfRange)
+	}
+	v := *t
+	v.mat.Data = t.mat.Data[i*t.mat.Stride+i : (k-1)*t.mat.Stride+k]
+	v.mat.N = k - i
+	v.cap = t.cap - i
+	return &v
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (t *TriDense) Norm(norm float64) float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum {
+		work := getFloat64s(t.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lantr(lnorm, t.mat, work)
+	}
+	return lapack64.Lantr(lnorm, t.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (t *TriDense) Trace() float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	// TODO(btracey): could use internal asm sum routine.
+	var v float64
+	for i := 0; i < t.mat.N; i++ {
+		v += t.mat.Data[i*t.mat.Stride+i]
+	}
+	return v
+}
+
+// copySymIntoTriangle copies a symmetric matrix into a TriDense
+func copySymIntoTriangle(t *TriDense, s Symmetric) {
+	n, upper := t.Triangle()
+	ns := s.SymmetricDim()
+	if n != ns {
+		panic("mat: triangle size mismatch")
+	}
+	ts := t.mat.Stride
+	if rs, ok := s.(RawSymmetricer); ok {
+		sd := rs.RawSymmetric()
+		ss := sd.Stride
+		if upper {
+			if sd.Uplo == blas.Upper {
+				for i := 0; i < n; i++ {
+					copy(t.mat.Data[i*ts+i:i*ts+n], sd.Data[i*ss+i:i*ss+n])
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				for j := i; j < n; j++ {
+					t.mat.Data[i*ts+j] = sd.Data[j*ss+i]
+				}
+			}
+			return
+		}
+		if sd.Uplo == blas.Upper {
+			for i := 0; i < n; i++ {
+				for j := 0; j <= i; j++ {
+					t.mat.Data[i*ts+j] = sd.Data[j*ss+i]
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			copy(t.mat.Data[i*ts:i*ts+i+1], sd.Data[i*ss:i*ss+i+1])
+		}
+		return
+	}
+	if upper {
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				t.mat.Data[i*ts+j] = s.At(i, j)
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		for j := 0; j <= i; j++ {
+			t.mat.Data[i*ts+j] = s.At(i, j)
+		}
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriDense) DoNonZero(fn func(i, j int, v float64)) {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			for j := i; j < t.mat.N; j++ {
+				v := t.at(i, j)
+				if v != 0 {
+					fn(i, j, v)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < t.mat.N; i++ {
+		for j := 0; j <= i; j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || t.mat.N <= i {
+		panic(ErrRowAccess)
+	}
+	if t.isUpper() {
+		for j := i; j < t.mat.N; j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+		return
+	}
+	for j := 0; j <= i; j++ {
+		v := t.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || t.mat.N <= j {
+		panic(ErrColAccess)
+	}
+	if t.isUpper() {
+		for i := 0; i <= j; i++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+		return
+	}
+	for i := j; i < t.mat.N; i++ {
+		v := t.at(i, j)
+		if v != 0 {
+			fn(i, j, v)
+		}
+	}
+}
+
+// SolveTo solves a triangular system T * X = B or Tᵀ * X = B where T is an n×n
+// triangular matrix represented by the receiver and B is a given n×nrhs matrix.
+// If T is non-singular, the result will be stored into dst and nil will be
+// returned. If T is singular, the contents of dst will be undefined and a
+// Condition error will be returned.
+//
+// If dst is empty, SolveTo will resize it to n×nrhs. If dst is not empty,
+// SolveTo will panic if dst is not n×nrhs.
+func (t *TriDense) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	n, nrhs := b.Dims()
+	if n != t.mat.N {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(n, nrhs)
+	bU, bTrans := untranspose(b)
+	if dst == bU {
+		if bTrans {
+			work := getDenseWorkspace(n, nrhs, false)
+			defer putDenseWorkspace(work)
+			work.Copy(b)
+			dst.Copy(work)
+		}
+	} else {
+		if rm, ok := bU.(RawMatrixer); ok {
+			dst.checkOverlap(rm.RawMatrix())
+		}
+		dst.Copy(b)
+	}
+
+	transT := blas.NoTrans
+	if trans {
+		transT = blas.Trans
+	}
+	ok := lapack64.Trtrs(transT, t.mat, dst.mat)
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+
+	work := getFloat64s(3*n, false)
+	iwork := getInts(n, false)
+	cond := lapack64.Trcon(CondNorm, t.mat, work, iwork)
+	putFloat64s(work)
+	putInts(iwork)
+	if cond > ConditionTolerance {
+		return Condition(cond)
+	}
+
+	return nil
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/triband.go b/vendor/gonum.org/v1/gonum/mat/triband.go
new file mode 100644
index 00000000000..aa0b51d6f73
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/triband.go
@@ -0,0 +1,694 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/lapack"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	triBand TriBanded
+	_       Banded     = triBand
+	_       Triangular = triBand
+
+	triBandDense *TriBandDense
+	_            Matrix           = triBandDense
+	_            allMatrix        = triBandDense
+	_            denseMatrix      = triBandDense
+	_            Triangular       = triBandDense
+	_            Banded           = triBandDense
+	_            TriBanded        = triBandDense
+	_            RawTriBander     = triBandDense
+	_            MutableTriBanded = triBandDense
+)
+
+// TriBanded is a triangular band matrix interface type.
+type TriBanded interface {
+	Banded
+
+	// Triangle returns the number of rows/columns in the matrix and its
+	// orientation.
+	Triangle() (n int, kind TriKind)
+
+	// TTri is the equivalent of the T() method in the Matrix interface but
+	// guarantees the transpose is of triangular type.
+	TTri() Triangular
+
+	// TriBand returns the number of rows/columns in the matrix, the
+	// size of the bandwidth, and the orientation.
+	TriBand() (n, k int, kind TriKind)
+
+	// TTriBand is the equivalent of the T() method in the Matrix interface but
+	// guarantees the transpose is of banded triangular type.
+	TTriBand() TriBanded
+}
+
+// A RawTriBander can return a blas64.TriangularBand representation of the receiver.
+// Changes to the blas64.TriangularBand.Data slice will be reflected in the original
+// matrix, changes to the N, K, Stride, Uplo and Diag fields will not.
+type RawTriBander interface {
+	RawTriBand() blas64.TriangularBand
+}
+
+// MutableTriBanded is a triangular band matrix interface type that allows
+// elements to be altered.
+type MutableTriBanded interface {
+	TriBanded
+	SetTriBand(i, j int, v float64)
+}
+
+var (
+	tTriBand TransposeTriBand
+	_        Matrix               = tTriBand
+	_        TriBanded            = tTriBand
+	_        Untransposer         = tTriBand
+	_        UntransposeTrier     = tTriBand
+	_        UntransposeBander    = tTriBand
+	_        UntransposeTriBander = tTriBand
+)
+
+// TransposeTriBand is a type for performing an implicit transpose of a TriBanded
+// matrix. It implements the TriBanded interface, returning values from the
+// transpose of the matrix within.
+type TransposeTriBand struct {
+	TriBanded TriBanded
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the TriBanded field.
+func (t TransposeTriBand) At(i, j int) float64 {
+	return t.TriBanded.At(j, i)
+}
+
+// Dims returns the dimensions of the transposed matrix. TriBanded matrices are
+// square and thus this is the same size as the original TriBanded.
+func (t TransposeTriBand) Dims() (r, c int) {
+	c, r = t.TriBanded.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) T() Matrix {
+	return t.TriBanded
+}
+
+// Triangle returns the number of rows/columns in the matrix and its orientation.
+func (t TransposeTriBand) Triangle() (int, TriKind) {
+	n, upper := t.TriBanded.Triangle()
+	return n, !upper
+}
+
+// TTri performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) TTri() Triangular {
+	return t.TriBanded
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+func (t TransposeTriBand) Bandwidth() (kl, ku int) {
+	kl, ku = t.TriBanded.Bandwidth()
+	return ku, kl
+}
+
+// TBand performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) TBand() Banded {
+	return t.TriBanded
+}
+
+// TriBand returns the number of rows/columns in the matrix, the
+// size of the bandwidth, and the orientation.
+func (t TransposeTriBand) TriBand() (n, k int, kind TriKind) {
+	n, k, kind = t.TriBanded.TriBand()
+	return n, k, !kind
+}
+
+// TTriBand performs an implicit transpose by returning the TriBand field.
+func (t TransposeTriBand) TTriBand() TriBanded {
+	return t.TriBanded
+}
+
+// Untranspose returns the Triangular field.
+func (t TransposeTriBand) Untranspose() Matrix {
+	return t.TriBanded
+}
+
+// UntransposeTri returns the underlying Triangular matrix.
+func (t TransposeTriBand) UntransposeTri() Triangular {
+	return t.TriBanded
+}
+
+// UntransposeBand returns the underlying Banded matrix.
+func (t TransposeTriBand) UntransposeBand() Banded {
+	return t.TriBanded
+}
+
+// UntransposeTriBand returns the underlying TriBanded matrix.
+func (t TransposeTriBand) UntransposeTriBand() TriBanded {
+	return t.TriBanded
+}
+
+// TriBandDense represents a triangular band matrix in dense storage format.
+type TriBandDense struct {
+	mat blas64.TriangularBand
+}
+
+// NewTriBandDense creates a new triangular banded matrix with n rows and columns,
+// k bands in the direction of the specified kind. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n*(k+1),
+// data is used as the backing slice, and changes to the elements of the returned
+// TriBandDense will be reflected in data. If neither of these is true, NewTriBandDense
+// will panic. k must be at least zero and less than n, otherwise NewTriBandDense will panic.
+//
+// The data must be arranged in row-major order constructed by removing the zeros
+// from the rows outside the band and aligning the diagonals. For example, if
+// the upper-triangular banded matrix
+//
+//	1  2  3  0  0  0
+//	0  4  5  6  0  0
+//	0  0  7  8  9  0
+//	0  0  0 10 11 12
+//	0  0  0 0  13 14
+//	0  0  0 0  0  15
+//
+// becomes (* entries are never accessed)
+//
+//	 1  2  3
+//	 4  5  6
+//	 7  8  9
+//	10 11 12
+//	13 14  *
+//	15  *  *
+//
+// which is passed to NewTriBandDense as []float64{1, 2, ..., 15, *, *, *}
+// with k=2 and kind = mat.Upper.
+// The lower triangular banded matrix
+//
+//	1  0  0  0  0  0
+//	2  3  0  0  0  0
+//	4  5  6  0  0  0
+//	0  7  8  9  0  0
+//	0  0 10 11 12  0
+//	0  0  0 13 14 15
+//
+// becomes (* entries are never accessed)
+//   - *  1
+//   - 2  3
+//     4  5  6
+//     7  8  9
+//     10 11 12
+//     13 14 15
+//
+// which is passed to NewTriBandDense as []float64{*, *, *, 1, 2, ..., 15}
+// with k=2 and kind = mat.Lower.
+// Only the values in the band portion of the matrix are used.
+func NewTriBandDense(n, k int, kind TriKind, data []float64) *TriBandDense {
+	if n <= 0 || k < 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if k+1 > n {
+		panic(ErrBandwidth)
+	}
+	bc := k + 1
+	if data != nil && len(data) != n*bc {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n*bc)
+	}
+	uplo := blas.Lower
+	if kind {
+		uplo = blas.Upper
+	}
+	return &TriBandDense{
+		mat: blas64.TriangularBand{
+			Uplo:   uplo,
+			Diag:   blas.NonUnit,
+			N:      n,
+			K:      k,
+			Data:   data,
+			Stride: bc,
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (t *TriBandDense) Dims() (r, c int) {
+	return t.mat.N, t.mat.N
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (t *TriBandDense) T() Matrix {
+	return Transpose{t}
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (t *TriBandDense) IsEmpty() bool {
+	// It must be the case that t.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return t.mat.Stride == 0
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (t *TriBandDense) Reset() {
+	t.mat.N = 0
+	t.mat.Stride = 0
+	t.mat.K = 0
+	t.mat.Data = t.mat.Data[:0]
+}
+
+// ReuseAsTriBand changes the receiver to be of size n×n, bandwidth k+1 and of
+// the given kind, re-using the backing data slice if it has sufficient capacity
+// and allocating a new slice otherwise. The backing data is zero on return.
+//
+// The receiver must be empty, n must be positive and k must be non-negative and
+// less than n, otherwise ReuseAsTriBand will panic. To empty the receiver for
+// re-use, Reset should be used.
+func (t *TriBandDense) ReuseAsTriBand(n, k int, kind TriKind) {
+	if n <= 0 || k < 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if k+1 > n {
+		panic(ErrBandwidth)
+	}
+	if !t.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	t.reuseAsZeroed(n, k, kind)
+}
+
+// reuseAsZeroed resizes an empty receiver to an n×n triangular band matrix with
+// the given bandwidth and orientation. If the receiver is not empty,
+// reuseAsZeroed checks that the receiver has the correct size, bandwidth and
+// orientation. It then zeros out the matrix data.
+func (t *TriBandDense) reuseAsZeroed(n, k int, kind TriKind) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.TriangularBand{
+			Uplo:   ul,
+			Diag:   blas.NonUnit,
+			N:      n,
+			K:      k,
+			Data:   useZeroed(t.mat.Data, n*(k+1)),
+			Stride: k + 1,
+		}
+		return
+	}
+	if t.mat.N != n || t.mat.K != k {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+	t.Zero()
+}
+
+// reuseAsNonZeroed resizes an empty receiver to an n×n triangular band matrix
+// with the given bandwidth and orientation. If the receiver is not empty,
+// reuseAsZeroed checks that the receiver has the correct size, bandwidth and
+// orientation.
+//
+//lint:ignore U1000 This will be used later.
+func (t *TriBandDense) reuseAsNonZeroed(n, k int, kind TriKind) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if n == 0 {
+		panic(ErrZeroLength)
+	}
+	ul := blas.Lower
+	if kind == Upper {
+		ul = blas.Upper
+	}
+	if t.IsEmpty() {
+		t.mat = blas64.TriangularBand{
+			Uplo:   ul,
+			Diag:   blas.NonUnit,
+			N:      n,
+			K:      k,
+			Data:   use(t.mat.Data, n*(k+1)),
+			Stride: k + 1,
+		}
+		return
+	}
+	if t.mat.N != n || t.mat.K != k {
+		panic(ErrShape)
+	}
+	if t.mat.Uplo != ul {
+		panic(ErrTriangle)
+	}
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriBandDense) DoNonZero(fn func(i, j int, v float64)) {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			for j := i; j < min(i+t.mat.K+1, t.mat.N); j++ {
+				v := t.at(i, j)
+				if v != 0 {
+					fn(i, j, v)
+				}
+			}
+		}
+	} else {
+		for i := 0; i < t.mat.N; i++ {
+			for j := max(0, i-t.mat.K); j <= i; j++ {
+				v := t.at(i, j)
+				if v != 0 {
+					fn(i, j, v)
+				}
+			}
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriBandDense) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	if i < 0 || t.mat.N <= i {
+		panic(ErrRowAccess)
+	}
+	if t.isUpper() {
+		for j := i; j < min(i+t.mat.K+1, t.mat.N); j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	} else {
+		for j := max(0, i-t.mat.K); j <= i; j++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of column j of t. The function fn
+// takes a row/column index and the element value of t at (i, j).
+func (t *TriBandDense) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	if j < 0 || t.mat.N <= j {
+		panic(ErrColAccess)
+	}
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	} else {
+		for i := 0; i < t.mat.N; i++ {
+			v := t.at(i, j)
+			if v != 0 {
+				fn(i, j, v)
+			}
+		}
+	}
+}
+
+// Zero sets all of the matrix elements to zero.
+func (t *TriBandDense) Zero() {
+	if t.isUpper() {
+		for i := 0; i < t.mat.N; i++ {
+			u := min(1+t.mat.K, t.mat.N-i)
+			zero(t.mat.Data[i*t.mat.Stride : i*t.mat.Stride+u])
+		}
+		return
+	}
+	for i := 0; i < t.mat.N; i++ {
+		l := max(0, t.mat.K-i)
+		zero(t.mat.Data[i*t.mat.Stride+l : i*t.mat.Stride+t.mat.K+1])
+	}
+}
+
+func (t *TriBandDense) isUpper() bool {
+	return isUpperUplo(t.mat.Uplo)
+}
+
+func (t *TriBandDense) triKind() TriKind {
+	return TriKind(isUpperUplo(t.mat.Uplo))
+}
+
+// Triangle returns the dimension of t and its orientation. The returned
+// orientation is only valid when n is not zero.
+func (t *TriBandDense) Triangle() (n int, kind TriKind) {
+	return t.mat.N, t.triKind()
+}
+
+// TTri performs an implicit transpose by returning the receiver inside a TransposeTri.
+func (t *TriBandDense) TTri() Triangular {
+	return TransposeTri{t}
+}
+
+// Bandwidth returns the upper and lower bandwidths of the matrix.
+func (t *TriBandDense) Bandwidth() (kl, ku int) {
+	if t.isUpper() {
+		return 0, t.mat.K
+	}
+	return t.mat.K, 0
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a TransposeBand.
+func (t *TriBandDense) TBand() Banded {
+	return TransposeBand{t}
+}
+
+// TriBand returns the number of rows/columns in the matrix, the
+// size of the bandwidth, and the orientation.
+func (t *TriBandDense) TriBand() (n, k int, kind TriKind) {
+	return t.mat.N, t.mat.K, TriKind(!t.IsEmpty()) && t.triKind()
+}
+
+// TTriBand performs an implicit transpose by returning the receiver inside a TransposeTriBand.
+func (t *TriBandDense) TTriBand() TriBanded {
+	return TransposeTriBand{t}
+}
+
+// RawTriBand returns the underlying blas64.TriangularBand used by the receiver.
+// Changes to the blas64.TriangularBand.Data slice will be reflected in the original
+// matrix, changes to the N, K, Stride, Uplo and Diag fields will not.
+func (t *TriBandDense) RawTriBand() blas64.TriangularBand {
+	return t.mat
+}
+
+// SetRawTriBand sets the underlying blas64.TriangularBand used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+//
+// The supplied TriangularBand must not use blas.Unit storage format.
+func (t *TriBandDense) SetRawTriBand(mat blas64.TriangularBand) {
+	if mat.Diag == blas.Unit {
+		panic("mat: cannot set TriBand with Unit storage")
+	}
+	t.mat = mat
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (t *TriBandDense) DiagView() Diagonal {
+	if t.mat.Diag == blas.Unit {
+		panic("mat: cannot take view of Unit diagonal")
+	}
+	n := t.mat.N
+	data := t.mat.Data
+	if !t.isUpper() {
+		data = data[t.mat.K:]
+	}
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  t.mat.Stride,
+			Data: data[:(n-1)*t.mat.Stride+1],
+		},
+	}
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (t *TriBandDense) Norm(norm float64) float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	lnorm := normLapack(norm, false)
+	if lnorm == lapack.MaxColumnSum {
+		work := getFloat64s(t.mat.N, false)
+		defer putFloat64s(work)
+		return lapack64.Lantb(lnorm, t.mat, work)
+	}
+	return lapack64.Lantb(lnorm, t.mat, nil)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (t *TriBandDense) Trace() float64 {
+	if t.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	rb := t.RawTriBand()
+	var tr float64
+	var offsetIndex int
+	if rb.Uplo == blas.Lower {
+		offsetIndex = rb.K
+	}
+	for i := 0; i < rb.N; i++ {
+		tr += rb.Data[offsetIndex+i*rb.Stride]
+	}
+	return tr
+}
+
+// SolveTo solves a triangular system T * X = B  or  Tᵀ * X = B where T is an
+// n×n triangular band matrix represented by the receiver and B is a given
+// n×nrhs matrix. If T is non-singular, the result will be stored into dst and
+// nil will be returned. If T is singular, the contents of dst will be undefined
+// and a Condition error will be returned.
+func (t *TriBandDense) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	n, nrhs := b.Dims()
+	if n != t.mat.N {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(n, nrhs)
+	bU, bTrans := untranspose(b)
+	if dst == bU {
+		if bTrans {
+			work := getDenseWorkspace(n, nrhs, false)
+			defer putDenseWorkspace(work)
+			work.Copy(b)
+			dst.Copy(work)
+		}
+	} else {
+		if rm, ok := bU.(RawMatrixer); ok {
+			dst.checkOverlap(rm.RawMatrix())
+		}
+		dst.Copy(b)
+	}
+
+	var ok bool
+	if trans {
+		ok = lapack64.Tbtrs(blas.Trans, t.mat, dst.mat)
+	} else {
+		ok = lapack64.Tbtrs(blas.NoTrans, t.mat, dst.mat)
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+// SolveVecTo solves a triangular system T * x = b  or  Tᵀ * x = b where T is an
+// n×n triangular band matrix represented by the receiver and b is a given
+// n-vector. If T is non-singular, the result will be stored into dst and nil
+// will be returned. If T is singular, the contents of dst will be undefined and
+// a Condition error will be returned.
+func (t *TriBandDense) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	n, nrhs := b.Dims()
+	if n != t.mat.N || nrhs != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+	var ok bool
+	if trans {
+		ok = lapack64.Tbtrs(blas.Trans, t.mat, dst.asGeneral())
+	} else {
+		ok = lapack64.Tbtrs(blas.NoTrans, t.mat, dst.asGeneral())
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+func copySymBandIntoTriBand(dst *TriBandDense, s SymBanded) {
+	n, k, upper := dst.TriBand()
+	ns, ks := s.SymBand()
+	if n != ns {
+		panic("mat: triangle size mismatch")
+	}
+	if k != ks {
+		panic("mat: triangle bandwidth mismatch")
+	}
+
+	// TODO(vladimir-ch): implement the missing cases below as needed.
+	t := dst.mat
+	sU, _ := untransposeExtract(s)
+	if sbd, ok := sU.(*SymBandDense); ok {
+		s := sbd.RawSymBand()
+		if upper {
+			if s.Uplo == blas.Upper {
+				// dst is upper triangular, s is stored in upper triangle.
+				for i := 0; i < n; i++ {
+					ilen := min(k+1, n-i)
+					copy(t.Data[i*t.Stride:i*t.Stride+ilen], s.Data[i*s.Stride:i*s.Stride+ilen])
+				}
+			} else {
+				// dst is upper triangular, s is stored in lower triangle.
+				//
+				// The following is a possible implementation for this case but
+				// is commented out due to lack of test coverage.
+				// for i := 0; i < n; i++ {
+				//  ilen := min(k+1, n-i)
+				//  for j := 0; j < ilen; j++ {
+				//      t.Data[i*t.Stride+j] = s.Data[(i+j)*s.Stride+k-j]
+				//  }
+				// }
+				panic("not implemented")
+			}
+		} else {
+			if s.Uplo == blas.Upper {
+				// dst is lower triangular, s is stored in upper triangle.
+				panic("not implemented")
+			} else {
+				// dst is lower triangular, s is stored in lower triangle.
+				panic("not implemented")
+			}
+		}
+		return
+	}
+	if upper {
+		for i := 0; i < n; i++ {
+			ilen := min(k+1, n-i)
+			for j := 0; j < ilen; j++ {
+				t.Data[i*t.Stride+j] = s.At(i, i+j)
+			}
+		}
+	} else {
+		panic("not implemented")
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/tridiag.go b/vendor/gonum.org/v1/gonum/mat/tridiag.go
new file mode 100644
index 00000000000..c001d486314
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/tridiag.go
@@ -0,0 +1,417 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/internal/asm/f64"
+	"gonum.org/v1/gonum/lapack/lapack64"
+)
+
+var (
+	tridiagDense *Tridiag
+	_            Matrix           = tridiagDense
+	_            allMatrix        = tridiagDense
+	_            denseMatrix      = tridiagDense
+	_            Banded           = tridiagDense
+	_            MutableBanded    = tridiagDense
+	_            RawTridiagonaler = tridiagDense
+)
+
+// A RawTridiagonaler can return a lapack64.Tridiagonal representation of the
+// receiver. Changes to the elements of DL, D, DU in lapack64.Tridiagonal will
+// be reflected in the original matrix, changes to the N field will not.
+type RawTridiagonaler interface {
+	RawTridiagonal() lapack64.Tridiagonal
+}
+
+// Tridiag represents a tridiagonal matrix by its three diagonals.
+type Tridiag struct {
+	mat lapack64.Tridiagonal
+}
+
+// NewTridiag creates a new n×n tridiagonal matrix with the first sub-diagonal
+// in dl, the main diagonal in d and the first super-diagonal in du. If all of
+// dl, d, and du are nil, new backing slices will be allocated for them. If dl
+// and du have length n-1 and d has length n, they will be used as backing
+// slices, and changes to the elements of the returned Tridiag will be reflected
+// in dl, d, du. If neither of these is true, NewTridiag will panic.
+func NewTridiag(n int, dl, d, du []float64) *Tridiag {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if dl != nil || d != nil || du != nil {
+		if len(dl) != n-1 || len(d) != n || len(du) != n-1 {
+			panic(ErrShape)
+		}
+	} else {
+		d = make([]float64, n)
+		if n > 1 {
+			dl = make([]float64, n-1)
+			du = make([]float64, n-1)
+		}
+	}
+	return &Tridiag{
+		mat: lapack64.Tridiagonal{
+			N:  n,
+			DL: dl,
+			D:  d,
+			DU: du,
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix.
+func (a *Tridiag) Dims() (r, c int) {
+	return a.mat.N, a.mat.N
+}
+
+// Bandwidth returns 1, 1 - the upper and lower bandwidths of the matrix.
+func (a *Tridiag) Bandwidth() (kl, ku int) {
+	return 1, 1
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (a *Tridiag) T() Matrix {
+	// An alternative would be to return the receiver with DL,DU swapped; the
+	// untranspose function would then always return false. With Transpose the
+	// diagonal swapping will be done in tridiagonal routines in lapack like
+	// lapack64.Gtsv or gonum.Dlagtm based on the trans parameter.
+	return Transpose{a}
+}
+
+// TBand performs an implicit transpose by returning the receiver inside a
+// TransposeBand.
+func (a *Tridiag) TBand() Banded {
+	// An alternative would be to return the receiver with DL,DU swapped; see
+	// explanation in T above.
+	return TransposeBand{a}
+}
+
+// RawTridiagonal returns the underlying lapack64.Tridiagonal used by the
+// receiver. Changes to elements in the receiver following the call will be
+// reflected in the returned matrix.
+func (a *Tridiag) RawTridiagonal() lapack64.Tridiagonal {
+	return a.mat
+}
+
+// SetRawTridiagonal sets the underlying lapack64.Tridiagonal used by the
+// receiver. Changes to elements in the receiver following the call will be
+// reflected in the input.
+func (a *Tridiag) SetRawTridiagonal(mat lapack64.Tridiagonal) {
+	a.mat = mat
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be zeroed using
+// Reset.
+func (a *Tridiag) IsEmpty() bool {
+	return a.mat.N == 0
+}
+
+// Reset empties the matrix so that it can be reused as the receiver of a
+// dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data. See the Reseter
+// interface for more information.
+func (a *Tridiag) Reset() {
+	a.mat.N = 0
+	a.mat.DL = a.mat.DL[:0]
+	a.mat.D = a.mat.D[:0]
+	a.mat.DU = a.mat.DU[:0]
+}
+
+// CloneFromTridiag makes a copy of the input Tridiag into the receiver,
+// overwriting the previous value of the receiver. CloneFromTridiag does not
+// place any restrictions on receiver shape.
+func (a *Tridiag) CloneFromTridiag(from *Tridiag) {
+	n := from.mat.N
+	switch n {
+	case 0:
+		panic(ErrZeroLength)
+	case 1:
+		a.mat = lapack64.Tridiagonal{
+			N:  1,
+			DL: use(a.mat.DL, 0),
+			D:  use(a.mat.D, 1),
+			DU: use(a.mat.DU, 0),
+		}
+		a.mat.D[0] = from.mat.D[0]
+	default:
+		a.mat = lapack64.Tridiagonal{
+			N:  n,
+			DL: use(a.mat.DL, n-1),
+			D:  use(a.mat.D, n),
+			DU: use(a.mat.DU, n-1),
+		}
+		copy(a.mat.DL, from.mat.DL)
+		copy(a.mat.D, from.mat.D)
+		copy(a.mat.DU, from.mat.DU)
+	}
+}
+
+// DiagView returns the diagonal as a matrix backed by the original data.
+func (a *Tridiag) DiagView() Diagonal {
+	return &DiagDense{
+		mat: blas64.Vector{
+			N:    a.mat.N,
+			Data: a.mat.D[:a.mat.N],
+			Inc:  1,
+		},
+	}
+}
+
+// Zero sets all of the matrix elements to zero.
+func (a *Tridiag) Zero() {
+	zero(a.mat.DL)
+	zero(a.mat.D)
+	zero(a.mat.DU)
+}
+
+// Trace returns the trace of the matrix.
+//
+// Trace will panic with ErrZeroLength if the matrix has zero size.
+func (a *Tridiag) Trace() float64 {
+	if a.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	return f64.Sum(a.mat.D)
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The maximum absolute column sum
+//	2 - The Frobenius norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum absolute row sum
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the matrix has zero size.
+func (a *Tridiag) Norm(norm float64) float64 {
+	if a.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	return lapack64.Langt(normLapack(norm, false), a.mat)
+}
+
+// MulVecTo computes A⋅x or Aᵀ⋅x storing the result into dst.
+func (a *Tridiag) MulVecTo(dst *VecDense, trans bool, x Vector) {
+	n := a.mat.N
+	if x.Len() != n {
+		panic(ErrShape)
+	}
+	dst.reuseAsNonZeroed(n)
+	t := blas.NoTrans
+	if trans {
+		t = blas.Trans
+	}
+	xMat, _ := untransposeExtract(x)
+	if xVec, ok := xMat.(*VecDense); ok && dst != xVec {
+		dst.checkOverlap(xVec.mat)
+		lapack64.Lagtm(t, 1, a.mat, xVec.asGeneral(), 0, dst.asGeneral())
+	} else {
+		xCopy := getVecDenseWorkspace(n, false)
+		xCopy.CloneFromVec(x)
+		lapack64.Lagtm(t, 1, a.mat, xCopy.asGeneral(), 0, dst.asGeneral())
+		putVecDenseWorkspace(xCopy)
+	}
+}
+
+// SolveTo solves a tridiagonal system A⋅X = B  or  Aᵀ⋅X = B where A is an
+// n×n tridiagonal matrix represented by the receiver and B is a given n×nrhs
+// matrix. If A is non-singular, the result will be stored into dst and nil will
+// be returned. If A is singular, the contents of dst will be undefined and a
+// Condition error will be returned.
+func (a *Tridiag) SolveTo(dst *Dense, trans bool, b Matrix) error {
+	n, nrhs := b.Dims()
+	if n != a.mat.N {
+		panic(ErrShape)
+	}
+
+	dst.reuseAsNonZeroed(n, nrhs)
+	bU, bTrans := untranspose(b)
+	if dst == bU {
+		if bTrans {
+			work := getDenseWorkspace(n, nrhs, false)
+			defer putDenseWorkspace(work)
+			work.Copy(b)
+			dst.Copy(work)
+		}
+	} else {
+		if rm, ok := bU.(RawMatrixer); ok {
+			dst.checkOverlap(rm.RawMatrix())
+		}
+		dst.Copy(b)
+	}
+
+	var aCopy Tridiag
+	aCopy.CloneFromTridiag(a)
+	var ok bool
+	if trans {
+		ok = lapack64.Gtsv(blas.Trans, aCopy.mat, dst.mat)
+	} else {
+		ok = lapack64.Gtsv(blas.NoTrans, aCopy.mat, dst.mat)
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+// SolveVecTo solves a tridiagonal system A⋅X = B  or  Aᵀ⋅X = B where A is an
+// n×n tridiagonal matrix represented by the receiver and b is a given n-vector.
+// If A is non-singular, the result will be stored into dst and nil will be
+// returned. If A is singular, the contents of dst will be undefined and a
+// Condition error will be returned.
+func (a *Tridiag) SolveVecTo(dst *VecDense, trans bool, b Vector) error {
+	n, nrhs := b.Dims()
+	if n != a.mat.N || nrhs != 1 {
+		panic(ErrShape)
+	}
+	if b, ok := b.(RawVectorer); ok && dst != b {
+		dst.checkOverlap(b.RawVector())
+	}
+	dst.reuseAsNonZeroed(n)
+	if dst != b {
+		dst.CopyVec(b)
+	}
+	var aCopy Tridiag
+	aCopy.CloneFromTridiag(a)
+	var ok bool
+	if trans {
+		ok = lapack64.Gtsv(blas.Trans, aCopy.mat, dst.asGeneral())
+	} else {
+		ok = lapack64.Gtsv(blas.NoTrans, aCopy.mat, dst.asGeneral())
+	}
+	if !ok {
+		return Condition(math.Inf(1))
+	}
+	return nil
+}
+
+// DoNonZero calls the function fn for each of the non-zero elements of A. The
+// function fn takes a row/column index and the element value of A at (i,j).
+func (a *Tridiag) DoNonZero(fn func(i, j int, v float64)) {
+	for i, aij := range a.mat.DU {
+		if aij != 0 {
+			fn(i, i+1, aij)
+		}
+	}
+	for i, aii := range a.mat.D {
+		if aii != 0 {
+			fn(i, i, aii)
+		}
+	}
+	for i, aij := range a.mat.DL {
+		if aij != 0 {
+			fn(i+1, i, aij)
+		}
+	}
+}
+
+// DoRowNonZero calls the function fn for each of the non-zero elements of row i
+// of A. The function fn takes a row/column index and the element value of A at
+// (i,j).
+func (a *Tridiag) DoRowNonZero(i int, fn func(i, j int, v float64)) {
+	n := a.mat.N
+	if uint(i) >= uint(n) {
+		panic(ErrRowAccess)
+	}
+	if n == 1 {
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(0, 0, v)
+		}
+		return
+	}
+	switch i {
+	case 0:
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(i, 0, v)
+		}
+		v = a.mat.DU[0]
+		if v != 0 {
+			fn(i, 1, v)
+		}
+	case n - 1:
+		v := a.mat.DL[n-2]
+		if v != 0 {
+			fn(n-1, n-2, v)
+		}
+		v = a.mat.D[n-1]
+		if v != 0 {
+			fn(n-1, n-1, v)
+		}
+	default:
+		v := a.mat.DL[i-1]
+		if v != 0 {
+			fn(i, i-1, v)
+		}
+		v = a.mat.D[i]
+		if v != 0 {
+			fn(i, i, v)
+		}
+		v = a.mat.DU[i]
+		if v != 0 {
+			fn(i, i+1, v)
+		}
+	}
+}
+
+// DoColNonZero calls the function fn for each of the non-zero elements of
+// column j of A. The function fn takes a row/column index and the element value
+// of A at (i, j).
+func (a *Tridiag) DoColNonZero(j int, fn func(i, j int, v float64)) {
+	n := a.mat.N
+	if uint(j) >= uint(n) {
+		panic(ErrColAccess)
+	}
+	if n == 1 {
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(0, 0, v)
+		}
+		return
+	}
+	switch j {
+	case 0:
+		v := a.mat.D[0]
+		if v != 0 {
+			fn(0, 0, v)
+		}
+		v = a.mat.DL[0]
+		if v != 0 {
+			fn(1, 0, v)
+		}
+	case n - 1:
+		v := a.mat.DU[n-2]
+		if v != 0 {
+			fn(n-2, n-1, v)
+		}
+		v = a.mat.D[n-1]
+		if v != 0 {
+			fn(n-1, n-1, v)
+		}
+	default:
+		v := a.mat.DU[j-1]
+		if v != 0 {
+			fn(j-1, j, v)
+		}
+		v = a.mat.D[j]
+		if v != 0 {
+			fn(j, j, v)
+		}
+		v = a.mat.DL[j]
+		if v != 0 {
+			fn(j+1, j, v)
+		}
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/mat/vector.go b/vendor/gonum.org/v1/gonum/mat/vector.go
new file mode 100644
index 00000000000..5c5d3ff7496
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/mat/vector.go
@@ -0,0 +1,855 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/blas64"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var (
+	vector *VecDense
+
+	_ Matrix        = vector
+	_ allMatrix     = vector
+	_ Vector        = vector
+	_ Reseter       = vector
+	_ MutableVector = vector
+)
+
+// Vector is a vector.
+type Vector interface {
+	Matrix
+	AtVec(int) float64
+	Len() int
+}
+
+// A MutableVector can set elements of a vector.
+type MutableVector interface {
+	Vector
+	SetVec(i int, v float64)
+}
+
+// TransposeVec is a type for performing an implicit transpose of a Vector.
+// It implements the Vector interface, returning values from the transpose
+// of the vector within.
+type TransposeVec struct {
+	Vector Vector
+}
+
+// At returns the value of the element at row i and column j of the transposed
+// matrix, that is, row j and column i of the Vector field.
+func (t TransposeVec) At(i, j int) float64 {
+	return t.Vector.At(j, i)
+}
+
+// AtVec returns the element at position i. It panics if i is out of bounds.
+func (t TransposeVec) AtVec(i int) float64 {
+	return t.Vector.AtVec(i)
+}
+
+// Dims returns the dimensions of the transposed vector.
+func (t TransposeVec) Dims() (r, c int) {
+	c, r = t.Vector.Dims()
+	return r, c
+}
+
+// T performs an implicit transpose by returning the Vector field.
+func (t TransposeVec) T() Matrix {
+	return t.Vector
+}
+
+// Len returns the number of columns in the vector.
+func (t TransposeVec) Len() int {
+	return t.Vector.Len()
+}
+
+// TVec performs an implicit transpose by returning the Vector field.
+func (t TransposeVec) TVec() Vector {
+	return t.Vector
+}
+
+// Untranspose returns the Vector field.
+func (t TransposeVec) Untranspose() Matrix {
+	return t.Vector
+}
+
+func (t TransposeVec) UntransposeVec() Vector {
+	return t.Vector
+}
+
+// VecDense represents a column vector.
+type VecDense struct {
+	mat blas64.Vector
+	// A BLAS vector can have a negative increment, but allowing this
+	// in the mat type complicates a lot of code, and doesn't gain anything.
+	// VecDense must have positive increment in this package.
+}
+
+// NewVecDense creates a new VecDense of length n. If data == nil,
+// a new slice is allocated for the backing slice. If len(data) == n, data is
+// used as the backing slice, and changes to the elements of the returned VecDense
+// will be reflected in data. If neither of these is true, NewVecDense will panic.
+// NewVecDense will panic if n is zero.
+func NewVecDense(n int, data []float64) *VecDense {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic("mat: negative dimension")
+	}
+	if len(data) != n && data != nil {
+		panic(ErrShape)
+	}
+	if data == nil {
+		data = make([]float64, n)
+	}
+	return &VecDense{
+		mat: blas64.Vector{
+			N:    n,
+			Inc:  1,
+			Data: data,
+		},
+	}
+}
+
+// SliceVec returns a new Vector that shares backing data with the receiver.
+// The returned matrix starts at i of the receiver and extends k-i elements.
+// SliceVec panics with ErrIndexOutOfRange if the slice is outside the capacity
+// of the receiver.
+func (v *VecDense) SliceVec(i, k int) Vector {
+	return v.sliceVec(i, k)
+}
+
+func (v *VecDense) sliceVec(i, k int) *VecDense {
+	if i < 0 || k <= i || v.Cap() < k {
+		panic(ErrIndexOutOfRange)
+	}
+	return &VecDense{
+		mat: blas64.Vector{
+			N:    k - i,
+			Inc:  v.mat.Inc,
+			Data: v.mat.Data[i*v.mat.Inc : (k-1)*v.mat.Inc+1],
+		},
+	}
+}
+
+// Dims returns the number of rows and columns in the matrix. Columns is always 1
+// for a non-Reset vector.
+func (v *VecDense) Dims() (r, c int) {
+	if v.IsEmpty() {
+		return 0, 0
+	}
+	return v.mat.N, 1
+}
+
+// Caps returns the number of rows and columns in the backing matrix. Columns is always 1
+// for a non-Reset vector.
+func (v *VecDense) Caps() (r, c int) {
+	if v.IsEmpty() {
+		return 0, 0
+	}
+	return v.Cap(), 1
+}
+
+// Len returns the length of the vector.
+func (v *VecDense) Len() int {
+	return v.mat.N
+}
+
+// Cap returns the capacity of the vector.
+func (v *VecDense) Cap() int {
+	if v.IsEmpty() {
+		return 0
+	}
+	return (cap(v.mat.Data)-1)/v.mat.Inc + 1
+}
+
+// T performs an implicit transpose by returning the receiver inside a Transpose.
+func (v *VecDense) T() Matrix {
+	return Transpose{v}
+}
+
+// TVec performs an implicit transpose by returning the receiver inside a TransposeVec.
+func (v *VecDense) TVec() Vector {
+	return TransposeVec{v}
+}
+
+// Reset empties the matrix so that it can be reused as the
+// receiver of a dimensionally restricted operation.
+//
+// Reset should not be used when the matrix shares backing data.
+// See the Reseter interface for more information.
+func (v *VecDense) Reset() {
+	// No change of Inc or N to 0 may be
+	// made unless both are set to 0.
+	v.mat.Inc = 0
+	v.mat.N = 0
+	v.mat.Data = v.mat.Data[:0]
+}
+
+// Zero sets all of the matrix elements to zero.
+func (v *VecDense) Zero() {
+	for i := 0; i < v.mat.N; i++ {
+		v.mat.Data[v.mat.Inc*i] = 0
+	}
+}
+
+// CloneFromVec makes a copy of a into the receiver, overwriting the previous value
+// of the receiver.
+func (v *VecDense) CloneFromVec(a Vector) {
+	if v == a {
+		return
+	}
+	n := a.Len()
+	v.mat = blas64.Vector{
+		N:    n,
+		Inc:  1,
+		Data: use(v.mat.Data, n),
+	}
+	if r, ok := a.(RawVectorer); ok {
+		blas64.Copy(r.RawVector(), v.mat)
+		return
+	}
+	for i := 0; i < a.Len(); i++ {
+		v.setVec(i, a.AtVec(i))
+	}
+}
+
+// VecDenseCopyOf returns a newly allocated copy of the elements of a.
+func VecDenseCopyOf(a Vector) *VecDense {
+	v := &VecDense{}
+	v.CloneFromVec(a)
+	return v
+}
+
+// RawVector returns the underlying blas64.Vector used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in returned blas64.Vector.
+func (v *VecDense) RawVector() blas64.Vector {
+	return v.mat
+}
+
+// SetRawVector sets the underlying blas64.Vector used by the receiver.
+// Changes to elements in the receiver following the call will be reflected
+// in the input.
+func (v *VecDense) SetRawVector(a blas64.Vector) {
+	v.mat = a
+}
+
+// CopyVec makes a copy of elements of a into the receiver. It is similar to the
+// built-in copy; it copies as much as the overlap between the two vectors and
+// returns the number of elements it copied.
+func (v *VecDense) CopyVec(a Vector) int {
+	n := min(v.Len(), a.Len())
+	if v == a {
+		return n
+	}
+	if r, ok := a.(RawVectorer); ok {
+		src := r.RawVector()
+		src.N = n
+		dst := v.mat
+		dst.N = n
+		blas64.Copy(src, dst)
+		return n
+	}
+	for i := 0; i < n; i++ {
+		v.setVec(i, a.AtVec(i))
+	}
+	return n
+}
+
+// Norm returns the specified norm of the receiver. Valid norms are:
+//
+//	1 - The sum of the element magnitudes
+//	2 - The Euclidean norm, the square root of the sum of the squares of the elements
+//	Inf - The maximum element magnitude
+//
+// Norm will panic with ErrNormOrder if an illegal norm is specified and with
+// ErrZeroLength if the vector has zero size.
+func (v *VecDense) Norm(norm float64) float64 {
+	if v.IsEmpty() {
+		panic(ErrZeroLength)
+	}
+	switch norm {
+	default:
+		panic(ErrNormOrder)
+	case 1:
+		return blas64.Asum(v.mat)
+	case 2:
+		return blas64.Nrm2(v.mat)
+	case math.Inf(1):
+		imax := blas64.Iamax(v.mat)
+		return math.Abs(v.at(imax))
+	}
+}
+
+// ScaleVec scales the vector a by alpha, placing the result in the receiver.
+func (v *VecDense) ScaleVec(alpha float64, a Vector) {
+	n := a.Len()
+
+	if v == a {
+		if v.mat.Inc == 1 {
+			f64.ScalUnitary(alpha, v.mat.Data)
+			return
+		}
+		f64.ScalInc(alpha, v.mat.Data, uintptr(n), uintptr(v.mat.Inc))
+		return
+	}
+
+	v.reuseAsNonZeroed(n)
+
+	if rv, ok := a.(RawVectorer); ok {
+		mat := rv.RawVector()
+		v.checkOverlap(mat)
+		if v.mat.Inc == 1 && mat.Inc == 1 {
+			f64.ScalUnitaryTo(v.mat.Data, alpha, mat.Data)
+			return
+		}
+		f64.ScalIncTo(v.mat.Data, uintptr(v.mat.Inc),
+			alpha, mat.Data, uintptr(n), uintptr(mat.Inc))
+		return
+	}
+
+	for i := 0; i < n; i++ {
+		v.setVec(i, alpha*a.AtVec(i))
+	}
+}
+
+// AddScaledVec adds the vectors a and alpha*b, placing the result in the receiver.
+func (v *VecDense) AddScaledVec(a Vector, alpha float64, b Vector) {
+	if alpha == 1 {
+		v.AddVec(a, b)
+		return
+	}
+	if alpha == -1 {
+		v.SubVec(a, b)
+		return
+	}
+
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	var amat, bmat blas64.Vector
+	fast := true
+	aU, _ := untransposeExtract(a)
+	if rv, ok := aU.(*VecDense); ok {
+		amat = rv.mat
+		if v != a {
+			v.checkOverlap(amat)
+		}
+	} else {
+		fast = false
+	}
+	bU, _ := untransposeExtract(b)
+	if rv, ok := bU.(*VecDense); ok {
+		bmat = rv.mat
+		if v != b {
+			v.checkOverlap(bmat)
+		}
+	} else {
+		fast = false
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	switch {
+	case alpha == 0: // v <- a
+		if v == a {
+			return
+		}
+		v.CopyVec(a)
+	case v == a && v == b: // v <- v + alpha * v = (alpha + 1) * v
+		blas64.Scal(alpha+1, v.mat)
+	case !fast: // v <- a + alpha * b without blas64 support.
+		for i := 0; i < ar; i++ {
+			v.setVec(i, a.AtVec(i)+alpha*b.AtVec(i))
+		}
+	case v == a && v != b: // v <- v + alpha * b
+		if v.mat.Inc == 1 && bmat.Inc == 1 {
+			// Fast path for a common case.
+			f64.AxpyUnitaryTo(v.mat.Data, alpha, bmat.Data, amat.Data)
+		} else {
+			f64.AxpyInc(alpha, bmat.Data, v.mat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(v.mat.Inc), 0, 0)
+		}
+	default: // v <- a + alpha * b or v <- a + alpha * v
+		if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+			// Fast path for a common case.
+			f64.AxpyUnitaryTo(v.mat.Data, alpha, bmat.Data, amat.Data)
+		} else {
+			f64.AxpyIncTo(v.mat.Data, uintptr(v.mat.Inc), 0,
+				alpha, bmat.Data, amat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(amat.Inc), 0, 0)
+		}
+	}
+}
+
+// AddVec adds the vectors a and b, placing the result in the receiver.
+func (v *VecDense) AddVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				f64.AxpyUnitaryTo(v.mat.Data, 1, bmat.Data, amat.Data)
+				return
+			}
+			f64.AxpyIncTo(v.mat.Data, uintptr(v.mat.Inc), 0,
+				1, bmat.Data, amat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(amat.Inc), 0, 0)
+			return
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)+b.AtVec(i))
+	}
+}
+
+// SubVec subtracts the vector b from a, placing the result in the receiver.
+func (v *VecDense) SubVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				f64.AxpyUnitaryTo(v.mat.Data, -1, bmat.Data, amat.Data)
+				return
+			}
+			f64.AxpyIncTo(v.mat.Data, uintptr(v.mat.Inc), 0,
+				-1, bmat.Data, amat.Data,
+				uintptr(ar), uintptr(bmat.Inc), uintptr(amat.Inc), 0, 0)
+			return
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)-b.AtVec(i))
+	}
+}
+
+// MulElemVec performs element-wise multiplication of a and b, placing the result
+// in the receiver.
+func (v *VecDense) MulElemVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				for i, a := range amat.Data {
+					v.mat.Data[i] = a * bmat.Data[i]
+				}
+				return
+			}
+			var ia, ib int
+			for i := 0; i < ar; i++ {
+				v.setVec(i, amat.Data[ia]*bmat.Data[ib])
+				ia += amat.Inc
+				ib += bmat.Inc
+			}
+			return
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)*b.AtVec(i))
+	}
+}
+
+// DivElemVec performs element-wise division of a by b, placing the result
+// in the receiver.
+func (v *VecDense) DivElemVec(a, b Vector) {
+	ar := a.Len()
+	br := b.Len()
+
+	if ar != br {
+		panic(ErrShape)
+	}
+
+	v.reuseAsNonZeroed(ar)
+
+	aU, _ := untransposeExtract(a)
+	bU, _ := untransposeExtract(b)
+
+	if arv, ok := aU.(*VecDense); ok {
+		if brv, ok := bU.(*VecDense); ok {
+			amat := arv.mat
+			bmat := brv.mat
+
+			if v != a {
+				v.checkOverlap(amat)
+			}
+			if v != b {
+				v.checkOverlap(bmat)
+			}
+
+			if v.mat.Inc == 1 && amat.Inc == 1 && bmat.Inc == 1 {
+				// Fast path for a common case.
+				for i, a := range amat.Data {
+					v.setVec(i, a/bmat.Data[i])
+				}
+				return
+			}
+			var ia, ib int
+			for i := 0; i < ar; i++ {
+				v.setVec(i, amat.Data[ia]/bmat.Data[ib])
+				ia += amat.Inc
+				ib += bmat.Inc
+			}
+		}
+	}
+
+	for i := 0; i < ar; i++ {
+		v.setVec(i, a.AtVec(i)/b.AtVec(i))
+	}
+}
+
+// MulVec computes a * b. The result is stored into the receiver.
+// MulVec panics if the number of columns in a does not equal the number of rows in b
+// or if the number of columns in b does not equal 1.
+func (v *VecDense) MulVec(a Matrix, b Vector) {
+	r, c := a.Dims()
+	br, bc := b.Dims()
+	if c != br || bc != 1 {
+		panic(ErrShape)
+	}
+
+	aU, trans := untransposeExtract(a)
+	var bmat blas64.Vector
+	fast := true
+	bU, _ := untransposeExtract(b)
+	if rv, ok := bU.(*VecDense); ok {
+		bmat = rv.mat
+		if v != b {
+			v.checkOverlap(bmat)
+		}
+	} else {
+		fast = false
+	}
+
+	v.reuseAsNonZeroed(r)
+	var restore func()
+	if v == aU {
+		v, restore = v.isolatedWorkspace(aU.(*VecDense))
+		defer restore()
+	} else if v == b {
+		v, restore = v.isolatedWorkspace(b)
+		defer restore()
+	}
+
+	// TODO(kortschak): Improve the non-fast paths.
+	switch aU := aU.(type) {
+	case Vector:
+		if b.Len() == 1 {
+			// {n,1} x {1,1}
+			v.ScaleVec(b.AtVec(0), aU)
+			return
+		}
+
+		// {1,n} x {n,1}
+		if fast {
+			if rv, ok := aU.(*VecDense); ok {
+				amat := rv.mat
+				if v != aU {
+					v.checkOverlap(amat)
+				}
+
+				if amat.Inc == 1 && bmat.Inc == 1 {
+					// Fast path for a common case.
+					v.setVec(0, f64.DotUnitary(amat.Data, bmat.Data))
+					return
+				}
+				v.setVec(0, f64.DotInc(amat.Data, bmat.Data,
+					uintptr(c), uintptr(amat.Inc), uintptr(bmat.Inc), 0, 0))
+				return
+			}
+		}
+		var sum float64
+		for i := 0; i < c; i++ {
+			sum += aU.AtVec(i) * b.AtVec(i)
+		}
+		v.setVec(0, sum)
+		return
+	case *SymBandDense:
+		if fast {
+			aU.checkOverlap(v.asGeneral())
+			blas64.Sbmv(1, aU.mat, bmat, 0, v.mat)
+			return
+		}
+	case *SymDense:
+		if fast {
+			aU.checkOverlap(v.asGeneral())
+			blas64.Symv(1, aU.mat, bmat, 0, v.mat)
+			return
+		}
+	case *TriDense:
+		if fast {
+			v.CopyVec(b)
+			aU.checkOverlap(v.asGeneral())
+			ta := blas.NoTrans
+			if trans {
+				ta = blas.Trans
+			}
+			blas64.Trmv(ta, aU.mat, v.mat)
+			return
+		}
+	case *Dense:
+		if fast {
+			aU.checkOverlap(v.asGeneral())
+			t := blas.NoTrans
+			if trans {
+				t = blas.Trans
+			}
+			blas64.Gemv(t, 1, aU.mat, bmat, 0, v.mat)
+			return
+		}
+	default:
+		if fast {
+			for i := 0; i < r; i++ {
+				var f float64
+				for j := 0; j < c; j++ {
+					f += a.At(i, j) * bmat.Data[j*bmat.Inc]
+				}
+				v.setVec(i, f)
+			}
+			return
+		}
+	}
+
+	for i := 0; i < r; i++ {
+		var f float64
+		for j := 0; j < c; j++ {
+			f += a.At(i, j) * b.AtVec(j)
+		}
+		v.setVec(i, f)
+	}
+}
+
+// ReuseAsVec changes the receiver if it IsEmpty() to be of size n×1.
+//
+// ReuseAsVec re-uses the backing data slice if it has sufficient capacity,
+// otherwise a new slice is allocated. The backing data is zero on return.
+//
+// ReuseAsVec panics if the receiver is not empty, and panics if
+// the input size is less than one. To empty the receiver for re-use,
+// Reset should be used.
+func (v *VecDense) ReuseAsVec(n int) {
+	if n <= 0 {
+		if n == 0 {
+			panic(ErrZeroLength)
+		}
+		panic(ErrNegativeDimension)
+	}
+	if !v.IsEmpty() {
+		panic(ErrReuseNonEmpty)
+	}
+	v.reuseAsZeroed(n)
+}
+
+// reuseAsNonZeroed resizes an empty vector to a r×1 vector,
+// or checks that a non-empty matrix is r×1.
+func (v *VecDense) reuseAsNonZeroed(r int) {
+	// reuseAsNonZeroed must be kept in sync with reuseAsZeroed.
+	if r == 0 {
+		panic(ErrZeroLength)
+	}
+	if v.IsEmpty() {
+		v.mat = blas64.Vector{
+			N:    r,
+			Inc:  1,
+			Data: use(v.mat.Data, r),
+		}
+		return
+	}
+	if r != v.mat.N {
+		panic(ErrShape)
+	}
+}
+
+// reuseAsZeroed resizes an empty vector to a r×1 vector,
+// or checks that a non-empty matrix is r×1.
+func (v *VecDense) reuseAsZeroed(r int) {
+	// reuseAsZeroed must be kept in sync with reuseAsNonZeroed.
+	if r == 0 {
+		panic(ErrZeroLength)
+	}
+	if v.IsEmpty() {
+		v.mat = blas64.Vector{
+			N:    r,
+			Inc:  1,
+			Data: useZeroed(v.mat.Data, r),
+		}
+		return
+	}
+	if r != v.mat.N {
+		panic(ErrShape)
+	}
+	v.Zero()
+}
+
+// IsEmpty returns whether the receiver is empty. Empty matrices can be the
+// receiver for size-restricted operations. The receiver can be emptied using
+// Reset.
+func (v *VecDense) IsEmpty() bool {
+	// It must be the case that v.Dims() returns
+	// zeros in this case. See comment in Reset().
+	return v.mat.Inc == 0
+}
+
+func (v *VecDense) isolatedWorkspace(a Vector) (n *VecDense, restore func()) {
+	l := a.Len()
+	if l == 0 {
+		panic(ErrZeroLength)
+	}
+	n = getVecDenseWorkspace(l, false)
+	return n, func() {
+		v.CopyVec(n)
+		putVecDenseWorkspace(n)
+	}
+}
+
+// asDense returns a Dense representation of the receiver with the same
+// underlying data.
+func (v *VecDense) asDense() *Dense {
+	return &Dense{
+		mat:     v.asGeneral(),
+		capRows: v.mat.N,
+		capCols: 1,
+	}
+}
+
+// asGeneral returns a blas64.General representation of the receiver with the
+// same underlying data.
+func (v *VecDense) asGeneral() blas64.General {
+	return blas64.General{
+		Rows:   v.mat.N,
+		Cols:   1,
+		Stride: v.mat.Inc,
+		Data:   v.mat.Data,
+	}
+}
+
+// ColViewOf reflects the column j of the RawMatrixer m, into the receiver
+// backed by the same underlying data. The receiver must either be empty
+// have length equal to the number of rows of m.
+func (v *VecDense) ColViewOf(m RawMatrixer, j int) {
+	rm := m.RawMatrix()
+
+	if j >= rm.Cols || j < 0 {
+		panic(ErrColAccess)
+	}
+	if !v.IsEmpty() && v.mat.N != rm.Rows {
+		panic(ErrShape)
+	}
+
+	v.mat.Inc = rm.Stride
+	v.mat.Data = rm.Data[j : (rm.Rows-1)*rm.Stride+j+1]
+	v.mat.N = rm.Rows
+}
+
+// RowViewOf reflects the row i of the RawMatrixer m, into the receiver
+// backed by the same underlying data. The receiver must either be
+// empty or have length equal to the number of columns of m.
+func (v *VecDense) RowViewOf(m RawMatrixer, i int) {
+	rm := m.RawMatrix()
+
+	if i >= rm.Rows || i < 0 {
+		panic(ErrRowAccess)
+	}
+	if !v.IsEmpty() && v.mat.N != rm.Cols {
+		panic(ErrShape)
+	}
+
+	v.mat.Inc = 1
+	v.mat.Data = rm.Data[i*rm.Stride : i*rm.Stride+rm.Cols]
+	v.mat.N = rm.Cols
+}
+
+// Permute rearranges the elements of the n-vector v in the receiver as
+// specified by the permutation p[0],p[1],...,p[n-1] of the integers 0,...,n-1.
+//
+// If inverse is false, the given permutation is applied:
+//
+//	v[p[i]] is moved to v[i] for i=0,1,...,n-1.
+//
+// If inverse is true, the inverse permutation is applied:
+//
+//	v[i] is moved to v[p[i]] for i=0,1,...,n-1.
+//
+// p must have length n, otherwise Permute will panic.
+func (v *VecDense) Permute(p []int, inverse bool) {
+	v.asDense().PermuteRows(p, inverse)
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/README.md b/vendor/gonum.org/v1/gonum/stat/README.md
new file mode 100644
index 00000000000..7156dc50956
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/README.md
@@ -0,0 +1,6 @@
+# Gonum stat
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/stat)](https://pkg.go.dev/gonum.org/v1/gonum/stat)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/stat?status.svg)](https://godocs.io/gonum.org/v1/gonum/stat)
+
+Package stat is a statistics package for the Go language.
diff --git a/vendor/gonum.org/v1/gonum/stat/doc.go b/vendor/gonum.org/v1/gonum/stat/doc.go
new file mode 100644
index 00000000000..d6916cb252f
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package stat provides generalized statistical functions.
+package stat // import "gonum.org/v1/gonum/stat"
diff --git a/vendor/gonum.org/v1/gonum/stat/pca_cca.go b/vendor/gonum.org/v1/gonum/stat/pca_cca.go
new file mode 100644
index 00000000000..25158ceaf3b
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/pca_cca.go
@@ -0,0 +1,324 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"errors"
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+)
+
+// PC is a type for computing and extracting the principal components of a
+// matrix. The results of the principal components analysis are only valid
+// if the call to PrincipalComponents was successful.
+type PC struct {
+	n, d    int
+	weights []float64
+	svd     *mat.SVD
+	ok      bool
+}
+
+// PrincipalComponents performs a weighted principal components analysis on the
+// matrix of the input data which is represented as an n×d matrix a where each
+// row is an observation and each column is a variable.
+//
+// PrincipalComponents centers the variables but does not scale the variance.
+//
+// The weights slice is used to weight the observations. If weights is nil, each
+// weight is considered to have a value of one, otherwise the length of weights
+// must match the number of observations or PrincipalComponents will panic.
+//
+// PrincipalComponents returns whether the analysis was successful.
+func (c *PC) PrincipalComponents(a mat.Matrix, weights []float64) (ok bool) {
+	c.n, c.d = a.Dims()
+	if weights != nil && len(weights) != c.n {
+		panic("stat: len(weights) != observations")
+	}
+
+	c.svd, c.ok = svdFactorizeCentered(c.svd, a, weights)
+	if c.ok {
+		c.weights = append(c.weights[:0], weights...)
+	}
+	return c.ok
+}
+
+// VectorsTo returns the component direction vectors of a principal components
+// analysis. The vectors are returned in the columns of a d×min(n, d) matrix.
+//
+// If dst is empty, VectorsTo will resize dst to be d×min(n, d). When dst is
+// non-empty, VectorsTo will panic if dst is not d×min(n, d). VectorsTo will also
+// panic if the receiver does not contain a successful PC.
+func (c *PC) VectorsTo(dst *mat.Dense) {
+	if !c.ok {
+		panic("stat: use of unsuccessful principal components analysis")
+	}
+
+	if dst.IsEmpty() {
+		dst.ReuseAs(c.d, min(c.n, c.d))
+	} else {
+		if d, n := dst.Dims(); d != c.d || n != min(c.n, c.d) {
+			panic(mat.ErrShape)
+		}
+	}
+	c.svd.VTo(dst)
+}
+
+// VarsTo returns the column variances of the principal component scores,
+// b * vecs, where b is a matrix with centered columns. Variances are returned
+// in descending order.
+// If dst is not nil it is used to store the variances and returned.
+// Vars will panic if the receiver has not successfully performed a principal
+// components analysis or dst is not nil and the length of dst is not min(n, d).
+func (c *PC) VarsTo(dst []float64) []float64 {
+	if !c.ok {
+		panic("stat: use of unsuccessful principal components analysis")
+	}
+	if dst != nil && len(dst) != min(c.n, c.d) {
+		panic("stat: length of slice does not match analysis")
+	}
+
+	dst = c.svd.Values(dst)
+	var f float64
+	if c.weights == nil {
+		f = 1 / float64(c.n-1)
+	} else {
+		f = 1 / (floats.Sum(c.weights) - 1)
+	}
+	for i, v := range dst {
+		dst[i] = f * v * v
+	}
+	return dst
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// CC is a type for computing the canonical correlations of a pair of matrices.
+// The results of the canonical correlation analysis are only valid
+// if the call to CanonicalCorrelations was successful.
+type CC struct {
+	// n is the number of observations used to
+	// construct the canonical correlations.
+	n int
+
+	// xd and yd are used for size checks.
+	xd, yd int
+
+	x, y, c *mat.SVD
+	ok      bool
+}
+
+// CanonicalCorrelations performs a canonical correlation analysis of the
+// input data x and y, columns of which should be interpretable as two sets
+// of measurements on the same observations (rows). These observations are
+// optionally weighted by weights. The result of the analysis is stored in
+// the receiver if the analysis is successful.
+//
+// Canonical correlation analysis finds associations between two sets of
+// variables on the same observations by finding linear combinations of the two
+// sphered datasets that maximize the correlation between them.
+//
+// Some notation: let Xc and Yc denote the centered input data matrices x
+// and y (column means subtracted from each column), let Sx and Sy denote the
+// sample covariance matrices within x and y respectively, and let Sxy denote
+// the covariance matrix between x and y. The sphered data can then be expressed
+// as Xc * Sx^{-1/2} and Yc * Sy^{-1/2} respectively, and the correlation matrix
+// between the sphered data is called the canonical correlation matrix,
+// Sx^{-1/2} * Sxy * Sy^{-1/2}. In cases where S^{-1/2} is ambiguous for some
+// covariance matrix S, S^{-1/2} is taken to be E * D^{-1/2} * Eᵀ where S can
+// be eigendecomposed as S = E * D * Eᵀ.
+//
+// The canonical correlations are the correlations between the corresponding
+// pairs of canonical variables and can be obtained with c.Corrs(). Canonical
+// variables can be obtained by projecting the sphered data into the left and
+// right eigenvectors of the canonical correlation matrix, and these
+// eigenvectors can be obtained with c.Left(m, true) and c.Right(m, true)
+// respectively. The canonical variables can also be obtained directly from the
+// centered raw data by using the back-transformed eigenvectors which can be
+// obtained with c.Left(m, false) and c.Right(m, false) respectively.
+//
+// The first pair of left and right eigenvectors of the canonical correlation
+// matrix can be interpreted as directions into which the respective sphered
+// data can be projected such that the correlation between the two projections
+// is maximized. The second pair and onwards solve the same optimization but
+// under the constraint that they are uncorrelated (orthogonal in sphered space)
+// to previous projections.
+//
+// CanonicalCorrelations will panic if the inputs x and y do not have the same
+// number of rows.
+//
+// The slice weights is used to weight the observations. If weights is nil, each
+// weight is considered to have a value of one, otherwise the length of weights
+// must match the number of observations (rows of both x and y) or
+// CanonicalCorrelations will panic.
+//
+// More details can be found at
+// https://en.wikipedia.org/wiki/Canonical_correlation
+// or in Chapter 3 of
+// Koch, Inge. Analysis of multivariate and high-dimensional data.
+// Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939
+func (c *CC) CanonicalCorrelations(x, y mat.Matrix, weights []float64) error {
+	var yn int
+	c.n, c.xd = x.Dims()
+	yn, c.yd = y.Dims()
+	if c.n != yn {
+		panic("stat: unequal number of observations")
+	}
+	if weights != nil && len(weights) != c.n {
+		panic("stat: len(weights) != observations")
+	}
+
+	// Center and factorize x and y.
+	c.x, c.ok = svdFactorizeCentered(c.x, x, weights)
+	if !c.ok {
+		return errors.New("stat: failed to factorize x")
+	}
+	c.y, c.ok = svdFactorizeCentered(c.y, y, weights)
+	if !c.ok {
+		return errors.New("stat: failed to factorize y")
+	}
+	var xu, xv, yu, yv mat.Dense
+	c.x.UTo(&xu)
+	c.x.VTo(&xv)
+	c.y.UTo(&yu)
+	c.y.VTo(&yv)
+
+	// Calculate and factorise the canonical correlation matrix.
+	var ccor mat.Dense
+	ccor.Product(&xv, xu.T(), &yu, yv.T())
+	if c.c == nil {
+		c.c = &mat.SVD{}
+	}
+	c.ok = c.c.Factorize(&ccor, mat.SVDThin)
+	if !c.ok {
+		return errors.New("stat: failed to factorize ccor")
+	}
+	return nil
+}
+
+// CorrsTo returns the canonical correlations, using dst if it is not nil.
+// If dst is not nil and len(dst) does not match the number of columns in
+// the y input matrix, Corrs will panic.
+func (c *CC) CorrsTo(dst []float64) []float64 {
+	if !c.ok {
+		panic("stat: canonical correlations missing or invalid")
+	}
+
+	if dst != nil && len(dst) != c.yd {
+		panic("stat: length of destination does not match input dimension")
+	}
+	return c.c.Values(dst)
+}
+
+// LeftTo returns the left eigenvectors of the canonical correlation matrix if
+// spheredSpace is true. If spheredSpace is false it returns these eigenvectors
+// back-transformed to the original data space.
+//
+// If dst is empty, LeftTo will resize dst to be xd×yd. When dst is
+// non-empty, LeftTo will panic if dst is not xd×yd. LeftTo will also
+// panic if the receiver does not contain a successful CC.
+func (c *CC) LeftTo(dst *mat.Dense, spheredSpace bool) {
+	if !c.ok || c.n < 2 {
+		panic("stat: canonical correlations missing or invalid")
+	}
+
+	if dst.IsEmpty() {
+		dst.ReuseAs(c.xd, c.yd)
+	} else {
+		if d, n := dst.Dims(); d != c.xd || n != c.yd {
+			panic(mat.ErrShape)
+		}
+	}
+	c.c.UTo(dst)
+	if spheredSpace {
+		return
+	}
+
+	xs := c.x.Values(nil)
+	xv := &mat.Dense{}
+	c.x.VTo(xv)
+
+	scaleColsReciSqrt(xv, xs)
+
+	dst.Product(xv, xv.T(), dst)
+	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
+}
+
+// RightTo returns the right eigenvectors of the canonical correlation matrix if
+// spheredSpace is true. If spheredSpace is false it returns these eigenvectors
+// back-transformed to the original data space.
+//
+// If dst is empty, RightTo will resize dst to be yd×yd. When dst is
+// non-empty, RightTo will panic if dst is not yd×yd. RightTo will also
+// panic if the receiver does not contain a successful CC.
+func (c *CC) RightTo(dst *mat.Dense, spheredSpace bool) {
+	if !c.ok || c.n < 2 {
+		panic("stat: canonical correlations missing or invalid")
+	}
+
+	if dst.IsEmpty() {
+		dst.ReuseAs(c.yd, c.yd)
+	} else {
+		if d, n := dst.Dims(); d != c.yd || n != c.yd {
+			panic(mat.ErrShape)
+		}
+	}
+	c.c.VTo(dst)
+	if spheredSpace {
+		return
+	}
+
+	ys := c.y.Values(nil)
+	yv := &mat.Dense{}
+	c.y.VTo(yv)
+
+	scaleColsReciSqrt(yv, ys)
+
+	dst.Product(yv, yv.T(), dst)
+	dst.Scale(math.Sqrt(float64(c.n-1)), dst)
+}
+
+func svdFactorizeCentered(work *mat.SVD, m mat.Matrix, weights []float64) (svd *mat.SVD, ok bool) {
+	n, d := m.Dims()
+	centered := mat.NewDense(n, d, nil)
+	col := make([]float64, n)
+	for j := 0; j < d; j++ {
+		mat.Col(col, j, m)
+		floats.AddConst(-Mean(col, weights), col)
+		centered.SetCol(j, col)
+	}
+	for i, w := range weights {
+		floats.Scale(math.Sqrt(w), centered.RawRowView(i))
+	}
+	if work == nil {
+		work = &mat.SVD{}
+	}
+	ok = work.Factorize(centered, mat.SVDThin)
+	return work, ok
+}
+
+// scaleColsReciSqrt scales the columns of cols
+// by the reciprocal square-root of vals.
+func scaleColsReciSqrt(cols *mat.Dense, vals []float64) {
+	if cols == nil {
+		panic("stat: input nil")
+	}
+	n, d := cols.Dims()
+	if len(vals) != d {
+		panic("stat: input length mismatch")
+	}
+	col := make([]float64, n)
+	for j := 0; j < d; j++ {
+		mat.Col(col, j, cols)
+		floats.Scale(math.Sqrt(1/vals[j]), col)
+		cols.SetCol(j, col)
+	}
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/roc.go b/vendor/gonum.org/v1/gonum/stat/roc.go
new file mode 100644
index 00000000000..05c6b44d385
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/roc.go
@@ -0,0 +1,201 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"math"
+	"sort"
+)
+
+// ROC returns paired false positive rate (FPR) and true positive rate
+// (TPR) values corresponding to cutoff points on the receiver operator
+// characteristic (ROC) curve obtained when y is treated as a binary
+// classifier for classes with weights. The cutoff thresholds used to
+// calculate the ROC are returned in thresh such that tpr[i] and fpr[i]
+// are the true and false positive rates for y >= thresh[i].
+//
+// The input y and cutoffs must be sorted, and values in y must correspond
+// to values in classes and weights. SortWeightedLabeled can be used to
+// sort y together with classes and weights.
+//
+// For a given cutoff value, observations corresponding to entries in y
+// greater than the cutoff value are classified as true, while those
+// less than or equal to the cutoff value are classified as false. These
+// assigned class labels are compared with the true values in the classes
+// slice and used to calculate the FPR and TPR.
+//
+// If weights is nil, all weights are treated as 1. If weights is not nil
+// it must have the same length as y and classes, otherwise ROC will panic.
+//
+// If cutoffs is nil or empty, all possible cutoffs are calculated,
+// resulting in fpr and tpr having length one greater than the number of
+// unique values in y. Otherwise fpr and tpr will be returned with the
+// same length as cutoffs. floats.Span can be used to generate equally
+// spaced cutoffs.
+//
+// More details about ROC curves are available at
+// https://en.wikipedia.org/wiki/Receiver_operating_characteristic
+func ROC(cutoffs, y []float64, classes []bool, weights []float64) (tpr, fpr, thresh []float64) {
+	if len(y) != len(classes) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(y) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if !sort.Float64sAreSorted(y) {
+		panic("stat: input must be sorted ascending")
+	}
+	if !sort.Float64sAreSorted(cutoffs) {
+		panic("stat: cutoff values must be sorted ascending")
+	}
+	if len(y) == 0 {
+		return nil, nil, nil
+	}
+	if len(cutoffs) == 0 {
+		if cutoffs == nil || cap(cutoffs) < len(y)+1 {
+			cutoffs = make([]float64, len(y)+1)
+		} else {
+			cutoffs = cutoffs[:len(y)+1]
+		}
+		// Choose all possible cutoffs for unique values in y.
+		bin := 0
+		cutoffs[bin] = y[0]
+		for i, u := range y[1:] {
+			if u == y[i] {
+				continue
+			}
+			bin++
+			cutoffs[bin] = u
+		}
+		cutoffs[bin+1] = math.Inf(1)
+		cutoffs = cutoffs[:bin+2]
+	} else {
+		// Don't mutate the provided cutoffs.
+		tmp := cutoffs
+		cutoffs = make([]float64, len(cutoffs))
+		copy(cutoffs, tmp)
+	}
+
+	tpr = make([]float64, len(cutoffs))
+	fpr = make([]float64, len(cutoffs))
+	var bin int
+	var nPos, nNeg float64
+	for i, u := range classes {
+		// Update the bin until it matches the next y value
+		// skipping empty bins.
+		for bin < len(cutoffs)-1 && y[i] >= cutoffs[bin] {
+			bin++
+			tpr[bin] = tpr[bin-1]
+			fpr[bin] = fpr[bin-1]
+		}
+		posWeight, negWeight := 1.0, 0.0
+		if weights != nil {
+			posWeight = weights[i]
+		}
+		if !u {
+			posWeight, negWeight = negWeight, posWeight
+		}
+		nPos += posWeight
+		nNeg += negWeight
+		// Count false negatives (in tpr) and true negatives (in fpr).
+		if y[i] < cutoffs[bin] {
+			tpr[bin] += posWeight
+			fpr[bin] += negWeight
+		}
+	}
+
+	invNeg := 1 / nNeg
+	invPos := 1 / nPos
+	// Convert negative counts to TPR and FPR.
+	// Bins beyond the maximum value in y are skipped
+	// leaving these fpr and tpr elements as zero.
+	for i := range tpr[:bin+1] {
+		// Prevent fused float operations by
+		// making explicit float64 conversions.
+		tpr[i] = 1 - float64(tpr[i]*invPos)
+		fpr[i] = 1 - float64(fpr[i]*invNeg)
+	}
+	for i, j := 0, len(tpr)-1; i < j; i, j = i+1, j-1 {
+		tpr[i], tpr[j] = tpr[j], tpr[i]
+		fpr[i], fpr[j] = fpr[j], fpr[i]
+	}
+	for i, j := 0, len(cutoffs)-1; i < j; i, j = i+1, j-1 {
+		cutoffs[i], cutoffs[j] = cutoffs[j], cutoffs[i]
+	}
+
+	return tpr, fpr, cutoffs
+}
+
+// TOC returns the Total Operating Characteristic for the classes provided
+// and the minimum and maximum bounds for the TOC.
+//
+// The input y values that correspond to classes and weights must be sorted
+// in ascending order. classes[i] is the class of value y[i] and weights[i]
+// is the weight of y[i]. SortWeightedLabeled can be used to sort classes
+// together with weights by the rank variable, i+1.
+//
+// The returned ntp values can be interpreted as the number of true positives
+// where values above the given rank are assigned class true for each given
+// rank from 1 to len(classes).
+//
+//	ntp_i = sum_{j ≥ len(ntp)-1 - i} [ classes_j ] * weights_j, where [x] = 1 if x else 0.
+//
+// The values of min and max provide the minimum and maximum possible number
+// of false values for the set of classes. The first element of ntp, min and
+// max are always zero as this corresponds to assigning all data class false
+// and the last elements are always weighted sum of classes as this corresponds
+// to assigning every data class true. For len(classes) != 0, the lengths of
+// min, ntp and max are len(classes)+1.
+//
+// If weights is nil, all weights are treated as 1. When weights are not nil,
+// the calculation of min and max allows for partial assignment of single data
+// points. If weights is not nil it must have the same length as classes,
+// otherwise TOC will panic.
+//
+// More details about TOC curves are available at
+// https://en.wikipedia.org/wiki/Total_operating_characteristic
+func TOC(classes []bool, weights []float64) (min, ntp, max []float64) {
+	if weights != nil && len(classes) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(classes) == 0 {
+		return nil, nil, nil
+	}
+
+	ntp = make([]float64, len(classes)+1)
+	min = make([]float64, len(ntp))
+	max = make([]float64, len(ntp))
+	if weights == nil {
+		for i := range ntp[1:] {
+			ntp[i+1] = ntp[i]
+			if classes[len(classes)-i-1] {
+				ntp[i+1]++
+			}
+		}
+		totalPositive := ntp[len(ntp)-1]
+		for i := range ntp {
+			min[i] = math.Max(0, totalPositive-float64(len(classes)-i))
+			max[i] = math.Min(totalPositive, float64(i))
+		}
+		return min, ntp, max
+	}
+
+	cumw := max // Reuse max for cumulative weight. Update its elements last.
+	for i := range ntp[1:] {
+		ntp[i+1] = ntp[i]
+		w := weights[len(weights)-i-1]
+		cumw[i+1] = cumw[i] + w
+		if classes[len(classes)-i-1] {
+			ntp[i+1] += w
+		}
+	}
+	totw := cumw[len(cumw)-1]
+	totalPositive := ntp[len(ntp)-1]
+	for i := range ntp {
+		min[i] = math.Max(0, totalPositive-(totw-cumw[i]))
+		max[i] = math.Min(totalPositive, cumw[i])
+	}
+	return min, ntp, max
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/stat.go b/vendor/gonum.org/v1/gonum/stat/stat.go
new file mode 100644
index 00000000000..f7d43726f77
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/stat.go
@@ -0,0 +1,1400 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"math"
+	"sort"
+
+	"gonum.org/v1/gonum/floats"
+)
+
+// CumulantKind specifies the behavior for calculating the empirical CDF or Quantile
+type CumulantKind int
+
+// List of supported CumulantKind values for the Quantile function.
+// Constant values should match the R nomenclature. See
+// https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population
+const (
+	// Empirical treats the distribution as the actual empirical distribution.
+	Empirical CumulantKind = 1
+	// LinInterp linearly interpolates the empirical distribution between sample values, with a flat extrapolation.
+	LinInterp CumulantKind = 4
+)
+
+// bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by:
+//
+//	\sum_i \sqrt{p_i q_i}
+//
+// It is assumed that p and q have equal length.
+func bhattacharyyaCoeff(p, q []float64) float64 {
+	var bc float64
+	for i, a := range p {
+		bc += math.Sqrt(a * q[i])
+	}
+	return bc
+}
+
+// Bhattacharyya computes the distance between the probability distributions p and q given by:
+//
+//	-\ln ( \sum_i \sqrt{p_i q_i} )
+//
+// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
+func Bhattacharyya(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	bc := bhattacharyyaCoeff(p, q)
+	return -math.Log(bc)
+}
+
+// CDF returns the empirical cumulative distribution function value of x, that is
+// the fraction of the samples less than or equal to q. The
+// exact behavior is determined by the CumulantKind. CDF is theoretically
+// the inverse of the Quantile function, though it may not be the actual inverse
+// for all values q and CumulantKinds.
+//
+// The x data must be sorted in increasing order. If weights is nil then all
+// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
+// CDF will panic if the length of x is zero.
+//
+// CumulantKind behaviors:
+//   - Empirical: Returns the lowest fraction for which q is greater than or equal
+//     to that fraction of samples
+func CDF(q float64, c CumulantKind, x, weights []float64) float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if floats.HasNaN(x) {
+		return math.NaN()
+	}
+	if len(x) == 0 {
+		panic("stat: zero length slice")
+	}
+	if !sort.Float64sAreSorted(x) {
+		panic("x data are not sorted")
+	}
+
+	if q < x[0] {
+		return 0
+	}
+	if q >= x[len(x)-1] {
+		return 1
+	}
+
+	var sumWeights float64
+	if weights == nil {
+		sumWeights = float64(len(x))
+	} else {
+		sumWeights = floats.Sum(weights)
+	}
+
+	// Calculate the index
+	switch c {
+	case Empirical:
+		// Find the smallest value that is greater than that percent of the samples
+		var w float64
+		for i, v := range x {
+			if v > q {
+				return w / sumWeights
+			}
+			if weights == nil {
+				w++
+			} else {
+				w += weights[i]
+			}
+		}
+		panic("impossible")
+	default:
+		panic("stat: bad cumulant kind")
+	}
+}
+
+// ChiSquare computes the chi-square distance between the observed frequencies 'obs' and
+// expected frequencies 'exp' given by:
+//
+//	\sum_i (obs_i-exp_i)^2 / exp_i
+//
+// The lengths of obs and exp must be equal.
+func ChiSquare(obs, exp []float64) float64 {
+	if len(obs) != len(exp) {
+		panic("stat: slice length mismatch")
+	}
+	var result float64
+	for i, a := range obs {
+		b := exp[i]
+		if a == 0 && b == 0 {
+			continue
+		}
+		result += (a - b) * (a - b) / b
+	}
+	return result
+}
+
+// CircularMean returns the circular mean of the dataset.
+//
+//	atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i))
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func CircularMean(x, weights []float64) float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+
+	var aX, aY float64
+	if weights != nil {
+		for i, v := range x {
+			aX += weights[i] * math.Cos(v)
+			aY += weights[i] * math.Sin(v)
+		}
+	} else {
+		for _, v := range x {
+			aX += math.Cos(v)
+			aY += math.Sin(v)
+		}
+	}
+
+	return math.Atan2(aY, aX)
+}
+
+// Correlation returns the weighted correlation between the samples of x and y
+// with the given means.
+//
+//	sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY)
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func Correlation(x, y, weights []float64) float64 {
+	// This is a two-pass corrected implementation. It is an adaptation of the
+	// algorithm used in the MeanVariance function, which applies a correction
+	// to the typical two pass approach.
+
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	xu := Mean(x, weights)
+	yu := Mean(y, weights)
+	var (
+		sxx           float64
+		syy           float64
+		sxy           float64
+		xcompensation float64
+		ycompensation float64
+	)
+	if weights == nil {
+		for i, xv := range x {
+			yv := y[i]
+			xd := xv - xu
+			yd := yv - yu
+			sxx += xd * xd
+			syy += yd * yd
+			sxy += xd * yd
+			xcompensation += xd
+			ycompensation += yd
+		}
+		// xcompensation and ycompensation are from Chan, et. al.
+		// referenced in the MeanVariance function. They are analogous
+		// to the second term in (1.7) in that paper.
+		sxx -= xcompensation * xcompensation / float64(len(x))
+		syy -= ycompensation * ycompensation / float64(len(x))
+
+		return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy)
+
+	}
+
+	var sumWeights float64
+	for i, xv := range x {
+		w := weights[i]
+		yv := y[i]
+		xd := xv - xu
+		wxd := w * xd
+		yd := yv - yu
+		wyd := w * yd
+		sxx += wxd * xd
+		syy += wyd * yd
+		sxy += wxd * yd
+		xcompensation += wxd
+		ycompensation += wyd
+		sumWeights += w
+	}
+	// xcompensation and ycompensation are from Chan, et. al.
+	// referenced in the MeanVariance function. They are analogous
+	// to the second term in (1.7) in that paper, except they use
+	// the sumWeights instead of the sample count.
+	sxx -= xcompensation * xcompensation / sumWeights
+	syy -= ycompensation * ycompensation / sumWeights
+
+	return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy)
+}
+
+// Kendall returns the weighted Tau-a Kendall correlation between the
+// samples of x and y. The Kendall correlation measures the quantity of
+// concordant and discordant pairs of numbers. If weights are specified then
+// each pair is weighted by weights[i] * weights[j] and the final sum is
+// normalized to stay between -1 and 1.
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func Kendall(x, y, weights []float64) float64 {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+
+	var (
+		cc float64 // number of concordant pairs
+		dc float64 // number of discordant pairs
+		n  = len(x)
+	)
+
+	if weights == nil {
+		for i := 0; i < n; i++ {
+			for j := i; j < n; j++ {
+				if i == j {
+					continue
+				}
+				if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
+					cc++
+				} else {
+					dc++
+				}
+			}
+		}
+		return (cc - dc) / float64(n*(n-1)/2)
+	}
+
+	var sumWeights float64
+
+	for i := 0; i < n; i++ {
+		for j := i; j < n; j++ {
+			if i == j {
+				continue
+			}
+			weight := weights[i] * weights[j]
+			if math.Signbit(x[j]-x[i]) == math.Signbit(y[j]-y[i]) {
+				cc += weight
+			} else {
+				dc += weight
+			}
+			sumWeights += weight
+		}
+	}
+	return float64(cc-dc) / sumWeights
+}
+
+// Covariance returns the weighted covariance between the samples of x and y.
+//
+//	sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1)
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func Covariance(x, y, weights []float64) float64 {
+	// This is a two-pass corrected implementation. It is an adaptation of the
+	// algorithm used in the MeanVariance function, which applies a correction
+	// to the typical two pass approach.
+
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	xu := Mean(x, weights)
+	yu := Mean(y, weights)
+	return covarianceMeans(x, y, weights, xu, yu)
+}
+
+// covarianceMeans returns the weighted covariance between x and y with the mean
+// of x and y already specified. See the documentation of Covariance for more
+// information.
+func covarianceMeans(x, y, weights []float64, xu, yu float64) float64 {
+	var (
+		ss            float64
+		xcompensation float64
+		ycompensation float64
+	)
+	if weights == nil {
+		for i, xv := range x {
+			yv := y[i]
+			xd := xv - xu
+			yd := yv - yu
+			ss += xd * yd
+			xcompensation += xd
+			ycompensation += yd
+		}
+		// xcompensation and ycompensation are from Chan, et. al.
+		// referenced in the MeanVariance function. They are analogous
+		// to the second term in (1.7) in that paper.
+		return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1)
+	}
+
+	var sumWeights float64
+
+	for i, xv := range x {
+		w := weights[i]
+		yv := y[i]
+		wxd := w * (xv - xu)
+		yd := (yv - yu)
+		ss += wxd * yd
+		xcompensation += wxd
+		ycompensation += w * yd
+		sumWeights += w
+	}
+	// xcompensation and ycompensation are from Chan, et. al.
+	// referenced in the MeanVariance function. They are analogous
+	// to the second term in (1.7) in that paper, except they use
+	// the sumWeights instead of the sample count.
+	return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1)
+}
+
+// CrossEntropy computes the cross-entropy between the two distributions specified
+// in p and q.
+func CrossEntropy(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	var ce float64
+	for i, v := range p {
+		if v != 0 {
+			ce -= v * math.Log(q[i])
+		}
+	}
+	return ce
+}
+
+// Entropy computes the Shannon entropy of a distribution or the distance between
+// two distributions. The natural logarithm is used.
+//   - sum_i (p_i * log_e(p_i))
+func Entropy(p []float64) float64 {
+	var e float64
+	for _, v := range p {
+		if v != 0 { // Entropy needs 0 * log(0) == 0.
+			e -= v * math.Log(v)
+		}
+	}
+	return e
+}
+
+// ExKurtosis returns the population excess kurtosis of the sample.
+// The kurtosis is defined by the 4th moment of the mean divided by the squared
+// variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of
+// the normal distribution is zero.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func ExKurtosis(x, weights []float64) float64 {
+	mean, std := MeanStdDev(x, weights)
+	if weights == nil {
+		var e float64
+		for _, v := range x {
+			z := (v - mean) / std
+			e += z * z * z * z
+		}
+		mul, offset := kurtosisCorrection(float64(len(x)))
+		return e*mul - offset
+	}
+
+	var (
+		e          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		z := (v - mean) / std
+		e += weights[i] * z * z * z * z
+		sumWeights += weights[i]
+	}
+	mul, offset := kurtosisCorrection(sumWeights)
+	return e*mul - offset
+}
+
+// n is the number of samples
+// see https://en.wikipedia.org/wiki/Kurtosis
+func kurtosisCorrection(n float64) (mul, offset float64) {
+	return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3))
+}
+
+// GeometricMean returns the weighted geometric mean of the dataset
+//
+//	\prod_i {x_i ^ w_i}
+//
+// This only applies with positive x and positive weights. If weights is nil
+// then all of the weights are 1. If weights is not nil, then len(x) must equal
+// len(weights).
+func GeometricMean(x, weights []float64) float64 {
+	if weights == nil {
+		var s float64
+		for _, v := range x {
+			s += math.Log(v)
+		}
+		s /= float64(len(x))
+		return math.Exp(s)
+	}
+	if len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		s          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		s += weights[i] * math.Log(v)
+		sumWeights += weights[i]
+	}
+	s /= sumWeights
+	return math.Exp(s)
+}
+
+// HarmonicMean returns the weighted harmonic mean of the dataset
+//
+//	\sum_i {w_i} / ( sum_i {w_i / x_i} )
+//
+// This only applies with positive x and positive weights.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func HarmonicMean(x, weights []float64) float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	// TODO(btracey): Fix this to make it more efficient and avoid allocation.
+
+	// This can be numerically unstable (for example if x is very small).
+	// W = \sum_i {w_i}
+	// hm = exp(log(W) - log(\sum_i w_i / x_i))
+
+	logs := make([]float64, len(x))
+	var W float64
+	for i := range x {
+		if weights == nil {
+			logs[i] = -math.Log(x[i])
+			W++
+			continue
+		}
+		logs[i] = math.Log(weights[i]) - math.Log(x[i])
+		W += weights[i]
+	}
+
+	// Sum all of the logs
+	v := floats.LogSumExp(logs) // This computes log(\sum_i { w_i / x_i}).
+	return math.Exp(math.Log(W) - v)
+}
+
+// Hellinger computes the distance between the probability distributions p and q given by:
+//
+//	\sqrt{ 1 - \sum_i \sqrt{p_i q_i} }
+//
+// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
+func Hellinger(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	bc := bhattacharyyaCoeff(p, q)
+	return math.Sqrt(1 - bc)
+}
+
+// Histogram sums up the weighted number of data points in each bin.
+// The weight of data point x[i] will be placed into count[j] if
+// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist
+// with bin creation.
+//
+// The following conditions on the inputs apply:
+//   - The count variable must either be nil or have length of one less than dividers.
+//   - The values in dividers must be sorted (use the sort package).
+//   - The x values must be sorted.
+//   - If weights is nil then all of the weights are 1.
+//   - If weights is not nil, then len(x) must equal len(weights).
+func Histogram(count, dividers, x, weights []float64) []float64 {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if count == nil {
+		count = make([]float64, len(dividers)-1)
+	}
+	if len(dividers) < 2 {
+		panic("histogram: fewer than two dividers")
+	}
+	if len(count) != len(dividers)-1 {
+		panic("histogram: bin count mismatch")
+	}
+	if !sort.Float64sAreSorted(dividers) {
+		panic("histogram: dividers are not sorted")
+	}
+	if !sort.Float64sAreSorted(x) {
+		panic("histogram: x data are not sorted")
+	}
+	for i := range count {
+		count[i] = 0
+	}
+	if len(x) == 0 {
+		return count
+	}
+	if x[0] < dividers[0] {
+		panic("histogram: minimum x value is less than lowest divider")
+	}
+	if dividers[len(dividers)-1] <= x[len(x)-1] {
+		panic("histogram: maximum x value is greater than or equal to highest divider")
+	}
+
+	idx := 0
+	comp := dividers[idx+1]
+	if weights == nil {
+		for _, v := range x {
+			if v < comp {
+				// Still in the current bucket.
+				count[idx]++
+				continue
+			}
+			// Find the next divider where v is less than the divider.
+			for j := idx + 1; j < len(dividers); j++ {
+				if v < dividers[j+1] {
+					idx = j
+					comp = dividers[j+1]
+					break
+				}
+			}
+			count[idx]++
+		}
+		return count
+	}
+
+	for i, v := range x {
+		if v < comp {
+			// Still in the current bucket.
+			count[idx] += weights[i]
+			continue
+		}
+		// Need to find the next divider where v is less than the divider.
+		for j := idx + 1; j < len(count); j++ {
+			if v < dividers[j+1] {
+				idx = j
+				comp = dividers[j+1]
+				break
+			}
+		}
+		count[idx] += weights[i]
+	}
+	return count
+}
+
+// JensenShannon computes the JensenShannon divergence between the distributions
+// p and q. The Jensen-Shannon divergence is defined as
+//
+//	m = 0.5 * (p + q)
+//	JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) )
+//
+// Unlike Kullback-Leibler, the Jensen-Shannon distance is symmetric. The value
+// is between 0 and ln(2).
+func JensenShannon(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	var js float64
+	for i, v := range p {
+		qi := q[i]
+		m := 0.5 * (v + qi)
+		if v != 0 {
+			// add kl from p to m
+			js += 0.5 * v * (math.Log(v) - math.Log(m))
+		}
+		if qi != 0 {
+			// add kl from q to m
+			js += 0.5 * qi * (math.Log(qi) - math.Log(m))
+		}
+	}
+	return js
+}
+
+// KolmogorovSmirnov computes the largest distance between two empirical CDFs.
+// Each dataset x and y consists of sample locations and counts, xWeights and
+// yWeights, respectively.
+//
+// x and y may have different lengths, though len(x) must equal len(xWeights), and
+// len(y) must equal len(yWeights). Both x and y must be sorted.
+//
+// Special cases are:
+//
+//	= 0 if len(x) == len(y) == 0
+//	= 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0
+func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
+	if xWeights != nil && len(x) != len(xWeights) {
+		panic("stat: slice length mismatch")
+	}
+	if yWeights != nil && len(y) != len(yWeights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(x) == 0 || len(y) == 0 {
+		if len(x) == 0 && len(y) == 0 {
+			return 0
+		}
+		return 1
+	}
+
+	if floats.HasNaN(x) {
+		return math.NaN()
+	}
+	if floats.HasNaN(y) {
+		return math.NaN()
+	}
+
+	if !sort.Float64sAreSorted(x) {
+		panic("x data are not sorted")
+	}
+	if !sort.Float64sAreSorted(y) {
+		panic("y data are not sorted")
+	}
+
+	xWeightsNil := xWeights == nil
+	yWeightsNil := yWeights == nil
+
+	var (
+		maxDist    float64
+		xSum, ySum float64
+		xCdf, yCdf float64
+		xIdx, yIdx int
+	)
+
+	if xWeightsNil {
+		xSum = float64(len(x))
+	} else {
+		xSum = floats.Sum(xWeights)
+	}
+
+	if yWeightsNil {
+		ySum = float64(len(y))
+	} else {
+		ySum = floats.Sum(yWeights)
+	}
+
+	xVal := x[0]
+	yVal := y[0]
+
+	// Algorithm description:
+	// The goal is to find the maximum difference in the empirical CDFs for the
+	// two datasets. The CDFs are piecewise-constant, and thus the distance
+	// between the CDFs will only change at the values themselves.
+	//
+	// To find the maximum distance, step through the data in ascending order
+	// of value between the two datasets. At each step, compute the empirical CDF
+	// and compare the local distance with the maximum distance.
+	// Due to some corner cases, equal data entries must be tallied simultaneously.
+	for {
+		switch {
+		case xVal < yVal:
+			xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
+		case yVal < xVal:
+			yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
+		case xVal == yVal:
+			newX := x[xIdx]
+			newY := y[yIdx]
+			if newX < newY {
+				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
+			} else if newY < newX {
+				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
+			} else {
+				// Update them both, they'll be equal next time and the right
+				// thing will happen.
+				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
+				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
+			}
+		default:
+			panic("unreachable")
+		}
+
+		dist := math.Abs(xCdf - yCdf)
+		if dist > maxDist {
+			maxDist = dist
+		}
+
+		// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
+		// end of either sample list, the distance is as large as it can be.
+		if xIdx == len(x) || yIdx == len(y) {
+			return maxDist
+		}
+	}
+}
+
+// updateKS gets the next data point from one of the set. In doing so, it combines
+// the weight of all the data points of equal value. Upon return, val is the new
+// value of the data set, newCdf is the total combined CDF up until this point,
+// and newIdx is the index of the next location in that sample to examine.
+func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
+	// Sum up all the weights of consecutive values that are equal.
+	if isNil {
+		newCdf = cdf + 1/sum
+	} else {
+		newCdf = cdf + weights[idx]/sum
+	}
+	newIdx = idx + 1
+	for {
+		if newIdx == len(values) {
+			return values[newIdx-1], newCdf, newIdx
+		}
+		if values[newIdx-1] != values[newIdx] {
+			return values[newIdx], newCdf, newIdx
+		}
+		if isNil {
+			newCdf += 1 / sum
+		} else {
+			newCdf += weights[newIdx] / sum
+		}
+		newIdx++
+	}
+}
+
+// KullbackLeibler computes the Kullback-Leibler distance between the
+// distributions p and q. The natural logarithm is used.
+//
+//	sum_i(p_i * log(p_i / q_i))
+//
+// Note that the Kullback-Leibler distance is not symmetric;
+// KullbackLeibler(p,q) != KullbackLeibler(q,p)
+func KullbackLeibler(p, q []float64) float64 {
+	if len(p) != len(q) {
+		panic("stat: slice length mismatch")
+	}
+	var kl float64
+	for i, v := range p {
+		if v != 0 { // Entropy needs 0 * log(0) == 0.
+			kl += v * (math.Log(v) - math.Log(q[i]))
+		}
+	}
+	return kl
+}
+
+// LinearRegression computes the best-fit line
+//
+//	y = alpha + beta*x
+//
+// to the data in x and y with the given weights. If origin is true, the
+// regression is forced to pass through the origin.
+//
+// Specifically, LinearRegression computes the values of alpha and
+// beta such that the total residual
+//
+//	\sum_i w[i]*(y[i] - alpha - beta*x[i])^2
+//
+// is minimized. If origin is true, then alpha is forced to be zero.
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	if origin {
+		var x2Sum, xySum float64
+		for i, xi := range x {
+			if weights != nil {
+				w = weights[i]
+			}
+			yi := y[i]
+			xySum += w * xi * yi
+			x2Sum += w * xi * xi
+		}
+		beta = xySum / x2Sum
+
+		return 0, beta
+	}
+
+	xu, xv := MeanVariance(x, weights)
+	yu := Mean(y, weights)
+	cov := covarianceMeans(x, y, weights, xu, yu)
+	beta = cov / xv
+	alpha = yu - beta*xu
+	return alpha, beta
+}
+
+// RSquared returns the coefficient of determination defined as
+//
+//	R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2
+//
+// for the line
+//
+//	y = alpha + beta*x
+//
+// and the data in x and y with the given weights.
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func RSquared(x, y, weights []float64, alpha, beta float64) float64 {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	yMean := Mean(y, weights)
+	var res, tot, d float64
+	for i, xi := range x {
+		if weights != nil {
+			w = weights[i]
+		}
+		yi := y[i]
+		fi := alpha + beta*xi
+		d = yi - fi
+		res += w * d * d
+		d = yi - yMean
+		tot += w * d * d
+	}
+	return 1 - res/tot
+}
+
+// RSquaredFrom returns the coefficient of determination defined as
+//
+//	R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2
+//
+// and the data in estimates and values with the given weights.
+//
+// The lengths of estimates and values must be equal. If weights is nil then
+// all of the weights are 1. If weights is not nil, then len(values) must
+// equal len(weights).
+func RSquaredFrom(estimates, values, weights []float64) float64 {
+	if len(estimates) != len(values) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(values) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	mean := Mean(values, weights)
+	var res, tot, d float64
+	for i, val := range values {
+		if weights != nil {
+			w = weights[i]
+		}
+		d = val - estimates[i]
+		res += w * d * d
+		d = val - mean
+		tot += w * d * d
+	}
+	return 1 - res/tot
+}
+
+// RNoughtSquared returns the coefficient of determination defined as
+//
+//	R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2
+//
+// for the line
+//
+//	y = beta*x
+//
+// and the data in x and y with the given weights. RNoughtSquared should
+// only be used for best-fit lines regressed through the origin.
+//
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func RNoughtSquared(x, y, weights []float64, beta float64) float64 {
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights != nil && len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+
+	w := 1.0
+	var ssr, tot float64
+	for i, xi := range x {
+		if weights != nil {
+			w = weights[i]
+		}
+		fi := beta * xi
+		ssr += w * fi * fi
+		yi := y[i]
+		tot += w * yi * yi
+	}
+	return ssr / tot
+}
+
+// Mean computes the weighted mean of the data set.
+//
+//	sum_i {w_i * x_i} / sum_i {w_i}
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func Mean(x, weights []float64) float64 {
+	if weights == nil {
+		return floats.Sum(x) / float64(len(x))
+	}
+	if len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		sumValues  float64
+		sumWeights float64
+	)
+	for i, w := range weights {
+		sumValues += w * x[i]
+		sumWeights += w
+	}
+	return sumValues / sumWeights
+}
+
+// Mode returns the most common value in the dataset specified by x and the
+// given weights. Strict float64 equality is used when comparing values, so users
+// should take caution. If several values are the mode, any of them may be returned.
+func Mode(x, weights []float64) (val float64, count float64) {
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(x) == 0 {
+		return 0, 0
+	}
+	m := make(map[float64]float64)
+	if weights == nil {
+		for _, v := range x {
+			m[v]++
+		}
+	} else {
+		for i, v := range x {
+			m[v] += weights[i]
+		}
+	}
+	var (
+		maxCount float64
+		max      float64
+	)
+	for val, count := range m {
+		if count > maxCount {
+			maxCount = count
+			max = val
+		}
+	}
+	return max, maxCount
+}
+
+// BivariateMoment computes the weighted mixed moment between the samples x and y.
+//
+//	E[(x - μ_x)^r*(y - μ_y)^s]
+//
+// No degrees of freedom correction is done.
+// The lengths of x and y must be equal. If weights is nil then all of the
+// weights are 1. If weights is not nil, then len(x) must equal len(weights).
+func BivariateMoment(r, s float64, x, y, weights []float64) float64 {
+	meanX := Mean(x, weights)
+	meanY := Mean(y, weights)
+	if len(x) != len(y) {
+		panic("stat: slice length mismatch")
+	}
+	if weights == nil {
+		var m float64
+		for i, vx := range x {
+			vy := y[i]
+			m += math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
+		}
+		return m / float64(len(x))
+	}
+	if len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		m          float64
+		sumWeights float64
+	)
+	for i, vx := range x {
+		vy := y[i]
+		w := weights[i]
+		m += w * math.Pow(vx-meanX, r) * math.Pow(vy-meanY, s)
+		sumWeights += w
+	}
+	return m / sumWeights
+}
+
+// Moment computes the weighted n^th moment of the samples,
+//
+//	E[(x - μ)^N]
+//
+// No degrees of freedom correction is done.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func Moment(moment float64, x, weights []float64) float64 {
+	// This also checks that x and weights have the same length.
+	mean := Mean(x, weights)
+	if weights == nil {
+		var m float64
+		for _, v := range x {
+			m += math.Pow(v-mean, moment)
+		}
+		return m / float64(len(x))
+	}
+	var (
+		m          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		w := weights[i]
+		m += w * math.Pow(v-mean, moment)
+		sumWeights += w
+	}
+	return m / sumWeights
+}
+
+// MomentAbout computes the weighted n^th weighted moment of the samples about
+// the given mean \mu,
+//
+//	E[(x - μ)^N]
+//
+// No degrees of freedom correction is done.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 {
+	if weights == nil {
+		var m float64
+		for _, v := range x {
+			m += math.Pow(v-mean, moment)
+		}
+		m /= float64(len(x))
+		return m
+	}
+	if len(weights) != len(x) {
+		panic("stat: slice length mismatch")
+	}
+	var (
+		m          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		m += weights[i] * math.Pow(v-mean, moment)
+		sumWeights += weights[i]
+	}
+	return m / sumWeights
+}
+
+// Quantile returns the sample of x such that x is greater than or
+// equal to the fraction p of samples. The exact behavior is determined by the
+// CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically
+// the inverse of the CDF function, though it may not be the actual inverse
+// for all values p and CumulantKinds.
+//
+// The x data must be sorted in increasing order. If weights is nil then all
+// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
+// Quantile will panic if the length of x is zero.
+//
+// CumulantKind behaviors:
+//   - Empirical: Returns the lowest value q for which q is greater than or equal
+//     to the fraction p of samples
+//   - LinInterp: Returns the linearly interpolated value
+func Quantile(p float64, c CumulantKind, x, weights []float64) float64 {
+	if !(p >= 0 && p <= 1) {
+		panic("stat: percentile out of bounds")
+	}
+
+	if weights != nil && len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	if len(x) == 0 {
+		panic("stat: zero length slice")
+	}
+	if floats.HasNaN(x) {
+		return math.NaN() // This is needed because the algorithm breaks otherwise.
+	}
+	if !sort.Float64sAreSorted(x) {
+		panic("x data are not sorted")
+	}
+
+	var sumWeights float64
+	if weights == nil {
+		sumWeights = float64(len(x))
+	} else {
+		sumWeights = floats.Sum(weights)
+	}
+	switch c {
+	case Empirical:
+		return empiricalQuantile(p, x, weights, sumWeights)
+	case LinInterp:
+		return linInterpQuantile(p, x, weights, sumWeights)
+	default:
+		panic("stat: bad cumulant kind")
+	}
+}
+
+func empiricalQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
+	var cumsum float64
+	fidx := p * sumWeights
+	for i := range x {
+		if weights == nil {
+			cumsum++
+		} else {
+			cumsum += weights[i]
+		}
+		if cumsum >= fidx {
+			return x[i]
+		}
+	}
+	panic("impossible")
+}
+
+func linInterpQuantile(p float64, x, weights []float64, sumWeights float64) float64 {
+	var cumsum float64
+	fidx := p * sumWeights
+	for i := range x {
+		if weights == nil {
+			cumsum++
+		} else {
+			cumsum += weights[i]
+		}
+		if cumsum >= fidx {
+			if i == 0 {
+				return x[0]
+			}
+			t := cumsum - fidx
+			if weights != nil {
+				t /= weights[i]
+			}
+			return t*x[i-1] + (1-t)*x[i]
+		}
+	}
+	panic("impossible")
+}
+
+// Skew computes the skewness of the sample data.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func Skew(x, weights []float64) float64 {
+
+	mean, std := MeanStdDev(x, weights)
+	if weights == nil {
+		var s float64
+		for _, v := range x {
+			z := (v - mean) / std
+			s += z * z * z
+		}
+		return s * skewCorrection(float64(len(x)))
+	}
+	var (
+		s          float64
+		sumWeights float64
+	)
+	for i, v := range x {
+		z := (v - mean) / std
+		s += weights[i] * z * z * z
+		sumWeights += weights[i]
+	}
+	return s * skewCorrection(sumWeights)
+}
+
+// From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7
+func skewCorrection(n float64) float64 {
+	return (n / (n - 1)) * (1 / (n - 2))
+}
+
+// SortWeighted rearranges the data in x along with their corresponding
+// weights so that the x data are sorted. The data is sorted in place.
+// Weights may be nil, but if weights is non-nil then it must have the same
+// length as x.
+func SortWeighted(x, weights []float64) {
+	if weights == nil {
+		sort.Float64s(x)
+		return
+	}
+	if len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	sort.Sort(weightSorter{
+		x: x,
+		w: weights,
+	})
+}
+
+type weightSorter struct {
+	x []float64
+	w []float64
+}
+
+func (w weightSorter) Len() int           { return len(w.x) }
+func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] }
+func (w weightSorter) Swap(i, j int) {
+	w.x[i], w.x[j] = w.x[j], w.x[i]
+	w.w[i], w.w[j] = w.w[j], w.w[i]
+}
+
+// SortWeightedLabeled rearranges the data in x along with their
+// corresponding weights and boolean labels so that the x data are sorted.
+// The data is sorted in place. Weights and labels may be nil, if either
+// is non-nil it must have the same length as x.
+func SortWeightedLabeled(x []float64, labels []bool, weights []float64) {
+	if labels == nil {
+		SortWeighted(x, weights)
+		return
+	}
+	if weights == nil {
+		if len(x) != len(labels) {
+			panic("stat: slice length mismatch")
+		}
+		sort.Sort(labelSorter{
+			x: x,
+			l: labels,
+		})
+		return
+	}
+	if len(x) != len(labels) || len(x) != len(weights) {
+		panic("stat: slice length mismatch")
+	}
+	sort.Sort(weightLabelSorter{
+		x: x,
+		l: labels,
+		w: weights,
+	})
+}
+
+type labelSorter struct {
+	x []float64
+	l []bool
+}
+
+func (a labelSorter) Len() int           { return len(a.x) }
+func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
+func (a labelSorter) Swap(i, j int) {
+	a.x[i], a.x[j] = a.x[j], a.x[i]
+	a.l[i], a.l[j] = a.l[j], a.l[i]
+}
+
+type weightLabelSorter struct {
+	x []float64
+	l []bool
+	w []float64
+}
+
+func (a weightLabelSorter) Len() int           { return len(a.x) }
+func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] }
+func (a weightLabelSorter) Swap(i, j int) {
+	a.x[i], a.x[j] = a.x[j], a.x[i]
+	a.l[i], a.l[j] = a.l[j], a.l[i]
+	a.w[i], a.w[j] = a.w[j], a.w[i]
+}
+
+// StdDev returns the sample standard deviation.
+func StdDev(x, weights []float64) float64 {
+	_, std := MeanStdDev(x, weights)
+	return std
+}
+
+// MeanStdDev returns the sample mean and unbiased standard deviation
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func MeanStdDev(x, weights []float64) (mean, std float64) {
+	mean, variance := MeanVariance(x, weights)
+	return mean, math.Sqrt(variance)
+}
+
+// StdErr returns the standard error in the mean with the given values.
+func StdErr(std, sampleSize float64) float64 {
+	return std / math.Sqrt(sampleSize)
+}
+
+// StdScore returns the standard score (a.k.a. z-score, z-value) for the value x
+// with the given mean and standard deviation, i.e.
+//
+//	(x - mean) / std
+func StdScore(x, mean, std float64) float64 {
+	return (x - mean) / std
+}
+
+// Variance computes the unbiased weighted sample variance:
+//
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func Variance(x, weights []float64) float64 {
+	_, variance := MeanVariance(x, weights)
+	return variance
+}
+
+// MeanVariance computes the sample mean and unbiased variance, where the mean and variance are
+//
+//	\sum_i w_i * x_i / (sum_i w_i)
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
+//
+// respectively.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+// When weights sum to 1 or less, a biased variance estimator should be used.
+func MeanVariance(x, weights []float64) (mean, variance float64) {
+	var (
+		unnormalisedVariance float64
+		sumWeights           float64
+	)
+	mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
+	return mean, unnormalisedVariance / (sumWeights - 1)
+}
+
+// PopMeanVariance computes the sample mean and biased variance (also known as
+// "population variance"), where the mean and variance are
+//
+//	\sum_i w_i * x_i / (sum_i w_i)
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i)
+//
+// respectively.
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func PopMeanVariance(x, weights []float64) (mean, variance float64) {
+	var (
+		unnormalisedVariance float64
+		sumWeights           float64
+	)
+	mean, unnormalisedVariance, sumWeights = meanUnnormalisedVarianceSumWeights(x, weights)
+	return mean, unnormalisedVariance / sumWeights
+}
+
+// PopMeanStdDev returns the sample mean and biased standard deviation
+// (also known as "population standard deviation").
+func PopMeanStdDev(x, weights []float64) (mean, std float64) {
+	mean, variance := PopMeanVariance(x, weights)
+	return mean, math.Sqrt(variance)
+}
+
+// PopStdDev returns the population standard deviation, i.e., a square root
+// of the biased variance estimate.
+func PopStdDev(x, weights []float64) float64 {
+	_, stDev := PopMeanStdDev(x, weights)
+	return stDev
+}
+
+// PopVariance computes the unbiased weighted sample variance:
+//
+//	\sum_i w_i (x_i - mean)^2 / (sum_i w_i)
+//
+// If weights is nil then all of the weights are 1. If weights is not nil, then
+// len(x) must equal len(weights).
+func PopVariance(x, weights []float64) float64 {
+	_, variance := PopMeanVariance(x, weights)
+	return variance
+}
+
+func meanUnnormalisedVarianceSumWeights(x, weights []float64) (mean, unnormalisedVariance, sumWeights float64) {
+	// This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing
+	// the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub,
+	// and Randall J. LeVeque.
+
+	// Note that this will panic if the slice lengths do not match.
+	mean = Mean(x, weights)
+	var (
+		ss           float64
+		compensation float64
+	)
+	if weights == nil {
+		for _, v := range x {
+			d := v - mean
+			ss += d * d
+			compensation += d
+		}
+		unnormalisedVariance = (ss - compensation*compensation/float64(len(x)))
+		return mean, unnormalisedVariance, float64(len(x))
+	}
+
+	for i, v := range x {
+		w := weights[i]
+		d := v - mean
+		wd := w * d
+		ss += wd * d
+		compensation += wd
+		sumWeights += w
+	}
+	unnormalisedVariance = (ss - compensation*compensation/sumWeights)
+	return mean, unnormalisedVariance, sumWeights
+}
diff --git a/vendor/gonum.org/v1/gonum/stat/statmat.go b/vendor/gonum.org/v1/gonum/stat/statmat.go
new file mode 100644
index 00000000000..4f05f30645c
--- /dev/null
+++ b/vendor/gonum.org/v1/gonum/stat/statmat.go
@@ -0,0 +1,142 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stat
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/mat"
+)
+
+// CovarianceMatrix calculates the covariance matrix (also known as the
+// variance-covariance matrix) calculated from a matrix of data, x, using
+// a two-pass algorithm. The result is stored in dst.
+//
+// If weights is not nil the weighted covariance of x is calculated. weights
+// must have length equal to the number of rows in input data matrix and
+// must not contain negative elements.
+// The dst matrix must either be empty or have the same number of
+// columns as the input data matrix.
+func CovarianceMatrix(dst *mat.SymDense, x mat.Matrix, weights []float64) {
+	// This is the matrix version of the two-pass algorithm. It doesn't use the
+	// additional floating point error correction that the Covariance function uses
+	// to reduce the impact of rounding during centering.
+
+	r, c := x.Dims()
+
+	if dst.IsEmpty() {
+		*dst = *(dst.GrowSym(c).(*mat.SymDense))
+	} else if n := dst.SymmetricDim(); n != c {
+		panic(mat.ErrShape)
+	}
+
+	var xt mat.Dense
+	xt.CloneFrom(x.T())
+	// Subtract the mean of each of the columns.
+	for i := 0; i < c; i++ {
+		v := xt.RawRowView(i)
+		// This will panic with ErrShape if len(weights) != len(v), so
+		// we don't have to check the size later.
+		mean := Mean(v, weights)
+		floats.AddConst(-mean, v)
+	}
+
+	if weights == nil {
+		// Calculate the normalization factor
+		// scaled by the sample size.
+		dst.SymOuterK(1/(float64(r)-1), &xt)
+		return
+	}
+
+	// Multiply by the sqrt of the weights, so that multiplication is symmetric.
+	sqrtwts := make([]float64, r)
+	for i, w := range weights {
+		if w < 0 {
+			panic("stat: negative covariance matrix weights")
+		}
+		sqrtwts[i] = math.Sqrt(w)
+	}
+	// Weight the rows.
+	for i := 0; i < c; i++ {
+		v := xt.RawRowView(i)
+		floats.Mul(v, sqrtwts)
+	}
+
+	// Calculate the normalization factor
+	// scaled by the weighted sample size.
+	dst.SymOuterK(1/(floats.Sum(weights)-1), &xt)
+}
+
+// CorrelationMatrix returns the correlation matrix calculated from a matrix
+// of data, x, using a two-pass algorithm. The result is stored in dst.
+//
+// If weights is not nil the weighted correlation of x is calculated. weights
+// must have length equal to the number of rows in input data matrix and
+// must not contain negative elements.
+// The dst matrix must either be empty or have the same number of
+// columns as the input data matrix.
+func CorrelationMatrix(dst *mat.SymDense, x mat.Matrix, weights []float64) {
+	// This will panic if the sizes don't match, or if weights is the wrong size.
+	CovarianceMatrix(dst, x, weights)
+	covToCorr(dst)
+}
+
+// covToCorr converts a covariance matrix to a correlation matrix.
+func covToCorr(c *mat.SymDense) {
+	r := c.SymmetricDim()
+
+	s := make([]float64, r)
+	for i := 0; i < r; i++ {
+		s[i] = 1 / math.Sqrt(c.At(i, i))
+	}
+	for i, sx := range s {
+		// Ensure that the diagonal has exactly ones.
+		c.SetSym(i, i, 1)
+		for j := i + 1; j < r; j++ {
+			v := c.At(i, j)
+			c.SetSym(i, j, v*sx*s[j])
+		}
+	}
+}
+
+// corrToCov converts a correlation matrix to a covariance matrix.
+// The input sigma should be vector of standard deviations corresponding
+// to the covariance.  It will panic if len(sigma) is not equal to the
+// number of rows in the correlation matrix.
+func corrToCov(c *mat.SymDense, sigma []float64) {
+	r, _ := c.Dims()
+
+	if r != len(sigma) {
+		panic(mat.ErrShape)
+	}
+	for i, sx := range sigma {
+		// Ensure that the diagonal has exactly sigma squared.
+		c.SetSym(i, i, sx*sx)
+		for j := i + 1; j < r; j++ {
+			v := c.At(i, j)
+			c.SetSym(i, j, v*sx*sigma[j])
+		}
+	}
+}
+
+// Mahalanobis computes the Mahalanobis distance
+//
+//	D = sqrt((x-y)ᵀ * Σ^-1 * (x-y))
+//
+// between the column vectors x and y given the cholesky decomposition of Σ.
+// Mahalanobis returns NaN if the linear solve fails.
+//
+// See https://en.wikipedia.org/wiki/Mahalanobis_distance for more information.
+func Mahalanobis(x, y mat.Vector, chol *mat.Cholesky) float64 {
+	var diff mat.VecDense
+	diff.SubVec(x, y)
+	var tmp mat.VecDense
+	err := chol.SolveVecTo(&tmp, &diff)
+	if err != nil {
+		return math.NaN()
+	}
+	return math.Sqrt(mat.Dot(&tmp, &diff))
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 125fa187d08..a9636205a1d 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -999,7 +999,7 @@ github.com/thanos-io/objstore/providers/gcs
 github.com/thanos-io/objstore/providers/s3
 github.com/thanos-io/objstore/providers/swift
 github.com/thanos-io/objstore/tracing/opentracing
-# github.com/thanos-io/promql-engine v0.0.0-20250220213456-fab1185f8c6c
+# github.com/thanos-io/promql-engine v0.0.0-20250302135832-accbf0891a16
 ## explicit; go 1.22.7
 github.com/thanos-io/promql-engine/api
 github.com/thanos-io/promql-engine/engine
@@ -1455,9 +1455,23 @@ golang.org/x/tools/internal/typesinternal
 golang.org/x/tools/internal/versions
 # gonum.org/v1/gonum v0.15.0
 ## explicit; go 1.21
+gonum.org/v1/gonum/blas
+gonum.org/v1/gonum/blas/blas64
+gonum.org/v1/gonum/blas/cblas128
+gonum.org/v1/gonum/blas/gonum
 gonum.org/v1/gonum/floats
 gonum.org/v1/gonum/floats/scalar
+gonum.org/v1/gonum/internal/asm/c128
+gonum.org/v1/gonum/internal/asm/c64
+gonum.org/v1/gonum/internal/asm/f32
 gonum.org/v1/gonum/internal/asm/f64
+gonum.org/v1/gonum/internal/cmplx64
+gonum.org/v1/gonum/internal/math32
+gonum.org/v1/gonum/lapack
+gonum.org/v1/gonum/lapack/gonum
+gonum.org/v1/gonum/lapack/lapack64
+gonum.org/v1/gonum/mat
+gonum.org/v1/gonum/stat
 # google.golang.org/api v0.218.0
 ## explicit; go 1.22
 google.golang.org/api/googleapi