/*
 * Decompiled with CFR 0.152.
 */
package dev.ludovic.netlib.blas;

import dev.ludovic.netlib.JavaBLAS;

public class Java11BLAS
extends dev.ludovic.netlib.blas.JavaBLAS {
    private static final Java11BLAS instance = new Java11BLAS();

    protected Java11BLAS() {
    }

    public static JavaBLAS getInstance() {
        return instance;
    }

    @Override
    protected void dgebpTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double b0;
        double a2;
        double a1;
        double a0;
        double sum20;
        double b02;
        double a02;
        double sum00;
        int row;
        int col;
        int Tcol = 3;
        int Trow = 3;
        for (col = cols; col < this.loopAlign(cols, cole, 3); ++col) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0;
                double sum10 = 0.0;
                sum20 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
        }
        while (col < this.loopBound(cole, 3)) {
            double sum02;
            double sum01;
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum03 = 0.0;
                for (int i = is; i < ie; ++i) {
                    double a03 = a[offseta + i + (row + 0) * lda];
                    double b03 = b[offsetb + i + (col + 0) * ldb];
                    double b1 = b[offsetb + i + (col + 1) * ldb];
                    double b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a03, b03, sum00);
                    sum01 = Math.fma(a03, b1, sum01);
                    sum02 = Math.fma(a03, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                this.dgepdotTN(m, row, row + 3, n, col, col + 3, k, is, ie, alpha, a, offseta, lda, b, offsetb, ldb, beta, c, offsetc, ldc);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    double b04 = b[offsetb + i + (col + 0) * ldb];
                    double b1 = b[offsetb + i + (col + 1) * ldb];
                    double b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a0, b04, sum00);
                    sum01 = Math.fma(a0, b1, sum01);
                    sum02 = Math.fma(a0, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
                ++row;
            }
            col += 3;
        }
        while (col < cole) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0;
                double sum10 = 0.0;
                sum20 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgepdotTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double b2;
        double b1;
        double b0;
        double a2;
        double a1;
        double a0;
        int i;
        int Ti = 2;
        assert (rowe - rows == 3);
        assert (cole - cols == 3);
        int row = rows;
        int col = cols;
        double sum00 = 0.0;
        double sum01 = 0.0;
        double sum02 = 0.0;
        double sum10 = 0.0;
        double sum11 = 0.0;
        double sum12 = 0.0;
        double sum20 = 0.0;
        double sum21 = 0.0;
        double sum22 = 0.0;
        for (i = is; i < this.loopAlign(is, ie, 2); ++i) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
        }
        while (i < this.loopBound(ie, 2)) {
            double a00 = a[offseta + (i + 0) + (row + 0) * lda];
            double a01 = a[offseta + (i + 0) + (row + 1) * lda];
            double a02 = a[offseta + (i + 0) + (row + 2) * lda];
            double b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
            sum00 = Math.fma(a00, b00, sum00);
            sum10 = Math.fma(a01, b00, sum10);
            sum20 = Math.fma(a02, b00, sum20);
            double b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
            sum01 = Math.fma(a00, b01, sum01);
            sum11 = Math.fma(a01, b01, sum11);
            sum21 = Math.fma(a02, b01, sum21);
            double b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
            sum02 = Math.fma(a00, b02, sum02);
            sum12 = Math.fma(a01, b02, sum12);
            sum22 = Math.fma(a02, b02, sum22);
            double a10 = a[offseta + (i + 1) + (row + 0) * lda];
            double a11 = a[offseta + (i + 1) + (row + 1) * lda];
            double a12 = a[offseta + (i + 1) + (row + 2) * lda];
            double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
            sum00 = Math.fma(a10, b10, sum00);
            sum10 = Math.fma(a11, b10, sum10);
            sum20 = Math.fma(a12, b10, sum20);
            double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
            sum01 = Math.fma(a10, b11, sum01);
            sum11 = Math.fma(a11, b11, sum11);
            sum21 = Math.fma(a12, b11, sum21);
            double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
            sum02 = Math.fma(a10, b12, sum02);
            sum12 = Math.fma(a11, b12, sum12);
            sum22 = Math.fma(a12, b12, sum22);
            i += 2;
        }
        while (i < ie) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
            ++i;
        }
        c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
        c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
        c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
        c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
        c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, c[offsetc + (row + 1) + (col + 1) * ldc]);
        c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, c[offsetc + (row + 1) + (col + 2) * ldc]);
        c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
        c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, c[offsetc + (row + 2) + (col + 1) * ldc]);
        c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, c[offsetc + (row + 2) + (col + 2) * ldc]);
    }

    @Override
    protected void dgemmNN(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemmNT(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemmTN(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemmTT(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgebpTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float b0;
        float a2;
        float a1;
        float sum20;
        float a0;
        float sum00;
        int row;
        int col;
        int Tcol = 3;
        int Trow = 3;
        int Ti = 2;
        for (col = cols; col < this.loopAlign(cols, cole, 3); ++col) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0f;
                float sum10 = 0.0f;
                sum20 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a02 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b03 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b03, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
        }
        while (col < this.loopBound(cole, 3)) {
            float sum02;
            float sum01;
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum03 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a03 = a[offseta + i + (row + 0) * lda];
                    float b04 = b[offsetb + i + (col + 0) * ldb];
                    float b1 = b[offsetb + i + (col + 1) * ldb];
                    float b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a03, b04, sum00);
                    sum01 = Math.fma(a03, b1, sum01);
                    sum02 = Math.fma(a03, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                this.sgepdotTN(m, row, row + 3, n, col, col + 3, k, is, ie, alpha, a, offseta, lda, b, offsetb, ldb, beta, c, offsetc, ldc);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a04 = a[offseta + i + (row + 0) * lda];
                    float b05 = b[offsetb + i + (col + 0) * ldb];
                    float b1 = b[offsetb + i + (col + 1) * ldb];
                    float b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a04, b05, sum00);
                    sum01 = Math.fma(a04, b1, sum01);
                    sum02 = Math.fma(a04, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
                ++row;
            }
            col += 3;
        }
        while (col < cole) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b06 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b06, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0f;
                float sum10 = 0.0f;
                sum20 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a05 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a05, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b07 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b07, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgepdotTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float b2;
        float b1;
        float b0;
        float a2;
        float a1;
        float a0;
        int i;
        int Ti = 2;
        assert (rowe - rows == 3);
        assert (cole - cols == 3);
        int row = rows;
        int col = cols;
        float sum00 = 0.0f;
        float sum01 = 0.0f;
        float sum02 = 0.0f;
        float sum10 = 0.0f;
        float sum11 = 0.0f;
        float sum12 = 0.0f;
        float sum20 = 0.0f;
        float sum21 = 0.0f;
        float sum22 = 0.0f;
        for (i = is; i < this.loopAlign(is, ie, 2); ++i) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
        }
        while (i < this.loopBound(ie, 2)) {
            float a00 = a[offseta + (i + 0) + (row + 0) * lda];
            float a01 = a[offseta + (i + 0) + (row + 1) * lda];
            float a02 = a[offseta + (i + 0) + (row + 2) * lda];
            float b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
            sum00 = Math.fma(a00, b00, sum00);
            sum10 = Math.fma(a01, b00, sum10);
            sum20 = Math.fma(a02, b00, sum20);
            float b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
            sum01 = Math.fma(a00, b01, sum01);
            sum11 = Math.fma(a01, b01, sum11);
            sum21 = Math.fma(a02, b01, sum21);
            float b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
            sum02 = Math.fma(a00, b02, sum02);
            sum12 = Math.fma(a01, b02, sum12);
            sum22 = Math.fma(a02, b02, sum22);
            float a10 = a[offseta + (i + 1) + (row + 0) * lda];
            float a11 = a[offseta + (i + 1) + (row + 1) * lda];
            float a12 = a[offseta + (i + 1) + (row + 2) * lda];
            float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
            sum00 = Math.fma(a10, b10, sum00);
            sum10 = Math.fma(a11, b10, sum10);
            sum20 = Math.fma(a12, b10, sum20);
            float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
            sum01 = Math.fma(a10, b11, sum01);
            sum11 = Math.fma(a11, b11, sum11);
            sum21 = Math.fma(a12, b11, sum21);
            float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
            sum02 = Math.fma(a10, b12, sum02);
            sum12 = Math.fma(a11, b12, sum12);
            sum22 = Math.fma(a12, b12, sum22);
            i += 2;
        }
        while (i < ie) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
            ++i;
        }
        c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
        c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
        c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
        c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
        c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, c[offsetc + (row + 1) + (col + 1) * ldc]);
        c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, c[offsetc + (row + 1) + (col + 2) * ldc]);
        c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
        c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, c[offsetc + (row + 2) + (col + 1) * ldc]);
        c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, c[offsetc + (row + 2) + (col + 2) * ldc]);
    }

    @Override
    protected void sgemmNN(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgemmNT(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgemmTN(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgemmTT(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }
}

