/*
 * Decompiled with CFR 0.152.
 */
package org.graalvm.compiler.lir.amd64;

import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Objects;
import jdk.vm.ci.amd64.AMD64;
import jdk.vm.ci.amd64.AMD64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.RegisterValue;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.JavaKind;
import jdk.vm.ci.meta.PlatformKind;
import jdk.vm.ci.meta.Value;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Assembler;
import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler;
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
import org.graalvm.compiler.asm.amd64.AVXKind;
import org.graalvm.compiler.code.DataSection;
import org.graalvm.compiler.core.common.LIRKind;
import org.graalvm.compiler.core.common.Stride;
import org.graalvm.compiler.debug.GraalError;
import org.graalvm.compiler.lir.LIRInstruction;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.amd64.AMD64ComplexVectorOp;
import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;

@Opcode(value="AMD64_CALC_STRING_ATTRIBUTES")
public final class AMD64CalcStringAttributesOp
extends AMD64ComplexVectorOp {
    public static final LIRInstructionClass<AMD64CalcStringAttributesOp> TYPE = LIRInstructionClass.create(AMD64CalcStringAttributesOp.class);
    private static final Register REG_ARRAY = AMD64.rsi;
    private static final Register REG_OFFSET = AMD64.rcx;
    private static final Register REG_LENGTH = AMD64.rdx;
    public static final int CR_7BIT = 0;
    public static final int CR_8BIT = 1;
    public static final int CR_16BIT = 2;
    public static final int CR_VALID_FIXED_WIDTH = 3;
    public static final int CR_BROKEN_FIXED_WIDTH = 4;
    public static final int CR_VALID_MULTIBYTE = 5;
    public static final int CR_BROKEN_MULTIBYTE = 6;
    private final Op op;
    private final Stride stride;
    private final int vectorLength;
    private final boolean assumeValid;
    @LIRInstruction.Def(value={LIRInstruction.OperandFlag.REG})
    private Value result;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value array;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value offset;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value length;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value arrayTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value offsetTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value lengthTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value[] temp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value[] vectorTemp;
    private static final byte TOO_SHORT = 1;
    private static final byte TOO_LONG = 2;
    private static final byte OVERLONG_3 = 4;
    private static final byte SURROGATE = 16;
    private static final byte OVERLONG_2 = 32;
    private static final byte TWO_CONTS = -128;
    private static final byte TOO_LARGE = 8;
    private static final byte TOO_LARGE_1000 = 64;
    private static final byte OVERLONG_4 = 64;
    private static final byte CARRY = -125;
    private static final byte[] UTF8_BYTE_1_HIGH_TABLE = new byte[]{2, 2, 2, 2, 2, 2, 2, 2, -128, -128, -128, -128, 33, 1, 21, 73};
    private static final byte[] UTF8_BYTE_1_LOW_TABLE = new byte[]{-25, -93, -125, -125, -117, -53, -53, -53, -53, -53, -53, -53, -53, -37, -53, -53};
    private static final byte[] UTF8_BYTE_2_HIGH_TABLE = new byte[]{1, 1, 1, 1, 1, 1, 1, 1, -26, -82, -70, -70, 1, 1, 1, 1};
    private static final byte[] UTF_8_STATE_MACHINE = new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};

    private AMD64CalcStringAttributesOp(LIRGeneratorTool tool, Op op, EnumSet<AMD64.CPUFeature> runtimeCheckedCPUFeatures, Value array, Value offset, Value length, Value result, boolean assumeValid) {
        super(TYPE, tool, runtimeCheckedCPUFeatures, AVXKind.AVXSize.YMM);
        int i;
        this.op = op;
        this.assumeValid = assumeValid;
        GraalError.guarantee(AMD64CalcStringAttributesOp.supports(tool.target(), runtimeCheckedCPUFeatures, AMD64.CPUFeature.SSE4_1), "needs at least SSE4.1 support");
        assert (op.stride.isNumericInteger());
        this.stride = Objects.requireNonNull(Stride.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(op.stride)));
        this.vectorLength = this.vectorSize.getBytes() / op.stride.getByteCount();
        this.arrayTmp = this.array = array;
        this.offsetTmp = this.offset = offset;
        this.lengthTmp = this.length = length;
        this.result = result;
        this.temp = new Value[AMD64CalcStringAttributesOp.getNumberOfTempRegisters(op, assumeValid)];
        for (i = 0; i < this.temp.length; ++i) {
            this.temp[i] = tool.newVariable(LIRKind.value((PlatformKind)AMD64Kind.QWORD));
        }
        this.vectorTemp = new Value[AMD64CalcStringAttributesOp.getNumberOfRequiredVectorRegisters(op, AMD64CalcStringAttributesOp.supports(tool.target(), runtimeCheckedCPUFeatures, AMD64.CPUFeature.AVX), assumeValid)];
        for (i = 0; i < this.vectorTemp.length; ++i) {
            this.vectorTemp[i] = tool.newVariable(LIRKind.value((PlatformKind)this.getVectorKind(JavaKind.Byte)));
        }
    }

    private static int getNumberOfTempRegisters(Op op, boolean assumeValid) {
        switch (op) {
            case UTF_8: {
                return assumeValid ? 1 : 3;
            }
            case UTF_16: {
                return assumeValid ? 1 : 2;
            }
        }
        return 0;
    }

    private static int getNumberOfRequiredVectorRegisters(Op op, boolean isAVX, boolean assumeValid) {
        switch (op) {
            case LATIN1: {
                return isAVX ? 1 : 2;
            }
            case BMP: {
                return isAVX ? 2 : 3;
            }
            case UTF_8: {
                return assumeValid ? 5 : 10;
            }
            case UTF_16: {
                return 7;
            }
            case UTF_32: {
                return 8;
            }
        }
        throw GraalError.shouldNotReachHere();
    }

    private int elementsPerVector(AVXKind.AVXSize size) {
        return size.getBytes() / this.op.stride.getByteCount();
    }

    private int elementsPerVector(AMD64BaseAssembler.OperandSize size) {
        return size.getBytes() / this.op.stride.getByteCount();
    }

    public static AMD64CalcStringAttributesOp movParamsAndCreate(LIRGeneratorTool tool, Op op, EnumSet<AMD64.CPUFeature> runtimeCheckedCPUFeatures, Value array, Value byteOffset, Value length, Value result, boolean assumeValid) {
        RegisterValue regArray = REG_ARRAY.asValue(array.getValueKind());
        RegisterValue regOffset = REG_OFFSET.asValue(byteOffset.getValueKind());
        RegisterValue regLength = REG_LENGTH.asValue(length.getValueKind());
        tool.emitConvertNullToZero((AllocatableValue)regArray, array);
        tool.emitMove((AllocatableValue)regOffset, byteOffset);
        tool.emitMove((AllocatableValue)regLength, length);
        return new AMD64CalcStringAttributesOp(tool, op, runtimeCheckedCPUFeatures, (Value)regArray, (Value)regOffset, (Value)regLength, result, assumeValid);
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
        Register arr = ValueUtil.asRegister((Value)this.array);
        Register off = ValueUtil.asRegister((Value)this.offset);
        Register len = ValueUtil.asRegister((Value)this.length);
        Register ret = ValueUtil.asRegister((Value)this.result);
        Register vec1 = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        asm.leaq(arr, new AMD64Address(arr, off, Stride.S1));
        asm.movl(off, len);
        switch (this.op) {
            case LATIN1: {
                this.emitLatin1(crb, asm, arr, len, off, ret, vec1);
                break;
            }
            case BMP: {
                this.emitBMP(crb, asm, arr, len, off, ret, vec1);
                break;
            }
            case UTF_8: {
                this.emitUTF8(crb, asm, arr, len, off, ret, vec1);
                break;
            }
            case UTF_16: {
                this.emitUTF16(crb, asm, arr, len, off, ret, vec1);
                break;
            }
            case UTF_32: {
                this.emitUTF32(crb, asm, arr, len, off, ret, vec1);
                break;
            }
            default: {
                throw GraalError.shouldNotReachHere();
            }
        }
    }

    private void emitLatin1(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register len, Register lengthTail, Register ret, Register vecMask) {
        assert (this.stride.log2 == 0);
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailLessThan4 = new Label();
        Label tailLessThan2 = new Label();
        Label returnLatin1 = new Label();
        Label returnAscii = new Label();
        Label end = new Label();
        Register vecArray = asm.isAVX() ? null : ValueUtil.asRegister((Value)this.vectorTemp[1]);
        DataSection.Data mask = this.createMask(crb, 128);
        asm.movdqu(this.vectorSize, vecMask, (AMD64Address)crb.recordDataSectionReference(mask));
        this.vectorLoopPrologue(asm, arr, len, lengthTail, tailLessThan32, tailLessThan16, true);
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMask, returnLatin1);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMask, null, returnLatin1, returnAscii, false);
        if (this.supportsAVX2AndYMM()) {
            asm.bind(tailLessThan32);
            asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.XMM), AMD64Assembler.ConditionFlag.Less, tailLessThan16, true);
            AMD64CalcStringAttributesOp.emitPTestCurr(asm, AVXKind.AVXSize.XMM, arr, vecArray, vecMask, null, returnLatin1);
            this.emitPTestTail(asm, AVXKind.AVXSize.XMM, arr, lengthTail, vecArray, vecMask, null, returnLatin1, returnAscii);
        }
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnLatin1, end, 1);
        asm.bind(tailLessThan16);
        asm.movq(len, (AMD64Address)crb.recordDataSectionReference(mask));
        this.latin1Tail(asm, AMD64BaseAssembler.OperandSize.QWORD, arr, lengthTail, len, null, tailLessThan8, returnLatin1, returnAscii);
        this.latin1Tail(asm, AMD64BaseAssembler.OperandSize.DWORD, arr, lengthTail, len, tailLessThan8, tailLessThan4, returnLatin1, returnAscii);
        this.latin1Tail(asm, AMD64BaseAssembler.OperandSize.WORD, arr, lengthTail, len, tailLessThan4, tailLessThan2, returnLatin1, returnAscii);
        asm.bind(tailLessThan2);
        asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnAscii, true);
        asm.movzbq(len, new AMD64Address(arr));
        asm.testAndJcc(AMD64BaseAssembler.OperandSize.QWORD, len, 128, AMD64Assembler.ConditionFlag.NotZero, returnLatin1, true);
        asm.jmpb(returnAscii);
        AMD64CalcStringAttributesOp.emitExitAtEnd(asm, ret, returnAscii, end, 0);
    }

    private void latin1Tail(AMD64MacroAssembler asm, AMD64BaseAssembler.OperandSize size, Register arr, Register lengthTail, Register mask, Label entry, Label tooSmall, Label labelLatin1, Label labelAscii) {
        AMD64CalcStringAttributesOp.bind(asm, entry);
        asm.cmplAndJcc(lengthTail, this.elementsPerVector(size), AMD64Assembler.ConditionFlag.Less, tooSmall, true);
        asm.testAndJcc(size, mask, new AMD64Address(arr), AMD64Assembler.ConditionFlag.NotZero, labelLatin1, true);
        asm.testAndJcc(size, mask, new AMD64Address(arr, lengthTail, this.stride, -size.getBytes()), AMD64Assembler.ConditionFlag.NotZero, labelLatin1, true);
        asm.jmpb(labelAscii);
    }

    private void vectorLoopPrologue(AMD64MacroAssembler asm, Register arr, Register len, Register lengthTail, Label tailLessThan32, Label tailLessThan16, boolean isShortJump) {
        asm.andl(lengthTail, this.vectorLength - 1);
        asm.andlAndJcc(len, -this.vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? tailLessThan32 : tailLessThan16, isShortJump);
        asm.leaq(arr, new AMD64Address(arr, len, this.stride));
        asm.negq(len);
    }

    private void emitPTestLoop(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register len, Register vecArray, Register vecMask, Label labelBreak) {
        Label loopHead = new Label();
        AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
        asm.bind(loopHead);
        asm.ptestU(this.vectorSize, vecMask, new AMD64Address(arr, len, this.stride), vecArray);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelBreak);
        asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, loopHead, true);
    }

    private static void emitPTestCurr(AMD64MacroAssembler asm, AVXKind.AVXSize size, Register arr, Register vecArray, Register vecMask, Label entry, Label match) {
        AMD64CalcStringAttributesOp.bind(asm, entry);
        asm.ptestU(size, vecMask, new AMD64Address(arr), vecArray);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, match);
    }

    private void emitPTestTail(AMD64MacroAssembler asm, AVXKind.AVXSize size, Register arr, Register lengthTail, Register vecArray, Register vecMask, Label entry, Label match, Label noMatch) {
        this.emitPTestTail(asm, size, arr, lengthTail, vecArray, vecMask, entry, match, noMatch, true);
    }

    private void emitPTestTail(AMD64MacroAssembler asm, AVXKind.AVXSize size, Register arr, Register lengthTail, Register vecArray, Register vecMask, Label entry, Label match, Label noMatch, boolean isShortJmp) {
        AMD64CalcStringAttributesOp.bind(asm, entry);
        asm.ptestU(size, vecMask, new AMD64Address(arr, lengthTail, this.stride, -size.getBytes()), vecArray);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, match);
        asm.jmp(noMatch, isShortJmp);
    }

    private static void bind(AMD64MacroAssembler asm, Label entry) {
        if (entry != null) {
            asm.bind(entry);
        }
    }

    private void emitBMP(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register len, Register lengthTail, Register ret, Register vecMaskAscii) {
        assert (this.stride.log2 == 1);
        Register vecMaskBMP = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        Register vecArray = asm.isAVX() ? null : ValueUtil.asRegister((Value)this.vectorTemp[2]);
        Label latin1Entry = new Label();
        Label latin1TailCmp = new Label();
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailLessThan4 = new Label();
        Label returnBMP = new Label();
        Label returnLatin1 = new Label();
        Label returnAscii = new Label();
        Label end = new Label();
        this.loadMask(crb, asm, vecMaskAscii, 65408);
        asm.psllw(this.vectorSize, vecMaskBMP, vecMaskAscii, 1);
        this.vectorLoopPrologue(asm, arr, len, lengthTail, tailLessThan32, tailLessThan16, true);
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMaskAscii, latin1Entry);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMaskAscii, null, latin1TailCmp, returnAscii);
        asm.bind(latin1Entry);
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMaskBMP, returnBMP);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMaskBMP, latin1TailCmp, returnBMP, returnLatin1);
        if (this.supportsAVX2AndYMM()) {
            this.bmpTail(asm, arr, lengthTail, vecArray, vecMaskAscii, vecMaskBMP, tailLessThan32, tailLessThan16, returnBMP, returnLatin1, returnAscii);
        }
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnAscii, end, 0);
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnLatin1, end, 1);
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnBMP, end, 2);
        asm.bind(tailLessThan16);
        asm.movdq(len, vecMaskAscii);
        asm.movdq(ret, vecMaskBMP);
        this.bmpTail(asm, AMD64BaseAssembler.OperandSize.QWORD, arr, lengthTail, len, ret, null, tailLessThan8, returnBMP, returnLatin1, returnAscii);
        this.bmpTail(asm, AMD64BaseAssembler.OperandSize.DWORD, arr, lengthTail, len, ret, tailLessThan8, tailLessThan4, returnBMP, returnLatin1, returnAscii);
        asm.bind(tailLessThan4);
        asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnAscii, true);
        asm.movzwq(len, new AMD64Address(arr));
        asm.testAndJcc(AMD64BaseAssembler.OperandSize.QWORD, len, 65408, AMD64Assembler.ConditionFlag.Zero, returnAscii, true);
        asm.testAndJcc(AMD64BaseAssembler.OperandSize.QWORD, len, 65280, AMD64Assembler.ConditionFlag.Zero, returnLatin1, true);
        asm.jmpb(returnBMP);
        asm.bind(end);
    }

    private void bmpTail(AMD64MacroAssembler asm, Register arr, Register lengthTail, Register vecArray, Register vecMaskAscii, Register vecMaskBMP, Label entry, Label tooSmall, Label returnBMP, Label returnLatin1, Label returnAscii) {
        asm.bind(entry);
        Label latin1Cur = new Label();
        Label latin1Tail = new Label();
        asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.XMM), AMD64Assembler.ConditionFlag.Less, tooSmall, true);
        AMD64CalcStringAttributesOp.emitPTestCurr(asm, AVXKind.AVXSize.XMM, arr, vecArray, vecMaskAscii, null, latin1Cur);
        this.emitPTestTail(asm, AVXKind.AVXSize.XMM, arr, lengthTail, vecArray, vecMaskAscii, null, latin1Tail, returnAscii);
        AMD64CalcStringAttributesOp.emitPTestCurr(asm, AVXKind.AVXSize.XMM, arr, vecArray, vecMaskBMP, latin1Cur, returnBMP);
        this.emitPTestTail(asm, AVXKind.AVXSize.XMM, arr, lengthTail, vecArray, vecMaskBMP, latin1Tail, returnBMP, returnLatin1);
    }

    private void bmpTail(AMD64MacroAssembler asm, AMD64BaseAssembler.OperandSize size, Register arr, Register lengthTail, Register maskAscii, Register maskBMP, Label entry, Label tooSmall, Label returnBMP, Label returnLatin1, Label returnAscii) {
        AMD64CalcStringAttributesOp.bind(asm, entry);
        Label latin1Cur = new Label();
        Label latin1Tail = new Label();
        asm.cmplAndJcc(lengthTail, this.elementsPerVector(size), AMD64Assembler.ConditionFlag.Less, tooSmall, true);
        AMD64CalcStringAttributesOp.emitTestCurr(asm, size, arr, maskAscii, null, latin1Cur);
        this.emitTestTail(asm, size, arr, lengthTail, maskAscii, null, latin1Tail, returnAscii);
        AMD64CalcStringAttributesOp.emitTestCurr(asm, size, arr, maskBMP, latin1Cur, returnBMP);
        this.emitTestTail(asm, size, arr, lengthTail, maskBMP, latin1Tail, returnBMP, returnLatin1);
    }

    private static void emitTestCurr(AMD64MacroAssembler asm, AMD64BaseAssembler.OperandSize size, Register arr, Register mask, Label entry, Label match) {
        AMD64CalcStringAttributesOp.bind(asm, entry);
        asm.testAndJcc(size, mask, new AMD64Address(arr), AMD64Assembler.ConditionFlag.NotZero, match, true);
    }

    private void emitTestTail(AMD64MacroAssembler asm, AMD64BaseAssembler.OperandSize size, Register arr, Register lengthTail, Register mask, Label entry, Label match, Label noMatch) {
        AMD64CalcStringAttributesOp.bind(asm, entry);
        asm.testAndJcc(size, mask, new AMD64Address(arr, lengthTail, this.stride, -size.getBytes()), AMD64Assembler.ConditionFlag.NotZero, match, true);
        asm.jmpb(noMatch);
    }

    private void emitUTF8(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register len, Register lengthTail, Register ret, Register vecArray) {
        DataSection.Data validMaskTail;
        assert (this.stride.log2 == 0);
        Register tmp = ValueUtil.asRegister((Value)this.temp[0]);
        Register vecMask = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        Register vecMaskCB = ValueUtil.asRegister((Value)this.vectorTemp[2]);
        Register vecTmp1 = ValueUtil.asRegister((Value)this.vectorTemp[3]);
        Register vecTmp2 = ValueUtil.asRegister((Value)this.vectorTemp[4]);
        Register vecPrevArray = this.assumeValid ? null : ValueUtil.asRegister((Value)this.vectorTemp[5]);
        Register vecError = this.assumeValid ? null : ValueUtil.asRegister((Value)this.vectorTemp[6]);
        Register vecPrevIsIncomplete = this.assumeValid ? null : ValueUtil.asRegister((Value)this.vectorTemp[7]);
        Label asciiLoop = new Label();
        Label labelMultiByteEntry = new Label();
        Label labelMultiByteLoop = new Label();
        Label labelMultiByteTail = new Label();
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailSingleVector = new Label();
        Label labelScalarTail = new Label();
        Label labelScalarAsciiLoop = new Label();
        Label labelScalarMultiByteLoop = new Label();
        Label labelScalarMultiByteLoopEntry = new Label();
        Label labelScalarMultiByteLoopSkipDec = new Label();
        Label returnValid = new Label();
        Label returnAscii = new Label();
        Label end = new Label();
        asm.movl(ret, len);
        this.loadMask(crb, asm, vecMask, 128);
        this.loadMask(crb, asm, vecMaskCB, 192);
        DataSection.Data data = validMaskTail = this.assumeValid ? this.createTailMask(crb) : null;
        if (!this.assumeValid) {
            asm.pxor(this.vectorSize, vecPrevArray, vecPrevArray);
            asm.pxor(this.vectorSize, vecError, vecError);
            asm.pxor(this.vectorSize, vecPrevIsIncomplete, vecPrevIsIncomplete);
        }
        this.vectorLoopPrologue(asm, arr, len, lengthTail, tailLessThan32, !this.assumeValid ? tailLessThan32 : tailLessThan16, false);
        AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
        asm.bind(asciiLoop);
        asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
        asm.ptest(this.vectorSize, vecArray, vecMask);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelMultiByteEntry);
        asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, asciiLoop, true);
        asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, lengthTail, this.stride, -this.vectorSize.getBytes()));
        asm.ptest(this.vectorSize, vecArray, vecMask);
        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelMultiByteTail, this.assumeValid);
        asm.jmp(returnAscii);
        if (this.assumeValid) {
            AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
            asm.bind(labelMultiByteLoop);
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
            asm.bind(labelMultiByteEntry);
            this.utf8SubtractContinuationBytes(asm, ret, vecArray, tmp, vecMask, vecMaskCB);
            asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelMultiByteLoop, true);
            asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnValid, false);
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, lengthTail, this.stride, -this.vectorSize.getBytes()));
            asm.bind(labelMultiByteTail);
            asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(validMaskTail));
            asm.pandU(this.vectorSize, vecArray, new AMD64Address(tmp, lengthTail, this.stride), vecTmp1);
            this.utf8SubtractContinuationBytes(asm, ret, vecArray, tmp, vecMask, vecMaskCB);
            asm.jmp(returnValid);
            if (this.supportsAVX2AndYMM()) {
                asm.bind(tailLessThan32);
                asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.XMM), AMD64Assembler.ConditionFlag.Less, tailLessThan16, true);
                this.loadLessThan32IntoYMMUnordered(crb, asm, validMaskTail, arr, lengthTail, tmp, vecArray, vecTmp1, vecTmp2);
                asm.jmpb(tailSingleVector);
            }
            asm.bind(tailLessThan16);
            asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.QWORD), AMD64Assembler.ConditionFlag.Less, tailLessThan8, true);
            this.loadLessThan16IntoXMMUnordered(crb, asm, validMaskTail, arr, lengthTail, tmp, vecArray, vecTmp1, vecTmp2);
            asm.jmpb(tailSingleVector);
            asm.bind(tailLessThan8);
            asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.DWORD), AMD64Assembler.ConditionFlag.Less, labelScalarTail, true);
            this.loadLessThan8IntoXMMOrdered(asm, arr, lengthTail, vecArray, tmp, len);
            asm.bind(tailSingleVector);
            asm.ptest(this.vectorSize, vecArray, vecMask);
            asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnAscii);
            this.utf8SubtractContinuationBytes(asm, ret, vecArray, tmp, vecMask, vecMaskCB);
            asm.jmp(returnValid);
        } else {
            Label labelMultiByteEnd = new Label();
            Label labelMultiByteTailLoopEntry = new Label();
            byte[] isIncompleteMaskBytes = new byte[this.vectorSize.getBytes()];
            Arrays.fill(isIncompleteMaskBytes, (byte)-1);
            isIncompleteMaskBytes[this.vectorSize.getBytes() - 3] = -17;
            isIncompleteMaskBytes[this.vectorSize.getBytes() - 2] = -33;
            isIncompleteMaskBytes[this.vectorSize.getBytes() - 1] = -65;
            DataSection.Data isIncompleteMask = AMD64CalcStringAttributesOp.writeToDataSection(crb, isIncompleteMaskBytes);
            DataSection.Data mask0x0F = this.createMask(crb, 15);
            DataSection.Data mask3ByteSeq = this.createMask(crb, -33);
            DataSection.Data mask4ByteSeq = this.createMask(crb, -17);
            DataSection.Data xmmTailShuffleMask = AMD64CalcStringAttributesOp.writeToDataSection(crb, AMD64CalcStringAttributesOp.createXMMTailShuffleMask(AVXKind.AVXSize.XMM.getBytes()));
            Register vecTmp3 = ValueUtil.asRegister((Value)this.vectorTemp[8]);
            Register vecTmp4 = ValueUtil.asRegister((Value)this.vectorTemp[9]);
            AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
            asm.bind(labelMultiByteLoop);
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
            asm.bind(labelMultiByteTailLoopEntry);
            asm.ptest(this.vectorSize, vecArray, vecMask);
            asm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelMultiByteEntry);
            asm.por(this.vectorSize, vecError, vecPrevIsIncomplete);
            asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelMultiByteLoop, true);
            asm.jmp(labelMultiByteTail);
            asm.bind(labelMultiByteEntry);
            asm.pand(this.vectorSize, vecTmp1, vecArray, vecMaskCB);
            asm.pcmpeqb(this.vectorSize, vecTmp1, vecMask);
            asm.pmovmsk(this.vectorSize, tmp, vecTmp1);
            asm.popcntl(tmp, tmp);
            asm.subl(ret, tmp);
            AMD64CalcStringAttributesOp.prev(asm, this.vectorSize, vecTmp3, vecArray, vecPrevArray, 1);
            asm.psrlw(this.vectorSize, vecTmp4, vecTmp3, 4);
            AMD64CalcStringAttributesOp.pandData(crb, asm, this.vectorSize, vecTmp3, mask0x0F, vecTmp1);
            AMD64CalcStringAttributesOp.pandData(crb, asm, this.vectorSize, vecTmp4, mask0x0F, vecTmp1);
            asm.movdqu(this.vectorSize, vecTmp1, AMD64CalcStringAttributesOp.getMaskOnce(crb, this.getStaticLUT(UTF8_BYTE_1_LOW_TABLE)));
            asm.movdqu(this.vectorSize, vecTmp2, AMD64CalcStringAttributesOp.getMaskOnce(crb, this.getStaticLUT(UTF8_BYTE_1_HIGH_TABLE)));
            asm.pshufb(this.vectorSize, vecTmp1, vecTmp3);
            asm.pshufb(this.vectorSize, vecTmp2, vecTmp4);
            asm.pand(this.vectorSize, vecTmp1, vecTmp2);
            asm.movdqu(this.vectorSize, vecTmp2, AMD64CalcStringAttributesOp.getMaskOnce(crb, this.getStaticLUT(UTF8_BYTE_2_HIGH_TABLE)));
            asm.psrlw(this.vectorSize, vecTmp3, vecArray, 4);
            AMD64CalcStringAttributesOp.pandData(crb, asm, this.vectorSize, vecTmp3, mask0x0F, vecTmp4);
            asm.pshufb(this.vectorSize, vecTmp2, vecTmp3);
            asm.pand(this.vectorSize, vecTmp1, vecTmp2);
            AMD64CalcStringAttributesOp.prev(asm, this.vectorSize, vecTmp2, vecArray, vecPrevArray, 2);
            AMD64CalcStringAttributesOp.prev(asm, this.vectorSize, vecTmp3, vecArray, vecPrevArray, 3);
            AMD64CalcStringAttributesOp.psubusbData(crb, asm, this.vectorSize, vecTmp2, vecTmp2, mask3ByteSeq, vecTmp4);
            AMD64CalcStringAttributesOp.psubusbData(crb, asm, this.vectorSize, vecTmp3, vecTmp3, mask4ByteSeq, vecTmp4);
            asm.por(this.vectorSize, vecTmp2, vecTmp3);
            asm.pxor(this.vectorSize, vecTmp3, vecTmp3);
            asm.pcmpgtb(this.vectorSize, vecTmp2, vecTmp3);
            asm.pand(this.vectorSize, vecTmp2, vecMask);
            asm.pxor(this.vectorSize, vecTmp1, vecTmp2);
            asm.por(this.vectorSize, vecError, vecTmp1);
            AMD64CalcStringAttributesOp.psubusbData(crb, asm, this.vectorSize, vecPrevIsIncomplete, vecArray, isIncompleteMask, vecTmp1);
            asm.movdqu(this.vectorSize, vecPrevArray, vecArray);
            asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelMultiByteLoop, false);
            asm.bind(labelMultiByteTail);
            asm.testqAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, labelMultiByteEnd, true);
            this.loadTailIntoYMMOrdered(crb, asm, xmmTailShuffleMask, arr, lengthTail, vecArray, tmp, vecTmp1, vecTmp2);
            asm.xorq(lengthTail, lengthTail);
            asm.subq(len, this.vectorLength);
            asm.jmp(labelMultiByteTailLoopEntry);
            asm.bind(labelMultiByteEnd);
            asm.por(this.vectorSize, vecError, vecPrevIsIncomplete);
            asm.ptest(this.vectorSize, vecError, vecError);
            asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnValid);
            asm.shlq(ret, 32);
            asm.orq(ret, 6);
            asm.jmp(end);
            asm.bind(tailLessThan32);
            Register tmp2 = ValueUtil.asRegister((Value)this.temp[1]);
            if (this.supportsAVX2AndYMM()) {
                asm.cmplAndJcc(lengthTail, 16, AMD64Assembler.ConditionFlag.Less, tailLessThan16, true);
                this.loadLessThan32IntoYMMOrdered(crb, asm, xmmTailShuffleMask, arr, lengthTail, tmp, vecArray, vecTmp1, vecTmp2);
                asm.jmp(tailSingleVector);
                asm.bind(tailLessThan16);
            }
            asm.cmplAndJcc(lengthTail, 8, AMD64Assembler.ConditionFlag.Less, tailLessThan8, true);
            this.loadLessThan16IntoXMMOrdered(crb, asm, arr, lengthTail, tmp, vecArray, vecTmp1, vecTmp2);
            asm.jmpb(tailSingleVector);
            asm.bind(tailLessThan8);
            asm.cmplAndJcc(lengthTail, 4, AMD64Assembler.ConditionFlag.Less, labelScalarTail, true);
            this.loadLessThan8IntoXMMOrdered(asm, arr, lengthTail, vecArray, tmp, tmp2);
            asm.bind(tailSingleVector);
            asm.ptest(this.vectorSize, vecArray, vecMask);
            asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnAscii);
            asm.subq(len, this.vectorLength);
            asm.xorq(lengthTail, lengthTail);
            asm.jmp(labelMultiByteEntry);
        }
        asm.bind(labelScalarTail);
        asm.leaq(arr, new AMD64Address(arr, lengthTail, this.stride));
        asm.testqAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnAscii, false);
        asm.negq(lengthTail);
        AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
        asm.bind(labelScalarAsciiLoop);
        asm.movzbl(tmp, new AMD64Address(arr, lengthTail, this.stride));
        asm.testlAndJcc(tmp, 128, AMD64Assembler.ConditionFlag.NotZero, labelScalarMultiByteLoopEntry, true);
        asm.incqAndJcc(lengthTail, AMD64Assembler.ConditionFlag.NotZero, labelScalarAsciiLoop, true);
        asm.jmpb(returnAscii);
        asm.bind(labelScalarMultiByteLoopEntry);
        if (this.assumeValid) {
            AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
            asm.bind(labelScalarMultiByteLoop);
            asm.movzbq(tmp, new AMD64Address(arr, lengthTail, this.stride));
            asm.andl(tmp, 192);
            asm.cmplAndJcc(tmp, 128, AMD64Assembler.ConditionFlag.NotEqual, labelScalarMultiByteLoopSkipDec, true);
            asm.decl(ret);
            asm.bind(labelScalarMultiByteLoopSkipDec);
            asm.incqAndJcc(lengthTail, AMD64Assembler.ConditionFlag.NotZero, labelScalarMultiByteLoop, true);
        } else {
            Register state = ValueUtil.asRegister((Value)this.temp[1]);
            Register type = ValueUtil.asRegister((Value)this.temp[2]);
            asm.leaq(len, AMD64CalcStringAttributesOp.getMaskOnce(crb, UTF_8_STATE_MACHINE));
            asm.xorq(state, state);
            AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
            asm.bind(labelScalarMultiByteLoop);
            asm.movzbq(tmp, new AMD64Address(arr, lengthTail, this.stride));
            asm.movzbq(type, new AMD64Address(len, tmp, this.stride));
            asm.andl(tmp, 192);
            asm.addq(type, state);
            asm.movzbq(state, new AMD64Address(len, type, this.stride, 256));
            asm.cmplAndJcc(tmp, 128, AMD64Assembler.ConditionFlag.NotEqual, labelScalarMultiByteLoopSkipDec, true);
            asm.decl(ret);
            asm.bind(labelScalarMultiByteLoopSkipDec);
            asm.incqAndJcc(lengthTail, AMD64Assembler.ConditionFlag.NotZero, labelScalarMultiByteLoop, true);
            asm.testqAndJcc(state, state, AMD64Assembler.ConditionFlag.Zero, returnValid, true);
            asm.shlq(ret, 32);
            asm.orq(ret, 6);
            asm.jmpb(end);
        }
        AMD64CalcStringAttributesOp.emitExitMultiByte(asm, ret, returnValid, end, 5);
        AMD64CalcStringAttributesOp.emitExitMultiByteAtEnd(asm, ret, returnAscii, end, 0);
    }

    private void utf8SubtractContinuationBytes(AMD64MacroAssembler asm, Register ret, Register vecArray, Register tmp, Register vecMask, Register vecMaskCB) {
        asm.pand(this.vectorSize, vecArray, vecMaskCB);
        asm.pcmpeqb(this.vectorSize, vecArray, vecMask);
        asm.pmovmsk(this.vectorSize, tmp, vecArray);
        asm.popcntl(tmp, tmp);
        asm.subl(ret, tmp);
    }

    private void loadTailIntoYMMOrdered(CompilationResultBuilder crb, AMD64MacroAssembler asm, DataSection.Data xmmTailShuffleMask, Register arr, Register lengthTail, Register vecArray, Register tmp, Register vecTmp1, Register vecTmp2) {
        if (this.supportsAVX2AndYMM()) {
            Label lessThan16 = new Label();
            Label done = new Label();
            asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(xmmTailShuffleMask));
            asm.movdqu(AVXKind.AVXSize.XMM, vecArray, new AMD64Address(arr, lengthTail, this.stride, -AVXKind.AVXSize.XMM.getBytes()));
            asm.cmpqAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.XMM), AMD64Assembler.ConditionFlag.Less, lessThan16, true);
            asm.movdqu(AVXKind.AVXSize.XMM, vecTmp1, new AMD64Address(arr));
            asm.negq(lengthTail);
            asm.movdqu(AVXKind.AVXSize.XMM, vecTmp2, new AMD64Address(tmp, lengthTail, this.stride, AVXKind.AVXSize.XMM.getBytes() * 2));
            asm.pshufb(AVXKind.AVXSize.XMM, vecArray, vecTmp2);
            AMD64Assembler.VexRVMIOp.VPERM2I128.emit((AMD64Assembler)asm, this.vectorSize, vecArray, vecArray, vecTmp1, 2);
            asm.jmpb(done);
            asm.bind(lessThan16);
            asm.negq(lengthTail);
            asm.movdqu(AVXKind.AVXSize.XMM, vecTmp2, new AMD64Address(tmp, lengthTail, this.stride, AVXKind.AVXSize.XMM.getBytes()));
            asm.pshufb(AVXKind.AVXSize.XMM, vecArray, vecTmp2);
            asm.bind(done);
        } else {
            asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(xmmTailShuffleMask));
            asm.movdqu(AVXKind.AVXSize.XMM, vecArray, new AMD64Address(arr, lengthTail, this.stride, -AVXKind.AVXSize.XMM.getBytes()));
            asm.negq(lengthTail);
            asm.movdqu(AVXKind.AVXSize.XMM, vecTmp2, new AMD64Address(tmp, lengthTail, this.stride, AVXKind.AVXSize.XMM.getBytes()));
            asm.pshufb(AVXKind.AVXSize.XMM, vecArray, vecTmp2);
        }
    }

    private void loadLessThan32IntoYMMOrdered(CompilationResultBuilder crb, AMD64MacroAssembler asm, DataSection.Data xmmTailShuffleMask, Register arr, Register lengthTail, Register tmp, Register vecArray, Register vecTmp1, Register vecTmp2) {
        asm.movdqu(AVXKind.AVXSize.XMM, vecTmp1, new AMD64Address(arr));
        asm.movdqu(AVXKind.AVXSize.XMM, vecArray, new AMD64Address(arr, lengthTail, this.stride, -AVXKind.AVXSize.XMM.getBytes()));
        asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(xmmTailShuffleMask));
        asm.negq(lengthTail);
        asm.movdqu(AVXKind.AVXSize.XMM, vecTmp2, new AMD64Address(tmp, lengthTail, this.stride, AVXKind.AVXSize.XMM.getBytes() * 2));
        asm.pshufb(AVXKind.AVXSize.XMM, vecArray, vecTmp2);
        AMD64Assembler.VexRVMIOp.VPERM2I128.emit((AMD64Assembler)asm, this.vectorSize, vecArray, vecArray, vecTmp1, 2);
    }

    private void loadLessThan32IntoYMMUnordered(CompilationResultBuilder crb, AMD64MacroAssembler asm, DataSection.Data maskTail, Register arr, Register lengthTail, Register tmp, Register vecArray, Register vecTmp1, Register vecTmp2) {
        asm.movdqu(AVXKind.AVXSize.XMM, vecArray, new AMD64Address(arr));
        asm.movdqu(AVXKind.AVXSize.XMM, vecTmp1, new AMD64Address(arr, lengthTail, this.stride, -AVXKind.AVXSize.XMM.getBytes()));
        asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(maskTail));
        asm.pandU(this.vectorSize, vecTmp1, new AMD64Address(tmp, lengthTail, this.stride), vecTmp2);
        AMD64Assembler.VexRVMIOp.VPERM2I128.emit((AMD64Assembler)asm, this.vectorSize, vecArray, vecArray, vecTmp1, 2);
    }

    private void loadLessThan16IntoXMMOrdered(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register lengthTail, Register tmp, Register vecArray, Register vecTmp1, Register vecTmp2) {
        asm.movdq(vecArray, new AMD64Address(arr));
        asm.movdq(vecTmp1, new AMD64Address(arr, lengthTail, this.stride, -8));
        asm.leaq(tmp, AMD64CalcStringAttributesOp.getMaskOnce(crb, AMD64CalcStringAttributesOp.createXMMTailShuffleMask(8), AVXKind.AVXSize.XMM.getBytes() * 2));
        asm.negq(lengthTail);
        asm.movdqu(AVXKind.AVXSize.XMM, vecTmp2, new AMD64Address(tmp, lengthTail, this.stride, AVXKind.AVXSize.XMM.getBytes()));
        asm.pshufb(AVXKind.AVXSize.XMM, vecTmp1, vecTmp2);
        asm.movlhps(vecArray, vecTmp1);
    }

    private void loadLessThan16IntoXMMUnordered(CompilationResultBuilder crb, AMD64MacroAssembler asm, DataSection.Data maskTail, Register arr, Register lengthTail, Register tmp, Register vecArray, Register vecTmp1, Register vecTmp2) {
        asm.movdq(vecArray, new AMD64Address(arr));
        asm.movdq(vecTmp1, new AMD64Address(arr, lengthTail, this.stride, -8));
        asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(maskTail));
        asm.pandU(this.vectorSize, vecTmp1, new AMD64Address(tmp, lengthTail, this.stride, this.supportsAVX2AndYMM() ? AVXKind.AVXSize.XMM.getBytes() : 0), vecTmp2);
        asm.movlhps(vecArray, vecTmp1);
    }

    private void loadLessThan8IntoXMMOrdered(AMD64MacroAssembler asm, Register arr, Register lengthTail, Register vecArray, Register tmp, Register tmp2) {
        assert (this.stride.log2 < 2);
        asm.movl(tmp, new AMD64Address(arr));
        asm.movl(tmp2, new AMD64Address(arr, lengthTail, this.stride, -4));
        asm.andq(lengthTail, 3 >> this.stride.log2);
        asm.shlq(lengthTail, 3 + this.stride.log2);
        assert (lengthTail.equals((Object)AMD64.rcx));
        asm.shlq(tmp2);
        asm.shrq(tmp2, 32);
        asm.shlq(tmp2, 32);
        asm.orq(tmp, tmp2);
        asm.movdq(vecArray, tmp);
    }

    private void loadLessThan8IntoXMMUnordered(CompilationResultBuilder crb, AMD64MacroAssembler asm, DataSection.Data maskTail, Register arr, Register lengthTail, Register vecArray, Register tmp, Register tmp2) {
        asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(maskTail));
        asm.movl(tmp2, new AMD64Address(arr, lengthTail, this.stride, -4));
        asm.andq(tmp2, new AMD64Address(tmp, lengthTail, this.stride, (this.supportsAVX2AndYMM() ? AVXKind.AVXSize.XMM.getBytes() : 0) + 8));
        asm.movl(tmp, new AMD64Address(arr));
        asm.shlq(tmp2, 32);
        asm.orq(tmp, tmp2);
        asm.movdq(vecArray, tmp);
    }

    private byte[] getStaticLUT(byte[] table) {
        assert (table.length == AVXKind.AVXSize.XMM.getBytes());
        if (this.supportsAVX2AndYMM()) {
            byte[] ret = Arrays.copyOf(table, table.length * 2);
            System.arraycopy(table, 0, ret, table.length, table.length);
            return ret;
        }
        return table;
    }

    private static void psubusbData(CompilationResultBuilder crb, AMD64MacroAssembler asm, AVXKind.AVXSize size, Register dst, Register src1, DataSection.Data src2, Register tmp) {
        if (asm.isAVX()) {
            AMD64Assembler.VexRVMOp.VPSUBUSB.emit((AMD64Assembler)asm, size, dst, src1, (AMD64Address)crb.recordDataSectionReference(src2));
        } else {
            if (!dst.equals((Object)src1)) {
                asm.movdqu(dst, src1);
            }
            asm.movdqu(tmp, (AMD64Address)crb.recordDataSectionReference(src2));
            asm.psubusb(dst, tmp);
        }
    }

    private static void pandData(CompilationResultBuilder crb, AMD64MacroAssembler asm, AVXKind.AVXSize avxSize, Register vecDst, DataSection.Data mask, Register vecTmp) {
        if (asm.isAVX()) {
            asm.pand(avxSize, vecDst, (AMD64Address)crb.recordDataSectionReference(mask));
        } else {
            asm.movdqu(vecTmp, (AMD64Address)crb.recordDataSectionReference(mask));
            asm.pand(avxSize, vecDst, vecTmp);
        }
    }

    private static void prev(AMD64MacroAssembler asm, AVXKind.AVXSize size, Register dst, Register cur, Register prev, int n) {
        if (size == AVXKind.AVXSize.YMM) {
            AMD64Assembler.VexRVMIOp.VPERM2I128.emit((AMD64Assembler)asm, size, dst, prev, cur, 33);
            asm.palignr(size, dst, cur, dst, 16 - n);
        } else {
            asm.palignr(size, dst, cur, prev, 16 - n);
        }
    }

    private void emitUTF16(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register len, Register lengthTail, Register ret, Register vecArray) {
        assert (this.stride.log2 == 1);
        Register vecArrayTail = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        Register vecMaskAscii = ValueUtil.asRegister((Value)this.vectorTemp[2]);
        Register vecMaskLatin = ValueUtil.asRegister((Value)this.vectorTemp[3]);
        Register vecMaskSurrogate = ValueUtil.asRegister((Value)this.vectorTemp[4]);
        Register vecTmp = ValueUtil.asRegister((Value)this.vectorTemp[5]);
        Register vecResult = ValueUtil.asRegister((Value)this.vectorTemp[6]);
        Register tmp = ValueUtil.asRegister((Value)this.temp[0]);
        Register retBroken = this.assumeValid ? null : ValueUtil.asRegister((Value)this.temp[1]);
        Label latin1Entry = new Label();
        Label latin1Tail = new Label();
        Label bmpLoop = new Label();
        Label bmpTail = new Label();
        Label labelSurrogateEntry = new Label();
        Label labelSurrogateLoop = new Label();
        Label labelSurrogateTail = new Label();
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailLessThan4 = new Label();
        Label tailSingleVector = new Label();
        Label tailSingleVectorSurrogate = new Label();
        Label surrogateExactlyVectorSize = new Label();
        Label returnValidOrBroken = new Label();
        Label returnBMP = new Label();
        Label returnLatin1 = new Label();
        Label returnAscii = new Label();
        Label end = new Label();
        asm.movl(ret, len);
        if (!this.assumeValid) {
            asm.movl(retBroken, 5);
            asm.pxor(this.vectorSize, vecResult, vecResult);
        }
        this.loadMask(crb, asm, vecMaskAscii, 65408);
        this.loadMask(crb, asm, vecMaskSurrogate, this.assumeValid ? 54 : 27);
        asm.psllw(this.vectorSize, vecMaskLatin, vecMaskAscii, 1);
        DataSection.Data maskTail = this.createTailMask(crb);
        DataSection.Data xmmTailShuffleMask = this.assumeValid ? null : AMD64CalcStringAttributesOp.writeToDataSection(crb, AMD64CalcStringAttributesOp.createXMMTailShuffleMask(AVXKind.AVXSize.XMM.getBytes()));
        this.vectorLoopPrologue(asm, arr, len, lengthTail, tailLessThan32, tailLessThan16, false);
        asm.movdqu(this.vectorSize, vecArrayTail, new AMD64Address(arr, lengthTail, this.stride, -this.vectorSize.getBytes()));
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMaskAscii, latin1Entry);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMaskAscii, null, latin1Tail, returnAscii, false);
        asm.bind(latin1Entry);
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMaskLatin, bmpLoop);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMaskLatin, latin1Tail, bmpTail, returnLatin1, false);
        AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
        asm.bind(bmpLoop);
        asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
        this.utf16FindSurrogatesAndTest(asm, vecArray, vecArray, vecMaskSurrogate);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, this.assumeValid ? labelSurrogateLoop : labelSurrogateEntry);
        asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, bmpLoop, true);
        asm.bind(bmpTail);
        this.utf16FindSurrogatesAndTest(asm, vecArrayTail, vecArrayTail, vecMaskSurrogate);
        asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnBMP);
        if (this.assumeValid) {
            this.utf16SubtractMatchedChars(asm, ret, vecArrayTail, tmp);
            asm.jmp(returnValidOrBroken);
            AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
            asm.bind(labelSurrogateLoop);
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
            this.utf16MatchSurrogates(asm, vecArray, vecMaskSurrogate);
            this.utf16SubtractMatchedChars(asm, ret, vecArray, tmp);
            asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelSurrogateLoop, true);
            asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnValidOrBroken, false);
            asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(maskTail));
            asm.pandU(this.vectorSize, vecArrayTail, new AMD64Address(tmp, lengthTail, this.stride), vecTmp);
            this.utf16MatchSurrogates(asm, vecArrayTail, vecMaskSurrogate);
            this.utf16SubtractMatchedChars(asm, ret, vecArrayTail, tmp);
            asm.jmp(returnValidOrBroken);
        } else {
            asm.bind(labelSurrogateEntry);
            asm.cmplAndJcc(ret, this.vectorLength, AMD64Assembler.ConditionFlag.Equal, surrogateExactlyVectorSize, false);
            asm.movzwl(tmp, new AMD64Address(arr, len, this.stride));
            asm.shrl(tmp, 10);
            asm.cmpl(tmp, 55);
            asm.movl(tmp, 6);
            asm.cmovl(AMD64Assembler.ConditionFlag.Equal, retBroken, tmp);
            asm.psllw(this.vectorSize, vecMaskSurrogate, 1);
            asm.psrlw(this.vectorSize, vecMaskAscii, 15);
            asm.por(this.vectorSize, vecMaskAscii, vecMaskSurrogate);
            Label labelSurrogateCheckLoopCountZero = new Label();
            asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.NotZero, labelSurrogateCheckLoopCountZero, true);
            asm.subq(arr, this.vectorSize.getBytes());
            asm.addq(lengthTail, this.vectorLength);
            asm.addq(len, this.vectorLength);
            asm.bind(labelSurrogateCheckLoopCountZero);
            asm.testqAndJcc(len, len, AMD64Assembler.ConditionFlag.Zero, labelSurrogateTail, true);
            AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
            asm.bind(labelSurrogateLoop);
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
            asm.movdqu(this.vectorSize, vecArrayTail, new AMD64Address(arr, len, this.stride, 2));
            this.utf16ValidateSurrogates(asm, ret, vecArray, vecArrayTail, vecMaskSurrogate, vecMaskAscii, vecTmp, vecResult, tmp);
            asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelSurrogateLoop, true);
            asm.bind(labelSurrogateTail);
            asm.leaq(tmp, (AMD64Address)crb.recordDataSectionReference(maskTail));
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, lengthTail, this.stride, -(this.vectorSize.getBytes() + 2)));
            asm.movdqu(this.vectorSize, vecArrayTail, new AMD64Address(arr, lengthTail, this.stride, -this.vectorSize.getBytes()));
            asm.pandU(this.vectorSize, vecArray, new AMD64Address(tmp, lengthTail, this.stride, -2), vecTmp);
            asm.pandU(this.vectorSize, vecArrayTail, new AMD64Address(tmp, lengthTail, this.stride, -2), vecTmp);
            this.utf16ValidateSurrogates(asm, ret, vecArray, vecArrayTail, vecMaskSurrogate, vecMaskAscii, vecTmp, vecResult, tmp);
            asm.movzwl(tmp, new AMD64Address(arr, lengthTail, this.stride, -2));
            asm.shrl(tmp, 10);
            asm.cmpl(tmp, 54);
            asm.movl(tmp, 6);
            asm.cmovl(AMD64Assembler.ConditionFlag.Equal, retBroken, tmp);
            asm.jmp(returnValidOrBroken);
        }
        if (this.supportsAVX2AndYMM()) {
            asm.bind(tailLessThan32);
            asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.XMM), AMD64Assembler.ConditionFlag.Less, tailLessThan16, true);
            if (this.assumeValid) {
                this.loadLessThan32IntoYMMUnordered(crb, asm, maskTail, arr, lengthTail, tmp, vecArray, vecTmp, vecArrayTail);
            } else {
                this.loadLessThan32IntoYMMOrdered(crb, asm, xmmTailShuffleMask, arr, lengthTail, tmp, vecArray, vecTmp, vecArrayTail);
            }
            asm.jmpb(tailSingleVector);
        }
        asm.bind(tailLessThan16);
        asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.QWORD), AMD64Assembler.ConditionFlag.Less, tailLessThan8, true);
        if (this.assumeValid) {
            this.loadLessThan16IntoXMMUnordered(crb, asm, maskTail, arr, lengthTail, tmp, vecArray, vecTmp, vecArrayTail);
        } else {
            this.loadLessThan16IntoXMMOrdered(crb, asm, arr, lengthTail, tmp, vecArray, vecTmp, vecArrayTail);
        }
        asm.jmpb(tailSingleVector);
        asm.bind(tailLessThan8);
        asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.DWORD), AMD64Assembler.ConditionFlag.Less, tailLessThan4, true);
        if (this.assumeValid) {
            this.loadLessThan8IntoXMMUnordered(crb, asm, maskTail, arr, lengthTail, vecArray, tmp, len);
        } else {
            this.loadLessThan8IntoXMMOrdered(asm, arr, lengthTail, vecArray, tmp, len);
        }
        asm.jmpb(tailSingleVector);
        asm.bind(tailLessThan4);
        asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnAscii, false);
        asm.movzwq(tmp, new AMD64Address(arr));
        asm.movdq(vecArray, tmp);
        asm.bind(tailSingleVector);
        asm.ptest(this.vectorSize, vecArray, vecMaskAscii);
        asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnAscii);
        asm.ptest(this.vectorSize, vecArray, vecMaskLatin);
        asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnLatin1);
        this.utf16FindSurrogatesAndTest(asm, vecTmp, vecArray, vecMaskSurrogate);
        asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnBMP);
        if (this.assumeValid) {
            this.utf16SubtractMatchedChars(asm, ret, vecTmp, tmp);
            asm.jmp(returnValidOrBroken);
        } else {
            asm.jmpb(tailSingleVectorSurrogate);
            asm.bind(surrogateExactlyVectorSize);
            asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, -this.vectorSize.getBytes()));
            asm.movzwl(tmp, new AMD64Address(arr, -2));
            asm.shrl(tmp, 10);
            asm.cmpl(tmp, 54);
            asm.movl(tmp, 6);
            asm.cmovl(AMD64Assembler.ConditionFlag.Equal, retBroken, tmp);
            asm.bind(tailSingleVectorSurrogate);
            asm.psllw(this.vectorSize, vecMaskSurrogate, 1);
            asm.psrlw(this.vectorSize, vecMaskAscii, 15);
            asm.por(this.vectorSize, vecMaskAscii, vecMaskSurrogate);
            asm.pxor(this.vectorSize, vecTmp, vecTmp);
            AMD64CalcStringAttributesOp.prev(asm, this.vectorSize, vecArrayTail, vecArray, vecTmp, 2);
            this.utf16ValidateSurrogates(asm, ret, vecArrayTail, vecArray, vecMaskSurrogate, vecMaskAscii, vecTmp, vecResult, tmp);
            asm.jmpb(returnValidOrBroken);
        }
        AMD64CalcStringAttributesOp.emitExitMultiByte(asm, ret, returnAscii, end, 0);
        AMD64CalcStringAttributesOp.emitExitMultiByte(asm, ret, returnLatin1, end, 1);
        AMD64CalcStringAttributesOp.emitExitMultiByte(asm, ret, returnBMP, end, 2);
        asm.bind(returnValidOrBroken);
        asm.shlq(ret, 32);
        if (this.assumeValid) {
            asm.orq(ret, 5);
        } else {
            asm.ptest(this.vectorSize, vecResult);
            asm.movl(tmp, 6);
            asm.cmovl(AMD64Assembler.ConditionFlag.NotZero, retBroken, tmp);
            asm.orq(ret, retBroken);
        }
        asm.bind(end);
    }

    private void utf16ValidateSurrogates(AMD64MacroAssembler asm, Register ret, Register vecArray0, Register vecArray1, Register vecMaskHiSurrogate, Register vecMaskLoSurrogate, Register vecTmp, Register vecResult, Register tmp) {
        this.utf16MatchSurrogates(asm, vecArray0, vecMaskHiSurrogate);
        this.utf16MatchSurrogates(asm, vecArray1, vecMaskLoSurrogate);
        asm.pand(this.vectorSize, vecTmp, vecArray0, vecArray1);
        asm.pxor(this.vectorSize, vecArray0, vecArray1);
        asm.pmovmsk(this.vectorSize, tmp, vecTmp);
        asm.popcntl(tmp, tmp);
        asm.por(this.vectorSize, vecResult, vecArray0);
        asm.shrl(tmp, 1);
        asm.subq(ret, tmp);
    }

    private void utf16MatchSurrogates(AMD64MacroAssembler asm, Register vecArray, Register vecMaskSurrogate) {
        asm.psrlw(this.vectorSize, vecArray, 10);
        asm.pcmpeqw(this.vectorSize, vecArray, vecMaskSurrogate);
    }

    private void utf16FindSurrogatesAndTest(AMD64MacroAssembler asm, Register vecDst, Register vecArray, Register vecMaskSurrogate) {
        asm.psrlw(this.vectorSize, vecDst, vecArray, this.assumeValid ? 10 : 11);
        asm.pcmpeqw(this.vectorSize, vecDst, vecMaskSurrogate);
        asm.ptest(this.vectorSize, vecDst, vecDst);
    }

    private void utf16SubtractMatchedChars(AMD64MacroAssembler asm, Register ret, Register vecArrayTail, Register tmp) {
        asm.pmovmsk(this.vectorSize, tmp, vecArrayTail);
        asm.popcntl(tmp, tmp);
        asm.shrl(tmp, 1);
        asm.subq(ret, tmp);
    }

    private void emitUTF32(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arr, Register len, Register lengthTail, Register ret, Register vecArray) {
        assert (this.stride.log2 == 2);
        Register vecMaskAscii = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        Register vecMaskLatin1 = ValueUtil.asRegister((Value)this.vectorTemp[2]);
        Register vecMaskBMP = ValueUtil.asRegister((Value)this.vectorTemp[3]);
        Register vecMaskSurrogate = ValueUtil.asRegister((Value)this.vectorTemp[4]);
        Register vecMaskOutOfRange = ValueUtil.asRegister((Value)this.vectorTemp[5]);
        Register vecArrayTail = ValueUtil.asRegister((Value)this.vectorTemp[6]);
        Register vecArrayTmp = ValueUtil.asRegister((Value)this.vectorTemp[7]);
        Label labelLatin1Entry = new Label();
        Label labelLatin1Tail = new Label();
        Label labelBMPEntry = new Label();
        Label labelBMPLoop = new Label();
        Label labelBMPTail = new Label();
        Label labelAstralLoop = new Label();
        Label labelAstralTail = new Label();
        Label tailLessThan32 = new Label();
        Label tailLessThan16 = new Label();
        Label tailLessThan8 = new Label();
        Label tailSingleVector = new Label();
        Label returnBroken = new Label();
        Label returnAstral = new Label();
        Label returnBMP = new Label();
        Label returnLatin1 = new Label();
        Label returnAscii = new Label();
        Label end = new Label();
        this.loadMask(crb, asm, vecMaskAscii, -128);
        this.loadMask(crb, asm, vecMaskSurrogate, 27);
        this.loadMask(crb, asm, vecMaskOutOfRange, 16);
        asm.pslld(this.vectorSize, vecMaskLatin1, vecMaskAscii, 1);
        asm.pslld(this.vectorSize, vecMaskBMP, vecMaskAscii, 9);
        this.vectorLoopPrologue(asm, arr, len, lengthTail, tailLessThan32, tailLessThan16, false);
        asm.movdqu(this.vectorSize, vecArrayTail, new AMD64Address(arr, lengthTail, this.stride, -this.vectorSize.getBytes()));
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMaskAscii, labelLatin1Entry);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMaskAscii, null, labelLatin1Tail, returnAscii, false);
        asm.bind(labelLatin1Entry);
        this.emitPTestLoop(crb, asm, arr, len, vecArray, vecMaskLatin1, labelBMPEntry);
        this.emitPTestTail(asm, this.vectorSize, arr, lengthTail, vecArray, vecMaskLatin1, labelLatin1Tail, labelBMPTail, returnLatin1, false);
        asm.bind(labelBMPEntry);
        AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
        asm.bind(labelBMPLoop);
        asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
        asm.ptest(this.vectorSize, vecArray, vecMaskBMP);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelAstralLoop);
        this.utf32CheckInvalid(asm, vecArray, vecArray, vecArrayTmp, vecMaskSurrogate, vecMaskOutOfRange, returnBroken, true);
        asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelBMPLoop, true);
        asm.bind(labelBMPTail);
        asm.ptest(this.vectorSize, vecArrayTail, vecMaskBMP);
        asm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelAstralTail);
        this.utf32CheckInvalid(asm, vecArrayTail, vecArrayTail, vecArrayTmp, vecMaskSurrogate, vecMaskOutOfRange, returnBroken, true);
        asm.jmpb(returnBMP);
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnBroken, end, 4, false);
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnBMP, end, 2, false);
        AMD64CalcStringAttributesOp.alignLoopHead(crb, asm);
        asm.bind(labelAstralLoop);
        asm.movdqu(this.vectorSize, vecArray, new AMD64Address(arr, len, this.stride));
        this.utf32CheckInvalid(asm, vecArray, vecArray, vecArrayTmp, vecMaskSurrogate, vecMaskOutOfRange, returnBroken, true);
        asm.addqAndJcc(len, this.vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelAstralLoop, true);
        asm.bind(labelAstralTail);
        this.utf32CheckInvalid(asm, vecArrayTail, vecArrayTail, vecArrayTmp, vecMaskSurrogate, vecMaskOutOfRange, returnBroken, true);
        asm.jmp(returnAstral);
        if (this.supportsAVX2AndYMM()) {
            asm.bind(tailLessThan32);
            asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.XMM), AMD64Assembler.ConditionFlag.Less, tailLessThan16, true);
            asm.movdqu(AVXKind.AVXSize.XMM, vecArray, new AMD64Address(arr));
            asm.movdqu(AVXKind.AVXSize.XMM, vecArrayTail, new AMD64Address(arr, lengthTail, this.stride, -AVXKind.AVXSize.XMM.getBytes()));
            AMD64Assembler.VexRVMIOp.VPERM2I128.emit((AMD64Assembler)asm, this.vectorSize, vecArray, vecArray, vecArrayTail, 2);
            asm.jmpb(tailSingleVector);
        }
        asm.bind(tailLessThan16);
        asm.cmplAndJcc(lengthTail, this.elementsPerVector(AVXKind.AVXSize.QWORD), AMD64Assembler.ConditionFlag.Less, tailLessThan8, true);
        asm.movdq(vecArray, new AMD64Address(arr));
        asm.movdq(vecArrayTail, new AMD64Address(arr, lengthTail, this.stride, -AVXKind.AVXSize.QWORD.getBytes()));
        asm.movlhps(vecArray, vecArrayTail);
        asm.jmpb(tailSingleVector);
        asm.bind(tailLessThan8);
        asm.testlAndJcc(lengthTail, lengthTail, AMD64Assembler.ConditionFlag.Zero, returnAscii, true);
        asm.movdl(vecArray, new AMD64Address(arr));
        asm.bind(tailSingleVector);
        asm.ptest(this.vectorSize, vecArray, vecMaskAscii);
        asm.jccb(AMD64Assembler.ConditionFlag.Zero, returnAscii);
        asm.ptest(this.vectorSize, vecArray, vecMaskLatin1);
        asm.jccb(AMD64Assembler.ConditionFlag.Zero, returnLatin1);
        this.utf32CheckInvalid(asm, vecArrayTail, vecArray, vecArrayTmp, vecMaskSurrogate, vecMaskOutOfRange, returnBroken, false);
        asm.ptest(this.vectorSize, vecArray, vecMaskBMP);
        asm.jcc(AMD64Assembler.ConditionFlag.Zero, returnBMP);
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnAstral, end, 3);
        AMD64CalcStringAttributesOp.emitExit(asm, ret, returnLatin1, end, 1);
        AMD64CalcStringAttributesOp.emitExitAtEnd(asm, ret, returnAscii, end, 0);
    }

    private void utf32CheckInvalid(AMD64MacroAssembler asm, Register vecArrayDst, Register vecArraySrc, Register vecArrayTmp, Register vecMaskBroken, Register vecMaskOutOfRange, Label returnBroken, boolean isShortJmp) {
        asm.psrld(this.vectorSize, vecArrayTmp, vecArraySrc, 16);
        asm.psrld(this.vectorSize, vecArrayDst, vecArraySrc, 11);
        asm.pcmpgtd(this.vectorSize, vecArrayTmp, vecMaskOutOfRange);
        asm.pcmpeqd(this.vectorSize, vecArrayDst, vecMaskBroken);
        asm.por(this.vectorSize, vecArrayDst, vecArrayTmp);
        asm.ptest(this.vectorSize, vecArrayDst, vecArrayDst);
        asm.jcc(AMD64Assembler.ConditionFlag.NotZero, returnBroken, isShortJmp);
    }

    private static void alignLoopHead(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
        asm.align(crb.target.wordSize * 2);
    }

    private void loadMask(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register vecMask, int value) {
        asm.movdqu(this.vectorSize, vecMask, AMD64CalcStringAttributesOp.getMaskOnce(crb, this.createMaskBytes(value)));
    }

    private static AMD64Address getMaskOnce(CompilationResultBuilder crb, byte[] mask) {
        return AMD64CalcStringAttributesOp.getMaskOnce(crb, mask, mask.length);
    }

    private static AMD64Address getMaskOnce(CompilationResultBuilder crb, byte[] mask, int alignLength) {
        int align = crb.dataBuilder.ensureValidDataAlignment(alignLength);
        return (AMD64Address)crb.recordDataReferenceInCode(mask, align);
    }

    private DataSection.Data createMask(CompilationResultBuilder crb, int value) {
        return AMD64CalcStringAttributesOp.writeToDataSection(crb, this.createMaskBytes(value));
    }

    private DataSection.Data createTailMask(CompilationResultBuilder crb) {
        byte[] mask = new byte[this.vectorSize.getBytes() * 2];
        for (int i = this.vectorLength; i < this.vectorLength * 2; ++i) {
            AMD64CalcStringAttributesOp.writeValue(mask, this.stride, i, -1);
        }
        return AMD64CalcStringAttributesOp.writeToDataSection(crb, mask);
    }

    private static byte[] createXMMTailShuffleMask(int length) {
        byte[] mask = new byte[AVXKind.AVXSize.XMM.getBytes() + length];
        for (int i = 0; i < length; ++i) {
            mask[i] = (byte)i;
        }
        Arrays.fill(mask, length, AVXKind.AVXSize.XMM.getBytes() + length, (byte)-1);
        return mask;
    }

    private byte[] createMaskBytes(int value) {
        byte[] mask = new byte[this.vectorSize.getBytes()];
        for (int i = 0; i < this.vectorLength; ++i) {
            AMD64CalcStringAttributesOp.writeValue(mask, this.stride, i, value);
        }
        return mask;
    }

    private static DataSection.Data writeToDataSection(CompilationResultBuilder crb, byte[] array) {
        int align = crb.dataBuilder.ensureValidDataAlignment(array.length);
        ArrayDataPointerConstant arrayConstant = new ArrayDataPointerConstant(array, align);
        return crb.dataBuilder.createSerializableData(arrayConstant, align);
    }

    private static void emitExit(AMD64MacroAssembler asm, Register ret, Label entry, Label labelDone, int returnValue) {
        AMD64CalcStringAttributesOp.emitExit(asm, ret, entry, labelDone, returnValue, true);
    }

    private static void emitExit(AMD64MacroAssembler asm, Register ret, Label entry, Label labelDone, int returnValue, boolean isShortJmp) {
        asm.bind(entry);
        if (returnValue == 0) {
            asm.xorq(ret, ret);
        } else {
            asm.movl(ret, returnValue);
        }
        asm.jmp(labelDone, isShortJmp);
    }

    private static void emitExitAtEnd(AMD64MacroAssembler asm, Register ret, Label entry, Label end, int returnValue) {
        asm.bind(entry);
        if (returnValue == 0) {
            asm.xorq(ret, ret);
        } else {
            asm.movl(ret, returnValue);
        }
        asm.bind(end);
    }

    private static void emitExitMultiByte(AMD64MacroAssembler asm, Register ret, Label entry, Label end, int returnValue) {
        asm.bind(entry);
        asm.shlq(ret, 32);
        asm.orq(ret, returnValue);
        asm.jmpb(end);
    }

    private static void emitExitMultiByteAtEnd(AMD64MacroAssembler asm, Register ret, Label entry, Label end, int returnValue) {
        asm.bind(entry);
        asm.shlq(ret, 32);
        asm.orq(ret, returnValue);
        asm.bind(end);
    }

    private static void writeValue(byte[] array, Stride stride, int index, int value) {
        int i = index << stride.log2;
        if (stride == Stride.S1) {
            array[i] = (byte)value;
            return;
        }
        if (ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN)) {
            if (stride == Stride.S2) {
                array[i] = (byte)value;
                array[i + 1] = (byte)(value >> 8);
            } else {
                array[i] = (byte)value;
                array[i + 1] = (byte)(value >> 8);
                array[i + 2] = (byte)(value >> 16);
                array[i + 3] = (byte)(value >> 24);
            }
        } else if (stride == Stride.S2) {
            array[i] = (byte)(value >> 8);
            array[i + 1] = (byte)value;
        } else {
            array[i] = (byte)(value >> 24);
            array[i + 1] = (byte)(value >> 16);
            array[i + 2] = (byte)(value >> 8);
            array[i + 3] = (byte)value;
        }
    }

    public static enum Op {
        LATIN1(JavaKind.Byte),
        BMP(JavaKind.Char),
        UTF_8(JavaKind.Byte),
        UTF_16(JavaKind.Char),
        UTF_32(JavaKind.Int);

        private final JavaKind stride;

        private Op(JavaKind stride) {
            this.stride = stride;
        }
    }
}

