/*
 * Decompiled with CFR 0.152.
 */
package uk.ac.manchester.tornado.examples.kernelcontext.matrices;

import java.util.Arrays;
import java.util.OptionalDouble;
import java.util.stream.IntStream;
import uk.ac.manchester.tornado.api.GridScheduler;
import uk.ac.manchester.tornado.api.ImmutableTaskGraph;
import uk.ac.manchester.tornado.api.KernelContext;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoBackend;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.WorkerGrid;
import uk.ac.manchester.tornado.api.WorkerGrid2D;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntimeProvider;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;

public class MatrixMul2DLocalMemory {
    public static final int WARMUP_ITERATIONS = 15;
    public static final int EXECUTE_ITERATIONS = 15;
    public static final int TS = 32;
    private static final boolean CHECK_RESULT = true;
    private static final float DELTA = 0.01f;

    private static void matrixMultiplication(FloatArray A, FloatArray B, FloatArray C, int size) {
        for (int i = 0; i < size; ++i) {
            for (int j = 0; j < size; ++j) {
                float sum = 0.0f;
                for (int k = 0; k < size; ++k) {
                    sum += A.get(i * size + k) * B.get(k * size + j);
                }
                C.set(i * size + j, sum);
            }
        }
    }

    public static void matrixMultiplicationLocalMemory(KernelContext context, FloatArray A, FloatArray B, FloatArray C, int size) {
        int row = context.localIdx;
        int col = context.localIdy;
        int globalRow = 32 * context.groupIdx + row;
        int globalCol = 32 * context.groupIdy + col;
        float[] aSub = context.allocateFloatLocalArray(1024);
        float[] bSub = context.allocateFloatLocalArray(1024);
        float sum = 0.0f;
        int numTiles = size / 32;
        for (int t = 0; t < numTiles; ++t) {
            int tiledRow = 32 * t + row;
            int tiledCol = 32 * t + col;
            aSub[col * 32 + row] = A.get(tiledCol * size + globalRow);
            bSub[col * 32 + row] = B.get(globalCol * size + tiledRow);
            context.localBarrier();
            for (int k = 0; k < 32; ++k) {
                sum += aSub[k * 32 + row] * bSub[col * 32 + k];
            }
            context.localBarrier();
        }
        C.set(globalCol * size + globalRow, sum);
    }

    public static void main(String[] args) throws Exception {
        int i;
        long stop;
        int N = 512;
        long local_x = 32L;
        long local_y = 32L;
        if (args.length == 1) {
            N = Integer.parseInt(args[0]);
        } else if (args.length == 3) {
            N = Integer.parseInt(args[0]);
            local_x = Long.parseLong(args[1]);
            local_y = Long.parseLong(args[2]);
        }
        FloatArray matrixA = new FloatArray(N * N);
        FloatArray matrixB = new FloatArray(N * N);
        FloatArray matrixCSeq = new FloatArray(N * N);
        FloatArray matrixCCUDA = new FloatArray(N * N);
        FloatArray matrixCOCL = new FloatArray(N * N);
        FloatArray matrixCOCLNewApi = new FloatArray(N * N);
        FloatArray matrixCCUDANewApi = new FloatArray(N * N);
        IntStream.range(0, N * N).parallel().forEach(idx -> {
            matrixA.set(idx, 2.5f);
            matrixB.set(idx, 3.5f);
        });
        WorkerGrid2D workerCUDAOld = new WorkerGrid2D(N, N);
        GridScheduler gridSchedulerCUDAOld = new GridScheduler("cuda_old_api.t0", (WorkerGrid)workerCUDAOld);
        TaskGraph scheduleCUDA = new TaskGraph("cuda_old_api").transferToDevice(0, new Object[]{matrixA, matrixB}).task("t0", MatrixMul2DLocalMemory::matrixMultiplication, (Object)matrixA, (Object)matrixB, (Object)matrixCCUDA, (Object)N).transferToHost(1, new Object[]{matrixCCUDA});
        ImmutableTaskGraph immutableTaskGraph = scheduleCUDA.snapshot();
        TornadoExecutionPlan executorCUDA = new TornadoExecutionPlan(new ImmutableTaskGraph[]{immutableTaskGraph});
        TornadoBackend cudaDriver = TornadoRuntimeProvider.getTornadoRuntime().getBackend(0);
        TornadoDevice cudaDevice = cudaDriver.getDevice(0);
        workerCUDAOld.setGlobalWork((long)N, (long)N, 1L);
        workerCUDAOld.setLocalWork(local_x, local_y, 1L);
        executorCUDA.withDevice(cudaDevice).withGridScheduler(gridSchedulerCUDAOld);
        for (int i2 = 0; i2 < 15; ++i2) {
            executorCUDA.execute();
        }
        long[] execTimesCUDA = new long[15];
        for (int i3 = 0; i3 < execTimesCUDA.length; ++i3) {
            long start = System.currentTimeMillis();
            executorCUDA.execute();
            stop = System.currentTimeMillis();
            execTimesCUDA[i3] = stop - start;
        }
        OptionalDouble avgCudaOptional = Arrays.stream(execTimesCUDA).average();
        if (!avgCudaOptional.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageCUDA = avgCudaOptional.getAsDouble();
        WorkerGrid2D workerOpenCLOld = new WorkerGrid2D(N, N);
        GridScheduler gridSchedulerOpenCLOld = new GridScheduler("ocl_old_api.t0", (WorkerGrid)workerOpenCLOld);
        TaskGraph scheduleOCL = new TaskGraph("ocl_old_api").transferToDevice(0, new Object[]{matrixA, matrixB}).task("t0", MatrixMul2DLocalMemory::matrixMultiplication, (Object)matrixA, (Object)matrixB, (Object)matrixCOCL, (Object)N).transferToHost(1, new Object[]{matrixCOCL});
        ImmutableTaskGraph immutableTaskGraph1 = scheduleOCL.snapshot();
        TornadoExecutionPlan executorOCL = new TornadoExecutionPlan(new ImmutableTaskGraph[]{immutableTaskGraph1});
        executorOCL.withGridScheduler(gridSchedulerOpenCLOld);
        TornadoBackend oclDriver = TornadoRuntimeProvider.getTornadoRuntime().getBackend(1);
        TornadoDevice oclDevice = null;
        for (i = 0; i < oclDriver.getNumDevices(); ++i) {
            TornadoDevice device = oclDriver.getDevice(i);
            if (!device.getPhysicalDevice().getDeviceName().equalsIgnoreCase(cudaDevice.getPhysicalDevice().getDeviceName())) continue;
            oclDevice = device;
        }
        if (oclDevice == null) {
            System.err.println("There is no device with both OpenCL and CUDA-PTX support");
            System.exit(1);
        }
        workerOpenCLOld.setGlobalWork((long)N, (long)N, 1L);
        workerOpenCLOld.setLocalWork(local_x, local_y, 1L);
        executorOCL.withDevice(oclDevice);
        for (i = 0; i < 15; ++i) {
            executorOCL.execute();
        }
        long[] execTimesOCL = new long[15];
        for (int i4 = 0; i4 < execTimesOCL.length; ++i4) {
            long start = System.currentTimeMillis();
            executorCUDA.execute();
            stop = System.currentTimeMillis();
            execTimesOCL[i4] = stop - start;
        }
        OptionalDouble avgOpenCLOptional = Arrays.stream(execTimesOCL).average();
        if (!avgOpenCLOptional.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageOpenCL = avgOpenCLOptional.getAsDouble();
        WorkerGrid2D workerOpenCLNew = new WorkerGrid2D(N, N);
        GridScheduler gridSchedulerOpenCLNew = new GridScheduler("ocl_advanced_api.t0", (WorkerGrid)workerOpenCLNew);
        KernelContext context = new KernelContext();
        TaskGraph oclNewApiTask = new TaskGraph("ocl_advanced_api").transferToDevice(0, new Object[]{matrixA, matrixB}).task("t0", MatrixMul2DLocalMemory::matrixMultiplicationLocalMemory, (Object)context, (Object)matrixA, (Object)matrixB, (Object)matrixCOCLNewApi, (Object)N).transferToHost(1, new Object[]{matrixCOCLNewApi});
        ImmutableTaskGraph immutableTaskGraph2 = oclNewApiTask.snapshot();
        TornadoExecutionPlan executorOCLNewAPI = new TornadoExecutionPlan(new ImmutableTaskGraph[]{immutableTaskGraph2});
        workerOpenCLNew.setGlobalWork((long)N, (long)N, 1L);
        workerOpenCLNew.setLocalWork(local_x, local_y, 1L);
        executorOCLNewAPI.withGridScheduler(gridSchedulerOpenCLNew).withDevice(oclDevice);
        for (int i5 = 0; i5 < 15; ++i5) {
            executorOCLNewAPI.execute();
        }
        long[] execTimesOCLNewApi = new long[15];
        for (int i6 = 0; i6 < 15; ++i6) {
            long start = System.currentTimeMillis();
            executorOCLNewAPI.execute();
            stop = System.currentTimeMillis();
            execTimesOCLNewApi[i6] = stop - start;
        }
        OptionalDouble avgOpenCLOptionalNewApi = Arrays.stream(execTimesOCLNewApi).average();
        if (!avgOpenCLOptionalNewApi.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageOpenCLNewApi = avgOpenCLOptionalNewApi.getAsDouble();
        WorkerGrid2D workerCudaNew = new WorkerGrid2D(N, N);
        GridScheduler gridSchedulerCudaNew = new GridScheduler("cuda_advanced_api.t0", (WorkerGrid)workerCudaNew);
        KernelContext contextCUDA = new KernelContext();
        TaskGraph cudaNewApiTask = new TaskGraph("cuda_advanced_api").transferToDevice(0, new Object[]{matrixA, matrixB}).task("t0", MatrixMul2DLocalMemory::matrixMultiplicationLocalMemory, (Object)contextCUDA, (Object)matrixA, (Object)matrixB, (Object)matrixCCUDANewApi, (Object)N).transferToHost(1, new Object[]{matrixCCUDANewApi});
        ImmutableTaskGraph immutableTaskGraph3 = cudaNewApiTask.snapshot();
        TornadoExecutionPlan executorCUDANewAPI = new TornadoExecutionPlan(new ImmutableTaskGraph[]{immutableTaskGraph3});
        executorCUDANewAPI.withGridScheduler(gridSchedulerCudaNew);
        workerCudaNew.setGlobalWork((long)N, (long)N, 1L);
        workerCudaNew.setLocalWork(local_x, local_y, 1L);
        executorCUDANewAPI.withDevice(cudaDevice);
        for (int i7 = 0; i7 < 15; ++i7) {
            executorCUDANewAPI.execute();
        }
        long[] execTimesCUDANewApi = new long[15];
        for (int i8 = 0; i8 < 15; ++i8) {
            long start = System.currentTimeMillis();
            executorCUDANewAPI.execute();
            stop = System.currentTimeMillis();
            execTimesCUDANewApi[i8] = stop - start;
        }
        OptionalDouble avgCUDAOptionalNewApi = Arrays.stream(execTimesCUDANewApi).average();
        if (!avgCUDAOptionalNewApi.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageCUDANewApi = avgCUDAOptionalNewApi.getAsDouble();
        for (int i9 = 0; i9 < 15; ++i9) {
            MatrixMul2DLocalMemory.matrixMultiplication(matrixA, matrixB, matrixCSeq, N);
        }
        long[] execTimesSeq = new long[15];
        for (int i10 = 0; i10 < execTimesSeq.length; ++i10) {
            long start = System.currentTimeMillis();
            MatrixMul2DLocalMemory.matrixMultiplication(matrixA, matrixB, matrixCSeq, N);
            stop = System.currentTimeMillis();
            execTimesSeq[i10] = stop - start;
        }
        OptionalDouble avgSeqOptional = Arrays.stream(execTimesSeq).average();
        if (!avgSeqOptional.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageSeq = avgSeqOptional.getAsDouble();
        boolean correctResult = true;
        boolean validationCUDA = true;
        boolean validationOCL = true;
        boolean validationOCLNewApi = true;
        boolean validationCUDANewApi = true;
        for (int i11 = 0; i11 < N * N; ++i11) {
            if (Math.abs(matrixCCUDA.get(i11) - matrixCSeq.get(i11)) > 0.01f) {
                validationCUDA = false;
                System.out.println("CUDA validation failed");
            }
            if (Math.abs(matrixCOCL.get(i11) - matrixCSeq.get(i11)) > 0.01f) {
                validationOCL = false;
                System.out.println("OpenCL validation failed");
            }
            if (Math.abs(matrixCOCLNewApi.get(i11) - matrixCSeq.get(i11)) > 0.01f) {
                validationOCLNewApi = false;
                System.out.println("OpenCL new api validation failed");
                System.out.println("Result is (" + matrixCOCLNewApi.get(i11) + ") - while should be (" + matrixCSeq.get(i11) + ")");
            }
            if (Math.abs(matrixCCUDANewApi.get(i11) - matrixCSeq.get(i11)) > 0.01f) {
                validationCUDANewApi = false;
                System.out.println("CUDA new api validation failed");
                System.out.println("Result is (" + matrixCCUDANewApi.get(i11) + ") - while should be (" + matrixCSeq.get(i11) + ")");
            }
            boolean bl = correctResult = validationCUDA && validationOCL && validationOCLNewApi && validationCUDANewApi;
            if (!correctResult) break;
        }
        if (correctResult) {
            System.out.println("[RESULT] correct");
        } else {
            System.out.println("[RESULT] wrong");
        }
        double flops = 2.0 * Math.pow(N, 3.0);
        double CUDAGigaFlops = 1.0E-9 * flops / (averageCUDA / 1000.0);
        double OpenCLGigaFlops = 1.0E-9 * flops / (averageOpenCL / 1000.0);
        double OpenCLNewApiGigaFlops = 1.0E-9 * flops / (averageOpenCLNewApi / 1000.0);
        double CUDANewApiGigaFlops = 1.0E-9 * flops / (averageCUDANewApi / 1000.0);
        double CUDAspeedup = averageSeq / averageCUDA;
        double OpenCLspeedup = averageSeq / averageOpenCL;
        double OpenCLNewApispeedup = averageSeq / averageOpenCLNewApi;
        double CUDANewApispeedup = averageSeq / averageCUDANewApi;
        String formatCUDAFGlops = String.format("%.2f", CUDAGigaFlops);
        String formatOpenCLFGlops = String.format("%.2f", OpenCLGigaFlops);
        String formatOpenCLNewApiFGlops = String.format("%.2f", OpenCLNewApiGigaFlops);
        String formatCUDANewApiFGlops = String.format("%.2f", CUDANewApiGigaFlops);
        System.out.println("\tOpenCL Execution: " + formatOpenCLFGlops + " GFlops, Total time = " + averageOpenCL + " ms");
        System.out.println("\tOpenCL Execution with Local Memory and Loop Tiling: " + formatOpenCLNewApiFGlops + " GFlops, Total time = " + averageOpenCLNewApi + " ms");
        System.out.println("\tPTX Execution: " + formatCUDAFGlops + " GFlops, Total Time = " + averageCUDA + " ms");
        System.out.println("\tPTX Execution with Local Memory and Loop Tiling: " + formatCUDANewApiFGlops + " GFlops, Total time = " + averageCUDANewApi + " ms");
        System.out.println("\tOpenCL Speedup: " + OpenCLspeedup + "x");
        System.out.println("\tOpenCL Speedup with Local Memory and Loop Tiling: " + OpenCLNewApispeedup + "x");
        System.out.println("\tPTX Speedup: " + CUDAspeedup + "x");
        System.out.println("\tPTX Speedup with Local Memory and Loop Tiling: " + CUDANewApispeedup + "x");
        System.out.println();
    }
}

