/*
 * Decompiled with CFR 0.152.
 */
package uk.ac.manchester.tornado.examples.matrices;

import java.util.Arrays;
import java.util.OptionalDouble;
import java.util.stream.IntStream;
import uk.ac.manchester.tornado.api.ImmutableTaskGraph;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoBackend;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntimeProvider;
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;

public class MatrixMul1D {
    public static final int WARMUP_ITERATIONS = 15;
    public static final int EXECUTE_ITERATIONS = 100;

    private static void matrixMultiplication(FloatArray A, FloatArray B, FloatArray C, int size) {
        for (int i = 0; i < size; ++i) {
            for (int j = 0; j < size; ++j) {
                float sum = 0.0f;
                for (int k = 0; k < size; ++k) {
                    sum += A.get(i * size + k) * B.get(k * size + j);
                }
                C.set(i * size + j, sum);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        long stop;
        int N = 512;
        if (args.length == 1) {
            N = Integer.parseInt(args[0]);
        }
        FloatArray matrixA = new FloatArray(N * N);
        FloatArray matrixB = new FloatArray(N * N);
        FloatArray matrixCSeq = new FloatArray(N * N);
        FloatArray matrixCCUDA = new FloatArray(N * N);
        FloatArray matrixCOCL = new FloatArray(N * N);
        IntStream.range(0, N * N).parallel().forEach(idx -> {
            matrixA.set(idx, 2.5f);
            matrixB.set(idx, 3.5f);
        });
        TaskGraph cudaTaskGraph = new TaskGraph("cuda_old_api").transferToDevice(0, new Object[]{matrixA, matrixB}).task("t0", MatrixMul1D::matrixMultiplication, (Object)matrixA, (Object)matrixB, (Object)matrixCCUDA, (Object)N).transferToHost(1, new Object[]{matrixCCUDA});
        TornadoBackend cudaDriver = TornadoRuntimeProvider.getTornadoRuntime().getBackend(0);
        TornadoDevice cudaDevice = cudaDriver.getDevice(0);
        ImmutableTaskGraph immutableTaskGraph = cudaTaskGraph.snapshot();
        TornadoExecutionPlan executorCUDA = new TornadoExecutionPlan(new ImmutableTaskGraph[]{immutableTaskGraph});
        executorCUDA.withDevice(cudaDevice);
        for (int i = 0; i < 15; ++i) {
            executorCUDA.execute();
        }
        long[] execTimesCUDA = new long[100];
        for (int i = 0; i < execTimesCUDA.length; ++i) {
            long start = System.currentTimeMillis();
            executorCUDA.execute();
            stop = System.currentTimeMillis();
            execTimesCUDA[i] = stop - start;
        }
        OptionalDouble avgCudaOptional = Arrays.stream(execTimesCUDA).average();
        if (!avgCudaOptional.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageCUDA = avgCudaOptional.getAsDouble();
        TaskGraph oclTaskGraph = new TaskGraph("ocl_old_api").transferToDevice(0, new Object[]{matrixA, matrixB}).task("t0", MatrixMul1D::matrixMultiplication, (Object)matrixA, (Object)matrixB, (Object)matrixCOCL, (Object)N).transferToHost(1, new Object[]{matrixCOCL});
        TornadoBackend oclDriver = TornadoRuntimeProvider.getTornadoRuntime().getBackend(1);
        TornadoDevice oclDevice = null;
        for (int i = 0; i < oclDriver.getNumDevices(); ++i) {
            TornadoDevice device = oclDriver.getDevice(i);
            if (!device.getPhysicalDevice().getDeviceName().equalsIgnoreCase(cudaDevice.getPhysicalDevice().getDeviceName())) continue;
            oclDevice = device;
        }
        if (oclDevice == null) {
            System.err.println("There is no device with both OpenCL and CUDA-PTX support");
            System.exit(1);
        }
        ImmutableTaskGraph immutableTaskGraph1 = oclTaskGraph.snapshot();
        TornadoExecutionPlan executorOCL = new TornadoExecutionPlan(new ImmutableTaskGraph[]{immutableTaskGraph1});
        executorOCL.withDevice(oclDevice);
        for (int i = 0; i < 15; ++i) {
            executorOCL.execute();
        }
        long[] execTimesOCL = new long[100];
        for (int i = 0; i < execTimesOCL.length; ++i) {
            long start = System.currentTimeMillis();
            executorOCL.execute();
            stop = System.currentTimeMillis();
            execTimesOCL[i] = stop - start;
        }
        OptionalDouble avgOpenCLOptional = Arrays.stream(execTimesOCL).average();
        if (!avgOpenCLOptional.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageOpenCL = avgOpenCLOptional.getAsDouble();
        for (int i = 0; i < 15; ++i) {
            MatrixMul1D.matrixMultiplication(matrixA, matrixB, matrixCSeq, N);
        }
        long[] execTimesSeq = new long[100];
        for (int i = 0; i < execTimesSeq.length; ++i) {
            long start = System.currentTimeMillis();
            MatrixMul1D.matrixMultiplication(matrixA, matrixB, matrixCSeq, N);
            stop = System.currentTimeMillis();
            execTimesSeq[i] = stop - start;
        }
        OptionalDouble avgSeqOptional = Arrays.stream(execTimesSeq).average();
        if (!avgSeqOptional.isPresent()) {
            throw new Exception("Could not get average execution time");
        }
        double averageSeq = avgSeqOptional.getAsDouble();
        double flops = 2.0 * Math.pow(N, 3.0);
        double CUDAGigaFlops = 1.0E-9 * flops / (averageCUDA / 1000.0);
        double OpenCLGigaFlops = 1.0E-9 * flops / (averageOpenCL / 1000.0);
        double CUDAspeedup = averageSeq / averageCUDA;
        double OpenCLspeedup = averageSeq / averageOpenCL;
        String formatCUDAFGlops = String.format("%.2f", CUDAGigaFlops);
        String formatOpenCLFGlops = String.format("%.2f", OpenCLGigaFlops);
        System.out.println("\tOpenCL Execution: " + formatOpenCLFGlops + " GFlops, Total time = " + averageOpenCL + " ms");
        System.out.println("\tPTX Execution: " + formatCUDAFGlops + " GFlops, Total Time = " + averageCUDA + " ms");
        System.out.println("\tOpenCL Speedup: " + OpenCLspeedup + "x");
        System.out.println("\tPTX Speedup: " + CUDAspeedup + "x");
        System.out.println();
    }
}

