"""Defines the possible routines for computing the gravitational forces in the
simulation.

All the methods in this file require a position (n, 3) vector, a mass (n, )
vector and an optional softening scale float."""

import ctypes
import numpy.ctypeslib as ctl
import numpy as np
from numba import jit


def bruteForce(r_vec, mass, soft=0.):
    """Calculates the acceleration generated by a set of masses on themselves.
       Complexity O(n*m) where n is the total number of masses and m is the
       number of massive particles.

    Parameters:
        r_vec (array): list of particles positions.
            Shape (n, 3) where n is the number of particles
        mass (array): list of particles masses.
            Shape (n,)
        soft (float): characteristic plummer softening length scale
    Returns:
        forces (array): list of forces acting on each particle.
            Shape (n, 3)
    """
    # Only calculate forces from massive particles
    mask = mass!=0
    massMassive = mass[mask]
    rMassive_vec = r_vec[mask]
    #  x m x 1 matrix (m = number of massive particles) for broadcasting
    mass_mat = massMassive.reshape(1, -1, 1)
    # Calculate displacements
    # r_ten is the direction of the pairwise displacements. Shape (n, m, 3)
    # r_mat is the absolute distance of the pairwise displacements. (n, m, 1)
    r_ten = rMassive_vec.reshape(1, -1, 3) - r_vec.reshape(-1, 1, 3)
    r_mat = np.linalg.norm(r_ten, axis=-1, keepdims=True)
    # Avoid division by zeros
    # $a = M / (r + \epsilon)^2$, where $\epsilon$ is the softening scale
    # r_ten/r_mat gives the direction unit vector
    accel = np.divide(r_ten * mass_mat/(r_mat+soft)**2, r_mat, 
        where=r_ten.astype(bool), out=r_ten) # Reuse memory from r_ten
    return accel.sum(axis=1) # Add all forces on each particle

@jit(nopython=True) # Numba annotation
def bruteForceNumba(r_vec, mass, soft=0.):
    """Calculates the acceleration generated by a set of masses on themselves.
    It is done in the same way as in bruteForce, but this
    method is ran through Numba"""
    mask = mass!=0
    massMassive = mass[mask]
    rMassive_vec = r_vec[mask]
    mass_mat = massMassive.reshape(1, -1, 1)
    r_ten = rMassive_vec.reshape(1, -1, 3) - r_vec.reshape(-1, 1, 3)
    # Avoid np.linalg.norm to allow Numba optimizations
    r_mat = np.sqrt(r_ten[:,:,0:1]**2 + r_ten[:,:,1:2]**2 + r_ten[:,:,2:3]**2)
    r_mat = np.where(r_mat == 0, np.ones_like(r_mat), r_mat)
    accel = r_ten/r_mat * mass_mat/(r_mat+soft)**2
    return accel.sum(axis=1) # Add all forces in each particle

@jit(nopython=True) # Numba annotation
def bruteForceNumbaOptimized(r_vec, mass, soft=0.):
    """Calculates the acceleration generated by a set of masses on themselves.
    This is optimized for high performance with Numba. All massive particles
    must appear first."""
    accel = np.zeros_like(r_vec) 
    # Use superposition to add all the contributions
    n = r_vec.shape[0] # Number of particles
    delta = np.zeros((3,)) # Only allocate this once
    for i in range(n):
        # Only consider pairs with at least one massive particle i
        if mass[i] == 0: break
        for j in range(i+1, n):
            # Explicitely separate components for high performance
            # i.e. do not do delta = r_vec[j] - r_vec[i]
            # (The effect of this is VERY relevant (x10) and has to do with
            # memory reallocation) Numba will vectorize the loops.
            for k in range(3): delta[k] = r_vec[j,k] - r_vec[i,k]
            r = np.sqrt(delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2])
            tripler = (r+soft)**2 * r
            
            # Compute acceleration on first particle
            mr3inv = mass[i]/(tripler)
            # Again, do NOT do accel[j] -= mr3inv * delta
            for k in range(3): accel[j,k] -= mr3inv * delta[k]
            
            # Compute acceleration on second particle
            # For pairs with one massless particle, no reaction force
            if mass[j] == 0: break
            # Otherwise, opposite direction (+)
            mr3inv = mass[j]/(tripler)
            for k in range(3): accel[i,k] += mr3inv * delta[k]
    return accel

# C++ interface, load library
ACCLIB = None
def loadCPPLib():
    """Loads the C++ shared library to the global variable ACCLIB. Must be
    called before using the library."""
    global ACCLIB
    ACCLIB = ctypes.CDLL('cpp/acclib.so')
    # Define appropiate types for library functions
    doublepp = np.ctypeslib.ndpointer(dtype=np.uintp) # double**
    doublep = ctl.ndpointer(np.float64, flags='aligned, c_contiguous')#double*
    # Check cpp/acclib.cpp for function signatures
    ACCLIB.bruteForceCPP.argtypes = [doublepp, doublep, 
        ctypes.c_int, ctypes.c_double]
    ACCLIB.barnesHutCPP.argtypes = [doublepp, doublep, 
        ctypes.c_int, ctypes.c_double, ctypes.c_double, 
        ctypes.c_double, ctypes.c_double, ctypes.c_double]

def bruteForceCPP(r_vec, m_vec, soft=0.):
    """Calculates the acceleration generated by a set of masses on themselves.
    This is ran in a shared C++ library through Brute Force (pairwise sums)
    Massive particles must appear first."""
    # Convert array to data required by C++ library
    if ACCLIB is None: loadCPPLib() # Singleton pattern
    # Change type to be appropiate for calling library
    r_vec_c = (r_vec.ctypes.data + np.arange(r_vec.shape[0]) 
        * r_vec.strides[0]).astype(np.uintp)
    # Set return type as double*
    ACCLIB.bruteForceCPP.restype = np.ctypeslib.ndpointer(dtype=np.float64, 
        shape=(r_vec.shape[0]*3,))
    # Call the C++ function: double* bruteForceCPP
    accel =  ACCLIB.bruteForceCPP(r_vec_c, m_vec, r_vec.shape[0], soft)
    # Change shape to get the expected Numpy array (n, 3)
    accel.shape = (-1, 3)
    return accel

def barnesHutCPP(r_vec, m_vec, soft=0.):
    """Calculates the acceleration generated by a set of masses on themselves.
    This is ran in a shared C++ library using a BarnesHut tree"""
    # Convert array to data required by C++ library
    if ACCLIB is None: loadCPPLib() # Singleton pattern
    # Change type to be appropiate for calling library
    r_vec_c = (r_vec.ctypes.data + np.arange(r_vec.shape[0]) 
        * r_vec.strides[0]).astype(np.uintp)
    # Set return type as double*
    ACCLIB.barnesHutCPP.restype = np.ctypeslib.ndpointer(dtype=np.float64, 
        shape=(r_vec.shape[0]*3,))
    # Explicitely pass the corner and size of the box for the top node
    px, py, pz = np.min(r_vec, axis=0)
    size = np.max(np.max(r_vec, axis=0) - np.min(r_vec, axis=0))
    # Call the C++ function: double* barnesHutCPP
    accel =  ACCLIB.barnesHutCPP(r_vec_c, m_vec, r_vec.shape[0], 
        size, px, py, pz, soft)
    # Change shape to get the expected Numpy array (n, 3)
    accel.shape = (-1, 3)
    return accel