{-|
Module      : MachineLearning.Utils.Data
Description : TIR expression data structures
Copyright   : (c) Fabricio Olivetti de Franca, 2022
License     : GPL-3
Maintainer  : fabricio.olivetti@gmail.com
Stability   : experimental
Portability : POSIX

Dataset handling functions.

-}
module MachineLearning.Utils.Data (processData) where

import Data.List                             (transpose)
import Data.List.Split                       (splitOn)
import Data.Vector                           (Vector, fromList)
import Numeric.ModalInterval            (Kaucher, (<.<))
import Numeric.LinearAlgebra ((??))
import qualified Numeric.LinearAlgebra as LA
import Numeric.Morpheus.MatrixReduce         (columnPredicate)

-- import Constraints.Shape
import MachineLearning.Utils.Config
import MachineLearning.TIR

-- | A tuple of features columns and target
type DataSplit = (Dataset Double, Column Double)

-- | Reads and loads the data. It returns the training X, y, test X, y, vector of domains and image of the training data,
-- number of variables
processData :: Config -> IO (DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher Double), Kaucher Double, Int)
processData :: Config
-> IO
     (DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher R),
      Kaucher R, Int)
processData Config
cfg =
  do (Matrix R
trainX, Vector R
trainY) <- FilePath -> IO (Matrix R, Vector R)
readAndParse (Config -> FilePath
getTrainName Config
cfg)
     (Matrix R
testX , Vector R
testY ) <- FilePath -> IO (Matrix R, Vector R)
readAndParse (Config -> FilePath
getTestName  Config
cfg)
     let nVars :: Int
nVars    = Matrix R -> Int
forall t. Matrix t -> Int
LA.cols Matrix R
trainX Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
         domains :: Vector (Kaucher R)
domains  = [Kaucher R] -> Vector (Kaucher R)
forall a. [a] -> Vector a
fromList ([Kaucher R] -> Vector (Kaucher R))
-> [Kaucher R] -> Vector (Kaucher R)
forall a b. (a -> b) -> a -> b
$ Domains -> Matrix R -> [Kaucher R]
estimateDomains (Config -> Domains
getDomains Config
cfg) Matrix R
trainX
         image :: Kaucher R
image    = Maybe (R, R) -> Vector R -> Kaucher R
estimateImage (Config -> Maybe (R, R)
getImage Config
cfg) Vector R
trainY 
         xss_all :: Dataset R
xss_all  = Matrix R -> Dataset R
toVecOfColumns Matrix R
trainX
         xss_test :: Dataset R
xss_test = Matrix R -> Dataset R
toVecOfColumns Matrix R
testX
         (Dataset R
xss_train, Vector R
y_train, Dataset R
xss_val, Vector R
y_val) = R
-> Matrix R
-> Vector R
-> (Dataset R, Vector R, Dataset R, Vector R)
splitValidation R
0.9 Matrix R
trainX Vector R
trainY
     (DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher R),
 Kaucher R, Int)
-> IO
     (DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher R),
      Kaucher R, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return ((Dataset R
xss_train, Vector R
y_train), (Dataset R
xss_val, Vector R
y_val), (Dataset R
xss_all, Vector R
trainY), (Dataset R
xss_test, Vector R
testY), Vector (Kaucher R)
domains, Kaucher R
image, Int
nVars)

-- | Parse a numerical csv file into predictors and target variables
parseFile :: String -> (LA.Matrix Double, Column Double)
parseFile :: FilePath -> (Matrix R, Vector R)
parseFile FilePath
css = Matrix R -> (Matrix R, Vector R)
splitToXY (Matrix R -> (Matrix R, Vector R))
-> ([[R]] -> Matrix R) -> [[R]] -> (Matrix R, Vector R)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[R]] -> Matrix R
forall t. Element t => [[t]] -> Matrix t
LA.fromLists ([[R]] -> (Matrix R, Vector R)) -> [[R]] -> (Matrix R, Vector R)
forall a b. (a -> b) -> a -> b
$ ([FilePath] -> [R]) -> [[FilePath]] -> [[R]]
forall a b. (a -> b) -> [a] -> [b]
map ((FilePath -> R) -> [FilePath] -> [R]
forall a b. (a -> b) -> [a] -> [b]
map FilePath -> R
forall a. Read a => FilePath -> a
read) [[FilePath]]
dat
  where
    dat :: [[FilePath]]
dat = (FilePath -> [FilePath]) -> [FilePath] -> [[FilePath]]
forall a b. (a -> b) -> [a] -> [b]
map (FilePath -> FilePath -> [FilePath]
forall a. Eq a => [a] -> [a] -> [[a]]
splitOn FilePath
",") ([FilePath] -> [[FilePath]]) -> [FilePath] -> [[FilePath]]
forall a b. (a -> b) -> a -> b
$ FilePath -> [FilePath]
lines FilePath
css
    
-- | read and parse the csv file
readAndParse :: FilePath -> IO (LA.Matrix Double, LA.Vector Double)
readAndParse :: FilePath -> IO (Matrix R, Vector R)
readAndParse FilePath
f = do (Matrix R
xss, Vector R
ys) <- FilePath -> (Matrix R, Vector R)
parseFile (FilePath -> (Matrix R, Vector R))
-> IO FilePath -> IO (Matrix R, Vector R)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath -> IO FilePath
readFile FilePath
f
                    (Matrix R, Vector R) -> IO (Matrix R, Vector R)
forall (m :: * -> *) a. Monad m => a -> m a
return (Matrix R
1.0 Matrix R -> Matrix R -> Matrix R
forall t. Element t => Matrix t -> Matrix t -> Matrix t
LA.||| Matrix R
xss, Vector R
ys)

toVecOfColumns :: LA.Matrix Double -> Dataset Double
toVecOfColumns :: Matrix R -> Dataset R
toVecOfColumns = [Vector R] -> Dataset R
forall a. [a] -> Vector a
fromList ([Vector R] -> Dataset R)
-> (Matrix R -> [Vector R]) -> Matrix R -> Dataset R
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Matrix R -> [Vector R]
forall t. Element t => Matrix t -> [Vector t]
LA.toColumns

takeNRows, dropNRows :: Int -> LA.Matrix Double -> LA.Matrix Double
takeNRows :: Int -> Matrix R -> Matrix R
takeNRows Int
n Matrix R
xss = Matrix R
xss Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
LA.?? (Int -> Extractor
LA.Take Int
n, Extractor
LA.All)
dropNRows :: Int -> Matrix R -> Matrix R
dropNRows Int
n Matrix R
xss = Matrix R
xss Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
LA.?? (Int -> Extractor
LA.Drop Int
n, Extractor
LA.All)

-- | split the training data into training and validation
splitValidation :: Double -> LA.Matrix Double -> LA.Vector Double 
                -> (Dataset Double, LA.Vector Double, Dataset Double, LA.Vector Double)
splitValidation :: R
-> Matrix R
-> Vector R
-> (Dataset R, Vector R, Dataset R, Vector R)
splitValidation R
ratio Matrix R
xss Vector R
ys
  | Int
nRows Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
50 = (Matrix R -> Dataset R
toVecOfColumns Matrix R
xss, Vector R
ys, Matrix R -> Dataset R
toVecOfColumns Matrix R
xss, Vector R
ys)
  | Bool
otherwise    = (Dataset R
xss_train, Vector R
y_train, Dataset R
xss_val, Vector R
y_val)
  where
    nRows :: Int
nRows      = Matrix R -> Int
forall t. Matrix t -> Int
LA.rows Matrix R
xss
    nRowsTrain :: Int
nRowsTrain = R -> Int
forall a b. (RealFrac a, Integral b) => a -> b
round (Int -> R
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nRows R -> R -> R
forall a. Num a => a -> a -> a
* R
ratio)
    nRowsVal :: Int
nRowsVal   = Int
nRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nRowsTrain
    xss_train :: Dataset R
xss_train  = Matrix R -> Dataset R
toVecOfColumns (Matrix R -> Dataset R) -> Matrix R -> Dataset R
forall a b. (a -> b) -> a -> b
$ Int -> Matrix R -> Matrix R
takeNRows Int
nRowsTrain Matrix R
xss
    xss_val :: Dataset R
xss_val    = Matrix R -> Dataset R
toVecOfColumns (Matrix R -> Dataset R) -> Matrix R -> Dataset R
forall a b. (a -> b) -> a -> b
$ Int -> Matrix R -> Matrix R
dropNRows Int
nRowsVal Matrix R
xss
    y_train :: Vector R
y_train    = Int -> Int -> Vector R -> Vector R
forall t. Storable t => Int -> Int -> Vector t -> Vector t
LA.subVector Int
0 Int
nRowsTrain Vector R
ys
    y_val :: Vector R
y_val      = Int -> Int -> Vector R -> Vector R
forall t. Storable t => Int -> Int -> Vector t -> Vector t
LA.subVector Int
nRowsVal Int
nRowsTrain Vector R
ys    

-- | estimates the target image of the function    
estimateImage :: Maybe (Double, Double) -> LA.Vector Double -> Kaucher Double
estimateImage :: Maybe (R, R) -> Vector R -> Kaucher R
estimateImage Maybe (R, R)
image Vector R
ys = 
  case Maybe (R, R)
image of 
       Maybe (R, R)
Nothing       -> R
minY R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
<.< R
maxY
       Just (R
lo, R
hi) -> R
lo R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
<.< R
hi
  where
    minY :: R
minY = [R] -> R
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum ([R] -> R) -> [R] -> R
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList Vector R
ys
    maxY :: R
maxY = [R] -> R
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
maximum ([R] -> R) -> [R] -> R
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList Vector R
ys

type Domains = [(Double, Double)]

-- | estimates the target domain of the function    
estimateDomains :: Domains -> LA.Matrix Double -> [Kaucher Double]
estimateDomains :: Domains -> Matrix R -> [Kaucher R]
estimateDomains Domains
domains Matrix R
xss =
  case Domains
domains of
    [] -> (R -> R -> Kaucher R) -> [R] -> [R] -> [Kaucher R]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
(<.<) [R]
minX [R]
maxX
    Domains
ds -> ((R, R) -> Kaucher R) -> Domains -> [Kaucher R]
forall a b. (a -> b) -> [a] -> [b]
map ((R -> R -> Kaucher R) -> (R, R) -> Kaucher R
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
(<.<)) Domains
ds
  where
    minX :: [R]
minX = [R] -> [R]
forall a. [a] -> [a]
Prelude.tail ([R] -> [R]) -> [R] -> [R]
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList (Vector R -> [R]) -> Vector R -> [R]
forall a b. (a -> b) -> a -> b
$ (R -> R -> R) -> Matrix R -> Vector R
columnPredicate R -> R -> R
forall a. Ord a => a -> a -> a
min Matrix R
xss
    maxX :: [R]
maxX = [R] -> [R]
forall a. [a] -> [a]
Prelude.tail ([R] -> [R]) -> [R] -> [R]
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList (Vector R -> [R]) -> Vector R -> [R]
forall a b. (a -> b) -> a -> b
$ (R -> R -> R) -> Matrix R -> Vector R
columnPredicate R -> R -> R
forall a. Ord a => a -> a -> a
max Matrix R
xss    
    
splitToXY :: LA.Matrix Double -> (LA.Matrix Double, LA.Vector Double)
splitToXY :: Matrix R -> (Matrix R, Vector R)
splitToXY Matrix R
mtx = (Matrix R
xss, Vector R
ys)
  where 
    xss :: Matrix R
xss = Matrix R
mtx Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
?? (Extractor
LA.All, Int -> Extractor
LA.DropLast Int
1)
    ys :: Vector R
ys  = Matrix R -> Vector R
forall t. Element t => Matrix t -> Vector t
LA.flatten (Matrix R -> Vector R) -> Matrix R -> Vector R
forall a b. (a -> b) -> a -> b
$ Matrix R
mtx Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
?? (Extractor
LA.All, Int -> Extractor
LA.TakeLast Int
1)