module MachineLearning.Utils.Data (processData) where
import Data.List (transpose)
import Data.List.Split (splitOn)
import Data.Vector (Vector, fromList)
import Numeric.ModalInterval (Kaucher, (<.<))
import Numeric.LinearAlgebra ((??))
import qualified Numeric.LinearAlgebra as LA
import Numeric.Morpheus.MatrixReduce (columnPredicate)
import MachineLearning.Utils.Config
import MachineLearning.TIR
type DataSplit = (Dataset Double, Column Double)
processData :: Config -> IO (DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher Double), Kaucher Double, Int)
processData :: Config
-> IO
(DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher R),
Kaucher R, Int)
processData Config
cfg =
do (Matrix R
trainX, Vector R
trainY) <- FilePath -> IO (Matrix R, Vector R)
readAndParse (Config -> FilePath
getTrainName Config
cfg)
(Matrix R
testX , Vector R
testY ) <- FilePath -> IO (Matrix R, Vector R)
readAndParse (Config -> FilePath
getTestName Config
cfg)
let nVars :: Int
nVars = Matrix R -> Int
forall t. Matrix t -> Int
LA.cols Matrix R
trainX Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
domains :: Vector (Kaucher R)
domains = [Kaucher R] -> Vector (Kaucher R)
forall a. [a] -> Vector a
fromList ([Kaucher R] -> Vector (Kaucher R))
-> [Kaucher R] -> Vector (Kaucher R)
forall a b. (a -> b) -> a -> b
$ Domains -> Matrix R -> [Kaucher R]
estimateDomains (Config -> Domains
getDomains Config
cfg) Matrix R
trainX
image :: Kaucher R
image = Maybe (R, R) -> Vector R -> Kaucher R
estimateImage (Config -> Maybe (R, R)
getImage Config
cfg) Vector R
trainY
xss_all :: Dataset R
xss_all = Matrix R -> Dataset R
toVecOfColumns Matrix R
trainX
xss_test :: Dataset R
xss_test = Matrix R -> Dataset R
toVecOfColumns Matrix R
testX
(Dataset R
xss_train, Vector R
y_train, Dataset R
xss_val, Vector R
y_val) = R
-> Matrix R
-> Vector R
-> (Dataset R, Vector R, Dataset R, Vector R)
splitValidation R
0.9 Matrix R
trainX Vector R
trainY
(DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher R),
Kaucher R, Int)
-> IO
(DataSplit, DataSplit, DataSplit, DataSplit, Vector (Kaucher R),
Kaucher R, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return ((Dataset R
xss_train, Vector R
y_train), (Dataset R
xss_val, Vector R
y_val), (Dataset R
xss_all, Vector R
trainY), (Dataset R
xss_test, Vector R
testY), Vector (Kaucher R)
domains, Kaucher R
image, Int
nVars)
parseFile :: String -> (LA.Matrix Double, Column Double)
parseFile :: FilePath -> (Matrix R, Vector R)
parseFile FilePath
css = Matrix R -> (Matrix R, Vector R)
splitToXY (Matrix R -> (Matrix R, Vector R))
-> ([[R]] -> Matrix R) -> [[R]] -> (Matrix R, Vector R)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[R]] -> Matrix R
forall t. Element t => [[t]] -> Matrix t
LA.fromLists ([[R]] -> (Matrix R, Vector R)) -> [[R]] -> (Matrix R, Vector R)
forall a b. (a -> b) -> a -> b
$ ([FilePath] -> [R]) -> [[FilePath]] -> [[R]]
forall a b. (a -> b) -> [a] -> [b]
map ((FilePath -> R) -> [FilePath] -> [R]
forall a b. (a -> b) -> [a] -> [b]
map FilePath -> R
forall a. Read a => FilePath -> a
read) [[FilePath]]
dat
where
dat :: [[FilePath]]
dat = (FilePath -> [FilePath]) -> [FilePath] -> [[FilePath]]
forall a b. (a -> b) -> [a] -> [b]
map (FilePath -> FilePath -> [FilePath]
forall a. Eq a => [a] -> [a] -> [[a]]
splitOn FilePath
",") ([FilePath] -> [[FilePath]]) -> [FilePath] -> [[FilePath]]
forall a b. (a -> b) -> a -> b
$ FilePath -> [FilePath]
lines FilePath
css
readAndParse :: FilePath -> IO (LA.Matrix Double, LA.Vector Double)
readAndParse :: FilePath -> IO (Matrix R, Vector R)
readAndParse FilePath
f = do (Matrix R
xss, Vector R
ys) <- FilePath -> (Matrix R, Vector R)
parseFile (FilePath -> (Matrix R, Vector R))
-> IO FilePath -> IO (Matrix R, Vector R)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath -> IO FilePath
readFile FilePath
f
(Matrix R, Vector R) -> IO (Matrix R, Vector R)
forall (m :: * -> *) a. Monad m => a -> m a
return (Matrix R
1.0 Matrix R -> Matrix R -> Matrix R
forall t. Element t => Matrix t -> Matrix t -> Matrix t
LA.||| Matrix R
xss, Vector R
ys)
toVecOfColumns :: LA.Matrix Double -> Dataset Double
toVecOfColumns :: Matrix R -> Dataset R
toVecOfColumns = [Vector R] -> Dataset R
forall a. [a] -> Vector a
fromList ([Vector R] -> Dataset R)
-> (Matrix R -> [Vector R]) -> Matrix R -> Dataset R
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Matrix R -> [Vector R]
forall t. Element t => Matrix t -> [Vector t]
LA.toColumns
takeNRows, dropNRows :: Int -> LA.Matrix Double -> LA.Matrix Double
takeNRows :: Int -> Matrix R -> Matrix R
takeNRows Int
n Matrix R
xss = Matrix R
xss Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
LA.?? (Int -> Extractor
LA.Take Int
n, Extractor
LA.All)
dropNRows :: Int -> Matrix R -> Matrix R
dropNRows Int
n Matrix R
xss = Matrix R
xss Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
LA.?? (Int -> Extractor
LA.Drop Int
n, Extractor
LA.All)
splitValidation :: Double -> LA.Matrix Double -> LA.Vector Double
-> (Dataset Double, LA.Vector Double, Dataset Double, LA.Vector Double)
splitValidation :: R
-> Matrix R
-> Vector R
-> (Dataset R, Vector R, Dataset R, Vector R)
splitValidation R
ratio Matrix R
xss Vector R
ys
| Int
nRows Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
50 = (Matrix R -> Dataset R
toVecOfColumns Matrix R
xss, Vector R
ys, Matrix R -> Dataset R
toVecOfColumns Matrix R
xss, Vector R
ys)
| Bool
otherwise = (Dataset R
xss_train, Vector R
y_train, Dataset R
xss_val, Vector R
y_val)
where
nRows :: Int
nRows = Matrix R -> Int
forall t. Matrix t -> Int
LA.rows Matrix R
xss
nRowsTrain :: Int
nRowsTrain = R -> Int
forall a b. (RealFrac a, Integral b) => a -> b
round (Int -> R
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nRows R -> R -> R
forall a. Num a => a -> a -> a
* R
ratio)
nRowsVal :: Int
nRowsVal = Int
nRows Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nRowsTrain
xss_train :: Dataset R
xss_train = Matrix R -> Dataset R
toVecOfColumns (Matrix R -> Dataset R) -> Matrix R -> Dataset R
forall a b. (a -> b) -> a -> b
$ Int -> Matrix R -> Matrix R
takeNRows Int
nRowsTrain Matrix R
xss
xss_val :: Dataset R
xss_val = Matrix R -> Dataset R
toVecOfColumns (Matrix R -> Dataset R) -> Matrix R -> Dataset R
forall a b. (a -> b) -> a -> b
$ Int -> Matrix R -> Matrix R
dropNRows Int
nRowsVal Matrix R
xss
y_train :: Vector R
y_train = Int -> Int -> Vector R -> Vector R
forall t. Storable t => Int -> Int -> Vector t -> Vector t
LA.subVector Int
0 Int
nRowsTrain Vector R
ys
y_val :: Vector R
y_val = Int -> Int -> Vector R -> Vector R
forall t. Storable t => Int -> Int -> Vector t -> Vector t
LA.subVector Int
nRowsVal Int
nRowsTrain Vector R
ys
estimateImage :: Maybe (Double, Double) -> LA.Vector Double -> Kaucher Double
estimateImage :: Maybe (R, R) -> Vector R -> Kaucher R
estimateImage Maybe (R, R)
image Vector R
ys =
case Maybe (R, R)
image of
Maybe (R, R)
Nothing -> R
minY R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
<.< R
maxY
Just (R
lo, R
hi) -> R
lo R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
<.< R
hi
where
minY :: R
minY = [R] -> R
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum ([R] -> R) -> [R] -> R
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList Vector R
ys
maxY :: R
maxY = [R] -> R
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
maximum ([R] -> R) -> [R] -> R
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList Vector R
ys
type Domains = [(Double, Double)]
estimateDomains :: Domains -> LA.Matrix Double -> [Kaucher Double]
estimateDomains :: Domains -> Matrix R -> [Kaucher R]
estimateDomains Domains
domains Matrix R
xss =
case Domains
domains of
[] -> (R -> R -> Kaucher R) -> [R] -> [R] -> [Kaucher R]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
(<.<) [R]
minX [R]
maxX
Domains
ds -> ((R, R) -> Kaucher R) -> Domains -> [Kaucher R]
forall a b. (a -> b) -> [a] -> [b]
map ((R -> R -> Kaucher R) -> (R, R) -> Kaucher R
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry R -> R -> Kaucher R
forall a. RealFloat a => a -> a -> Kaucher a
(<.<)) Domains
ds
where
minX :: [R]
minX = [R] -> [R]
forall a. [a] -> [a]
Prelude.tail ([R] -> [R]) -> [R] -> [R]
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList (Vector R -> [R]) -> Vector R -> [R]
forall a b. (a -> b) -> a -> b
$ (R -> R -> R) -> Matrix R -> Vector R
columnPredicate R -> R -> R
forall a. Ord a => a -> a -> a
min Matrix R
xss
maxX :: [R]
maxX = [R] -> [R]
forall a. [a] -> [a]
Prelude.tail ([R] -> [R]) -> [R] -> [R]
forall a b. (a -> b) -> a -> b
$ Vector R -> [R]
forall a. Storable a => Vector a -> [a]
LA.toList (Vector R -> [R]) -> Vector R -> [R]
forall a b. (a -> b) -> a -> b
$ (R -> R -> R) -> Matrix R -> Vector R
columnPredicate R -> R -> R
forall a. Ord a => a -> a -> a
max Matrix R
xss
splitToXY :: LA.Matrix Double -> (LA.Matrix Double, LA.Vector Double)
splitToXY :: Matrix R -> (Matrix R, Vector R)
splitToXY Matrix R
mtx = (Matrix R
xss, Vector R
ys)
where
xss :: Matrix R
xss = Matrix R
mtx Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
?? (Extractor
LA.All, Int -> Extractor
LA.DropLast Int
1)
ys :: Vector R
ys = Matrix R -> Vector R
forall t. Element t => Matrix t -> Vector t
LA.flatten (Matrix R -> Vector R) -> Matrix R -> Vector R
forall a b. (a -> b) -> a -> b
$ Matrix R
mtx Matrix R -> (Extractor, Extractor) -> Matrix R
forall t.
Element t =>
Matrix t -> (Extractor, Extractor) -> Matrix t
?? (Extractor
LA.All, Int -> Extractor
LA.TakeLast Int
1)