Adam Optimizer is apparently not converging - machine-learning

I am trying to write a neural network in rust + arrayfire, and while gradient descent works, ADAM does not.
fn back_propagate(
&mut self,
signals: &Vec<Array<f32>>,
labels: &Array<u8>,
learning_rate_alpha: f64,
batch_size: i32,
) {
let mut output = signals.last().unwrap();
let mut error = output - labels;
for layer_index in (0..self.num_layers - 1).rev() {
let signal = Self::add_bias(&signals[layer_index]);
let deriv = self.layer_activations[layer_index].apply_deriv(output);
let delta = &(deriv * error).T();
let matmul = matmul(&delta, &signal, MatProp::NONE, MatProp::NONE);
let gradient_t = (matmul / batch_size).T();
match self.optimizer {
Optimizer::GradientDescent => {
let weight_update = learning_rate_alpha * gradient_t;
self.weights[layer_index] -= weight_update;
}
Optimizer::Adam => {
let exponents = constant(2f32, gradient_t.dims());
self.first_moment_vectors[layer_index] = (&self.beta1[layer_index]
* &self.first_moment_vectors[layer_index])
+ (&self.one_minus_beta1[layer_index] * &gradient_t);
self.second_moment_vectors[layer_index] = (&self.beta2[layer_index]
* &self.second_moment_vectors[layer_index])
+ (&self.one_minus_beta2[layer_index]
* arrayfire::pow(&gradient_t, &exponents, true));
let corrected_first_moment_vector = &self.first_moment_vectors[layer_index]
/ &self.one_minus_beta1[layer_index];
let corrected_second_moment_vector = &self.second_moment_vectors[layer_index]
/ &self.one_minus_beta2[layer_index];
let denominator = sqrt(&corrected_second_moment_vector) + 1e-8;
let weight_update =
learning_rate_alpha * (corrected_first_moment_vector / denominator);
self.weights[layer_index] -= weight_update;
}
}
output = &signals[layer_index];
let err = matmulTT(
&delta,
&self.weights[layer_index],
MatProp::NONE,
MatProp::NONE,
);
error = index(&err, &[seq!(), seq!(1, output.dims()[1] as i32, 1)]);
}
}
I've stored beta1, beta2, 1-beta1, 1-beta2 in constant arrays for every layer just to avoid having to recompute them. It appears to have made no difference.
GradientDescent converges with a learning rate alpha=2.0, however with Adam, if i use alpha>~0.02, the network appears to get locked in. Funnily enough, if I remove all the hidden layers, it does work. Which tells me something, but I'm not sure what it is.

I figured it out, for anyone else, my alpha=0.01 is still too high, once I reduced it to 0.001, it converged very quickly

Related

Correct response and predictor parameters when using caret and roc()

I'm practicing logistic regression models and cross validation. I would like to output ROC curve to estimate performance, but I'm not sure which response and predictor I should use in the roc() function.
Here is an example I tried. Why are the plots different? What is my error? Thanks.
library(caret)
library(ggplot2)
library(lattice)
library(pROC)
data(mtcars)
mtcars$am = factor(ifelse(mtcars$am == 1, 'one', 'zero'))
ctrl = trainControl(method="cv",number = 5,
summaryFunction=twoClassSummary, classProbs=T,
savePredictions = T)
m=train(am ~ qsec, data = mtcars, method = "glm",
family = binomial,metric="ROC",
trControl=ctrl)
curve1 = roc(response = mtcars$am,
predictor = as.numeric(predict(m)), plot = T, legacy.axes = T, percent = T,
main = 'Test Curve1',
xlab = 'False Positive Percentage (1 - Specificity)',
ylab = 'True Positive Percentage (Sensitivity)',print.auc = T,
print.auc.x = 100, print.auc.y = 100, col = '#20B2AA', lwd = 4)
curve2 = roc(response = m$pred$obs,
predictor = as.numeric(m$pred$pred), plot = T, legacy.axes = T, percent = T,
main = 'Test Curve2',
xlab = 'False Positive Percentage (1 - Specificity)',
ylab = 'True Positive Percentage (Sensitivity)',print.auc = T,
print.auc.x = 100, print.auc.y = 100, col = '#20B2AA', lwd = 4)

About pytorch reduction mean

I want use L1loss and BCELoss with reduction='mean' in vae reconstruction loss
but it produce same result for all different input i.e. result for landmark
so i use reduction='sum' it produce correct result that different output for different input.
how can i use mean reduction??
L1Loss = nn.L1Loss(reduction='mean').to(device)
BCELoss = nn.BCELoss(reduction='mean').to(device)
kld_criterion = KLDLoss(reduction='mean').to(device)
in training
rec_m, (rec_f, mean_f, logvar_f), (rec_l, mean_l, logvar_l) = model(origin)
lm_loss = CELoss(rec_l, lm)
f_loss = L1Loss(rec_f, f)
m_loss = CELoss(rec_m, m)
lm_kld_loss = kld_criterion(mean_l, logvar_l)
f_kld_loss = kld_criterion(mean_f, logvar_f)
loss = 4000*(f_loss + m_loss) + 30 * (lm_kld_loss + f_kld_loss) + 2000 * lm_loss
and model code
class VAE_NET(nn.Module):
def __init__(self, nc=3, ndf=32, nef=32, nz=128, isize=128, device=torch.device("cuda:0"), is_train=True):
super(VAE_NET, self).__init__()
self.nz = nz
# Encoder
self.l_encoder = Encoder(nc=nc, nef=nef, nz=nz, isize=isize, device=device)
self.f_encoder = Encoder(nc=nc, nef=nef, nz=nz, isize=isize, device=device)
# Decoder
self.l_decoder = Decoder(nc=nc, ndf=ndf, nz=nz, isize=isize)
self.m_decoder = Decoder(nc = nc, ndf = ndf, nz = nz * 2, isize = isize)
self.f_decoder = Decoder(nc = nc, ndf = ndf, nz = nz * 2, isize = isize)
if is_train == False:
for param in self.encoder.parameters():
param.requires_grad = False
for param in self.decoder.parameters():
param.requires_grad = False
def forward(self, x):
latent_l, mean_l, logvar_l = self.l_encoder(x)
latent_f, mean_f, logvar_f = self.f_encoder(x)
concat_latent = torch.cat((latent_l, latent_f), 1)
rec_l = self.l_decoder(latent_l)
rec_m = self.m_decoder(concat_latent)
rec_f = self.f_decoder(concat_latent)
return rec_m, (rec_f, mean_f, latent_f), (rec_l, mean_l, latent_l)
l is for face landmark
m is for face mask
f is for face part
reduction='sum' and reduction='mean' differs only by a scalar multiple. There is nothing wrong with your implementation from what I see. If your model only produces correct results with reduction='sum', it is likely that your learning rate is too low (and sum makes up for that difference by amplifying the gradient).

LinearRegressionWithSGD() returns NaN

I am trying to use LinearRegressionWithSGD on Million Song Data Set and my model returns NaN's as weights and 0.0 as the intercept. What might be the issue for the error ? I am using Spark 1.40 in standalone mode.
Sample data: http://www.filedropper.com/part-00000
Here is my full code:
// Import Dependencies
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// Define RDD
val data =
sc.textFile("/home/naveen/Projects/millionSong/YearPredictionMSD.txt")
// Convert to Labelled Point
def parsePoint (line: String): LabeledPoint = {
val x = line.split(",")
val head = x.head.toDouble
val tail = Vectors.dense(x.tail.map(x => x.toDouble))
return LabeledPoint(head,tail)
}
// Find Range
val parsedDataInit = data.map(x => parsePoint(x))
val onlyLabels = parsedDataInit.map(x => x.label)
val minYear = onlyLabels.min()
val maxYear = onlyLabels.max()
// Shift Labels
val parsedData = parsedDataInit.map(x => LabeledPoint(x.label-minYear
, x.features))
// Training, validation, and test sets
val splits = parsedData.randomSplit(Array(0.8, 0.1, 0.1), seed = 123)
val parsedTrainData = splits(0).cache()
val parsedValData = splits(1).cache()
val parsedTestData = splits(2).cache()
val nTrain = parsedTrainData.count()
val nVal = parsedValData.count()
val nTest = parsedTestData.count()
// RMSE
def squaredError(label: Double, prediction: Double): Double = {
return scala.math.pow(label - prediction,2)
}
def calcRMSE(labelsAndPreds: RDD[List[Double]]): Double = {
return scala.math.sqrt(labelsAndPreds.map(x =>
squaredError(x(0),x(1))).mean())
}
val numIterations = 100
val stepSize = 1.0
val regParam = 0.01
val regType = "L2"
val algorithm = new LinearRegressionWithSGD()
algorithm.optimizer
.setNumIterations(numIterations)
.setStepSize(stepSize)
.setRegParam(regParam)
val model = algorithm.run(parsedTrainData)
I am not familiar with this specific implementation of SGD, but generally if a gradient descent solver goes to nan that means that the learning rate is too big. (in this case I think it is the stepSize variable).
Try to lower it by an order of magnitude each time until it starts to converge
I can think there are two possibilities.
stepSize is big. You should try something like 0.01, 0.03, 0.1,
0.3, 1.0, 3.0....
Your train data have NaN. If so, result will be likely NaN.

How to get a prediction using Torch7

I'm still familiarizing myself with Torch and so far so good. I have however hit a dead end that I'm not sure how to get around: how can I get Torch7 (or more specifically the dp library) to evaluate a single input and return the predicted output?
Here's my setup (basically the dp demo):
require 'dp'
--[[hyperparameters]]--
opt = {
nHidden = 100, --number of hidden units
learningRate = 0.1, --training learning rate
momentum = 0.9, --momentum factor to use for training
maxOutNorm = 1, --maximum norm allowed for output neuron weights
batchSize = 128, --number of examples per mini-batch
maxTries = 100, --maximum number of epochs without reduction in validation error.
maxEpoch = 1000 --maximum number of epochs of training
}
--[[data]]--
datasource = dp.Mnist{input_preprocess = dp.Standardize()}
print("feature size: ", datasource:featureSize())
--[[Model]]--
model = dp.Sequential{
models = {
dp.Neural{
input_size = datasource:featureSize(),
output_size = opt.nHidden,
transfer = nn.Tanh(),
sparse_init = true
},
dp.Neural{
input_size = opt.nHidden,
output_size = #(datasource:classes()),
transfer = nn.LogSoftMax(),
sparse_init = true
}
}
}
--[[Propagators]]--
train = dp.Optimizer{
loss = dp.NLL(),
visitor = { -- the ordering here is important:
dp.Momentum{momentum_factor = opt.momentum},
dp.Learn{learning_rate = opt.learningRate},
dp.MaxNorm{max_out_norm = opt.maxOutNorm}
},
feedback = dp.Confusion(),
sampler = dp.ShuffleSampler{batch_size = opt.batchSize},
progress = true
}
valid = dp.Evaluator{
loss = dp.NLL(),
feedback = dp.Confusion(),
sampler = dp.Sampler{}
}
test = dp.Evaluator{
loss = dp.NLL(),
feedback = dp.Confusion(),
sampler = dp.Sampler{}
}
--[[Experiment]]--
xp = dp.Experiment{
model = model,
optimizer = train,
validator = valid,
tester = test,
observer = {
dp.FileLogger(),
dp.EarlyStopper{
error_report = {'validator','feedback','confusion','accuracy'},
maximize = true,
max_epochs = opt.maxTries
}
},
random_seed = os.time(),
max_epoch = opt.maxEpoch
}
xp:run(datasource)
You have two options.
One. Use the encapsulated nn.Module to forward your torch.Tensor:
mlp = model:toModule(datasource:trainSet():sub(1,2))
mlp:float()
input = torch.FloatTensor(1, 1, 32, 32) -- replace this with your input
output = mlp:forward(input)
Two. Encapsulate your torch.Tensor into a dp.ImageView and forward that through your dp.Model :
input = torch.FloatTensor(1, 1, 32, 32) -- replace with your input
inputView = dp.ImageView('bchw', input)
outputView = mlp:forward(inputView, dp.Carry{nSample=1})
output = outputView:forward('b')

AVAudioRecorder through accelerate FFT into frequency - EXECUTION

My main goal: find the frequency of the noises being pulled in through AVAudioRecorder. I have followed this:
http://www.ehow.com/how_12224909_detect-blow-mic-xcode.html
I have read up on many questions on SO asking how to detect frequency. The majority of those answers say, "Use FFT!" and then the question ask-ers say, "Oh, great!".
My question is, how do you get from here:
- (void)levelTimerCallback {
[recorder updateMeters];
const double ALPHA = 0.05;
double peakPowerForChannel = pow(10, (0.05 * [recorder peakPowerForChannel:0]));
lowPassResults = ALPHA * peakPowerForChannel + (1.0 - ALPHA) * lowPassResults;
if (lowPassResults > sensitivitySlider.value) {
NSLog(#"Sound detected");
//What goes here so I can spit out a frequency?
}
}
Somehow magically use FFT... (I will use accelerate.h),
And wind up with "The frequency = 450.3"?
If somebody could show me the actual code that I would use to
Plug the sound from the AVAudioRecorder into Accelerate
and
How to turn the result into a frequency...
That would be greatly appreciated.
Thanks in advance.
Nothing "goes there", as the AVRecorder API does not plug into the Accelerate framework. Instead, you have to use a completely different API, the Audio Queue or RemoteIO Audio Unit API, to capture audio input, a completely different code arrangement, such as waiting for callbacks to get your data, buffer size management to get data arrays of the appropriate size to feed an FFT, then know enough DSP to post-process the FFT results for the particular kind of frequency measure for which you are looking.
Well, it turns out that something CAN "go there". Instead of using Accelerate, I bought a book on Fourier Analysis on Amazon and used it to build my own FFT. Which spits out not a single frequency but the levels of each of many frequencies, which is basically what I wanted.
Here's my FFT-computing class:
class FFTComputer: NSObject {
class func integerBitReverse(_ input:Int,binaryDigits:Int) -> Int {
return integerForReversedBooleans(booleansForInt(input, binaryDigits: binaryDigits))
}
class func integerForReversedBooleans(_ booleans:[Bool]) -> Int {
var integer = 0
var digit = booleans.count - 1
while digit >= 0 {
if booleans[digit] == true {
integer += Int(pow(Double(2), Double(digit)))
}
digit -= 1
}
return integer
}
class func Pnumber(_ k:Int,placesToMove:Int, gamma:Int) -> Int {
var booleans = booleansForInt(k, binaryDigits: gamma)
for _ in 0 ..< placesToMove {
booleans.removeLast()
booleans.insert(false, at: 0)
}
return integerForReversedBooleans(booleans)
}
class func booleansForInt(_ input:Int,binaryDigits:Int) -> [Bool] {
var booleans = [Bool]()
var remainingInput = input
var digit = binaryDigits - 1
while digit >= 0 {
let potential = Int(pow(Double(2), Double(digit)))
if potential > remainingInput {
booleans.append(false)
} else {
booleans.append(true)
remainingInput -= potential
}
digit += -1
}
return booleans
}
class func fftOfTwoRealFunctions(_ realX1:[CGFloat], realX2:[CGFloat], gamma:Int) -> (([CGFloat],[CGFloat]),([CGFloat],[CGFloat])) {
let theFFT = fft(realX1, imaginaryXin: realX2, gamma: gamma)
var R = theFFT.0
var I = theFFT.1
let N = Int(pow(2.0, Double(gamma)))
var realOut1 = [CGFloat]()
var imagOut1 = [CGFloat]()
var realOut2 = [CGFloat]()
var imagOut2 = [CGFloat]()
for n in 0..<N {
var Rback:CGFloat
var Iback:CGFloat
if n == 0 {
Rback = R[0]
Iback = I[0]
} else {
Rback = R[N-n]
Iback = I[N-n]
}
realOut1.append(CGFloat(R[n]/2 + Rback/2))
realOut2.append(CGFloat(I[n]/2 + Iback/2))
imagOut1.append(CGFloat(I[n]/2 - Iback/2))
imagOut2.append(-CGFloat(R[n]/2 - Rback/2))
}
return ((realOut1,imagOut1),(realOut2,imagOut2))
}
class func fft(_ realXin:[CGFloat], imaginaryXin:[CGFloat], gamma:Int) -> ([CGFloat],[CGFloat]) {
var realX = realXin
var imaginaryX = imaginaryXin
let N = Int(pow(2.0, Double(gamma)))
var N2 = N/2
var NU1 = gamma - 1 // Always equals (gamma - l)
var realWP:Double = 1
var imaginaryWP:Double = 0
var redoPCounter = 0
func redoP(_ k:Int, places:Int) {
let P = Pnumber(k, placesToMove:places, gamma: gamma)
let inside = (-2*Double.pi*Double(P))/Double(N)
realWP = cos(inside)
imaginaryWP = sin(inside)
}
var l = 1
while l <= gamma {
var k = 0
var I = 1
while k < N - 1 {
if redoPCounter == N2 {
redoP(k,places: NU1)
redoPCounter = 0
}
redoPCounter += 1
// Swift.print(realX.count,imaginaryX.count,k+N2)
let realT1 = (realWP*Double(realX[k + N2]))-(imaginaryWP*Double(imaginaryX[k + N2]))
let imaginaryT1 = (realWP*Double(imaginaryX[k + N2]))+(imaginaryWP*Double(realX[k + N2]))
realX[k+N2] = realX[k] - CGFloat(realT1)
imaginaryX[k+N2] = imaginaryX[k] - CGFloat(imaginaryT1)
realX[k] = realX[k] + CGFloat(realT1)
imaginaryX[k] = imaginaryX[k] + CGFloat(imaginaryT1)
k += 1
if I == N2 {
k += N2
I = 1
} else {
I += 1
}
}
N2 = N2/2
NU1 = NU1 - 1
redoPCounter = 0
realWP = 1
imaginaryWP = 0
l += 1
}
for k in 0 ..< N - 1 {
let i = integerBitReverse(k, binaryDigits:gamma)
if i > k {
let placeholderReal = realX[k]
let placeholderImaginary = imaginaryX[k]
realX[k] = realX[i]
imaginaryX[k] = imaginaryX[i]
realX[i] = placeholderReal
imaginaryX[i] = placeholderImaginary
}
}
return (realX,imaginaryX)
}
class func magnitudeAndPhasePresentations(_ realX:[CGFloat], imaginaryX:[CGFloat]) -> ([CGFloat],[CGFloat]) {
var magnitudes = [CGFloat]()
var phases = [CGFloat]()
var lastMagnitude:CGFloat = 0
var lastPhase:CGFloat = 0
for n in 0 ..< realX.count {
let real = realX[n]
let imaginary = imaginaryX[n]
if real != 0 {
lastMagnitude = sqrt(pow(real, 2)+pow(imaginary, 2))
lastPhase = atan(imaginary/real)
}
magnitudes.append(lastMagnitude)
phases.append(lastPhase)
}
return (magnitudes,phases)
}
class func magnitudePresentation(_ realX:[CGFloat], imaginaryX:[CGFloat]) -> [CGFloat] {
var magnitudes = [CGFloat]()
var lastMagnitude:CGFloat = 0
for n in 0 ..< realX.count {
let real = realX[n]
let imaginary = imaginaryX[n]
if real != 0 {
lastMagnitude = sqrt(pow(real, 2)+pow(imaginary, 2))
}
magnitudes.append(lastMagnitude)
}
return magnitudes
}
}
And to get the audio, I used Novocaine: https://github.com/alexbw/novocaine
I would recommend reading a bit about the Fourier Transform, but it really doesn't have to be that difficult to plug data from Novocaine (the mic) into an FFTComputer and get back some frequencies.
(2 to the gamma is the count of realXin. I could have just computed gamma, so if you want to change that, go ahead. Just turn the Novocaine data into an array of CGFloats, put that in realXin, put an empty array of the same size in imagXin, and enter the right gamma. Then, maybe graph the output to see the frequencies.)

Resources