Related
I was trying out the wandb library and I run wandb.watch but that doesn't seem to work on my code. It's not supposed to be anything to complicated so I am puzzled why it's not working.
Code:
"""
https://docs.wandb.ai/guides/track/advanced/distributed-training
import wandb
# 1. Start a new run
wandb.init(project='playground', entity='brando')
# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01
# 3. Log gradients and model parameters
wandb.watch(model)
for batch_idx, (data, target) in enumerate(train_loader):
...
if batch_idx % args.log_interval == 0:
# 4. Log metrics to visualize performance
wandb.log({"loss": loss})
Notes:
- call wandb.init and wandb.log only from the leader process
"""
from argparse import Namespace
from pathlib import Path
from typing import Union
import torch
from torch import nn
from torch.nn.functional import mse_loss
from torch.optim import Optimizer
import uutils
from uutils.torch_uu import r2_score_from_torch
from uutils.torch_uu.distributed import is_lead_worker
from uutils.torch_uu.models import get_simple_model
from uutils.torch_uu.tensorboard import log_2_tb_supervisedlearning
import wandb
def log_2_wandb_nice(it, loss, inputs, outputs, captions):
wandb.log({"loss": loss, "epoch": it,
"inputs": wandb.Image(inputs),
"logits": wandb.Histogram(outputs),
"captions": wandb.HTML(captions)})
def log_2_wandb(**metrics):
""" Log to wandb """
new_metrics: dict = {}
for key, value in metrics.items():
key = str(key).strip('_')
new_metrics[key] = value
wandb.log(new_metrics)
def log_train_val_stats(args: Namespace,
it: int,
train_loss: float,
train_acc: float,
valid,
log_freq: int = 10,
ckpt_freq: int = 50,
force_log: bool = False, # e.g. at the final it/epoch
save_val_ckpt: bool = False,
log_to_tb: bool = False,
log_to_wandb: bool = False
):
"""
log train and val stats.
Note: Unlike save ckpt, this one does need it to be passed explicitly (so it can save it in the stats collector).
"""
from uutils.torch_uu.tensorboard import log_2_tb
from matplotlib import pyplot as plt
# - is it epoch or iteration
it_or_epoch: str = 'epoch_num' if args.training_mode == 'epochs' else 'it'
# if its
total_its: int = args.num_empochs if args.training_mode == 'epochs' else args.num_its
print(f'-- {it == total_its - 1}')
print(f'-- {it}')
print(f'-- {total_its}')
if (it % log_freq == 0 or is_lead_worker(args.rank) or it == total_its - 1 or force_log) and is_lead_worker(args.rank):
print('inside log')
# - get eval stats
val_loss, val_acc = valid(args, args.mdl, save_val_ckpt=save_val_ckpt)
# - print
args.logger.log('\n')
args.logger.log(f"{it_or_epoch}={it}: {train_loss=}, {train_acc=}")
args.logger.log(f"{it_or_epoch}={it}: {val_loss=}, {val_acc=}")
# - record into stats collector
args.logger.record_train_stats_stats_collector(it, train_loss, train_acc)
args.logger.record_val_stats_stats_collector(it, val_loss, val_acc)
args.logger.save_experiment_stats_to_json_file()
fig = args.logger.save_current_plots_and_stats()
# - log to wandb
if log_to_wandb:
# if it == 0:
# # -- todo why isn't this working?
# wandb.watch(args.mdl)
# print('watching model')
# log_2_wandb(train_loss=train_loss, train_acc=train_acc)
print('inside wandb log')
wandb.log(data={'train loss': train_loss, 'train acc': train_acc, 'val loss': val_loss, 'val acc': val_acc}, step=it)
wandb.log(data={'it': it}, step=it)
if it == total_its - 1:
print(f'logging fig at {it=}')
wandb.log(data={'fig': fig}, step=it)
plt.close('all')
# - log to tensorboard
if log_to_tb:
log_2_tb_supervisedlearning(args.tb, args, it, train_loss, train_acc, 'train')
log_2_tb_supervisedlearning(args.tb, args, it, train_loss, train_acc, 'val')
# log_2_tb(args, it, val_loss, val_acc, 'train')
# log_2_tb(args, it, val_loss, val_acc, 'val')
# - log ckpt
if (it % ckpt_freq == 0 or it == total_its - 1 or force_log) and is_lead_worker(args.rank):
save_ckpt(args, args.mdl, args.optimizer)
def save_ckpt(args: Namespace, mdl: nn.Module, optimizer: torch.optim.Optimizer,
dirname: Union[None, Path] = None, ckpt_name: str = 'ckpt.pt'):
"""
Saves checkpoint for any worker.
Intended use is to save by worker that got a val loss that improved.
"""
import dill
dirname = args.log_root if (dirname is None) else dirname
# - pickle ckpt
assert uutils.xor(args.training_mode == 'epochs', args.training_mode == 'iterations')
pickable_args = uutils.make_args_pickable(args)
torch.save({'state_dict': mdl.state_dict(),
'epoch_num': args.epoch_num,
'it': args.it,
'optimizer': optimizer.state_dict(),
'args': pickable_args,
'mdl': mdl},
pickle_module=dill,
f=dirname / ckpt_name) # f'mdl_{epoch_num:03}.pt'
def get_args() -> Namespace:
args = uutils.parse_args_synth_agent()
# we can place model here...
args = uutils.setup_args_for_experiment(args)
return args
def valid_for_test(args: Namespace, mdl: nn.Module, save_val_ckpt: bool = False):
import torch
for t in range(1):
x = torch.randn(args.batch_size, 5)
y = (x ** 2 + x + 1).sum(dim=1)
y_pred = mdl(x).squeeze(dim=1)
val_loss, val_acc = mse_loss(y_pred, y), r2_score_from_torch(y_true=y, y_pred=y_pred)
if val_loss.item() < args.best_val_loss and save_val_ckpt:
args.best_val_loss = val_loss.item()
save_ckpt(args, args.mdl, args.optimizer, ckpt_name='ckpt_best_val.pt')
return val_loss, val_acc
def train_for_test(args: Namespace, mdl: nn.Module, optimizer: Optimizer, scheduler=None):
# wandb.watch(args.mdl)
for it in range(args.num_its):
x = torch.randn(args.batch_size, 5)
y = (x ** 2 + x + 1).sum(dim=1)
y_pred = mdl(x).squeeze(dim=1)
train_loss, train_acc = mse_loss(y_pred, y), r2_score_from_torch(y_true=y, y_pred=y_pred)
optimizer.zero_grad()
train_loss.backward() # each process synchronizes it's gradients in the backward pass
optimizer.step() # the right update is done since all procs have the right synced grads
scheduler.step()
log_train_val_stats(args, it, train_loss, train_acc, valid_for_test,
log_freq=2, ckpt_freq=10,
save_val_ckpt=True, log_to_tb=True, log_to_wandb=True)
return train_loss, train_acc
def debug_test():
args: Namespace = get_args()
args.num_its = 12
# - get mdl, opt, scheduler, etc
args.mdl = get_simple_model(in_features=5, hidden_features=20, out_features=1, num_layer=2)
wandb.watch(args.mdl)
args.optimizer = torch.optim.Adam(args.mdl.parameters(), lr=1e-1)
args.scheduler = torch.optim.lr_scheduler.ExponentialLR(args.optimizer, gamma=0.999, verbose=False)
# - train
train_loss, train_acc = train_for_test(args, args.mdl, args.optimizer, args.scheduler)
print(f'{train_loss=}, {train_loss=}')
# - eval
val_loss, val_acc = valid_for_test(args, args.mdl)
print(f'{val_loss=}, {val_acc=}')
# - make sure wandb closes properly
if args.log_to_wandb:
wandb.finish()
if __name__ == '__main__':
import os
# print(os.environ['WANDB_API_KEY'])
import time
start = time.time()
debug_test()
duration_secs = time.time() - start
print(f"\nSuccess, time passed: hours:{duration_secs / (60 ** 2)}, minutes={duration_secs / 60}, seconds={duration_secs}")
print('Done!\a')
code in github: https://github.com/brando90/ultimate-utils/blob/master/tutorials_for_myself/my_wandb/my_wandb_basic1.py
sample run: https://wandb.ai/brando/playground/runs/wpupxvg1
cross posted: https://community.wandb.ai/t/when-is-one-supposed-to-run-wandb-watch-so-that-weights-and-biases-tracks-params-and-gradients-prope/518
Cross posting an answer by charlesfrye in the wandb community forum:
There are two things you might be running into here -- can't confirm because your code relies on the ultimate-utils package.
wandb.watch will only start working once you call wandb.log after a backwards pass that touches the watched Module (docs).
The frequency with which gradients/params are logged is controlled by the log_freq argument. If the number of logging calls is less than the value of log_freq, then no information will be logged. Here's a short colab reproducing this behavior.
Also, if you want params and gradients, you need to set the log kwarg to "all". By default, we log only gradients.
I don't know why but this line of code seems to work:
wandb.watch(args.mdl, mse_loss, log="all", log_freq=10)
perhaps it really needs the loss and the log all despite it not being in the intro/quick start guide:
import wandb
# 1. Start a new run
wandb.init(project='playground', entity='brando')
# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01
# 3. Log gradients and model parameters
wandb.watch(model)
for batch_idx, (data, target) in enumerate(train_loader):
...
if batch_idx % args.log_interval == 0:
# 4. Log metrics to visualize performance
wandb.log({"loss": loss})
I wonder that how can I save a self-trained word2vec to txt file with the format like 'word2vec-google-news' or 'glove.6b.50d' which has the tokens followed by matched vectors.
I export my self-trained vectors to txt file which only has vectors but no tokens in the front of those vectors.
My code for training my own word2vec:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import random
import numpy as np
from six.moves import xrange
import zipfile
import tensorflow as tf
import pandas as pd
filename = ('data/data.zip')
# Step 1: Read the data into a list of strings.
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
#print('Data size', len(words))
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
#print("count",len(count))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
#del words # Hint to reduce memory.
#print('Most common words (+UNK)', count[:5])
#print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
#for i in range(8):
#print(batch[i], reverse_dictionary[batch[i]],'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
# Step 4: Build and train a skip-gram model.
batch_size = 128
embedding_size = 128
skip_window = 2
num_skips = 2
valid_size = 9
valid_window = 100
num_sampled = 64 # Number of negative examples to sample.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,biases=nce_biases, inputs=embed, labels=train_labels,
num_sampled=num_sampled, num_classes=vocabulary_size))
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
# Step 5: Begin training.
num_steps = 20000
with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
init.run()
#print("Initialized")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
#if step % 2000 == 0:
# if step > 0:
# average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
# print("Average loss at step ", step, ": ", average_loss)
#average_loss = 0
final_embeddings = normalized_embeddings.eval()
np.savetxt('data/w2v.txt', final_embeddings)
You may want to look at the implementation of _save_word2vec_format() in gensim for an example of Python code which writes that format:
https://github.com/RaRe-Technologies/gensim/blob/e859c11f6f57bf3c883a718a9ab7067ac0c2d4cf/gensim/models/utils_any2vec.py#L104
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None):
"""Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
Parameters
----------
fname : str
The file path used to save the vectors in.
vocab : dict
The vocabulary of words.
vectors : numpy.array
The vectors to be stored.
fvocab : str, optional
File path used to save the vocabulary.
binary : bool, optional
If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
total_vec : int, optional
Explicitly specify total number of vectors
(in case word vectors are appended with document vectors afterwards).
"""
if not (vocab or vectors):
raise RuntimeError("no input")
if total_vec is None:
total_vec = len(vocab)
vector_size = vectors.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.open(fvocab, 'wb') as vout:
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(vocab), vector_size) == vectors.shape
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
row = vectors[vocab_.index]
if binary:
row = row.astype(REAL)
fout.write(utils.to_utf8(word) + b" " + row.tostring())
else:
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
I have used DataParallel in my script for multiple Gpu.The torch.nn.DataParallel is only using the gpu with id 0 and not utilizing the gpu with id 1.So the utilization is very low.Here is the full code:
from __future__ import print_function
from miscc.utils import mkdir_p
from miscc.utils import build_super_images
from miscc.losses import sent_loss, words_loss
from miscc.config import cfg, cfg_from_file
from datasets import TextDataset
from datasets import prepare_data
from model import RNN_ENCODER, CNN_ENCODER
import os
import sys
import time
import random
import pprint
import datetime
import dateutil.tz
import argparse
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
dir_path = (os.path.abspath(os.path.join(os.path.realpath(__file__), './.')))
sys.path.append(dir_path)
UPDATE_INTERVAL = 200
def parse_args():
parser = argparse.ArgumentParser(description='Train a DAMSM network')
parser.add_argument('--cfg', dest='cfg_file',
help='optional config file',
default='cfg/bird.yml', type=str)
parser.add_argument('--gpu', dest='gpu_id', type=int, default=0)
parser.add_argument('--data_dir', dest='data_dir', type=str, default='')
parser.add_argument('--manualSeed', type=int, help='manual seed')
args = parser.parse_args()
return args
def train(dataloader, cnn_model, rnn_model, batch_size, labels, optimizer, epoch, ixtoword, image_dir):
torch.nn.DataParallel(cnn_model.train(),device_ids=[0,1],dim=1).cuda()
torch.nn.DataParallel(rnn_model.train(),device_ids=[0,1],dim=1).cuda()
s_total_loss0 = 0
s_total_loss1 = 0
w_total_loss0 = 0
w_total_loss1 = 0
count = (epoch + 1) * len(dataloader)
start_time = time.time()
for step, data in enumerate(dataloader, 0):
# print('step', step)
rnn_model.zero_grad()
cnn_model.zero_grad()
imgs, captions, cap_lens, \
class_ids, keys = prepare_data(data)
# words_features: batch_size x nef x 17 x 17
# sent_code: batch_size x nef
words_features, sent_code = cnn_model(imgs[-1])
# --> batch_size x nef x 17*17
nef, att_sze = words_features.size(1), words_features.size(2)
# words_features = words_features.view(batch_size, nef, -1)
hidden = rnn_model.init_hidden(batch_size)
# words_emb: batch_size x nef x seq_len
# sent_emb: batch_size x nef
words_emb, sent_emb = rnn_model(captions, cap_lens, hidden)
w_loss0, w_loss1, attn_maps = words_loss(words_features, words_emb, labels,
cap_lens, class_ids, batch_size)
w_total_loss0 += w_loss0.data
w_total_loss1 += w_loss1.data
loss = w_loss0 + w_loss1
s_loss0, s_loss1 = \
sent_loss(sent_code, sent_emb, labels, class_ids, batch_size)
loss += s_loss0 + s_loss1
s_total_loss0 += s_loss0.data
s_total_loss1 += s_loss1.data
#
loss.backward()
#
# `clip_grad_norm` helps prevent
# the exploding gradient problem in RNNs / LSTMs.
torch.nn.utils.clip_grad_norm(rnn_model.parameters(),
cfg.TRAIN.RNN_GRAD_CLIP)
optimizer.step()
if step % UPDATE_INTERVAL == 0:
count = epoch * len(dataloader) + step
s_cur_loss0 = s_total_loss0[0] / UPDATE_INTERVAL
s_cur_loss1 = s_total_loss1[0] / UPDATE_INTERVAL
w_cur_loss0 = w_total_loss0[0] / UPDATE_INTERVAL
w_cur_loss1 = w_total_loss1[0] / UPDATE_INTERVAL
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
's_loss {:5.2f} {:5.2f} | '
'w_loss {:5.2f} {:5.2f}'
.format(epoch, step, len(dataloader),
elapsed * 1000. / UPDATE_INTERVAL,
s_cur_loss0, s_cur_loss1,
w_cur_loss0, w_cur_loss1))
s_total_loss0 = 0
s_total_loss1 = 0
w_total_loss0 = 0
w_total_loss1 = 0
start_time = time.time()
# attention Maps
img_set, _ = \
build_super_images(imgs[-1].cpu(), captions,
ixtoword, attn_maps, att_sze)
if img_set is not None:
im = Image.fromarray(img_set)
fullpath = '%s/attention_maps%d.png' % (image_dir, step)
im.save(fullpath)
return count
def evaluate(dataloader, cnn_model, rnn_model, batch_size):
cnn_model.eval().cuda()
rnn_model.eval().cuda()
s_total_loss = 0
w_total_loss = 0
for step, data in enumerate(dataloader, 0):
real_imgs, captions, cap_lens, \
class_ids, keys = prepare_data(data)
words_features, sent_code = cnn_model(real_imgs[-1])
# nef = words_features.size(1)
# words_features = words_features.view(batch_size, nef, -1)
hidden = rnn_model.init_hidden(batch_size)
words_emb, sent_emb = rnn_model(captions, cap_lens, hidden)
w_loss0, w_loss1, attn = words_loss(words_features, words_emb, labels,
cap_lens, class_ids, batch_size)
w_total_loss += (w_loss0 + w_loss1).data
s_loss0, s_loss1 = \
sent_loss(sent_code, sent_emb, labels, class_ids, batch_size)
s_total_loss += (s_loss0 + s_loss1).data
if step == 50:
break
s_cur_loss = s_total_loss[0] / step
w_cur_loss = w_total_loss[0] / step
return s_cur_loss, w_cur_loss
def build_models():
# build model ############################################################
text_encoder = RNN_ENCODER(dataset.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM)
image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM)
labels = Variable(torch.LongTensor(range(batch_size)))
start_epoch = 0
if cfg.TRAIN.NET_E != '':
state_dict = torch.load(cfg.TRAIN.NET_E)
text_encoder.load_state_dict(state_dict)
print('Load ', cfg.TRAIN.NET_E)
#
name = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder')
state_dict = torch.load(name)
image_encoder.load_state_dict(state_dict)
print('Load ', name)
istart = cfg.TRAIN.NET_E.rfind('_') + 8
iend = cfg.TRAIN.NET_E.rfind('.')
start_epoch = cfg.TRAIN.NET_E[istart:iend]
start_epoch = int(start_epoch) + 1
print('start_epoch', start_epoch)
if cfg.CUDA:
text_encoder = text_encoder.cuda()#torch.nn.DataParallel(text_encoder,device_ids=[0,1],dim=1)
image_encoder = image_encoder.cuda()#torch.nn.DataParallel(image_encoder,device_ids=[0,1],dim=1)
labels = labels.cuda()#torch.nn.DataParallel(labels,device_ids=[0,1],dim=1)
return text_encoder, image_encoder, labels, start_epoch
if __name__ == "__main__":
args = parse_args()
if args.cfg_file is not None:
cfg_from_file(args.cfg_file)
if args.gpu_id == -1:
cfg.CUDA = False
else:
cfg.GPU_ID =args.gpu_id
if args.data_dir != '':
cfg.DATA_DIR = args.data_dir
print('Using config:')
pprint.pprint(cfg)
if not cfg.TRAIN.FLAG:
args.manualSeed = 100
elif args.manualSeed is None:
args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed)
np.random.seed(args.manualSeed)
torch.manual_seed(args.manualSeed)
if cfg.CUDA:
torch.cuda.manual_seed_all(args.manualSeed)
##########################################################################
now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
output_dir = '../output/%s_%s_%s' % \
(cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)
model_dir = os.path.join(output_dir, 'Model')
image_dir = os.path.join(output_dir, 'Image')
mkdir_p(model_dir)
mkdir_p(image_dir)
#torch.cuda.set_device()
cudnn.benchmark = True
# Get data loader ##################################################
imsize = cfg.TREE.BASE_SIZE * (2 ** (cfg.TREE.BRANCH_NUM-1))
batch_size = cfg.TRAIN.BATCH_SIZE
image_transform = transforms.Compose([
transforms.Scale(int(imsize * 76 / 64)),
transforms.RandomCrop(imsize),
transforms.RandomHorizontalFlip()])
dataset = TextDataset(cfg.DATA_DIR, 'train',
base_size=cfg.TREE.BASE_SIZE,
transform=image_transform)
print(dataset.n_words, dataset.embeddings_num)
assert dataset
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=batch_size, drop_last=True,
shuffle=True, num_workers=int(cfg.WORKERS))
# # validation data #
dataset_val = TextDataset(cfg.DATA_DIR, 'test',
base_size=cfg.TREE.BASE_SIZE,
transform=image_transform)
dataloader_val = torch.utils.data.DataLoader(
dataset_val, batch_size=batch_size, drop_last=True,
shuffle=True, num_workers=int(cfg.WORKERS))
# Train ##############################################################
text_encoder, image_encoder, labels, start_epoch = build_models()
para = list(text_encoder.parameters())
for v in image_encoder.parameters():
if v.requires_grad:
para.append(v)
# optimizer = optim.Adam(para, lr=cfg.TRAIN.ENCODER_LR, betas=(0.5, 0.999))
# At any point you can hit Ctrl + C to break out of training early.
try:
lr = cfg.TRAIN.ENCODER_LR
for epoch in range(start_epoch, cfg.TRAIN.MAX_EPOCH):
optimizer = optim.Adam(para, lr=lr, betas=(0.5, 0.999))
epoch_start_time = time.time()
count = torch.nn.DataParallel(train(dataloader, image_encoder, text_encoder,
batch_size, labels, optimizer, epoch,
dataset.ixtoword, image_dir),device_ids=[0,1],dim=1).cuda()
print('-' * 89)
if len(dataloader_val) > 0:
s_loss, w_loss = torch.nn.DataParallel(evaluate(dataloader_val, image_encoder,
text_encoder, batch_size)).cuda()
print('| end epoch {:3d} | valid loss '
'{:5.2f} {:5.2f} | lr {:.5f}|'
.format(epoch, s_loss, w_loss, lr))
print('-' * 89)
if lr > cfg.TRAIN.ENCODER_LR/10.:
lr *= 0.98
if (epoch % cfg.TRAIN.SNAPSHOT_INTERVAL == 0 or
epoch == cfg.TRAIN.MAX_EPOCH):
torch.save(image_encoder.state_dict(),
'%s/image_encoder%d.pth' % (model_dir, epoch))
torch.save(text_encoder.state_dict(),
'%s/text_encoder%d.pth' % (model_dir, epoch))
print('Save G/Ds models.')
except KeyboardInterrupt:
print('-' * 89)
print('Exiting from training early')
I have read various articles regarding DataParallel and according to them this should have worked.Can Somebody help me in finding the solution to the problem?
You must use torch.nn.DataParallel only once that is immediately after creating an instance of the classes RNN_ENCODER and CNN_ENCODER. Your mistakes are as follows:
Everytime train method is called, the instance of RNN_ENCODER (which is rnn_model in train method) and the instance of CNN_ENCODER (which is cnn_model in train method) are wrapped by torch.nn.DataParallel. This is wrong. You will have to do it only once, which is after instantiating them in build_models method. This ensures that your model is replicated across both multiple GPUs for parallel execution. Wrapping the instance again and again for every epoch (when the train is called) is not going to help pytorch to parallelize computations.
The output of train method is count which is an integer variable. By wrapping the output of train method in torch.nn.DataParallel you are not going to achieve data parallelism.
I suggest you visit the official documentation of Pytorch at this link for better understanding of how to use Dataparallel.
I started a very simple RNN project to solidify my knowledge in TF, basically a simple sequence generator using LSTMs and TF. The project is just a many-to-one sequence generation, the input is a 4 integer window and the output has only one float for each window. The minimum number of the input is 1 and the maximum is 61, so I can predict from 61 and forward. I just used a batch of all inputs, which has shape [58,4,1] and the output with shape [58,1]. For better visualization, the inputs and outputs have been written below.
Inputs Labels
[[[ 1],[ 2],[ 3],[ 4]], -------> [[0.0493],
[[ 2],[ 3],[ 4],[ 5]], -------> [0.0634],
[[ 3],[ 4],[ 5],[ 6]], -------> [0.0773],
[[ 4],[ 5],[ 6],[ 7]], -------> [0.0909],
.. .. .. .. -------> ... ,
[[55],[56],[57],[58]], -------> [0.5503],
[[56],[57],[58],[59]], -------> [0.5567],
[[57],[58],[59],[60]], -------> [0.5630],
[[58],[59],[60],[61]]] -------> [0.5693]]
The training part went very well and I could achieve something around 0.991 accuracy with 500 epochs, but when I try to predict some values from 61 to 118, the output has a fixed step down for all predicted values but somehow has the right behavior.
Because the purpose of this project is just for learning the basics, I decided to use the simplest functions in TF, so the seq2seq facilities have been left off. The code for the RNN is written below
def build_lstm(cell_lengh, cell_depth, batch_size, keep_prob):
def lstm_row(cell_length, keep_prob):
cell_row = tf.contrib.rnn.BasicLSTMCell(cell_lengh)
cell_row = tf.contrib.rnn.DropoutWrapper(cell_row, keep_prob)
return cell_row
cell = tf.contrib.rnn.MultiRNNCell([lstm_row(cell_lengh, keep_prob) for _ in range(cell_depth)])
initial_state = cell.zero_state(batch_size, tf.float32)
return cell, initial_state
tf.reset_default_graph()
inputs = tf.placeholder(tf.float32, [None, feature_length, 1], name='inputs')
labels = tf.placeholder(tf.float32, [None, output_length], name='labels')
keep_prob = tf.placeholder(tf.float32, name='kpprob')
lstm_cell, initial_state = build_lstm(40, 2, batch_size=batch_size, keep_prob=keep_prob)
lstm_output, final_state = tf.nn.dynamic_rnn(lstm_cell, inputs, initial_state=initial_state)
lstm_outout_seq = lstm_output[:,-1,:]
dense_0 = tf.layers.dense(inputs=lstm_outout_seq, units=120, activation=tf.nn.relu)
dropout_0 = tf.layers.dropout(dense_0, rate=0.7)
with tf.variable_scope('sigmoid'):
W = tf.Variable(tf.truncated_normal((120, 1), stddev=0.1), name='weights')
b = tf.Variable(tf.zeros(1), name='bias')
logits = tf.matmul(dropout_0, W) + b
output = tf.nn.sigmoid(logits, name='output')
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
correct_predictions = tf.abs(output - labels)
total_correct = tf.ones_like(correct_predictions)
accuracy = tf.reduce_mean(total_correct - correct_predictions)
learning_rate = tf.placeholder(tf.float32, name='learning_rate')
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
l_rate = 0.001
epochs = 500
kp_prob = 0.7
with tf.Session() as session:
session.run(tf.global_variables_initializer())
for e in range(epochs):
new_state = session.run([initial_state])
feeder = {
inputs: wnd_x,
labels: wnd_y_scl,
keep_prob: kp_prob,
learning_rate: l_rate,
initial_state: new_state
}
session_loss,
session_accuracy,
session_output, _,
last_state = session.run([loss, accuracy, output,
optimizer, final_state], feed_dict=feeder)
print('Epoch {0}/{1}:\t'.format(e, epochs),
'training loss {0}\t'.format(session_loss),
'accuracy {0}\t'.format(session_accuracy))
new_state = session.run([initial_state])
feeder = {
inputs: unseen_data_rsp,
keep_prob: 1.0,
initial_state: new_state
}
session_output = session.run([output], feed_dict=feeder)
As mentioned before, during the inference phase, the predictions have fixed step down but somehow have the right behavior, i.e. the derivate of the curves changes correctly for the time-steps.
During the training phase I have the following output:
Epoch 999/1000: training loss = 0.5913468599319458 | accuracy = 0.9909629225730896
Input Label Output
[[ 1],[ 2],[ 3],[ 4]] --> [0.0493] ... [0.0591]
[[ 2],[ 3],[ 4],[ 5]] --> [0.0634] ... [0.0802]
[[ 3],[ 4],[ 5],[ 6]] --> [0.0773] ... [0.0777]
[[ 4],[ 5],[ 6],[ 7]] --> [0.0909] ... [0.1035]
.. .. .. .. ... ... ...
[[55],[56],[57],[58]] --> [0.5503] ... [0.5609]
[[56],[57],[58],[59]] --> [0.5567] ... [0.5465]
[[57],[58],[59],[60]] --> [0.5630] ... [0.5543]
[[58],[59],[60],[61]] --> [0.5693] ... [0.5614]
And during inference phase I have the following output:
Input Prediction
[[ 58],[ 59],[ 60],[ 61]] --> [0.4408]
[[ 59],[ 60],[ 61],[ 62]] --> [0.4459]
[[ 60],[ 61],[ 62],[ 63]] --> [0.4510]
[[ 61],[ 62],[ 63],[ 64]] --> [0.4559]
... ... ... ... ... ...
[[112],[113],[114],[115]] --> [0.6089]
[[113],[114],[115],[116]] --> [0.6101]
[[114],[115],[116],[117]] --> [0.6113]
[[115],[116],[117],[118]] --> [0.6124]
As you can see, the first input of the inference is the same of the last input of the training. What I don't understand here is why the same input gave me 2 different outputs and why theses output has a fixed step down, around 0.11. Thank you guys for any help and sorry for the long text, I can make it shorter upon request.
During inference you are resetting the state. And so you're getting two different values on the same input because the state of the network is different in both cases.
To keep the state after a prediction you would need to do something like this:
#iterate for each prediction {
feeder = {
inputs: unseen_data_rsp,
keep_prob: 1.0,
initial_state: last_state
}
session_output, last_state = session.run([output,final_state], feed_dict=feeder)
}
Also to get exactly the training result with the first input of inference you would need to first present all the training examples to ensure that you start inference with the correct state. Another approach would be to save the state of the network which you then could reuse during prediction.
I am trying to use LSTM Recurrent Neural Net using Keras to forecast future purchase. My input variables are time-window of purchases for previous 5 days, and a categorical variable which I encoded as dummy variables A, B, ...,I. My input data looks like following:
>>> dataframe.head()
day price A B C D E F G H I TS_bigHolidays
0 2015-06-16 7.031160 1 0 0 0 0 0 0 0 0 0
1 2015-06-17 10.732429 1 0 0 0 0 0 0 0 0 0
2 2015-06-18 21.312692 1 0 0 0 0 0 0 0 0 0
My problem is my forecasts/fitted values (both for trained and test data) seem to be shifted forward. Here is a plot:
My question is what parameter in LSTM Keras should I change to correct this issue? Or do I need to change anything in my input data?
Here is my code:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas
import math
import time
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from sklearn.preprocessing import MinMaxScaler
np.random.seed(1234)
exo_feature = ["A","B","C","D","E","F","G","H","I", "TS_bigHolidays"]
look_back = 5 #this is number of days we are looking back for sliding window of time series
forecast_period_length = 40
# load the dataset
dataframe = pandas.read_csv('processedDataframeGameSphere.csv', header = 0, engine='python', skipfooter=6)
dataframe["price"] = dataframe['price'].astype('float32')
scaler = MinMaxScaler(feature_range=(0, 100))
dataframe["price"] = scaler.fit_transform(dataframe['price'])
# this function is used to make sliding window for time series data
def create_dataframe(dataframe, look_back=1):
dataX, dataY = [], []
for i in range(dataframe.shape[0]-look_back-1):
price_lookback = dataframe['price'][i: (i + look_back)] #i+look_back is exclusive here
exog_feature = dataframe[exo_feature].ix[i + look_back - 1] #Y is i+ look_back ,that's why
row_i = price_lookback.append(exog_feature)
dataX.append(row_i)
dataY.append(dataframe["price"][i + look_back])
return np.array(dataX), np.array(dataY)
window_dataframe, Y = create_dataframe(dataframe, look_back)
# split into train and test sets
train_size = int(dataframe.shape[0] - forecast_period_length) #28 is the number of days we want to forecast , 4 weeks
test_size = dataframe.shape[0] - train_size
test_size_start_point_with_lookback = train_size - look_back
trainX, trainY = window_dataframe[0:train_size,:], Y[0:train_size]
print(trainX.shape)
print(trainY.shape)
#below changed datawindowY indexing, since it's just array.
testX, testY = window_dataframe[train_size:dataframe.shape[0],:], Y[train_size:dataframe.shape[0]]
# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
print(trainX.shape)
print(testX.shape)
# create and fit the LSTM network
dimension_input = testX.shape[2]
model = Sequential()
layers = [dimension_input, 50, 100, 1]
epochs = 100
model.add(LSTM(
input_dim=layers[0],
output_dim=layers[1],
return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
layers[2],
return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(
output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="rmsprop")
print "Compilation Time : ", time.time() - start
model.fit(
trainX, trainY,
batch_size= 10, nb_epoch=epochs, validation_split=0.05,verbose =2)
# Estimate model performance
trainScore = model.evaluate(trainX, trainY, verbose=0)
trainScore = math.sqrt(trainScore)
trainScore = scaler.inverse_transform(np.array([[trainScore]]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = model.evaluate(testX, testY, verbose=0)
testScore = math.sqrt(testScore)
testScore = scaler.inverse_transform(np.array([[testScore]]))
print('Test Score: %.2f RMSE' % (testScore))
# generate predictions for training
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# shift train predictions for plotting
np_price = np.array(dataframe["price"])
print(np_price.shape)
np_price = np_price.reshape(np_price.shape[0],1)
trainPredictPlot = np.empty_like(np_price)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
testPredictPlot = np.empty_like(np_price)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+look_back+1:dataframe.shape[0], :] = testPredict
# plot baseline and predictions
plt.plot(dataframe["price"])
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()
It's not a problem of LSTM, if you use just simple feed-forward network, the effect will be the same.
the problem is the network tend to mimic yesterday value instead of 'forecasting' you expect.
(it is nice strategy in term of reducing MSE loss)
you need more 'care' to avoid this issue and it's not a simple issue.