Unable to write gradient step in theano for rnn - machine-learning

I have following code in which I convert words to one hot vectors and do a gradient descent in theano using rnn for predicting next words given a sequence of words(basically a language model).
# coding: utf-8
# In[68]:
#Importing stuff
import theano
import theano.tensor as T
import numpy as np
# In[69]:
import nltk
import sys
import operator
import csv
import itertools
from utils import *
from datetime import datetime
# In[70]:
#Fixing vocabulary size for one hot vectors and some initialization stuff
v_size = 8000
unknown_token = "UNKNOWN_TOKEN"
start_token = "<s>"
end_token = "</s>"
# In[71]:
#Read data and start preprocessing
with open('reddit-comments-2015-08.csv','rb') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
sentences = list(itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8')) for x in reader]))
print len(sentences)
# In[72]:
#Tokenize the sentences and add start and end tokens
tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
tokenized_sentences = [[start_token] + s + [end_token] for s in tokenized_sentences]
# In[73]:
#Get word frequencies and use only most frequent words in vocabulary
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
vocab = word_freq.most_common(v_size-1)
# In[74]:
#Do mapping and reverse mapping
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = {w:i for i,w in enumerate(index_to_word)}
#Removing less frequent words
for i, s in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in s]
#Got vectors but they are not one hot
X_train = np.asarray([[word_to_index[w] for w in s[:-1]] for s in tokenized_sentences])
Y_train = np.asarray([[word_to_index[w] for w in s[1:]] for s in tokenized_sentences])
#Preprocessing ends here
# In[75]:
#Take only one sentence for now
X_train = X_train[0]
Y_train = Y_train[0]
# In[76]:
#Make input and output as onehot vectors. This can easily be replaced with vectors generated by word2vec.
X_train_onehot = np.eye(v_size)[X_train]
X = theano.shared(np.array(X_train_onehot).astype('float32'), name = 'X')
Y_train_onehot = np.eye(v_size)[Y_train]
Y = theano.shared(np.array(Y_train_onehot).astype('float32'), name = 'Y')
# In[77]:
#Initializing U, V and W
i_dim = v_size
h_dim = 100
o_dim = v_size
U = theano.shared(np.random.randn(i_dim, h_dim).astype('float32'), name = 'U')
W = theano.shared(np.random.randn(h_dim, h_dim).astype('float32'), name = 'W')
V = theano.shared(np.random.randn(h_dim, o_dim).astype('float32'), name = 'V')
# In[78]:
#forward propagation
s = T.vector('s')
results, updates = theano.scan(lambda x, sm1: T.tanh( T.dot(x, U) + T.dot(sm1, W)),
sequences = X_train_onehot,
outputs_info = s
)
y_hat = T.dot(results, V)
forward_propagation = theano.function(inputs=[s], outputs = y_hat)
# In[80]:
#loss
loss = T.sum(T.nnet.categorical_crossentropy(y_hat, Y))
# In[81]:
#Gradients
dw = T.grad(loss, W)
du = T.grad(loss, U)
dv = T.grad(loss, V)
# In[82]:
#BPTT
learning_rate = T.scalar('learning_rate')
gradient_step = theano.function(inputs = [s, learning_rate],
updates = (
(U, U - learning_rate * du),
(V, V - learning_rate * dv),
(W, W - learning_rate * dw)
)
)
# In[ ]:
But it keeps throwing error at gradient step. I am posting full code because I don't know which step is affecting the error. The following is the screenshot of error in jupyter notebook.

I solved it. The problem is with mismatch of types. I had to typecast du, dv, dw, learning rate to float32. By default, they are float64.

Related

How to deal with big dataset when using pyG?

I am a beginer learning to using torch_geometric to build my GNN models. I refered the sample of the pyG example of node classification and build my own dataset, however, I tried to use my GPU to run the code and it tells me that it run out of memory, maybe my dataset is too large to allocate the GPU memory? I don't know. I shared an machine of 8 A100 with my classmates. Could you please give me some suggestions, thank you!
from torch_geometric.nn import GATConv,GCNConv
from torch_geometric.data import Dataset,DataLoader,HeteroData,Data
import torch.nn as nn
from torch_geometric.nn import DataParallel
from torch_geometric.loader import DataListLoader
import torch.nn.functional as F
import torch
import pandas as pd
from transformers import BertTokenizer,BertModel
import pickle
import time
from tqdm import tqdm
from numba import jit
import json
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
plt.grid(True)
plt.grid(color='gray',
linestyle='--',
linewidth=1,
alpha=0.3)
begin = time.time()
punctuation = "!#$%&'\(\)-*+,-./:;<=>?#\\\[\]^_`{|}~():;,。【】·、“”‘’《》\"%……——·"
def dataCleanifier(s):
for i in punctuation:
s.replace(i," ")
s = s.replace(" "," ")
s = s.replace("\n","")
return s
class BertClassifier(nn.Module):
def __init__(self,bertType:str,max_length,tag_size):
super(BertClassifier,self).__init__()
self.bertType = bertType
self.tokenizer = BertTokenizer.from_pretrained(self.bertType)
self.encoder = BertModel.from_pretrained(self.bertType)
self.outputDim = self.encoder.pooler.dense.out_features
self.max_length = max_length
self.tag_size = tag_size
self.dropout = nn.Dropout(0.1)
self.activation = nn.LeakyReLU(0.1)
self.convs = nn.ModuleList(
[nn.Conv2d(1, 512, (k, self.outputDim)) for k in (2,3,4)])
self.fc_cnn = nn.Linear(512 * len((2,3,4)), self.tag_size)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self,x):
x = self.tokenizer.batch_encode_plus(x,return_tensors="pt",max_length=self.max_length,truncation=True,padding="max_length")
attention = x["attention_mask"]
x = x["input_ids"]
x = x.cuda(2)
x = self.encoder(x,attention_mask=attention.cuda(2))['last_hidden_state'][:]
x = x.unsqueeze(1)
encoded = torch.cat([self.conv_and_pool(x,conv) for conv in self.convs],1)
x = self.fc_cnn(encoded)
x = self.activation(x)
# x = F.softmax(x,dim=1)
return x,encoded
class ContrastiveLoss(nn.Module):
def __init__(self):
super(ContrastiveLoss, self).__init__()
def forward(self,representations,label,y_hat):
n = label.shape[0]
T = 0.5
similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
mask = torch.ones_like(similarity_matrix) * (label.expand(n, n).eq(label.expand(n, n).t()))
mask_no_sim = torch.ones_like(mask) - mask
mask_dui_jiao_0 = torch.ones(n ,n) - torch.eye(n, n )
similarity_matrix = torch.exp(similarity_matrix/T)
similarity_matrix = similarity_matrix*mask_dui_jiao_0
sim = mask*similarity_matrix
no_sim = similarity_matrix - sim
no_sim_sum = torch.sum(no_sim , dim=1)
no_sim_sum_expend = no_sim_sum.repeat(n, 1).T
sim_sum = sim + no_sim_sum_expend
loss = torch.div(sim , sim_sum)
loss = mask_no_sim + loss + torch.eye(n, n )
#接下来就是算一个批次中的loss了
loss = -torch.log(loss) #求-log
loss = torch.sum(torch.sum(loss, dim=1) )/(2*n)+nn.CrossEntropyLoss()(y_hat,label)
return loss
class GAT(nn.Module):
def __init__(self, hidden_channels) -> None:
super().__init__()
self.conv1 = GATConv(data.num_features,hidden_channels)
self.conv2 = GATConv(hidden_channels,9)
self.activation = nn.ReLU()
def forward(self,x,edge_index):
x = self.conv1(x,edge_index)
x = self.activation(x)
# print(x)
# x = F.dropout(x,p=0.2)
x = self.conv2(x,edge_index)
return x
x=None
y=None
edge_index = None
train_mask = None
with open("X.pkl","rb") as f1:
x = pickle.load(f1)
with open("Y.pkl","rb") as f2:
y = pickle.load(f2)
y = y.long()
with open("edge_index.pkl","rb") as f3:
edge_index = pickle.load(f3)
# print(edge_index.shape)
with open("train_mask.pkl","rb") as f4:
train_mask = pickle.load(f4)
data = Data(x=x,y=y,edge_index=edge_index)
data.train_mask = train_mask
model = GAT(hidden_channels=32)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, 100, 0.8)
criterion = ContrastiveLoss()
def train():
model.train()
optimizer.zero_grad() # Clear gradients.
out = model(data.x,data.edge_index) # Perform a single forward pass.
loss = criterion(data.x[data.train_mask], data.y[data.train_mask],out[data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
test_correct = pred[data.train_mask] == data.y[data.train_mask] # Check against ground-truth labels.
test_acc = int(test_correct.sum()) / int(data.train_mask.sum()) # Derive ratio of correct predictions.
return test_acc
accs = []
for epoch in range(1, 1025):
loss = train()
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}',end=" ")
acc = test()
print("acc:",acc)
accs.append(acc)
scheduler.step()
plt.plot(range(len(accs)),accs)
print(time.time()-begin)
with open("./accs_gat_GCL.pkl","wb") as f1:
pickle.dump(accs,f1)
plt.savefig("./res_GAT_GCL.png",dpi=600)
I have tried to use DataPararel to use multiple GPU to load my model and dataset but failed.

How does one implement a meta-trainable step size in Pytorch?

I want to implement a (meta) trainable step size. I tried it with this post:
https://discuss.pytorch.org/t/how-does-one-have-the-parameters-of-a-model-not-be-leafs/70076/17
and with the higher library (https://github.com/facebookresearch/higher) with no luck...
I tried:
eta = torch.tensor([0.5], requires_grad=True).view(1)
inner_opt = torch.optim.Adam(child_model.parameters(), lr=eta)
#meta_params = itertools.chain(child_model.parameters(),eta.parameters())
meta_params = itertools.chain(child_model.parameters())
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
nb_outer_steps = 10 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
meta_opt.zero_grad()
if outer_i >= nb_outer_steps:
break
# do inner-training/MAML; minimize innerloop: theta^{T} - eta* Grad L^train(theta^{T}) ~ argmin L^train(theta)
nb_inner_steps = 3
with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
with error:
Exception has occurred: RuntimeError
Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
which wouldn't work anyway cuz eta might become negative suddenly so I really want to cap it with a sigmoid function but had to try something...
It thinks my step size NN is not in the graph but it is because of this line of code:
p_new = p + lr*g
group['params'][p_idx] = p_new
but somehow that is not enough to have gradients...
Full script self contained script:
import torch
import torch.nn as nn
from torch.optim.optimizer import Optimizer
import higher
from higher.optim import DifferentiableOptimizer
from higher.optim import DifferentiableSGD
import torchvision
import torchvision.transforms as transforms
from torchviz import make_dot
import copy
import itertools
from collections import OrderedDict
#mini class to add a flatten layer to the ordered dictionary
class Flatten(nn.Module):
def forward(self, input):
'''
Note that input.size(0) is usually the batch size.
So what it does is that given any input with input.size(0) # of batches,
will flatten to be 1 * nb_elements.
'''
batch_size = input.size(0)
out = input.view(batch_size,-1)
return out # (batch_size, *size)
def get_cifar10():
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
shuffle=False, num_workers=2)
return trainloader, testloader
class MySGD(Optimizer):
def __init__(self, params, eta, prev_lr):
defaults = {'eta':eta, 'prev_lr':prev_lr}
super().__init__(params, defaults)
class TrainableSGD(DifferentiableOptimizer):
def _update(self, grouped_grads, **kwargs):
prev_lr = self.param_groups[0]['prev_lr']
eta = self.param_groups[0]['eta']
# start differentiable & trainable update
zipped = zip(self.param_groups, grouped_grads)
lr = 0.1*eta(prev_lr).view(1)
for group_idx, (group, grads) in enumerate(zipped):
for p_idx, (p, g) in enumerate(zip(group['params'], grads)):
if g is None:
continue
#group['params'][p_idx] = _add(p, -group['lr'], g)
p_new = p + lr*g
group['params'][p_idx] = p_new
# fake returns
self.param_groups[0]['prev_lr'] = lr
higher.register_optim(MySGD, TrainableSGD)
def main():
# get dataloaders
trainloader, testloader = get_cifar10()
criterion = nn.CrossEntropyLoss()
child_model = nn.Sequential(OrderedDict([
('conv1', nn.Conv2d(in_channels=3,out_channels=2,kernel_size=5)),
('relu1', nn.ReLU()),
('Flatten', Flatten()),
('fc', nn.Linear(in_features=28*28*2,out_features=10) )
]))
hidden = torch.randn(size=(1,1),requires_grad=True)
print(f'-> hidden = {hidden}')
eta = nn.Sequential(OrderedDict([
('fc', nn.Linear(1,1)),
('sigmoid', nn.Sigmoid())
]))
inner_opt = MySGD(child_model.parameters(), eta=eta, prev_lr=hidden)
meta_params = itertools.chain(child_model.parameters(),eta.parameters())
#meta_params = itertools.chain(eta.parameters(),[hidden])
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
print()
nb_outer_steps = 1 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
meta_opt.zero_grad()
if outer_i >= nb_outer_steps:
break
# do inner-training/MAML; minimize innerloop: theta^{T} - eta * Grad L^train(theta^{T}) ~ argmin L^train(theta)
nb_inner_steps = 3
#with higher.innerloop_ctx(child_model, inner_opt, copy_initial_weights=False) as (fmodel, diffopt):
with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
for inner_i, (inner_inputs, inner_targets) in enumerate(trainloader, 0):
if inner_i >= nb_inner_steps:
break
logits = fmodel(inner_inputs)
inner_loss = criterion(logits, inner_targets)
print(f'--> inner_i = {inner_i}')
print(f'inner_loss^<{inner_i}>: {inner_loss}')
print(f'lr^<{inner_i-1}> = {diffopt.param_groups[0]["prev_lr"]}')
diffopt.step(inner_loss) # changes params P[t+1] using P[t] and loss[t] in a differentiable manner
print(f'lr^<{inner_i}> = {diffopt.param_groups[0]["prev_lr"]}')
print()
# compute the meta-loss L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
outer_outputs = fmodel(outer_inputs)
meta_loss = criterion(outer_outputs, outer_targets) # L^val
make_dot(meta_loss).render('meta_loss',format='png')
meta_loss.backward()
#grad_of_grads = torch.autograd.grad(outputs=meta_loss, inputs=eta.parameters()) # dmeta_loss/dw0
print(f'----> outer_i = {outer_i}')
print(f'-> outer_loss/meta_loss^<{outer_i}>: {meta_loss}')
print(f'child_model.fc.weight.grad = {child_model.fc.weight.grad}')
print(f'hidden.grad = {hidden.grad}')
print(f'eta.fc.weight = {eta.fc.weight.grad}')
meta_opt.step() # meta-optimizer step: more or less theta^<t> := theta^<t> - meta_eta * Grad L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
if __name__ == "__main__":
main()
print('---> Done\a')
notice the None's:
Files already downloaded and verifiedFiles already downloaded and verified
-> hidden = tensor([[0.8459]], requires_grad=True)
--> inner_i = 0
inner_loss^<0>: 2.2696359157562256
lr^<-1> = tensor([[0.8459]], requires_grad=True)
lr^<0> = tensor([0.0567], grad_fn=)
--> inner_i = 1
inner_loss^<1>: 2.0114920139312744
lr^<0> = tensor([0.0567], grad_fn=)
lr^<1> = tensor([0.0720], grad_fn=)
--> inner_i = 2
inner_loss^<2>: 2.3866422176361084
lr^<1> = tensor([0.0720], grad_fn=)
lr^<2> = tensor([0.0717], grad_fn=)
----> outer_i = 0
-> outer_loss/meta_loss^<0>: 4.021303176879883
child_model.fc.weight.grad = None
hidden.grad = None
eta.fc.weight = None
---> Done
related:
pytorch forum: https://discuss.pytorch.org/t/implement-a-meta-trainable-step-size/70396
gitissue: https://github.com/facebookresearch/higher/issues/32
related SO Q: How does one have parameters in a pytorch model not be leafs and be in the computation graph?

How to save self-trained word2vec to a txt file with format like 'word2vec-google-news' or 'glove.6b.50d'

I wonder that how can I save a self-trained word2vec to txt file with the format like 'word2vec-google-news' or 'glove.6b.50d' which has the tokens followed by matched vectors.
I export my self-trained vectors to txt file which only has vectors but no tokens in the front of those vectors.
My code for training my own word2vec:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import random
import numpy as np
from six.moves import xrange
import zipfile
import tensorflow as tf
import pandas as pd
filename = ('data/data.zip')
# Step 1: Read the data into a list of strings.
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
#print('Data size', len(words))
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
#print("count",len(count))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
#del words # Hint to reduce memory.
#print('Most common words (+UNK)', count[:5])
#print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
#for i in range(8):
#print(batch[i], reverse_dictionary[batch[i]],'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
# Step 4: Build and train a skip-gram model.
batch_size = 128
embedding_size = 128
skip_window = 2
num_skips = 2
valid_size = 9
valid_window = 100
num_sampled = 64 # Number of negative examples to sample.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,biases=nce_biases, inputs=embed, labels=train_labels,
num_sampled=num_sampled, num_classes=vocabulary_size))
# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
# Step 5: Begin training.
num_steps = 20000
with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
init.run()
#print("Initialized")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
#if step % 2000 == 0:
# if step > 0:
# average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
# print("Average loss at step ", step, ": ", average_loss)
#average_loss = 0
final_embeddings = normalized_embeddings.eval()
np.savetxt('data/w2v.txt', final_embeddings)
You may want to look at the implementation of _save_word2vec_format() in gensim for an example of Python code which writes that format:
https://github.com/RaRe-Technologies/gensim/blob/e859c11f6f57bf3c883a718a9ab7067ac0c2d4cf/gensim/models/utils_any2vec.py#L104
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None):
"""Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
Parameters
----------
fname : str
The file path used to save the vectors in.
vocab : dict
The vocabulary of words.
vectors : numpy.array
The vectors to be stored.
fvocab : str, optional
File path used to save the vocabulary.
binary : bool, optional
If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
total_vec : int, optional
Explicitly specify total number of vectors
(in case word vectors are appended with document vectors afterwards).
"""
if not (vocab or vectors):
raise RuntimeError("no input")
if total_vec is None:
total_vec = len(vocab)
vector_size = vectors.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s", fvocab)
with utils.open(fvocab, 'wb') as vout:
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
assert (len(vocab), vector_size) == vectors.shape
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
row = vectors[vocab_.index]
if binary:
row = row.astype(REAL)
fout.write(utils.to_utf8(word) + b" " + row.tostring())
else:
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

Why should the training label for Generator in GAN be always True?

I am currently learning deep learning especially GAN.
I found a simple code of GAN from a web site below.
https://medium.com/#devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f
However, in the code, I don't understand why we always need to give true label to Generator as below.
for g_index in range(g_steps):
# 2. Train G on D's response (but DO NOT train D on these labels)
G.zero_grad()
gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
g_fake_data = G(gen_input)
dg_fake_decision = D(preprocess(g_fake_data.t()))
g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
g_error.backward()
g_optimizer.step() # Only optimizes G's parameters
Specifically, on this line.
g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
Input data for Generator is fake data(includes noise), so if we assign True labels on those input data, I think Generator ends up creating data which is similar to fake data(doesn't look like genuine). Is my understanding wrong? Sorry for the silly question, but if you have knowledge, plz help me out.
I'll put a whole code below.
#!/usr/bin/env python
# Generative Adversarial Networks (GAN) example in PyTorch.
# See related blog post at https://medium.com/#devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f#.sch4xgsa9
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
# Data params
data_mean = 4
data_stddev = 1.25
# Model params
g_input_size = 1 # Random noise dimension coming into generator, per output vector
g_hidden_size = 50 # Generator complexity
g_output_size = 1 # size of generated output vector
d_input_size = 100 # Minibatch size - cardinality of distributions
d_hidden_size = 50 # Discriminator complexity
d_output_size = 1 # Single dimension for 'real' vs. 'fake'
minibatch_size = d_input_size
d_learning_rate = 2e-4 # 2e-4
g_learning_rate = 2e-4
optim_betas = (0.9, 0.999)
num_epochs = 30000
print_interval = 200
d_steps = 1 # 'k' steps in the original GAN paper. Can put the discriminator on higher training freq than generator
g_steps = 1
# ### Uncomment only one of these
#(name, preprocess, d_input_func) = ("Raw data", lambda data: data, lambda x: x)
(name, preprocess, d_input_func) = ("Data and variances", lambda data: decorate_with_diffs(data, 2.0), lambda x: x * 2)
print("Using data [%s]" % (name))
# ##### DATA: Target data and generator input data
def get_distribution_sampler(mu, sigma):
return lambda n: torch.Tensor(np.random.normal(mu, sigma, (1, n))) # Gaussian
def get_generator_input_sampler():
return lambda m, n: torch.rand(m, n) # Uniform-dist data into generator, _NOT_ Gaussian
# ##### MODELS: Generator model and discriminator model
class Generator(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Generator, self).__init__()
self.map1 = nn.Linear(input_size, hidden_size)
self.map2 = nn.Linear(hidden_size, hidden_size)
self.map3 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.elu(self.map1(x))
x = F.sigmoid(self.map2(x))
return self.map3(x)
class Discriminator(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Discriminator, self).__init__()
self.map1 = nn.Linear(input_size, hidden_size)
self.map2 = nn.Linear(hidden_size, hidden_size)
self.map3 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.elu(self.map1(x))
x = F.elu(self.map2(x))
return F.sigmoid(self.map3(x))
def extract(v):
return v.data.storage().tolist()
def stats(d):
return [np.mean(d), np.std(d)]
def decorate_with_diffs(data, exponent):
mean = torch.mean(data.data, 1, keepdim=True)
mean_broadcast = torch.mul(torch.ones(data.size()), mean.tolist()[0][0])
diffs = torch.pow(data - Variable(mean_broadcast), exponent)
return torch.cat([data, diffs], 1)
d_sampler = get_distribution_sampler(data_mean, data_stddev)
gi_sampler = get_generator_input_sampler()
G = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size)
D = Discriminator(input_size=d_input_func(d_input_size), hidden_size=d_hidden_size, output_size=d_output_size)
criterion = nn.BCELoss() # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss
d_optimizer = optim.Adam(D.parameters(), lr=d_learning_rate, betas=optim_betas)
g_optimizer = optim.Adam(G.parameters(), lr=g_learning_rate, betas=optim_betas)
for epoch in range(num_epochs):
for d_index in range(d_steps):
# 1. Train D on real+fake
D.zero_grad()
# 1A: Train D on real
d_real_data = Variable(d_sampler(d_input_size))
d_real_decision = D(preprocess(d_real_data))
d_real_error = criterion(d_real_decision, Variable(torch.ones(1))) # ones = true
d_real_error.backward() # compute/store gradients, but don't change params
# 1B: Train D on fake
d_gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
d_fake_data = G(d_gen_input).detach() # detach to avoid training G on these labels
d_fake_decision = D(preprocess(d_fake_data.t()))
d_fake_error = criterion(d_fake_decision, Variable(torch.zeros(1))) # zeros = fake
d_fake_error.backward()
d_optimizer.step() # Only optimizes D's parameters; changes based on stored gradients from backward()
for g_index in range(g_steps):
# 2. Train G on D's response (but DO NOT train D on these labels)
G.zero_grad()
gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
g_fake_data = G(gen_input)
dg_fake_decision = D(preprocess(g_fake_data.t()))
g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
g_error.backward()
g_optimizer.step() # Only optimizes G's parameters
if epoch % print_interval == 0:
print("%s: D: %s/%s G: %s (Real: %s, Fake: %s) " % (epoch,
extract(d_real_error)[0],
extract(d_fake_error)[0],
extract(g_error)[0],
stats(extract(d_real_data)),
stats(extract(d_fake_data))))
In this part of the code you are training G to fool D, so G generates fake data and asks D whether it thinks it's real (true labels), D's gradients are then propogated all the way to G (this is possible as D's input was G's output) so that it will learn to better fool D in the next iteration.
The inputs of G are not trainable and G only tries to transform them into real data (data similar to what d_sampler generates)

Tensorflow Grid3LSTMCell visualization

I'm having a difficult time visualizing what this Tensorflow class creates. I want to implement a LSTM RNN that handles 3D data.
class Grid3LSTMCell(GridRNNCell):
"""3D BasicLSTM cell
This creates a 2D cell which receives input and gives output in the first dimension.
The first dimension can optionally be non-recurrent if `non_recurrent_fn` is specified.
The second and third dimensions are LSTM.
"""
def __init__(self, num_units, tied=False, non_recurrent_fn=None,
use_peepholes=False, forget_bias=1.0):
super(Grid3LSTMCell, self).__init__(num_units=num_units, num_dims=3,
input_dims=0, output_dims=0, priority_dims=0, tied=tied,
non_recurrent_dims=None if non_recurrent_fn is None else 0,
cell_fn=lambda n, i: rnn_cell.LSTMCell(
num_units=n, input_size=i, forget_bias=forget_bias,
use_peepholes=use_peepholes),
non_recurrent_fn=non_recurrent_fn)
The class is found in `from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell`.
This is difficult to explain, so I've provided a drawing. Here is what I want it to do...
However the comment sounds like it isn't doing this. The comment makes it sound like the RNN is still a flat RNN, where the first dimension is outputting to, what is commonly called, the outputs variable (see below). The second dimension is outputting to the next step in the RNN, and the third dimension is outputting to the next hidden layer.
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
If this is the case, what is the point in having the first and second dimensions? Aren't they essentially the same thing? The BasicLSTMCell sends the output to the next step into outputs -- in other words they are one in the same.
Clarity?
For reference, here is my example code...
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
import numpy as np
#define parameters
learning_rate = 0.01
batch_size = 2
n_input_x = 10
n_input_y = 10
n_input_z = 10
n_hidden = 128
n_classes = 2
n_output = n_input_x * n_classes
x = tf.placeholder("float", [n_input_x, n_input_y, n_input_z])
y = tf.placeholder("float", [n_input_x, n_input_y, n_input_z, n_classes])
weights = {}
biases = {}
for i in xrange(n_input_y * n_input_z):
weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
biases[i] = tf.Variable(tf.random_normal([n_output]))
#generate random data
input_data = np.random.rand(n_input_x, n_input_y, n_input_z)
ground_truth = np.random.rand(n_input_x, n_input_y, n_input_z, n_classes)
#build GridLSTM
def GridLSTM_network(x):
x = tf.reshape(x, [-1,n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = grid_rnn_cell.Grid3LSTMCell(n_hidden)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], weights[i]) + biases[i])
return output
#initialize network, cost, optimizer and all variables
pred = GridLSTM_network(x)
# import pdb
# pdb.set_trace()
pred = tf.pack(pred)
pred = tf.transpose(pred,[1,0,2])
pred= tf.reshape(pred, [-1, n_input_x, n_input_y, n_input_z, n_classes])
temp_pred = tf.reshape(pred, [-1,n_classes])
temp_y = tf.reshape(y,[-1, n_classes])
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 0
while 1:
print step
step = step + 1
# pdb.set_trace
sess.run(optimizer, feed_dict={x: input_data, y: ground_truth})

Resources