I am getting that RandomCrop isn't putting the padding when I create my images. Why is it?
Reproducible script 1
todo with cifar...
Reproducible script 2:
def check_size_of_mini_imagenet_original_img():
import random
import numpy as np
import torch
import os
seed = 0
os.environ["PYTHONHASHSEED"] = str(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import learn2learn
batch_size = 5
kwargs: dict = dict(name='mini-imagenet', train_ways=2, train_samples=2, test_ways=2, test_samples=2)
kwargs['data_augmentation'] = 'lee2019'
benchmark: learn2learn.BenchmarkTasksets = learn2learn.vision.benchmarks.get_tasksets(**kwargs)
tasksets = [(split, getattr(benchmark, split)) for split in splits]
for i, (split, taskset) in enumerate(tasksets):
for task_num in range(batch_size):
X, y = taskset.sample()
assert X.size(2) == 84
for img_idx in range(X.size(0)):
visualize_pytorch_tensor_img(X[img_idx], show_img_now=True)
if img_idx >= 5: # print 5 images only
# visualize_pytorch_batch_of_imgs(X, show_img_now=True)
if task_num >= 4: # so to get a MI image finally (note omniglot does not have padding at train...oops!)
def visualize_pytorch_tensor_img(tensor_image: torch.Tensor, show_img_now: bool = False):
Due to channel orders not agreeing in pt and matplot lib.
Given a Tensor representing the image, use .permute() to put the channels as the last dimension:
ref: https://stackoverflow.com/questions/53623472/how-do-i-display-a-single-image-in-pytorch
from matplotlib import pyplot as plt
assert len(tensor_image.size()) == 3, f'Err your tensor is the wrong shape {tensor_image.size()=}' \
f'likely it should have been a single tensor with 3 channels' \
f'i.e. CHW.'
if tensor_image.size(0) == 3: # three chanels
plt.imshow(tensor_image.permute(1, 2, 0))
if show_img_now:
images here: https://github.com/learnables/learn2learn/issues/376#issuecomment-1319368831
first one:
I am getting odd images despite printing the transform the data is using:
-- splits[i]='train'
taskset=<learn2learn.data.task_dataset.TaskDataset object at 0x7fbc38345880>
RandomCrop(size=(84, 84), padding=8)
ColorJitter(brightness=[0.6, 1.4], contrast=[0.6, 1.4], saturation=[0.6, 1.4], hue=None)
Normalize(mean=[0.47214064400000005, 0.45330829125490196, 0.4099612805098039], std=[0.2771838538039216, 0.26775040952941176, 0.28449041290196075])
but the padding is missing:
but when I use this instead:
train_data_transform = Compose([
RandomResizedCrop((size - padding*2, size - padding*2), scale=scale, ratio=ratio),
ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
Normalize(mean=mean, std=std),
it seems to work:
why don't both have the 8 and 8 padding on both sides I expect?
I tried seeing the images with mini-imagenet for torch-meta and it also didn't seem the padding was there:
RandomCrop(size=(84, 84), padding=8)
ColorJitter(brightness=[0.6, 1.4], contrast=[0.6, 1.4], saturation=[0.6, 1.4], hue=[-0.2, 0.2])
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
X.size()=torch.Size([25, 3, 84, 84])
The code is much harder to make compact and reproducible but you can see my torchmeta_plot_images_is_the_padding_there ultimate-utils library.
For now since 2 data sets say that padding is not being inserted despite the transform saying it should be I am concluding there is a bug in pytorch or my pytorch version or I just don't understand RandomCrop. But the description is clear to me:
padding (int or sequence, optional) –
Optional padding on each border of the image. Default is None. If a single int is provided this is used to pad all borders.
and the normal padding Pad(...) says something very similar:
padding (int or sequence) –
Padding on each border. If a single int is provided this is used to pad all borders.
so what else could go wrong? The bottom img I provided with a pad is done with the above Pad() function not with RandomCrop.
gitissues: https://github.com/learnables/learn2learn/issues/376
pytorch forum: https://discuss.pytorch.org/t/why-isnt-randomcrop-inserting-the-padding-in-pytorch/166244

They are padded to 84+8 then cropped back to 84: you can see the black padding on each image (eg, on the left for the 2nd image).
I discovered & confirmed that by doing it on cifar. But note this NOT what the docs say for RandomCrop:
Optional padding on each border of the image. Default is None. If a single int is provided this is used to pad all borders.
it says something very similar to pad:
Padding on each border. If a single int is provided this is used to pad all borders.
See: https://github.com/learnables/learn2learn/issues/376#issuecomment-1319405466
I am going to report this to pytorch as a bug https://github.com/pytorch/pytorch/issues/89253. Reproducible code in cifar:
def check_padding_random_crop_cifar_pure_torch():
# -
import sys
print(f'python version: {sys.version=}')
import torch
# -
from uutils.plot.image_visualization import visualize_pytorch_tensor_img
from torchvision.transforms import RandomCrop
# - for determinism
import random
import torch
import numpy as np
# -
from pathlib import Path
root = Path('~/data/').expanduser()
import torch
import torchvision
# - test tensor imgs
from torchvision.transforms import Resize
from torchvision.transforms import Pad
from torchvision.transforms import ToTensor
from torchvision.transforms import Compose
# -- see if pad doubles length
print(f'--- test padding doubles length with Pad(...)')
transform = Compose([Resize((32, 32)), Pad(padding=4), ToTensor()])
train = torchvision.datasets.CIFAR100(root=root, train=True, download=True,
target_transform=lambda data: torch.tensor(data, dtype=torch.long))
transform = Compose([Resize((32, 32)), Pad(padding=8), ToTensor()])
test = torchvision.datasets.CIFAR100(root=root, train=True, download=True,
target_transform=lambda data: torch.tensor(data, dtype=torch.long))
# - test padding doubles length
from torch.utils.data import DataLoader
loader = DataLoader(train)
x, y = next(iter(loader))
assert x[0].size(2) == 32 + 4 * 2
assert x[0].size(2) == 32 + 8
visualize_pytorch_tensor_img(x[0], show_img_now=True)
loader = DataLoader(test)
x, y = next(iter(loader))
assert x.size(2) == 32 + 8 * 2
assert x.size(2) == 32 + 16
visualize_pytorch_tensor_img(x[0], show_img_now=True)
# -- see if RandomCrop also puts the pad
print(f'--- test RandomCrop indeed puts padding')
transform = Compose([Resize((32, 32)), RandomCrop(28, padding=8), ToTensor()])
train = torchvision.datasets.CIFAR100(root=root, train=True, download=True,
target_transform=lambda data: torch.tensor(data, dtype=torch.long))
transform = Compose([Resize((32, 32)), RandomCrop(28), ToTensor()])
test = torchvision.datasets.CIFAR100(root=root, train=True, download=True,
target_transform=lambda data: torch.tensor(data, dtype=torch.long))
# - test that the padding is there visually
from torch.utils.data import DataLoader
loader = DataLoader(train)
x, y = next(iter(loader))
assert x[0].size(2) == 28
visualize_pytorch_tensor_img(x[0], show_img_now=True)
loader = DataLoader(test)
x, y = next(iter(loader))
assert x.size(2) == 28
visualize_pytorch_tensor_img(x[0], show_img_now=True


Richardson-Lucy not sharpening image

I had posted a question previously about the Richardson-Lucy algorithm. I have a follow-up question I would appreciate help with.
Below is the Python code I am using. My input image is already blurry so I removed program lines that I originally had to intentionally blur the image. I am getting the error "RuntimeWarning: invalid value encountered in true_divide relative_blur = image / convolve(im_deconv, psf, mode='same')" I would appreciate help with debugging this. I kept the lines in the program that I commented out based on the suggestion below.
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageFilter
from scipy.signal import convolve2d as conv2
from skimage import color, data, restoration
astro = Image.open('TOFA-003_UV_Cured_Lincoln_Corrected_gray.bmp')
psf = np.ones((5, 5)) / 25
#psf = np.ones((8, 8)) / 25
astro = conv2(astro, psf, 'same')
astro = astro/255
# Add Noise to Image
#astro_noisy = astro.copy()
#astro_noisy += (np.random.poisson(lam=25, size=astro.shape) - 10) / 255
#astro_noisy = astro_noisy/255
# Restore Image using Richardson-Lucy algorithm
deconvolved_RL = restoration.richardson_lucy(astro, psf, iterations=2)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(8, 5))
for a in (ax[0], ax[1], ax[2]):
ax[0].set_title('Original Data')
#ax[1].set_title('Noisy data')
ax[2].imshow(deconvolved_RL, vmin=astro.min(), vmax=astro.max())
ax[2].set_title('Restoration using\nRichardson-Lucy')
fig.subplots_adjust(wspace=0.02, hspace=0.2,
top=0.9, bottom=0.05, left=0, right=1)

[OpenCV]how to fix contours to rectangle?

I'm new to OpenCV(or computer vision), so it would be very helpful just to tell me the search query!
What I want to ask
I want to write a program that extract the business cards from pictures.
I was able to extract a rough outline, but reflected light becomes noise and I can't extract an accurate outline. Please tell me your idea.
image(raw data)
raw data
output data(rough outline)
import math
import itertools
from glob import glob
import cv2
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
def read_images():
"read image data from data directory"
names = glob('data/*.jpg')
return map(lambda name: cv2.imread(name), names)
def blur(img):
"apply blur"
return cv2.GaussianBlur(img, (25, 25), 0)
def show_images(images, column, color_type=cv2.COLOR_BGR2RGB):
"plot images with matplotlib"
plt.figure(figsize=(10,10), dpi=150)
for n, img in zip(range(len(images)), images):
p = plt.subplot(math.ceil(len(images) / column), column, n + 1)
if color_type is None:
p.imshow(cv2.cvtColor(img, color_type))
def detect_background_color(img):
"detect background color"
# Assume that the perimeter is all background
height, width, *_ = img.shape
background_colors = np.concatenate([
img[5:height-5, 5], img[5, 5:width-5],
img[5:height-5, width-5], img[height-5, 5:width-5]
background_colors = background_colors.astype(np.float32)
# Assume that the background color is only one.
K = 2
_, labels, centers = cv2.kmeans(
background_colors, K, None, (iter_flg, 10, 1.0), 10,
cnt1 = len(labels[labels==0])
cnt2 = len(labels[labels==1])
return centers[0] if cnt1 > cnt2 else centers[1]
def scale(img):
bg = detect_background_color(img)
return np.fix(np.sqrt(np.sum(np.square(img - bg), axis=2)) / 1.732).astype(np.uint8)
def binarize(img):
th, bit = cv2.threshold(img, 40, 255, cv2.THRESH_BINARY)
return bit
binarized = [binarize(scale(blur(img))) for img in read_images()]
show_images(binarized, 4, None)
Looks like you need to apply morphology try cv2.erode and then cv2 dilate operations.
The first will remove regions smaller than erode kernel size, the second will restore initial size of large blob. You need to apply the same size kernels for both operations.
Check also this: medium article

Tensorflow and Scikit learn: Same solution but different outputs

Im implementing a simple linear regression with scikitlearn and tensorflow.
My solution in scikitlearn seem fine but with tensorflow my evaluation output is showing some crazy numbers.
The problem is basically to try to predict a salary based in years of experience.
I not sure what Im doing wrong in Tensorflow's code.
ScikitLearn solution
import pandas as pd
data = pd.read_csv('Salary_Data.csv')
X = data.iloc[:, :-1].values
y = data.iloc[:, 1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
X_single_data = [[4.6]]
y_single_pred = regressor.predict(X_single_data)
print(f'Train score: {regressor.score(X_train, y_train)}')
print(f'Test score: {regressor.score(X_test, y_test)}')
Train score: 0.960775692121653
Test score: 0.9248580247217076
Tensorflow solution
import tensorflow as tf
f_cols = [tf.feature_column.numeric_column(key='X', shape=[1])]
estimator = tf.estimator.LinearRegressor(feature_columns=f_cols)
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={'X': X_train}, y=y_train,shuffle=False)
test_input_fn = tf.estimator.inputs.numpy_input_fn(x={'X': X_test}, y=y_test,shuffle=False)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
({'average_loss': 7675087400.0,
'label/mean': 84588.11,
'loss': 69075790000.0,
'prediction/mean': 5.0796494,
'global_step': 6},
Per your code request in the comments: Though I had used my online curve and surface fitting web site zunzun.com for this equation at http://zunzun.com/Equation/2/Sigmoidal/Sigmoid%20B/ for the modeling work, here is a graphing source code example using the scipy differential_evolution genetic algorithm module to estimate initial parameter estimates. The scipy implementation of Differential Evolution uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, which requires bounds within which to search - in this example those bounds are taken from the data maximum and minimum values, and the fit statistics and parameter values are almost identical to those from the web site.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
xData = numpy.array([ 1.1, 1.3, 1.5, 2.0, 2.2, 2.9, 3.0, 3.2, 3.2, 3.7, 3.9, 4.0, 4.0, 4.1, 4.5, 4.9, 5.1, 5.3, 5.9, 6.0, 6.8, 7.1, 7.9, 8.2, 8.7, 9.0, 9.5, 9.6, 10.3, 10.5])
yData = numpy.array([ 39.343, 46.205, 37.731, 43.525, 39.891, 56.642, 60.15, 54.445, 64.445, 57.189, 63.218, 55.794, 56.957, 57.081, 61.111, 67.938, 66.029, 83.088, 81.363, 93.94, 91.738, 98.273, 101.302, 113.812, 109.431, 105.582, 116.969, 112.635, 122.391, 121.872])
def func(x, a, b, c):
return a / (1.0 + numpy.exp(-(x-b)/c))
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([minX, maxX]) # search bounds for b
parameterBounds.append([minX, maxX]) # search bounds for c
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('Years of experience') # X axis data label
axes.set_ylabel('Salary in thousands') # Y axis data label
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
I cannot place an image in a comment, and so place it here. I suspected the relationship might be sigmoidal rather than linear, and found the following sigmoidal equation and fit statistics using units of thousands for salary: "y = a / (1.0 + exp(-(x-b)/c))" with fitted parameters a = 1.5535069418318591E+02, b = 5.4580059234664899E+00, and c = 3.7724942500630938E+00 giving an R-squared = 0.96 and RMSE = 5.30 (thousand)

RNN can't learn integral function

For studying deep learning, RNN, LSTM and so on I tried to make RNN fit integration function. I have put random signal from 0 to 1 as input to RNN and made integral from biased by -0.5 input signal, made the limit for integral between 0:1 and put it as RNN target to learn. Blue - random input, orange - integrated input
So I have time series with only one input (random) and one output (limited integral of input) and I want RNN to predict output by the input.
I used Pytorch and tried to use vanilla RNN, GRU cell, different sizes of hidden layers, stacking several RNN, putting dense connected layers to the RNN output, different deep in backpropagation through time (from 2 to 50 gradients rolling-back). And I can't get a good result at all! It works somehow, but I can't find a way to fit integral function precisely. Here is the best of my results:
green - RNN output. Green line (model output) does not fit orange line in many cases - that is the problem.
Here is my source code in jupyter.
My questions: is it possible - to learn a saturated integral function by RNN? Where is my problem? What can I try more to achieve good quality? Ideally I want to RNN output be equal desired output (integral function) through all time series.
My code in raw format:
import numpy as np
from scipy.stats import truncnorm
import random
import math
import copy
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.cm as cm
def generate_data(num_of_data):
for i in range(num_of_data):
if (random.random()<0.1):
# current_output_value=0
if (current_output_value<0):
if (current_output_value>1):
return input_data,output_data
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (20, 6)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.gru = nn.GRU(input_size, hidden_size,self.number_of_layers)
self.Dense1 = nn.Linear(hidden_size, hidden_size)
self.Dense1A = nn.ReLU()
self.Dense2 = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden):
gru_output, hidden = self.gru(input, hidden)
return output, hidden
def initHidden(self):
return Variable(torch.zeros(self.number_of_layers,1,self.hidden_size))
import time
import math
import operator
def timeSince(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
rnn = RNN(1, 50, 1)
n_iters = 250000
print_every = 2000
plot_every = 2000
all_losses = []
total_loss_print = 0
total_loss_plot = 0
start = time.time()
optimizer = optim.Adam(rnn.parameters(), lr=0.0002)
rnn_hidden = rnn.initHidden()
loss = 0
#for gata_q in range(int(n_iters/500)):
# rnn_hidden = rnn.initHidden()
for data_index in range(len(input_data)):
input_tensor=torch.zeros(1, 1, 1)
output_tensor=torch.zeros(1, 1, 1)
rnn_output, rnn_hidden = rnn(Variable(input_tensor), rnn_hidden)
loss += criterion(rnn_output, Variable(output_tensor))
if data_index%2==0:
total_loss_print += loss.data[0]
total_loss_plot += loss.data[0]
loss = 0
if data_index % print_every == 0:
print('%s (%d %d%%) tl=%.4f' % (timeSince(start), data_index, data_index / n_iters * 100,total_loss_print/print_every))
total_loss_print = 0
if data_index % plot_every == 0:
all_losses.append(total_loss_plot / plot_every)
total_loss_plot = 0
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
rnn_hidden = rnn.initHidden()
loss = 0
for data_index in range(len(input_data)):
input_tensor=torch.zeros(1, 1, 1)
rnn_output, rnn_hidden = rnn(Variable(input_tensor), rnn_hidden)
I have found the problem by myself. The problem was in some case of overfitting on latest data, as in reinforcement learning case overfitting can occur with exploiting latest strategy. As I was not using any mini-batches and applied optimiser directly after a new point of data, and as because of data points similar through 20-50 of samples, optimiser simply fitted network to only latest points forgetting of fitting previous. I solved it by collecting gradient data through time for 50 points and only after it I apply one step of optimiser. The network can learn now much better, but still not perfect.
Here is modification of code to make it work:
rnn_output, rnn_hidden = rnn(Variable(input_tensor), rnn_hidden)
loss += criterion(rnn_output, Variable(output_tensor))
if data_index % 2==0:
total_loss_print += loss.data[0]
loss = 0
# torch.nn.utils.clip_grad_norm(rnn.parameters(), 0.01)
if data_index % 50==0:
and new result of learning of integral:

unsupervised learning how to get number of clusters

In this code below the author says that -
"Before I begin the kmeans clustering I want to use a hierarchial clustering to figure how many clusters I should have. I truncated the dendrogram because if I didn't the dendrogram will be hard to read. I cut at 20 because it has the second biggest distance jump (the first big jump is at 60). After the cut there are 7 clusters."
I am not able to see in the Dendrogram how he arrived at the numbers he mentioned - 20, 60 or 7
I am attaching the dendrogram that I have got from the sample data taken from his github example and am wondering if anyone can shed light on how he arrived at the numbers 20, 60 or 7
he also says "Let's fit k-means on the matrix with a range of clusters 1 - 19." where did he get that range 1 to 19 from? is it cause of the drop at 20 (or the cut off at 20)
github - https://github.com/moyphilip/SKU-Clustering
Also what would one say should be the number of clusters in this second image attached here ? 6 clusters ? (its a different dataset)
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
import re
import numpy as np
df = pd.read_csv('sample-data.csv')
def split_description(string):
string_split = string.split(' - ',1)
name = string_split[0]
return name
df_new = pd.DataFrame()
df_new['name'] = df.loc[:,'description'].apply(lambda x: split_description(x))
df_new['id'] = df['id']
def remove(name):
new_name = re.sub("[0-9]", '', name)
new_name = ' '.join(new_name.split())
return new_name
df_new['name'] = df_new.loc[:,'name'].apply(lambda x: remove(x))
tfidf_vectorizer = TfidfVectorizer(
stop_words = 'english',
ngram_range=(1,4), min_df = 0.01, max_df = 0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_new['name'])
print (tfidf_matrix.shape)
print (tfidf_vectorizer.get_feature_names())
from sklearn.metrics.pairwise import cosine_similarity
dist = 1.0 - cosine_similarity(tfidf_matrix)
print (dist)
from scipy.cluster.hierarchy import ward, dendrogram
#run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix,
truncate_mode='lastp', # show only the last p merged clusters
p=20, # show only the last p merged clusters
plt.axhline(y=20, linewidth = 2, color = 'black')
fig.suptitle("Hierarchial Clustering Dendrogram Truncated", fontsize = 35, fontweight = 'bold')
from sklearn.cluster import KMeans
num_clusters = range(1,20)
KM = [KMeans(n_clusters=k, random_state = 1).fit(tfidf_matrix) for k in num_clusters]
# Let's plot the within cluster sum of squares for each k to see which k I should choose.
# The plot shows a steady decline from from 0 to 19. Since the elbow rule does not apply for this I will choose k = 7 because of the previous dendrogram.
# In[17]:
import matplotlib.pyplot as plt
#get_ipython().run_line_magic('matplotlib', 'inline')
with_in_cluster = [KM[k].inertia_ for k in range(0,len(num_clusters))]
plt.plot(num_clusters, with_in_cluster)
plt.ylim(min(with_in_cluster)-1000, max(with_in_cluster)+1000)
plt.ylabel('with-in cluster sum of squares')
plt.xlabel('# of clusters')
plt.title('kmeans within ss for k value')
# I add the cluster label to each record in df_new
# In[18]:
model = KM[6]
clusters = model.labels_.tolist()
df_new['cluster'] = clusters
# Here is the distribution of clusters. Cluster 0 has a records, then cluster 1. Cluster 2 - 4 seem pretty even.
# In[19]:
# I print the top terms per cluster and the names in the respective cluster.
# In[20]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(model.n_clusters):
print ("Cluster %d : " %i )
for ind in order_centroids[i, :10]:
print ( '%s' % terms[ind])
print ("Cluster %d names:" %i)
for idx in df_new[df_new['cluster'] == i]['name'].sample(n = 10):
print ( ' %s' %idx)
# I reduce the dist to 2 dimensions with MDS. The dissimilarity is precomputed because we provide 1 - cosine similarity. Then I assign the x and y variables.
# In[21]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]
# In[22]:
cluster_colors = {0: '#85C1E9', 1: '#FF0000', 2: '#800000', 3: '#04B320',
4: '#6033FF', 5: '#33FF49', 6: '#F9E79F', 7: '#935116',
8: '#9B59B6', 9: '#95A5A6'}
cluster_labels = {0: 'vest dress print', 1: 'shirt merino island',
2: 'pants guide pants guide', 3: 'shorts board board shorts',
4: 'simply live live simply', 5: 'cap cap bottoms bottoms',
6: 'jkt zip jkt guide'}
#some ipython magic to show the matplotlib plots inline
#get_ipython().run_line_magic('matplotlib', 'inline')
#create data frame that has the result of the MDS plus the cluster numbers and titles
df_plot = pd.DataFrame(dict(x=xs, y=ys, label=clusters, name=df_new['name']))
#group by cluster
groups = df_plot.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label = cluster_labels[name],
color = cluster_colors[name])
ax.legend(numpoints = 1)
fig.suptitle("SKU Clustering", fontsize = 35, fontweight = 'bold')
