How to convert a text file by word2vec using python - machine-learning

I am beginner of python language,natural language processing,deep learning,neural networks.I want to execute a program which convert text file into vector by using word2vec in python..someone please help me
import math
import nltk file = "/home/stephy/Demo/textfile.txt"
import numpy as np
def loadGloveModel(gloveFile):
with open(gloveFile, encoding="utf8" ) as f:
content = f.readlines()
model = {} for line in content:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model= loadGloveModel(file)
print (model['file'])

Related

How to handle alphanumeric values in machine learning

I am trying to the find the best algorithm for my claims data. The claims data include some diagnosis code which are alphanumeric like 'EA43454' . when i run the below code to evaluate the models
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=None)
cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
i get the error
ValueError: could not convert string to float: 'U0003'
How to handle these alphanumeric values?
You need to convert your strings to an indicator variable (dummy variables). Each value of the string variable has to be associated with a number so that the models can train on that data.
Scikit-learn has several preprocessors to help you with this such as OneHotEncoder. You can also use pandas.get_dummies, but using sklearn's own classes is more composable - for example, you can use them as part of a pipeline.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
rng = np.random.default_rng()
animals = pd.DataFrame({"animal": rng.choice(["cat", "dog"], size=10),
"age": rng.integers(1, 20, size=10)})
animals_ohe = OneHotEncoder().fit_transform(animals.drop(columns=["age"]))

Reading Pointcloud from .csv to ROS PointCloud2

I have a .csv file which has /raw_points rostopic, and i'm trying to convert that file into PointCloud2 data(http://docs.ros.org/en/api/sensor_msgs/html/msg/PointCloud2.html).
import csv
import sys
csv.field_size_limit(sys.maxsize)
file = open("points_raw.csv")
csvreader = csv.reader(file)
header = next(csvreader)
print(header)
This is my header:
['Time', 'header.seq', 'header.stamp.secs', 'header.stamp.nsecs', 'header.frame_id', 'height', 'width', 'fields', 'is_bigendian', 'point_step', 'row_step', 'data', 'is_dense']
These information match the CloudPoint2, but I'm not sure how to convert it to this type.
You need to simply iterate over each row and for every row store the relative fields in a PointCloud2 message and publish it out. For example:
import rospy
import csv
from sensor_msgs.msg import PointCloud2
def main():
#Setup ros param/init here
some_pub = rospy.Publisher('output_topic', PointCloud2, queue_size=10)
with open('some_file.csv', 'r') as f:
reader = csv.reader(f)
for line in reader:
split_line = line.split(',')
new_msg = PointCloud2()
new_msg.header.seq = split_line[1]
new_msg.header.stamp.secs = split_line[2]
#Iterate over the rest
new_msg.data = split_line[11]
new_msg.is_dense = split_line[12]
some_pub.publish(new_msg)
rospy.Rate(10).sleep() #Sleep at 10Hz

am getting NotFittedError: This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator

my code is
import streamlit as st
import pickle
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def transform_text(text):
text = text.lower()
text = nltk.word_tokenize(text)
y = []
for i in text:
if i.isalnum():
y.append(i)
text = y[:]
y.clear()
for i in text:
if i not in stopwords.words('english') and i not in string.punctuation:
y.append(i)
text = y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
return " ".join(y)
tfidf = pickle.load(open('vectorizer.pkl','rb'))
model = pickle.load(open('model.pkl','rb'))
st.title("Email/SMS Spam Classifier")
input_sms = st.text_area("Enter the message")
if st.button('Predict'):
# 1. preprocess
transformed_sms = transform_text(input_sms)
# 2. vectorize
vector_input = tfidf.transform([transformed_sms])
# 3. predict
result = model.predict(vector_input)[0]
# 4. Display
if result == 1:
st.header("Spam")
else:
st.header("Not Spam")

Error while trying to transpose the matrix

Code for a single raster file:
import geopandas as gpd
#import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
dataset = rasterio.open(r'E:/anakonda/LST_day/MOD11A1.006_LST_Day_1km_doy2019082_aid0001.tif')
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351, 545))
for records_date in Matrix.columns.tolist():
a = Matrix
LST_day_value = a.loc[int(row)][int(col)]
table[records_date] = LST_day_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LST_Day(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\\'+station_name+'.csv')
Error code lines:
LST_day_value = a.loc[int(row)][int(col)]
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\'+station_name+'.csv')
Errors Shown:
Undefined Name 'row' (pyflakes E)
Undefined Name 'col' (pyflakes E)
NameError: name 'transpose_mat' is not defined
I'm using the above code for creating a Raster Time-series for Modis LST data. the code ran well till 'transposing the matrix'. the error shown is mentioned below the code. Im new to python, so kindly help me with this issue.
import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
for files in os.listdir(r'E:/anakonda/LST_Night'):
if files[-4: ] == '.tif':
dataset = rasterio.open(r'E:/anakonda/LST_Night'+'\\'+files)
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351,545))
data = files[ :-20]
Matrix[data] = data_array_sparse.toarray().tolist()
print('Processing is done for the raster: '+ files[:-20])
# Iterate through the stations and get the corresponding row and column for the related x, y coordinates
for index, row in stations.iterrows():
station_name = str(row['Station'])
lon = float(row['lon'])
lat = float(row['lat'])
x,y = (lon, lat)
row, col = dataset.index(x, y)
print('Processing: '+ station_name)
# Pick the LST value from each stored raster array and record it into the previously created 'table'
for records_date in Matrix.columns.tolist():
a = Matrix[records_date]
LST_Night_value = a.loc[int(row)][int(col)]
table[records_date] = LST_Night_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LstNight(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_Night'+'\\'+station_name+'.csv')```
This is the error shown:
```File "C:\Anaconda\envs\timeseries\lib\site-packages\pandas\core\indexes\range.py", line 357, in get_loc
raise KeyError(key) from err
KeyError: 2278```

NLTK Word Extraction

So I am trying to read a txt file, process it by taking out the stop words, and then output that result into a new file. However, I keep getting the following error:
TypeError: expected a string or other character buffer object
This is my code:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
f=open('tess1.txt','rU')
stop_words = set(stopwords.words('english'))
raw=f.read()
word_tokens = word_tokenize(raw)
text = nltk.Text(word_tokens)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
if w not in stop_words:
filtered_sentence.append(w)
K = open("tess12.txt", "w")
K.write(filtered_sentence)
K.close()
print(filtered_sentence)
The solution's to write a string inside the buffer:
K.write(str(filtered_sentence))

Resources