Error while trying to transpose the matrix - time-series

Code for a single raster file:
import geopandas as gpd
#import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
dataset = rasterio.open(r'E:/anakonda/LST_day/MOD11A1.006_LST_Day_1km_doy2019082_aid0001.tif')
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351, 545))
for records_date in Matrix.columns.tolist():
a = Matrix
LST_day_value = a.loc[int(row)][int(col)]
table[records_date] = LST_day_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LST_Day(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\\'+station_name+'.csv')
Error code lines:
LST_day_value = a.loc[int(row)][int(col)]
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\'+station_name+'.csv')
Errors Shown:
Undefined Name 'row' (pyflakes E)
Undefined Name 'col' (pyflakes E)
NameError: name 'transpose_mat' is not defined
I'm using the above code for creating a Raster Time-series for Modis LST data. the code ran well till 'transposing the matrix'. the error shown is mentioned below the code. Im new to python, so kindly help me with this issue.
import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
for files in os.listdir(r'E:/anakonda/LST_Night'):
if files[-4: ] == '.tif':
dataset = rasterio.open(r'E:/anakonda/LST_Night'+'\\'+files)
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351,545))
data = files[ :-20]
Matrix[data] = data_array_sparse.toarray().tolist()
print('Processing is done for the raster: '+ files[:-20])
# Iterate through the stations and get the corresponding row and column for the related x, y coordinates
for index, row in stations.iterrows():
station_name = str(row['Station'])
lon = float(row['lon'])
lat = float(row['lat'])
x,y = (lon, lat)
row, col = dataset.index(x, y)
print('Processing: '+ station_name)
# Pick the LST value from each stored raster array and record it into the previously created 'table'
for records_date in Matrix.columns.tolist():
a = Matrix[records_date]
LST_Night_value = a.loc[int(row)][int(col)]
table[records_date] = LST_Night_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LstNight(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_Night'+'\\'+station_name+'.csv')```
This is the error shown:
```File "C:\Anaconda\envs\timeseries\lib\site-packages\pandas\core\indexes\range.py", line 357, in get_loc
raise KeyError(key) from err
KeyError: 2278```

Related

Outcput of Chi2 is showing as in dataframe fromat:

I am trying loop the ChiSquare test and outcome is not shown as required, that is in Dataframe.
All columns are coming one row..
Please help
# Import the function
from scipy.stats import chi2_contingency
chi2_check = []
for i in df_clean.select_dtypes(['object']):
if chi2_contingency(pd.crosstab(df_clean['Final_Comments'], df_clean[i]))[1] < 0.05:
chi2_check.append('Reject Null Hypothesis')
else:
chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [df_clean.select_dtypes(['object']), chi2_check]
).T
res.columns = ['Column', 'Hypothesis']
print(res)
res.columns

am getting NotFittedError: This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator

my code is
import streamlit as st
import pickle
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def transform_text(text):
text = text.lower()
text = nltk.word_tokenize(text)
y = []
for i in text:
if i.isalnum():
y.append(i)
text = y[:]
y.clear()
for i in text:
if i not in stopwords.words('english') and i not in string.punctuation:
y.append(i)
text = y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
return " ".join(y)
tfidf = pickle.load(open('vectorizer.pkl','rb'))
model = pickle.load(open('model.pkl','rb'))
st.title("Email/SMS Spam Classifier")
input_sms = st.text_area("Enter the message")
if st.button('Predict'):
# 1. preprocess
transformed_sms = transform_text(input_sms)
# 2. vectorize
vector_input = tfidf.transform([transformed_sms])
# 3. predict
result = model.predict(vector_input)[0]
# 4. Display
if result == 1:
st.header("Spam")
else:
st.header("Not Spam")

Is there a function to get log and pct change?

I would like to compare log and pct change f the two symbols, but the following error appears:
KeyError: 'Adj Close'
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
start = datetime.datetime(2017, 10, 1)
end = datetime.datetime.now()
symbols = ['BTC-USD', 'ETH-USD']
df = pd.DataFrame()
for i in symbols:
data = yf.download(i, start=None, end=None,show_errors=("True"),
period="4y", interval="1mo")
df[i] = data['Adj Close'].pct_change().dropna()
df['log_stuff'] = \
np.log(df['Adj Close'].astype('float64')/df['Adj Close'].astype('float64').shift(1))
df[['pct_change', 'log_stuff','df']].plot();
You could try the following. Please note, that you can also pass a list to download(), so no loops are required.
import numpy as np
import pandas as pd
import yfinance as yf
symbols = ['BTC-USD', 'ETH-USD']
data = yf.download(symbols, period="4y", interval="1mo")
# calculate pct return
pct_data = data['Adj Close'].pct_change()
pct_data = pct_data.add_suffix('_pct')
# calculate log returns
log_data = np.log(data['Adj Close']) - np.log(data['Adj Close'].shift(1))
log_data = log_data.add_suffix('_log')
# combine returns and drop na values
combined_data = pd.concat([pct_data,log_data], axis=1).dropna()
print(combined_data)
This will yield the following output:
BTC-USD_pct ETH-USD_pct BTC-USD_log ETH-USD_log
Date
2017-12-01 0.383326 0.692483 0.324490 0.526197
2018-01-01 -0.277987 0.477813 -0.325713 0.390564
2018-02-01 0.017298 -0.235276 0.017150 -0.268240
...

Getting the column names chosen after a feature selection method

Given a simple feature selection code below, I want to know the selected columns after the feature selection (The dataset includes a header V1 ... V20)
import pandas as pd
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
def feature_selection(data):
y = data['Class']
X = data.drop(['Class'], axis=1)
fs = SelectKBest(score_func=f_regression, k=10)
# Applying feature selection
X_selected = fs.fit_transform(X, y)
# TODO: determine the columns being selected
return X_selected
data = pd.read_csv("../dataset.csv")
new_data = feature_selection(data)
I appreciate any help.
I have used the iris dataset for my example but you can probably easily modify your code to match your use case.
The SelectKBest method has the scores_ attribute I used to sort the features.
Feel free to ask for any clarifications.
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.datasets import load_iris
def feature_selection(data):
y = data[1]
X = data[0]
column_names = ["A", "B", "C", "D"] # Here you should use your dataframe's column names
k = 2
fs = SelectKBest(score_func=f_regression, k=k)
# Applying feature selection
X_selected = fs.fit_transform(X, y)
# Find top features
# I create a list like [[ColumnName1, Score1] , [ColumnName2, Score2], ...]
# Then I sort in descending order on the score
top_features = sorted(zip(column_names, fs.scores_), key=lambda x: x[1], reverse=True)
print(top_features[:k])
return X_selected
data = load_iris(return_X_y=True)
new_data = feature_selection(data)
I don't know the in-build method, but it can be easily coded.
n_columns_selected = X_new.shape[0]
new_columns = list(sorted(zip(fs.scores_, X.columns))[-n_columns_selected:])
# new_columns order is perturbed, we need to restore it. We use the names of the columns of X as a reference
new_columns = list(sorted(cols_new, key=lambda x: list(X.columns).index(x)))

how to convert pandas str.split call to to dask

I have a dask data frame where the index is a string which looks like this:
12/09/2016 00:00;32.0046;-106.259
12/09/2016 00:00;32.0201;-108.838
12/09/2016 00:00;32.0224;-106.004
(its basically a string encoding the datetime;latitude;longitude of the row)
I'd like to split that while still in the dask context to individual columns representing each of the fields.
I can do that with a pandas dataframe as:
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
But that doesn't work in dask for several of the attempts I've tried. If I directly substitute the df for a dask df I get the error:
'Index' object has no attribute 'str'
If I use the column name instead of index as:
forecastDf['date'], forecastDf['Lat'], forecastDf['Lon'] = forecastDf['dateLocation'].str.split(';', 2).str
I get the error:
TypeError: 'StringAccessor' object is not iterable
Here is an runnable example of this working in Pandas
import pandas as pd
df = pd.DataFrame()
df['dateLocation'] = ['12/09/2016 00:00;32.0046;-106.259','12/09/2016 00:00;32.0201;-108.838','12/09/2016 00:00;32.0224;-106.004']
df = df.set_index('dateLocation')
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
df.head()
Here is the error I get if I directly convert that to dask
import dask.dataframe as dd
dd = dd.from_pandas(df, npartitions=1)
dd['date'], dd['Lat'], dd['Lon'] = dd.index.str.split(';', 2).str
>>TypeError: 'StringAccessor' object is not iterable
forecastDf['date'] = forecastDf['dateLocation'].str.partition(';')[0]
forecastDf['Lat'] = forecastDf['dateLocation'].str.partition(';')[2]
forecastDf['Lon'] = forecastDf['dateLocation'].str.partition(';')[4]
Let me know if this works for you!
First make sure the column is string dtype
forecastDD['dateLocation'] = forecastDD['dateLocation'].astype('str')
Then you can use this to split in dask
splitColumns = client.persist(forecastDD['dateLocation'].str.split(';',2))
You can then index the columns in the new dataframe splitColumns and add them back to the original data frame.
forecastDD = forecastDD.assign(Lat=splitColumns.apply(lambda x: x[0], meta=('Lat', 'f8')), Lon=splitColumns.apply(lambda x: x[1], meta=('Lat', 'f8')), date=splitColumns.apply(lambda x: x[2], meta=('Lat', np.dtype(str))))
Unfortunately I couldn't figure out how to do it without calling compute and creating the temp dataframe.

Resources