Outcput of Chi2 is showing as in dataframe fromat: - machine-learning

I am trying loop the ChiSquare test and outcome is not shown as required, that is in Dataframe.
All columns are coming one row..
Please help
# Import the function
from scipy.stats import chi2_contingency
chi2_check = []
for i in df_clean.select_dtypes(['object']):
if chi2_contingency(pd.crosstab(df_clean['Final_Comments'], df_clean[i]))[1] < 0.05:
chi2_check.append('Reject Null Hypothesis')
else:
chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [df_clean.select_dtypes(['object']), chi2_check]
).T
res.columns = ['Column', 'Hypothesis']
print(res)
res.columns

Related

Is there a function to get log and pct change?

I would like to compare log and pct change f the two symbols, but the following error appears:
KeyError: 'Adj Close'
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
start = datetime.datetime(2017, 10, 1)
end = datetime.datetime.now()
symbols = ['BTC-USD', 'ETH-USD']
df = pd.DataFrame()
for i in symbols:
data = yf.download(i, start=None, end=None,show_errors=("True"),
period="4y", interval="1mo")
df[i] = data['Adj Close'].pct_change().dropna()
df['log_stuff'] = \
np.log(df['Adj Close'].astype('float64')/df['Adj Close'].astype('float64').shift(1))
df[['pct_change', 'log_stuff','df']].plot();
You could try the following. Please note, that you can also pass a list to download(), so no loops are required.
import numpy as np
import pandas as pd
import yfinance as yf
symbols = ['BTC-USD', 'ETH-USD']
data = yf.download(symbols, period="4y", interval="1mo")
# calculate pct return
pct_data = data['Adj Close'].pct_change()
pct_data = pct_data.add_suffix('_pct')
# calculate log returns
log_data = np.log(data['Adj Close']) - np.log(data['Adj Close'].shift(1))
log_data = log_data.add_suffix('_log')
# combine returns and drop na values
combined_data = pd.concat([pct_data,log_data], axis=1).dropna()
print(combined_data)
This will yield the following output:
BTC-USD_pct ETH-USD_pct BTC-USD_log ETH-USD_log
Date
2017-12-01 0.383326 0.692483 0.324490 0.526197
2018-01-01 -0.277987 0.477813 -0.325713 0.390564
2018-02-01 0.017298 -0.235276 0.017150 -0.268240
...

Error while trying to transpose the matrix

Code for a single raster file:
import geopandas as gpd
#import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
dataset = rasterio.open(r'E:/anakonda/LST_day/MOD11A1.006_LST_Day_1km_doy2019082_aid0001.tif')
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351, 545))
for records_date in Matrix.columns.tolist():
a = Matrix
LST_day_value = a.loc[int(row)][int(col)]
table[records_date] = LST_day_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LST_Day(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\\'+station_name+'.csv')
Error code lines:
LST_day_value = a.loc[int(row)][int(col)]
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\'+station_name+'.csv')
Errors Shown:
Undefined Name 'row' (pyflakes E)
Undefined Name 'col' (pyflakes E)
NameError: name 'transpose_mat' is not defined
I'm using the above code for creating a Raster Time-series for Modis LST data. the code ran well till 'transposing the matrix'. the error shown is mentioned below the code. Im new to python, so kindly help me with this issue.
import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
for files in os.listdir(r'E:/anakonda/LST_Night'):
if files[-4: ] == '.tif':
dataset = rasterio.open(r'E:/anakonda/LST_Night'+'\\'+files)
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351,545))
data = files[ :-20]
Matrix[data] = data_array_sparse.toarray().tolist()
print('Processing is done for the raster: '+ files[:-20])
# Iterate through the stations and get the corresponding row and column for the related x, y coordinates
for index, row in stations.iterrows():
station_name = str(row['Station'])
lon = float(row['lon'])
lat = float(row['lat'])
x,y = (lon, lat)
row, col = dataset.index(x, y)
print('Processing: '+ station_name)
# Pick the LST value from each stored raster array and record it into the previously created 'table'
for records_date in Matrix.columns.tolist():
a = Matrix[records_date]
LST_Night_value = a.loc[int(row)][int(col)]
table[records_date] = LST_Night_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LstNight(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_Night'+'\\'+station_name+'.csv')```
This is the error shown:
```File "C:\Anaconda\envs\timeseries\lib\site-packages\pandas\core\indexes\range.py", line 357, in get_loc
raise KeyError(key) from err
KeyError: 2278```

Getting the column names chosen after a feature selection method

Given a simple feature selection code below, I want to know the selected columns after the feature selection (The dataset includes a header V1 ... V20)
import pandas as pd
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
def feature_selection(data):
y = data['Class']
X = data.drop(['Class'], axis=1)
fs = SelectKBest(score_func=f_regression, k=10)
# Applying feature selection
X_selected = fs.fit_transform(X, y)
# TODO: determine the columns being selected
return X_selected
data = pd.read_csv("../dataset.csv")
new_data = feature_selection(data)
I appreciate any help.
I have used the iris dataset for my example but you can probably easily modify your code to match your use case.
The SelectKBest method has the scores_ attribute I used to sort the features.
Feel free to ask for any clarifications.
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.datasets import load_iris
def feature_selection(data):
y = data[1]
X = data[0]
column_names = ["A", "B", "C", "D"] # Here you should use your dataframe's column names
k = 2
fs = SelectKBest(score_func=f_regression, k=k)
# Applying feature selection
X_selected = fs.fit_transform(X, y)
# Find top features
# I create a list like [[ColumnName1, Score1] , [ColumnName2, Score2], ...]
# Then I sort in descending order on the score
top_features = sorted(zip(column_names, fs.scores_), key=lambda x: x[1], reverse=True)
print(top_features[:k])
return X_selected
data = load_iris(return_X_y=True)
new_data = feature_selection(data)
I don't know the in-build method, but it can be easily coded.
n_columns_selected = X_new.shape[0]
new_columns = list(sorted(zip(fs.scores_, X.columns))[-n_columns_selected:])
# new_columns order is perturbed, we need to restore it. We use the names of the columns of X as a reference
new_columns = list(sorted(cols_new, key=lambda x: list(X.columns).index(x)))

OpenAI gym breakout-ram-v4 unable to learn

I am using Q learning and the program should be able to play the game after some tries but it is not learning even when the epsilon value if 0.1.
I have tried changing the batch size the memory size. I have changed the code to give -1 reward if the player dies.
import gym
import numpy as np
import random
import tensorflow as tf
import numpy as np
from time import time
import keyboard
import sys
import time
env = gym.make("Breakout-ram-v4")
observationSpace = env.observation_space
actionSpace= env.action_space
episode = 500
class Model_QNN :
def __init__(self):
self.memory = []
self.MAX_MEMORY_TO_USE = 60_000
self.gamma = 0.9
self.model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(128,1)),
tf.keras.layers.Dense(256,activation="relu"),
tf.keras.layers.Dense(64,activation="relu"),
tf.keras.layers.Dense(actionSpace.n , activation= "softmax")
])
self.model.compile(optimizer="adam",loss="mse",metrics=["accuracy"])
def remember(self, steps , done):
self.memory.append([steps,done])
if(len(self.memory) >= self.MAX_MEMORY_TO_USE):
del self.memory[0]
def replay(self,batch_size= 32):
states, targets_f = [], []
if(len(self.memory)< batch_size) :
return
else:
mini = random.sample(self.memory,batch_size)
states ,targets = [], []
for steps , done in mini :
target= steps[2] ;
if not done :
target = steps[2] + (self.gamma* np.amax(self.model.predict(steps[3].reshape(1,128,1))[0]))
target_f = self.model.predict(steps[0].reshape(1,128,1))
target_f[0][steps[1]] = target
states.append(steps[0])
targets.append(target_f[0])
self.model.fit(np.array(states).reshape(len(states),128,1), np.array(targets),verbose=0,epochs=10)
def act(self,state,ep):
if(random.random()< ep):
action = actionSpace.sample()
else :
np.array([state]).shape
action= self.model.predict(state.reshape(1,128,1))
action = np.argmax(action)
return action;
def saveModel (self):
print("Saving")
self.model.save("NEWNAMEDONE")
def saveBackup(self,num):
self.model.save("NEWNAME"+str(int(num)))
def main():
agent= Model_QNN();
epsilon=0.9
t_end = time.time()
score= 0
for e in range(2000):
print("Working on episode : "+str(e)+" eps "+str(epsilon)+" Score " + str(score))
preState = env.reset()
preState,reward,done,_ = env.step(1)
mainLife=5
done = False
score= 0
icount = 0
render=False
if e % 400 ==0 and not e==0:
render =True
while not done:
icount+=1
if render:
env.render()
if keyboard.is_pressed('q'):
agent.saveBackup(100)
agent.saveModel()
quit()
rewrd=0
if ( _["ale.lives"] < mainLife ):
mainLife-=1
rewrd=-1
action=1
else:
action = agent.act(preState,epsilon)
newState,reward,done,_ = env.step(action)
if rewrd ==-1 :
reward =-1
agent.remember([preState/255,action,reward,newState/255],done);
preState= newState;
score+=reward
if done :
break
agent.replay(1024)
if epsilon >= 0.18 :
epsilon = epsilon * 0.995;
if ((e+1)%500==0):
agent.saveBackup((e+1)/20)
agent.saveModel()
if __name__=='__main__':
main()
There is no error message the program should learn and it is not
Why are you using Softmax on your output layer?
If you want to use Softmax use Cross-Entropy as your loss. However, it looks like you're trying to implement a value based learning system. The activation function on your output layer should be linear.
I suggest you try your implementation on Cartpole-v0 then LunarLanding-v2 first.
Those are solved environments and a great place to sanity check your code.
"There is no error message the program should learn and it is not."
Welcome to ML where things fail silently.

how to convert pandas str.split call to to dask

I have a dask data frame where the index is a string which looks like this:
12/09/2016 00:00;32.0046;-106.259
12/09/2016 00:00;32.0201;-108.838
12/09/2016 00:00;32.0224;-106.004
(its basically a string encoding the datetime;latitude;longitude of the row)
I'd like to split that while still in the dask context to individual columns representing each of the fields.
I can do that with a pandas dataframe as:
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
But that doesn't work in dask for several of the attempts I've tried. If I directly substitute the df for a dask df I get the error:
'Index' object has no attribute 'str'
If I use the column name instead of index as:
forecastDf['date'], forecastDf['Lat'], forecastDf['Lon'] = forecastDf['dateLocation'].str.split(';', 2).str
I get the error:
TypeError: 'StringAccessor' object is not iterable
Here is an runnable example of this working in Pandas
import pandas as pd
df = pd.DataFrame()
df['dateLocation'] = ['12/09/2016 00:00;32.0046;-106.259','12/09/2016 00:00;32.0201;-108.838','12/09/2016 00:00;32.0224;-106.004']
df = df.set_index('dateLocation')
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
df.head()
Here is the error I get if I directly convert that to dask
import dask.dataframe as dd
dd = dd.from_pandas(df, npartitions=1)
dd['date'], dd['Lat'], dd['Lon'] = dd.index.str.split(';', 2).str
>>TypeError: 'StringAccessor' object is not iterable
forecastDf['date'] = forecastDf['dateLocation'].str.partition(';')[0]
forecastDf['Lat'] = forecastDf['dateLocation'].str.partition(';')[2]
forecastDf['Lon'] = forecastDf['dateLocation'].str.partition(';')[4]
Let me know if this works for you!
First make sure the column is string dtype
forecastDD['dateLocation'] = forecastDD['dateLocation'].astype('str')
Then you can use this to split in dask
splitColumns = client.persist(forecastDD['dateLocation'].str.split(';',2))
You can then index the columns in the new dataframe splitColumns and add them back to the original data frame.
forecastDD = forecastDD.assign(Lat=splitColumns.apply(lambda x: x[0], meta=('Lat', 'f8')), Lon=splitColumns.apply(lambda x: x[1], meta=('Lat', 'f8')), date=splitColumns.apply(lambda x: x[2], meta=('Lat', np.dtype(str))))
Unfortunately I couldn't figure out how to do it without calling compute and creating the temp dataframe.

Resources