Python module import Error, Infinite loop - python-import

enter image description here
Error code
import sys
from sys import stdin
sys.stdin = open("input.txt", "r")
input = stdin.readline
n = int(input())
Pass code
from sys import stdin
stdin = open("input.txt", "r")
input = stdin.readline
n = int(input())
i'm not good at English! but i'm curious.
Can you see the Problem in the picture?
Infinite loop after using the sys module!!!
import sys >>> sys.stdin
from sys import stdin >>> input = stdin.readline
nice pass code
from sys import stdin >>> stdin
from sys import stdin >>> stdin.readline
Why is there an infinite loop here? Is it a module crash?

Related

Troubles using dask distributed with datashader: 'can't pickle weakref objects'

I'm working with datashader and dask but I'm having problems when trying to plot with a cluster running. To make it more concrete, I have the following example (embedded in a bokeh plot):
import holoviews as hv
import pandas as pd
import dask.dataframe as dd
import numpy as np
from holoviews.operation.datashader import datashade
import datashader.transfer_functions as tf
#initialize the client/cluster
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
dask_client = Client(cluster)
def datashade_plot():
hv.extension('bokeh')
#create some random data (in the actual code this is a parquet file with millions of rows, this is just an example)
delta = 1/1000
x = np.arange(0, 1, delta)
y = np.cumsum(np.sqrt(delta)*np.random.normal(size=len(x)))
df = pd.DataFrame({'X':x, 'Y':y})
#create dask dataframe
points_dd = dd.from_pandas(df, npartitions=3)
#create plot
points = hv.Curve(points_dd)
return hd.datashade(points)
dask_client.submit(datashade_plot,).result()
This raises a:
TypeError: can't pickle weakref objects
I have the theory that this happens because you can't distribute the datashade operations in the cluster. Sorry if this is a noob question, I'd be very grateful for any advice you could give me.
I think you want to go the other way. That is, pass datashader a dask dataframe instead of a pandas dataframe:
>>> from dask import dataframe as dd
>>> import multiprocessing as mp
>>> dask_df = dd.from_pandas(df, npartitions=mp.cpu_count())
>>> dask_df.persist()
...
>>> cvs = datashader.Canvas(...)
>>> agg = cvs.points(dask_df, ...)
XREF: https://datashader.org/user_guide/Performance.html

How to parallelize sklearn's random forest regressor on SLURM

I am currently trying to make sklearn's random forest run parallely on SLURM cluster. I have sent them to nodes, and then I have noticed that the parameter, n_jobs=-1, was no longer working on SLURM.
I have tried ipyparallel package, but it gave me error messages. I do not necessarily use ipyparallel, so I appreciate any module that I can parallelize random forest on the cluster.
from sklearn.ensemble import RandomForestRegressor
from joblib import parallel_backend, register_parallel_backend
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
import sys
import time
import pickle
import numpy as np
def fit_predict(self, X_train, y, X_test):
"""
train a model by X_train and y, and then return the prediction of
X_test
"""
pred = None
client = Client(profile='myprofile')
bview = client.load_balanced_view()
register_parallel_backend('ipyparallel', lambda: IPythonParallelBackend(view=bview))
regr = RandomForestRegressor(n_jobs=-1)
try:
with parallel_backend('ipyparallel'):
regr.fit(X_train, y)
pred = regr.predict(X_test)
except Exception as e:
print(e)
return pred
Error:
Traceback (most recent call last):
File "job.py", line 124, in <module>
pred = rf.fit_predict(X_train, y_train, X_test)
File "job.py", line 50, in fit_predict
client = Client(profile='myprofile')
File "/home/lfz/.conda/envs/mvi/lib/python3.7/site-packages/ipyparallel/client/client.py", line 419, in __init__
raise IOError(no_file_msg)
OSError: You have attempted to connect to an IPython Cluster but no Controller could be found.
Please double-check your configuration and ensure that a cluster is running.
srun: error: c6-28: task 0: Exited with exit code 1

Dask dataframe get second highest value and column name

This code gives me the highest value and column name.
import numpy as np
import pandas as pd
import dask.dataframe as dd
cols=[0,1,2,3,4]
df = pd.DataFrame(np.random.randn(1000, len(cols)), columns=cols)
ddf = dd.from_pandas(df, npartitions=4)
ddf['max_col'] = ddf[cols].idxmax(axis=1)
ddf['max_val'] = ddf[cols].max(axis=1)
I want to get the second higest as well. Something like:
ddf['max2_col'] = ddf[cols].idxmax2(axis=1)
ddf['max2_val'] = ddf[cols].max2(axis=1)
Are there functions like idxmax2 or max2? Or any other optimized way for doing this?
You should normally try to figure out how to do what you want to do with pandas first. If you cannot, and pose that question instead, with the pandas tag, you will get a faster answer.
The following appears to work for pandas, although it may not be elegant
import numpy as np
import pandas as pd
import dask.dataframe as dd
cols=[0,1,2,3,4]
df = pd.DataFrame(np.random.randn(1000, len(cols)), columns=cols)
def make_cols(df):
df['max2_col'] = df[cols].values.argsort(axis=1)[:, -2]
df2 = df[cols].values.copy()
df2.sort(axis=1)
df['max2_val'] = df2[:, -2]
return df
so to apply it to the dask variant, you can do
ddf = dd.from_pandas(df, npartitions=4)
ddf.map_partitions(make_cols)
ddf.head()

Google Cloud ML exited with a non-zero status of 245 when training

I tried to train my model on Google Cloud ML using this sample code:
import keras
from keras import optimizers
from keras import losses
from keras import metrics
from keras.models import Model, Sequential
from keras.layers import Dense, Lambda, RepeatVector, TimeDistributed
import numpy as np
def test():
model = Sequential()
model.add(Dense(2, input_shape=(3,)))
model.add(RepeatVector(3))
model.add(TimeDistributed(Dense(3)))
model.compile(loss=losses.MSE,
optimizer=optimizers.RMSprop(lr=0.0001),
metrics=[metrics.categorical_accuracy],
sample_weight_mode='temporal')
x = np.random.random((1, 3))
y = np.random.random((1, 3, 3))
model.train_on_batch(x, y)
if __name__ == '__main__':
test()
and i got this error:
The replica master 0 exited with a non-zero status of 245. Termination reason: Error.
Detailed error output is big, so i'm pasting it here in pastebin
Note this output:
Module raised an exception for failing to call a subprocess Command '['python', '-m', u'trainer.test', '--job-dir', u'gs://my_test_bucket_keras/s_27_100630']' returned non-zero exit status -11.
And I guess the google cloud will run your code with an extra parameter called --job-dir. So perhaps you can try add the following code in your example code?
import ...
import argparse
def test():
model = Sequential()
model.add(Dense(2, input_shape=(3,)))
model.add(RepeatVector(3))
model.add(TimeDistributed(Dense(3)))
model.compile(loss=losses.MSE,
optimizer=optimizers.RMSprop(lr=0.0001),
metrics=[metrics.categorical_accuracy],
sample_weight_mode='temporal')
x = np.random.random((1, 3))
y = np.random.random((1, 3, 3))
model.train_on_batch(x, y)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Input Arguments
parser.add_argument(
'--job-dir',
help='GCS location to write checkpoints and export models',
required=True
)
args = parser.parse_args()
arguments = args.__dict__
test()
# test(**arguments) # or if you want to use this job_dir parameter in your code
Not 100% sure this will work but I think you can give it a try.
Also I have a post here to do something similar, perhaps you can take a look there as well.
Problem is resolved. All I had to do is use tensorflow 1.1.0 instead default 1.0.1

dask + luigi: raise ValueError('url type not understood: %s' % urlpath)

I am trying to merge dask with luigi,
and while business logic works fine by itself, code starts throwing errors when I run a Luigi task:
raise ValueError('url type not understood: %s' % urlpath)
ValueError: url type not understood: <_io.TextIOWrapper name='../data/2017_04_11_oldsource_geocoded.csv-luigi-tmp-1647603946' mode='wb' encoding='UTF-8'>
the code is here (I dropped the business model part to make it shorter):
import pandas as pd
import geopandas as gp
from geopandas.tools import sjoin
from dask import dataframe as dd
from shapely.geometry import Point
from os import path
import luigi
class geocode_tweets(luigi.Task):
boundaries = _load_geoboundaries()
nyc = boundaries[0].unary_union
def requires(self):
return []
def output(self):
self.path = '../data/2017_04_11_oldsource_geocoded.csv'
return luigi.LocalTarget(self.path)
def run(self):
df = dd.read_csv(path.join(data_dir, '2017_03_22_oldsource.csv'))
df['geometry'] = df.apply(_get_point, axis=1)
meta = _form_meta(df)
S = df.map_partitions(
distributed_sjoin, boundaries=self.boundaries,
nyc_border=self.nyc, meta=meta).drop('geometry', axis=1)
f = self.output().open('w')
S.to_csv(f)
f.close()
and the problem, it looks like, is in the output part
As far as I understand, problem is that dask does not like Luigi file objects as a substitution to the string.
Dask defines DataFrame.to_csv(filename, **kwargs) and you are sending it a file instead of a filename. Replace those last three lines with:
S.to_csv(self.output().path)

Resources