Store returned values in variable - return

I've got a code which renames the names of files to randomly chosen numbers. The code works, but I can't seem to figure out how to store the original filenames and the respective renamed filename (random number).
When I run the code, I only get the values of the last iteration using 'return'. But how do I store the original filenames and the respective renamed filenames?
So, I want to have a list of 'file_name' (which contains all the original filenames) and 'rand_keynumber' (which are the generated random numbers)
Thank you.
import os
import random
numbers = range(1,1025)
numbers_list = list(map(str,numbers))
def keynumber():
# Generate a random index
rand_index = random.randint(0, len(numbers_list)-1)
# Get the keynumber
global rand_keynumber
rand_keynumber = numbers_list[rand_index]
# Remove the used-up keynumber from the list to
# prevent randomly selecting it again when renaming
numbers_list.remove(rand_keynumber)
return rand_keynumber
def renam_name():
os.chdir(r"C:\Users\samwi\OneDrive\Bureaublad\videos_anonimisatie\video_to_rename")
file_list = os.listdir(r"C:\Users\samwi\OneDrive\Bureaublad\videos_anonimisatie\video_to_rename")
global file_name
for f in file_list:
# get the file extension
file_name, img_type = os.path.splitext(f)
os.rename(f, keynumber() + img_type)
return file_name
renam_name()

It looks like you are assigning a new value to the global file_name variable on each iteration. You need to append each filename and the selected key number to a list before moving to the next file in file_list.
import os
import random
numbers = range(1,1025)
numbers_list = list(map(str,numbers))
file_names = []
def keynumber():
# Generate a random index
rand_index = random.randint(0, len(numbers_list)-1)
# Get the keynumber
rand_keynumber = numbers_list[rand_index]
# Remove the used-up keynumber from the list to
# prevent randomly selecting it again when renaming
numbers_list.remove(rand_keynumber)
return rand_keynumber
def renam_name():
os.chdir(r"C:\Users\samwi\OneDrive\Bureaublad\videos_anonimisatie\video_to_rename")
file_list = os.listdir(r"C:\Users\samwi\OneDrive\Bureaublad\videos_anonimisatie\video_to_rename")
for f in file_list:
# get the file extension
next_file_name, img_type = os.path.splitext(f)
next_keynumber = keynumber()
file_names.append([next_file_name, next_keynumber])
os.rename(f, next_keynumber + img_type)
renam_name()

You could just create an empty array under import random and in your functions just before returning rand_keynumber and file_name, append what you want to the empty array using .append

Related

How to avoid importing nil object when reading spreadsheet with roo on Rails 5.2?

My application manages hierarchical classifications based on lists of values (dictionnaries). At some point, I need to import the parent-child relationships from an Excel sheet, and create persisted ValuesToValues objects.
Based on Ryan Bates' RailsCast 396, I created the import model in which the main loop is:
(2..spreadsheet.last_row).map do |i|
# Read columns indexes
parent = header.index("Parent") +1
level = header.index("Level") +1
code = header.index("Code") +1
# Skip if parent is blank
next if spreadsheet.cell(i, parent).blank?
# Count links
#links_counter += 1
parent_values_list_id = values_lists[((spreadsheet.cell(i, level).to_i) -1)]
child_values_list_id = values_lists[spreadsheet.cell(i, level).to_i]
parent_value_id = Value.find_by(values_list_id: parent_values_list_id, code: spreadsheet.cell(i, parent).to_s).id
child_value_id = Value.find_by(values_list_id: child_values_list_id, code: spreadsheet.cell(i, code).to_s).id
link_code = "#{parent_values_list_id}/#{spreadsheet.cell(i, parent)} - #{child_values_list_id}/#{spreadsheet.cell(i, code)}"
link_name = "#{spreadsheet.cell(i, parent)} #{spreadsheet.cell(i, code)}"
link = ValuesToValues.new( playground_id: playground_id,
classification_id: #classification.id,
parent_values_list_id: parent_values_list_id,
child_values_list_id: child_values_list_id,
parent_value_id: parent_value_id,
child_value_id: child_value_id,
code: link_code,
name: link_name
)
end
The issue is that, when encourtering a root value -without a parent value- the loop creates a nil object, which does not pass the later validation.
How can I build the loop in order to consider only rows where the Parent cell is not empty?
I finally decided to manage my own array of imported values instead of using the array based on the filtered sheet rows.
I added the following code around the main loop:
# Create array of links
linked_values = Array.new
# start loading links for each values list
(2..spreadsheet.last_row).map do |i|
...
and
...
linked_values << link
end
linked_values
Then linked_values array is returned, which only contains valid links records.

Saving SEC 10-K annual report text to files (trouble with decoding)

I am trying to bulk-download the text visible to the "end-user" from 10-K SEC Edgar reports (don't care about tables) and save it in a text file. I have found the code below on Youtube, however I am facing 2 challenges:
I am not sure if I am capturing all text, and when I print the URL from below, I receive very weird output (special characters e.g., at the very end of the print-out)
I can't seem to save the text in txt files, not sure if this is due to encoding (I am entirely new to programming).
import re
import requests
import unicodedata
from bs4 import BeautifulSoup
def restore_windows_1252_characters(restore_string):
def to_windows_1252(match):
try:
return bytes([ord(match.group(0))]).decode('windows-1252')
except UnicodeDecodeError:
# No character at the corresponding code point: remove it.
return ''
return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)
# define the url to specific html_text file
new_html_text = r"https://www.sec.gov/Archives/edgar/data/796343/0000796343-14-000004.txt"
# grab the response
response = requests.get(new_html_text)
page_soup = BeautifulSoup(response.content,'html5lib')
page_text = page_soup.html.body.get_text(' ',strip = True)
# normalize the text, remove characters. Additionally, restore missing window characters.
page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text))
# print: this works however gives me weird special characters in the print (e.g., at the very end)
print(page_text_norm)
# save to file: this only gives me an empty text file
with open('testfile.txt','w') as file:
file.write(page_text_norm)
Try this. If you take the data you expect as an example, it will be easier for people to understand your needs.
from simplified_scrapy import SimplifiedDoc,req,utils
url = 'https://www.sec.gov/Archives/edgar/data/796343/0000796343-14-000004.txt'
html = req.get(url)
doc = SimplifiedDoc(html)
# text = doc.body.text
text = doc.body.unescape() # Converting HTML entities
utils.saveFile("testfile.txt",text)

How broadcast variables are used in dask parallelization

I have some code applying a map function on a dask bag. I need a lookup dictionary to apply that function and it doesn't work with client.scatter.
I don't know if I am doing the right things, because the workers starts, but they don't do anything. I have tried different configuration looking to different examples, but I can't get it to work. Any support will be appreciated.
I know from Spark, you define a broadcast variable and you access the content by variable.value inside the function you want to apply. I don't see the same with dask.
# Function to map
def transform_contacts_add_to_historic_sin(data,historic_dict):
raw_buffer = ''
line = json.loads(data)
if line['timestamp] > historic_dict['timestamp]:
raw_buffer = raw_buffer + line['vid']
return raw_buffer
# main program
# historic_dict is a dictionary previously filled, which is the lookup variable for map function
# file_records will be a list of json.dump getting from a S3 file
from distributed import Client
client = Client()
historic_dict_scattered = client.scatter(historic_dict, broadcast=True)
file_records = []
raw_data = s3_procedure.read_raw_file(... S3 file.......)
data = TextIOWrapper(raw_data)
for line in data:
file_records.append(line)
bag_chunk = db.from_sequence(file_records, npartitions=16)
bag_transform = bag_chunk.map(lambda x: transform_contacts_add_to_historic(x), args=[historic_dict_scattered])
bag_transform.compute()
If your dictionary is small you can just include it directly
def func(partition, d):
return ...
my_dict = {...}
b = b.map(func, d=my_dict)
If it's large then you might want to wrap it up in Dask delayed first
my_dict = dask.delayed(my_dict)
b = b.map(func, d=my_dict)
If it's very large then yes, you might want to scatter it first (though I would avoid this if things work out with either of the approaches above).
[my_dict] = client.scatter([my_dict])
b = b.map(func, d=my_dict)

Rails application with regular expression validation

I have created an application like users app using rails. In this app, the text file is imported to DB. In which, i need to validate the mobile number, that means, it should not contains + or * or any other special characters, suppose if it presence it should neglect this special characters and print the rest. I have used the following code to store the text file in array.
File.open('text file') do |f|
while line = f.gets
array = line.split(',')
user = User.new
user.user_name = array[0]
user.email_id = array[1]
user.mobile_number = array[2]
user.save
end
Use global substitute using regex, to remove the non-number part.
user.mobile_number = array[2].gsub(/[^0-9]/,'')

How can you join two or more dictionaries created by Bio.SeqIO.index?

I would like to be able to join the two "dictionaries" stored in "indata" and "pairdata", but this code,
indata = SeqIO.index(infile, infmt)
pairdata = SeqIO.index(pairfile, infmt)
indata.update(pairdata)
produces the following error:
indata.update(pairdata)
TypeError: update() takes exactly 1 argument (2 given)
I have tried using,
indata = SeqIO.to_dict(SeqIO.parse(infile, infmt))
pairdata = SeqIO.to_dict(SeqIO.parse(pairfile, infmt))
indata.update(pairdata)
which does work, but the resulting dictionaries take up too much memory to be practical for for the sizes of infile and pairfile I have.
The final option I have explored is:
indata = SeqIO.index_db(indexfile, [infile, pairfile], infmt)
which works perfectly, but is very slow. Does anyone know how/whether I can successfully join the two indexes from the first example above?
SeqIO.index returns a read-only dictionary-like object, so update will not work on it (apologies for the confusing error message; I just checked in a fix for that to the main Biopython repository).
The best approach is to either use index_db, which will be slower but
only needs to index the file once, or to define a higher level object
which acts like a dictionary over your multiple files. Here is a
simple example:
from Bio import SeqIO
class MultiIndexDict:
def __init__(self, *indexes):
self._indexes = indexes
def __getitem__(self, key):
for idx in self._indexes:
try:
return idx[key]
except KeyError:
pass
raise KeyError("{0} not found".format(key))
indata = SeqIO.index("f001", "fasta")
pairdata = SeqIO.index("f002", "fasta")
combo = MultiIndexDict(indata, pairdata)
print combo['gi|3318709|pdb|1A91|'].description
print combo['gi|1348917|gb|G26685|G26685'].description
print combo["key_failure"]
In you don't plan to use the index again and memory isn't a limitation (which both appear to be true in your case), you can tell Bio.SeqIO.index_db(...) to use an in memory SQLite3 index with the special index name ":memory:" like so:
indata = SeqIO.index_db(":memory:", [infile, pairfile], infmt)
where infile and pairfile are filenames, and infmt is their format type as defined in Bio.SeqIO (e.g. "fasta").
This is actually a general trick with Python's SQLite3 library. For a small set of files this should be much faster than building the SQLite index on disk.

Resources