Using AKSampleDescriptor - ios

Using AKSamplerDescriptor
I am using an adapted AKSampler example, in which I try to use the sforzando output of Fluid.sf3 melodicSounds. Sforzando creates .sfz files for each instrument, but all pointing for the global sample to a huge .wav file.
In all the instrument.sfz files there is an offset and endpoint description for the part of the wave file to be used.
When I load the .sfz file I get a crash due to memory problems. It seems that for every defined region in the .sfz file the complete .wav file (140 mB) is loaded again.
The most likely is that loading the sample file with the AKSampleDescriptor as done in the AKSampler example will ignore offset and endpoint (AKSampleDescriptor.startPoint and AKSampleDescriptor.endPoint) while reloading the complete .wav file.
Is there a way to load just the part start-to-end wanted from the sample file, because the complete file has al the sample data for all the instruments (I know and use polyphony that extracts only one instrument at the time and works fine, but this is for other use)
Or, and that seems the best to me, just load the file once and than have the sampledescriptors point to the data in memory

Good suggestions, Rob. I just ran into this one-giant-WAV issue myself, having never seen it before. I was also using Sforzando for conversion. I'll look into adding the necessary capabilities to AKSampler. In the meantime, it might be easier to write a program to cut up the one WAV file into smaller pieces and adjust the SFZ accordingly.
Here is some Python 2.7 code to do this, which I have used successfully with a Sforzando-converted sf2 soundfont. It might need changes to work for you--there is huge variability among sfz files--but at least it might help you get started. This code requires the PyDub library for manipulating WAV audio.
import os
import re
from pydub import AudioSegment
def stripComments(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
def updateSplitList(splitList, regionLabels, values):
if len(values) > 3:
start = int(values['offset'])
length = int(values['end']) - start
name = regionLabels.pop(0)
splitList.add((name, start, length))
def lookupSplitName(splitList, offset, end):
for (name, start, end) in splitList:
if offset == start and end == end:
return name
return None
def outputGroupAndRegion(outputFile, splitList, values):
if values.has_key('lokey') and values.has_key('hikey') and values.has_key('pitch_keycenter'):
outputFile.write('<group> lokey=%s hikey=%s pitch_keycenter=%s\n' % (values['lokey'], values['hikey'], values['pitch_keycenter']))
elif values.has_key('key') and values.has_key('pitch_keycenter'):
outputFile.write('<group> key=%s pitch_keycenter=%s\n' % (values['key'], values['pitch_keycenter']))
if len(values) > 3:
outputFile.write(' <region> ')
if values.has_key('lovel') and values.has_key('hivel'):
outputFile.write('lovel=%s hivel=%s ' % (values['lovel'], values['hivel']))
if values.has_key('tune'):
outputFile.write('tune=%s ' % values['tune'])
if values.has_key('volume'):
outputFile.write('volume=%s ' % values['volume'])
if values.has_key('offset'):
outputFile.write('offset=0 ')
if values.has_key('end'):
outputFile.write('end=%d ' % (int(values['end']) - int(values['offset'])))
if values.has_key('loop_mode'):
outputFile.write('loop_mode=%s ' % values['loop_mode'])
if values.has_key('loop_start'):
outputFile.write('loop_start=%d ' % (int(values['loop_start']) - int(values['offset'])))
if values.has_key('loop_end'):
outputFile.write('loop_end=%d ' % (int(values['loop_end']) - int(values['offset'])))
outputFile.write('sample=samples/%s' % lookupSplitName(splitList, int(values['offset']), int(values['end'])) + '.wav\n')
def process(inputFile, outputFile):
# create a list of region labels
regionLabels = list()
for line in open(inputFile):
if line.strip().startswith('region_label'):
regionLabels.append(line.strip().split('=')[1])
# read entire input SFZ file
sfz = open(inputFile).read()
# strip comments and create a mixed list of <header> tags and key=value pairs
sfz_list = stripComments(sfz).split()
inSection = "none"
default_path = ""
global_sample = None
values = dict()
splitList = set()
# parse the input SFZ data and build up splitList
for item in sfz_list:
if item.startswith('<'):
inSection = item
updateSplitList(splitList, regionLabels, values)
values.clear()
continue
elif item.find('=') < 0:
#print 'unknown:', item
continue
key, value = item.split('=')
if inSection == '<control>' and key == 'default_path':
default_path = value.replace('\\', '/')
elif inSection == '<global>' and key == 'sample':
global_sample = value.replace('\\', '/')
elif inSection == '<region>':
values[key] = value
# split the wav file
bigWav = AudioSegment.from_wav(global_sample)
#print "%d channels, %d bytes/sample, %d frames/sec" % (bigWav.channels, bigWav.sample_width, bigWav.frame_rate)
frate = float(bigWav.frame_rate)
for (name, start, length) in splitList:
startMs = 1000 * start / frate
endMs = 1000 * (start + length) / frate
wav = bigWav[startMs : endMs]
wavName = 'samples/' + name + '.wav'
wav.export(wavName, format='wav')
# parse the input SFZ data again and generate the output SFZ
for item in sfz_list:
if item.startswith('<'):
inSection = item
outputGroupAndRegion(outputFile, splitList, values)
values.clear()
continue
elif item.find('=') < 0:
#print 'unknown:', item
continue
key, value = item.split('=')
if inSection == '<control>' and key == 'default_path':
default_path = value.replace('\\', '/')
elif inSection == '<global>' and key == 'sample':
global_sample = value.replace('\\', '/')
elif inSection == '<region>':
values[key] = value
dirPath = '000'
fileNameList = os.listdir(dirPath)
for fileName in fileNameList:
if fileName.endswith('.sfz'):
inputFile = os.path.join(dirPath, fileName)
outputFile = open(fileName, 'w')
print fileName
process(inputFile, outputFile)

Related

Using Google Colab how to drive.files().list more than 1000 files from google drive

About once a month I get a google drive folder with lots of videos in it (usually around 700-800) and a spreadsheet that column A gets populated with the names of all of the video files in order of the time stamp in the video file name. Now I've already got the code that does this (I will post it below) but This time I've got about 8,400 video files in the folder and this algorithm has a pageSize limit of 1,000 (it was originally 100, I changed it to 1,000 but that's the highest it will accept) How do I change this code to accept more than 1000
This is the part that initializes everything
!pip install gspread_formatting
import time
import gspread
from gspread import urls
from google.colab import auth
from datetime import datetime
from datetime import timedelta
from gspread_formatting import *
from googleapiclient.discovery import build
from oauth2client.client import GoogleCredentials
from google.auth import default
folder_id = '************************' # change to whatever folder the required videos are in
base_dir = '/Example/drive/videofolder' # change this to whatever folder path you want to grab videos from same as above
file_name_qry_filter = "name contains 'mp4' and name contains 'cam'"
file_pattern="cam*.mp4"
spreadSheetUrl = 'https://docs.google.com/spreadsheets/d/SpreadsheetIDExample/edit#gid=0'
data_drive_id = '***********' # This is the ID of the shared Drive
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
#gc = gspread.authorize(GoogleCredentials.get_application_default())
wb = gc.open_by_url(spreadSheetUrl)
sheet = wb.worksheet('Sheet1')
And this is the main part of the code
prevTimeStamp = None
prevHour = None
def dateChecker(fileName, prevHour):
strippedFileName = fileName.strip(".mp4") # get rid of the .mp4 from the end of the file name
parsedFileName = strippedFileName.split("_") # split the file name into an array of (0 = Cam#, 1 = yyyy-mm-dd, 2 = hh-mm-ss)
timeStamp = parsedFileName[2] # Grabbed specifically the hh-mm-ss time section from the original file name
parsedTimeStamp = timeStamp.split("-") # split the time stamp into an array of (0 = hour, 1 = minute, 2 = second)
hour = int(parsedTimeStamp[0])
minute = int(parsedTimeStamp[1])
second = int(parsedTimeStamp[2]) # set hour, minute, and seccond to it's own variable
commentCell = "Reset"
if prevHour == None:
commentCell = " "
prevHour = hour
else:
if 0 <= hour < 24:
if hour == 0:
if prevHour == 23:
commentCell = " "
else:
commentCell = "Missing Video1"
else:
if hour - prevHour == 1:
commentCell = " "
else:
commentCell = "Missing Video2"
else:
commentCell = "Error hour is not between 0 and 23"
if minute != 0 or 1 < second <60:
commentCell = "Check Length"
prevHour = hour
return commentCell, prevHour
# Drive query variables
parent_folder_qry_filter = "'" + folder_id + "' in parents" #you shouldn't ever need to change this
query = file_name_qry_filter + " and " + parent_folder_qry_filter
drive_service = build('drive', 'v3')
# Build request and call Drive API
page_token = None
response = drive_service.files().list(q=query,
corpora='drive',
supportsAllDrives='true',
includeItemsFromAllDrives='true',
driveId=data_drive_id,
pageSize=1000,
fields='nextPageToken, files(id, name, webViewLink)', # you can add extra fields in the files() if you need more information about the files you're grabbing
pageToken=page_token).execute()
i = 1
array = [[],[]]
# Parse/print results
for file in response.get('files', []):
array.insert(i-1, [file.get('name'), file.get('webViewLink')]) # If you add extra fields above, this is where you will have to start changing the code to make it accomadate the extra fields
i = i + 1
array.sort()
array_sorted = [x for x in array if x] #Idk man this is some alien shit I just copied it from the internet and it worked, it somehow removes any extra blank objects in the array that aren't supposed to be there
arrayLength = len(array_sorted)
print(arrayLength)
commentCell = 'Error'
# for file_name in array_sorted:
# date_gap, start_date, end_date = date_checker(file_name[0])
# if prev_end_date == None:
# print('hello')
# elif start_date != prev_end_date:
# date_gap = 'Missing Video'
for file_name in array_sorted:
commentCell, prevHour = dateChecker(file_name[0], prevHour)
time.sleep(0.3)
#insertRow = [file_name[0], "Not Processed", " ", date_gap, " ", " ", " ", " ", base_dir + '/' + file_name[0], " ", file_name[1], " ", " ", " "]
insertRow = [file_name[0], "Not Processed", " ", commentCell, " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " "]
sheet.append_row(insertRow, value_input_option='USER_ENTERED')
Now I know the problem has to do with the
page_token = None
response = drive_service.files().list(q=query,
corpora='drive',
supportsAllDrives='true',
includeItemsFromAllDrives='true',
driveId=data_drive_id,
pageSize=1000,
fields='nextPageToken, files(id, name, webViewLink)', # you can add extra fields in the files() if you need more information about the files you're grabbing
pageToken=page_token).execute()
In the middle of the main part of the code. I've obviously already tried just changing the pageSize limit to 10,000 but I knew that wouldn't work and I was right, it came back with
HttpError: <HttpError 400 when requesting https://www.googleapis.com/drive/v3/files?q=name+contains+%27mp4%27+and+name+contains+%27cam%27+and+%271ANmLGlNr-Cu0BvH2aRrAh_GXEDk1nWvf%27+in+parents&corpora=drive&supportsAllDrives=true&includeItemsFromAllDrives=true&driveId=0AF92uuRq-00KUk9PVA&pageSize=10000&fields=nextPageToken%2C+files%28id%2C+name%2C+webViewLink%29&alt=json returned "Invalid value '10000'. Values must be within the range: [1, 1000]". Details: "Invalid value '10000'. Values must be within the range: [1, 1000]">
The one idea I have is to have multiple pages with 1000 each and than iterate through them but I barely understood how this part of the code worked a year ago when I set it up and since than I haven't touched google colab except to run this algorithm and Every time I try to google how to do this or look up the google drive API or anything else everything always comes back with how to download and upload a couple file where what I need is just to get a list of the names of all the files.
The documentation explains how to use the pageToken for pagination (the page is for Calendar API but it works the same in Drive):
In order to retrieve the next page, perform the exact same request as previously and append a pageToken field with the value of nextPageToken from the previous page. A new nextPageToken is provided on the following pages until all the results are retrieved.
Essentially you want a loop where you run files.list(), retrieve the pageToken, and run it again while feeding it the previous token until you stop getting tokens.
For your specific scenario you can try to replace the "problem" snippet with the following:
page_token = ""
filelist = {}
while True:
response = drive_service.files().list(q=query,
corpora='drive',
supportsAllDrives='true',
includeItemsFromAllDrives='true',
driveId=data_drive_id,
pageSize=1000,
fields='nextPageToken, files(id, name, webViewLink)',
pageToken=page_token).execute()
page_token = response.get('nextPageToken', None)
filelist.setdefault("files",[]).extend(response.get('files'))
if (not page_token):
break
response = filelist
This does as I described, looping files.list() and adding the results to the filelist variable, then breaking the loop when the API stops returning page tokens. At the end I just assigned the value of filelist to the response variable since that's what you're using in the rest of your code. It should parse the same way but with the full list of results this time.
Sources:
Page through list of resources
Files.list()

how to speed up "POS tag" with StanfordPOSTagger?

I wanted to take none phrases of tweets, code is following. The problem is that it only process 300 tweets at a time and spend 5 minutes, how to speed up?
by the way, some code edited according to text blob.
I use dataset of gate-EN-twitter(https://gate.ac.uk/wiki/twitter-postagger.html) and NLTK interface to the Stanford POS tagger to tag tweets
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import time,nltk
start_time = time.time()
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
st = StanfordPOSTagger('/models/gate-EN-twitter.model','/twitie_tagger/twitie_tag.jar', encoding='utf-8')
def _normalize_tags(chunk):
'''Normalize the corpus tags.
("NN", "NN-PL", "NNS") -> "NN"
'''
ret = []
for word, tag in chunk:
if tag == 'NP-TL' or tag == 'NP':
ret.append((word, 'NNP'))
continue
if tag.endswith('-TL'):
ret.append((word, tag[:-3]))
continue
if tag.endswith('S'):
ret.append((word, tag[:-1]))
continue
ret.append((word, tag))
return ret
def noun_phrase_count(text):
matches1=[]
print('len(text)',len(text))
for i in range(len(text)//1000):
tokenized_text = word_tokenize(text[i*1000:i*10000+1000])
classified_text = st.tag(tokenized_text)
tags = _normalize_tags(classified_text)
merge = True
while merge:
merge = False
for x in range(0, len(tags) - 1):
t1 = tags[x]
t2 = tags[x + 1]
key = t1[1], t2[1]
value = CFG.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = '%s %s' % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
matches1+=matches
print("--- %s seconds ---" % (time.time() - start_time))
fdist = nltk.FreqDist(matches1)
return [(tag,num) for (tag, num) in fdist.most_common()]
noun_phrase_count(tweets)
Looks like a duplicate of Stanford POS tagger with GATE twitter model is slow so you may find more info there.
Additionally; if there's any chance of stumbling upon identical inputs (tweets) twice (or more), you can consider a dictionary with the tweet (plain str) as key, and tagged as value, so that when you encounter a tweet, you first check if it's in your dict already. If not, tag it and put it there (and if this route is viable, why not pickle/unpickle that dictionary so that debugging/subsequent runs of your code go faster as well).

LLDB Python access of iOS variables?

As part of debugging a problem that might be related to my UIVIews, I want to write a python script to run from LLDB. I had thought to extract all settings for a view in a breakpoint and all view children, to allow me to compare states. I checked out the WWDC video on the topic and then spent time reading things at lldb.llvm.org/scripting.html, and didn't find them very helpful. A web search for examples led to nothing substantially different from those.
My problem is that I'm trying to figure out how to access iOS variables at my breakpoint. The examples I've seen do things like convert numbers and mimic shell commands. Interesting stuff but not useful for my purposes. I've been reading my way through the help info with "script help(lldb.SBValue)" and the like, but it is slow going as the results are huge and it is not clear what the use patterns are. I feel like one decent example of how to traverse a few iOS objects would help me understand the system. Does anyone know of one or can share a snippet of code?
UPDATE:
I wrote this to help me track down a bug in my UIView use. I want to do a bit more work to refine this to see if I could show the whole view tree, but this was sufficient to solve my problem, so I'll put it here to save others some time.
import lldb
max_depth = 6
filters = {'_view':'UIView *', '_layer':'CALayer *', '_viewFlags':'struct'}
def print_value(var, depth, prefix):
""" print values and recurse """
global max_depth
local_depth = max_depth - depth
pad = ' ' * local_depth
name = var.GetName()
typ = str(var.GetType()).split('\n')[0].split('{')[0].split(':')[0].strip()
found = name in filters.keys() # only visit filter items children
if found:
found = (filters.get(name) == typ)
value = var.GetValue()
if value is None or str(value) == '0x00000000':
value = ''
else:
value = ' Val: %s' % value
if var.GetNumChildren() == 0 and var.IsInScope():
path = lldb.SBStream()
var.GetExpressionPath(path)
path = ' pathData: %s' % path.GetData()
else:
path = ''
print '^' * local_depth, prefix, ' Adr:', var.GetAddress(), ' Name:', name, ' Type:', typ, value, path
if var.GetNumChildren() > 0:
if local_depth < 2 or found:
print pad, var.GetNumChildren(), 'children, to depth', local_depth + 1
counter = 0
for subvar in var:
subprefix = '%d/%d' % (counter, var.GetNumChildren())
print_value(subvar, depth - 1, subprefix)
counter += 1
def printvh (debugger, command_line, result, dict):
""" print view hierarchy """
global max_depth
args = command_line.split()
if len(args) > 0:
var = lldb.frame.FindVariable(args[0])
depth = max_depth
if len(args) > 1:
depth = int(args[1])
max_depth = depth
print_value(var, depth, 'ROOT')
else:
print 'pass a variable name and optional depth'
And I added the following to my .lldbinit :
script import os, sys
# So that files in my dir takes precedence.
script sys.path[:0] = [os.path.expanduser("~/lldbpy")]
script import views
command script add -f views.printvh printvh
so that I can just type "printvh self 3" at the LLDB prompt.
Maybe this will help. Here's an example of how to dump simple local variables when a breakpoint is hit. I'm not displaying char* arrays correctly, I'm not sure how I should get the data for these to display it like "frame variable" would display it but I'll figure that out later when I have a free minute.
struct datastore {
int val1;
int val2;
struct {
int val3;
} subdata;
char *name;
};
int main (int argc, char **argv)
{
struct datastore data = {1, 5, {3}, "a string"};
return data.val2;
}
Current executable set to 'a.out' (x86_64).
(lldb) br se -l 13
Breakpoint created: 1: file ='a.c', line = 13, locations = 1
(lldb) br comm add -s python
Enter your Python command(s). Type 'DONE' to end.
> def printvar_or_children(var):
> if var.GetNumChildren() == 0 and var.IsInScope():
> path = lldb.SBStream()
> var.GetExpressionPath(path)
> print '%s: %s' % (path.GetData(), var.GetValue())
> else:
> for subvar in var:
> printvar_or_children(subvar)
>
> print 'variables visible at breakpoint %s' % bp_loc
> for var in frame.arguments:
> printvar_or_children(var)
> for var in frame.locals:
> printvar_or_children(var)
>
> DONE
(lldb) r
variables visible at breakpoint 1.1: where = a.out`main + 51 at a.c:13, address = 0x0000000100000f33, resolved, hit count = 1
argc: 1
*(*(argv)): '/'
data.val1: 1
data.val2: 5
data.subdata.val3: 3
*(data.name): 'a'
Process 84865 stopped
* thread #1: tid = 0x1f03, 0x0000000100000f33 a.out`main + 51 at a.c:13, stop reason = breakpoint 1.1
frame #0: 0x0000000100000f33 a.out`main + 51 at a.c:13
10 int main (int argc, char **argv)
11 {
12 struct datastore data = {1, 5, {3}, "a string"};
-> 13 return data.val2;
(lldb)
Tip - for sanity's sake I worked on the python over in a side text editor and pasted it into lldb as I experimented.
If you use the frame variable command in lldb to explore your variables at a given stop location, that's the same basic way that you can access them via the SBFrame that is provided to your breakpoint python command in the 'frame' object.
Hope that helps to get you started.
Did you try looking at the python LLDB formatting templates stored in:
XCode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python/lldb/formatters/objc

Decompressing LZW in Lua [duplicate]

Here is the Pseudocode for Lempel-Ziv-Welch Compression.
pattern = get input character
while ( not end-of-file ) {
K = get input character
if ( <<pattern, K>> is NOT in
the string table ){
output the code for pattern
add <<pattern, K>> to the string table
pattern = K
}
else { pattern = <<pattern, K>> }
}
output the code for pattern
output EOF_CODE
I am trying to code this in Lua, but it is not really working. Here is the code I modeled after an LZW function in Python, but I am getting an "attempt to call a string value" error on line 8.
function compress(uncompressed)
local dict_size = 256
local dictionary = {}
w = ""
result = {}
for c in uncompressed do
-- while c is in the function compress
local wc = w + c
if dictionary[wc] == true then
w = wc
else
dictionary[w] = ""
-- Add wc to the dictionary.
dictionary[wc] = dict_size
dict_size = dict_size + 1
w = c
end
-- Output the code for w.
if w then
dictionary[w] = ""
end
end
return dictionary
end
compressed = compress('TOBEORNOTTOBEORTOBEORNOT')
print (compressed)
I would really like some help either getting my code to run, or helping me code the LZW compression in Lua. Thank you so much!
Assuming uncompressed is a string, you'll need to use something like this to iterate over it:
for i = 1, #uncompressed do
local c = string.sub(uncompressed, i, i)
-- etc
end
There's another issue on line 10; .. is used for string concatenation in Lua, so this line should be local wc = w .. c.
You may also want to read this with regard to the performance of string concatenation. Long story short, it's often more efficient to keep each element in a table and return it with table.concat().
You should also take a look here to download the source for a high-performance LZW compression algorithm in Lua...

Reading the fileset from a torrent

I want to (quickly) put a program/script together to read the fileset from a .torrent file. I want to then use that set to delete any files from a specific directory that do not belong to the torrent.
Any recommendations on a handy library for reading this index from the .torrent file? Whilst I don't object to it, I don't want to be digging deep into the bittorrent spec and rolling a load of code from scratch for this simple purpose.
I have no preference on language.
I would use rasterbar's libtorrent which is a small and fast C++ library.
To iterate over the files you could use the torrent_info class (begin_files(), end_files()).
There's also a python interface for libtorrent:
import libtorrent
info = libtorrent.torrent_info('test.torrent')
for f in info.files():
print "%s - %s" % (f.path, f.size)
Effbot has your question answered. Here is the complete code to read the list of files from .torrent file (Python 2.4+):
import re
def tokenize(text, match=re.compile("([idel])|(\d+):|(-?\d+)").match):
i = 0
while i < len(text):
m = match(text, i)
s = m.group(m.lastindex)
i = m.end()
if m.lastindex == 2:
yield "s"
yield text[i:i+int(s)]
i = i + int(s)
else:
yield s
def decode_item(next, token):
if token == "i":
# integer: "i" value "e"
data = int(next())
if next() != "e":
raise ValueError
elif token == "s":
# string: "s" value (virtual tokens)
data = next()
elif token == "l" or token == "d":
# container: "l" (or "d") values "e"
data = []
tok = next()
while tok != "e":
data.append(decode_item(next, tok))
tok = next()
if token == "d":
data = dict(zip(data[0::2], data[1::2]))
else:
raise ValueError
return data
def decode(text):
try:
src = tokenize(text)
data = decode_item(src.next, src.next())
for token in src: # look for more tokens
raise SyntaxError("trailing junk")
except (AttributeError, ValueError, StopIteration):
raise SyntaxError("syntax error")
return data
if __name__ == "__main__":
data = open("test.torrent", "rb").read()
torrent = decode(data)
for file in torrent["info"]["files"]:
print "%r - %d bytes" % ("/".join(file["path"]), file["length"])
Here's the code from Constantine's answer above, slightly modified to handle Unicode characters in torrent filenames and fileset filenames in torrent info:
import re
def tokenize(text, match=re.compile("([idel])|(\d+):|(-?\d+)").match):
i = 0
while i < len(text):
m = match(text, i)
s = m.group(m.lastindex)
i = m.end()
if m.lastindex == 2:
yield "s"
yield text[i:i+int(s)]
i = i + int(s)
else:
yield s
def decode_item(next, token):
if token == "i":
# integer: "i" value "e"
data = int(next())
if next() != "e":
raise ValueError
elif token == "s":
# string: "s" value (virtual tokens)
data = next()
elif token == "l" or token == "d":
# container: "l" (or "d") values "e"
data = []
tok = next()
while tok != "e":
data.append(decode_item(next, tok))
tok = next()
if token == "d":
data = dict(zip(data[0::2], data[1::2]))
else:
raise ValueError
return data
def decode(text):
try:
src = tokenize(text)
data = decode_item(src.next, src.next())
for token in src: # look for more tokens
raise SyntaxError("trailing junk")
except (AttributeError, ValueError, StopIteration):
raise SyntaxError("syntax error")
return data
n = 0
if __name__ == "__main__":
data = open("C:\\Torrents\\test.torrent", "rb").read()
torrent = decode(data)
for file in torrent["info"]["files"]:
n = n + 1
filenamepath = file["path"]
print str(n) + " -- " + ', '.join(map(str, filenamepath))
fname = ', '.join(map(str, filenamepath))
print fname + " -- " + str(file["length"])
bencode.py from the original Mainline BitTorrent 5.x client (http://download.bittorrent.com/dl/BitTorrent-5.2.2.tar.gz) would give you pretty much the reference implementation in Python.
It has an import dependency on the BTL package but that's trivially easy to remove. You'd then look at bencode.bdecode(filecontent)['info']['files'].
Expanding on the ideas above, I did the following:
~> cd ~/bin
~/bin> ls torrent*
torrent-parse.py torrent-parse.sh
~/bin> cat torrent-parse.py
# torrent-parse.py
import sys
import libtorrent
# get the input torrent file
if (len(sys.argv) > 1):
torrent = sys.argv[1]
else:
print "Missing param: torrent filename"
sys.exit()
# get names of files in the torrent file
info = libtorrent.torrent_info(torrent);
for f in info.files():
print "%s - %s" % (f.path, f.size)
~/bin> cat torrent-parse.sh
#!/bin/bash
if [ $# -lt 1 ]; then
echo "Missing param: torrent filename"
exit 0
fi
python torrent-parse.py "$*"
You'll want to set permissions appropriately to make the shell script executable:
~/bin> chmod a+x torrent-parse.sh
Hope this helps someone :)

Resources