Fast way to get total line number of a large file - delphi

I'm dealing with large text files (bigger than 100MB). I need the total number of lines as fast as possible. I'm currently using the code below (update: added try-finally):
var
SR: TStreamReader;
totallines: int64;
str: string;
begin
SR:=TStreamReader.Create(myfilename, TEncoding.UTF8);
try
totallines:=0;
while not SR.EndOfStream do
begin
str:=SR.ReadLine;
inc(totallines);
end;
finally
SR.Free;
end;
end;
Is there any faster way to get totallines?

Program LineCount;
{$APPTYPE CONSOLE}
{$WEAKLINKRTTI ON}
{$RTTI EXPLICIT METHODS([]) PROPERTIES([]) FIELDS([])}
{$SetPEFlags 1}
{ Compile with XE8 or above... }
USES
SysUtils,
BufferedFileStream;
VAR
LineCnt: Int64;
Ch: Char;
BFS: TReadOnlyCachedFileStream;
function Komma(const S: string; const C: Char = ','): string;
{ About 4 times faster than Comma... }
var
I: Integer; // loops through separator position
begin
Result := S;
I := Length(S) - 2;
while I > 1 do
begin
Insert(C, Result, I);
I := I - 3;
end;
end; {Komma}
BEGIN
writeln('LineCount - Copyright (C) 2020 by Walter L. Chester.');
writeln('Counts lines in the given textfile.');
if ParamCount <> 1 then
begin
writeln('USAGE: LineCount <filename>');
writeln;
writeln('No file size limit! Counts lines: takes 4 minutes on a 16GB file.');
Halt;
end;
if not FileExists(ParamStr(1)) then
begin
writeln('File not found!');
halt;
end;
writeln('Counting lines in file...');
BFS := TReadOnlyCachedFileStream.Create(ParamStr(1), fmOpenRead);
try
LineCnt := 0;
while BFS.Read(ch,1) = 1 do
begin
if ch = #13 then
Inc(LineCnt);
if (LineCnt mod 1000000) = 0 then
write('.');
end;
writeln;
writeln('Total Lines: ' + Komma(LineCnt.ToString));
finally
BFS.Free;
end;
END.

The answer is simply: No. Your algorithm is the fastest but the implementation isn't. You must read the whole file and count the lines. At least if lines are not fixed size.
How you read the file may impact the global performance.
Read the file block by block in a binary buffer (Array of bytes) as large as possible. Then count the lines in the buffer and loop with the block in same buffer.

Related

How to read last line in a text file using Delphi

I need to read the last line in some very large textfiles (to get the timestamp from the data). TStringlist would be a simple approach but it returns an out of memory error. I'm trying to use seek and blockread, but the characters in the buffer are all nonsense. Is this something to do with unicode?
Function TForm1.ReadLastLine2(FileName: String): String;
var
FileHandle: File;
s,line: string;
ok: 0..1;
Buf: array[1..8] of Char;
k: longword;
i,ReadCount: integer;
begin
AssignFile (FileHandle,FileName);
Reset (FileHandle); // or for binary files: Reset (FileHandle,1);
ok := 0;
k := FileSize (FileHandle);
Seek (FileHandle, k-1);
s := '';
while ok<>1 do begin
BlockRead (FileHandle, buf, SizeOf(Buf)-1, ReadCount); //BlockRead ( var FileHandle : File; var Buffer; RecordCount : Integer {; var RecordsRead : Integer} ) ;
if ord (buf[1]) <>13 then //Arg to integer
s := s + buf[1]
else
ok := ok + 1;
k := k-1;
seek (FileHandle,k);
end;
CloseFile (FileHandle);
// Reverse the order in the line read
setlength (line,length(s));
for i:=1 to length(s) do
line[length(s) - i+1 ] := s[i];
Result := Line;
end;
Based on www.delphipages.com/forum/showthread.php?t=102965
The testfile is a simple CSV I created in excel ( this is not the 100MB I ultimately need to read).
a,b,c,d,e,f,g,h,i,j,blank
A,B,C,D,E,F,G,H,I,J,blank
1,2,3,4,5,6,7,8,9,0,blank
Mary,had,a,little,lamb,His,fleece,was,white,as,snow
And,everywhere,that,Mary,went,The,lamb,was,sure,to,go
You really have to read the file in LARGE chunks from the tail to the head.
Since it is so large it does not fit the memory - then reading it line by line from start to end would be very slow. With ReadLn - twice slow.
You also has to be ready that the last line might end with EOL or may not.
Personally I would also account for three possible EOL sequences:
CR/LF aka #13#10=^M^J - DOS/Windows style
CR without LF - just #13=^M - Classic MacOS file
LF without CR - just #10=^J - UNIX style, including MacOS version 10
If you are sure your CSV files would only ever be generated by native Windows programs it would be safe to assume full CR/LF be used. But if there can be other Java programs, non-Windows platforms, mobile programs - I would be less sure. Of course pure CR without LF would be the least probable case of them all.
uses System.IOUtils, System.Math, System.Classes;
type FileChar = AnsiChar; FileString = AnsiString; // for non-Unicode files
// type FileChar = WideChar; FileString = UnicodeString;// for UTF16 and UCS-2 files
const FileCharSize = SizeOf(FileChar);
// somewhere later in the code add: Assert(FileCharSize = SizeOf(FileString[1]);
function ReadLastLine(const FileName: String): FileString; overload; forward;
const PageSize = 4*1024;
// the minimal read atom of most modern HDD and the memory allocation atom of Win32
// since the chances your file would have lines longer than 4Kb are very small - I would not increase it to several atoms.
function ReadLastLine(const Lines: TStringDynArray): FileString; overload;
var i: integer;
begin
Result := '';
i := High(Lines);
if i < Low(Lines) then exit; // empty array - empty file
Result := Lines[i];
if Result > '' then exit; // we got the line
Dec(i); // skip the empty ghost line, in case last line was CRLF-terminated
if i < Low(Lines) then exit; // that ghost was the only line in the empty file
Result := Lines[i];
end;
// scan for EOLs in not-yet-scanned part
function FindLastLine(buffer: TArray<FileChar>; const OldRead : Integer;
const LastChunk: Boolean; out Line: FileString): boolean;
var i, tailCRLF: integer; c: FileChar;
begin
Result := False;
if Length(Buffer) = 0 then exit;
i := High(Buffer);
tailCRLF := 0; // test for trailing CR/LF
if Buffer[i] = ^J then begin // LF - single, or after CR
Dec(i);
Inc(tailCRLF);
end;
if (i >= Low(Buffer)) and (Buffer[i] = ^M) then begin // CR, alone or before LF
Inc(tailCRLF);
end;
i := High(Buffer) - Max(OldRead, tailCRLF);
if i - Low(Buffer) < 0 then exit; // no new data to read - results would be like before
if OldRead > 0 then Inc(i); // the CR/LF pair could be sliced between new and previous buffer - so need to start a bit earlier
for i := i downto Low(Buffer) do begin
c := Buffer[i];
if (c=^J) or (c=^M) then begin // found EOL
SetString( Line, #Buffer[i+1], High(Buffer) - tailCRLF - i);
exit(True);
end;
end;
// we did not find non-terminating EOL in the buffer (except maybe trailing),
// now we should ask for more file content, if there is still left any
// or take the entire file (without trailing EOL if any)
if LastChunk then begin
SetString( Line, #Buffer[ Low(Buffer) ], Length(Buffer) - tailCRLF);
Result := true;
end;
end;
function ReadLastLine(const FileName: String): FileString; overload;
var Buffer, tmp: TArray<FileChar>;
// dynamic arrays - eases memory management and protect from stack corruption
FS: TFileStream; FSize, NewPos: Int64;
OldRead, NewLen : Integer; EndOfFile: boolean;
begin
Result := '';
FS := TFile.OpenRead(FileName);
try
FSize := FS.Size;
if FSize <= PageSize then begin // small file, we can be lazy!
FreeAndNil(FS); // free the handle and avoid double-free in finally
Result := ReadLastLine( TFile.ReadAllLines( FileName, TEncoding.ANSI ));
// or TEncoding.UTF16
// warning - TFIle is not share-aware, if the file is being written to by another app
exit;
end;
SetLength( Buffer, PageSize div FileCharSize);
OldRead := 0;
repeat
NewPos := FSize - Length(Buffer)*FileCharSize;
EndOfFile := NewPos <= 0;
if NewPos < 0 then NewPos := 0;
FS.Position := NewPos;
FS.ReadBuffer( Buffer[Low(Buffer)], (Length(Buffer) - OldRead)*FileCharSize);
if FindLastLine(Buffer, OldRead, EndOfFile, Result) then
exit; // done !
tmp := Buffer; Buffer := nil; // flip-flop: preparing to broaden our mouth
OldRead := Length(tmp); // need not to re-scan the tail again and again when expanding our scanning range
NewLen := Min( 2*Length(tmp), FSize div FileCharSize );
SetLength(Buffer, NewLen); // this may trigger EOutOfMemory...
Move( tmp[Low(tmp)], Buffer[High(Buffer)-OldRead+1], OldRead*FileCharSize);
tmp := nil; // free old buffer
until EndOfFile;
finally
FS.Free;
end;
end;
PS. Note one extra special case - if you would use Unicode chars (two-bytes ones) and would give odd-length file (3 bytes, 5 bytes, etc) - you would never be ble to scan the starting single byte (half-widechar). Maybe you should add the extra guard there, like Assert( 0 = FS.Size mod FileCharSize)
PPS. As a rule of thumb you better keep those functions out of the form class, - because WHY mixing them? In general you should separate concerns into small blocks. Reading file has nothing with user interaction - so should better be offloaded to an extra UNIT. Then you would be able to use functions from that unit in one form or 10 forms, in main thread or in multi-threaded application. Like LEGO parts - they give you flexibility by being small and separate.
PPPS. Another approach here would be using memory-mapped files. Google for MMF implementations for Delphi and articles about benefits and problems with MMF approach. Personally I think rewriting the code above to use MMF would greatly simplify it, removing several "special cases" and the troublesome and memory copying flip-flop. OTOH it would demand you to be very strict with pointers arithmetic.
https://en.wikipedia.org/wiki/Memory-mapped_file
https://msdn.microsoft.com/en-us/library/ms810613.aspx
http://torry.net/quicksearchd.php?String=memory+map&Title=No
Your char type is two byte, so that buffer is 16 byte. Then with blockread you read sizeof(buffer)-1 byte into it, and check the first 2 byte char if it is equal to #13.
The sizeof(buffer)-1 is dodgy (where does that -1 come from?), and the rest is valid, but only if your input file is utf16.
Also your read 8 (or 16) characters each time, but compare only one and then do a seek again. That is not very logical either.
If your encoding is not utf16, I suggest you change the type of a buffer element to ansichar and remove the -1
In response to kopiks suggestion, I figured out how to do it with TFilestream, it works ok with the simple test file, though there may be some further tweeks when I use it on a variety of csv files. Also, I don't make any claims that this is the most efficient method.
procedure TForm1.Button6Click(Sender: TObject);
Var
StreamSize, ApproxNumRows : Integer;
TempStr : String;
begin
if OpenDialog1.Execute then begin
TempStr := ReadLastLineOfTextFile(OpenDialog1.FileName,StreamSize, ApproxNumRows);
// TempStr := ReadFileStream('c:\temp\CSVTestFile.csv');
ShowMessage ('approximately '+ IntToStr(ApproxNumRows)+' Rows');
ListBox1.Items.Add(TempStr);
end;
end;
Function TForm1.ReadLastLineOfTextFile(const FileName: String; var StreamSize, ApproxNumRows : Integer): String;
const
MAXLINELENGTH = 256;
var
Stream: TFileStream;
BlockSize,CharCount : integer;
Hash13Found : Boolean;
Buffer : array [0..MAXLINELENGTH] of AnsiChar;
begin
Hash13Found := False;
Result :='';
Stream := TFileStream.Create(FileName, fmOpenRead or fmShareDenyWrite);
StreamSize := Stream.size;
if StreamSize < MAXLINELENGTH then
BlockSize := StreamSize
Else
BlockSize := MAXLINELENGTH;
// for CharCount := 0 to Length(Buffer)-1 do begin
// Buffer[CharCount] := #0; // zeroing the buffer can aid diagnostics
// end;
CharCount := 0;
Repeat
Stream.Seek(-(CharCount+3), 2); //+3 misses out the #0,#10,#13 at the end of the file
Stream.Read( Buffer[CharCount], 1);
Result := String(Buffer[CharCount]) + result;
if Buffer[CharCount] =#13 then
Hash13Found := True;
Inc(CharCount);
Until Hash13Found OR (CharCount = BlockSize);
ShowMessage(Result);
ApproxNumRows := Round(StreamSize / CharCount);
end;
Just thought of a new solution.
Again, there could be better ones, but this one is the best i thought of.
function GetLastLine(textFilePath: string): string;
var
list: tstringlist;
begin
list := tstringlist.Create;
try
list.LoadFromFile(textFilePath);
result := list[list.Count-1];
finally
list.free;
end;
end;

Find and Replace Text in a Large TextFile (Delphi XE5)

I am trying to find and replace text in a text file. I have been able to do this in the past with methods like:
procedure SmallFileFindAndReplace(FileName, Find, ReplaceWith: string);
begin
with TStringList.Create do
begin
LoadFromFile(FileName);
Text := StringReplace(Text, Find, ReplaceWith, [rfReplaceAll, rfIgnoreCase]);
SaveToFile(FileName);
Free;
end;
end;
The above works fine when a file is relatively small, however; when the the file size is something like 170 Mb the above code will cause the following error:
EOutOfMemory with message 'Out of memory'
I have tried the following with success, however it takes a long time to run:
procedure Tfrm_Main.button_MakeReplacementClick(Sender: TObject);
var
fs : TFileStream;
s : AnsiString;
//s : string;
begin
fs := TFileStream.Create(edit_SQLFile.Text, fmOpenread or fmShareDenyNone);
try
SetLength(S, fs.Size);
fs.ReadBuffer(S[1], fs.Size);
finally
fs.Free;
end;
s := StringReplace(s, edit_Find.Text, edit_Replace.Text, [rfReplaceAll, rfIgnoreCase]);
fs := TFileStream.Create(edit_SQLFile.Text, fmCreate);
try
fs.WriteBuffer(S[1], Length(S));
finally
fs.Free;
end;
end;
I am new to "Streams" and working with buffers.
Is there a better way to do this?
Thank You.
You have two mistakes in first code example and three - in second example:
Do not load whole large file in memory, especially in 32bit application. If file size more than ~1 Gb, you always get "Out of memory"
StringReplace slows with large strings, because of repeated memory reallocation
In second code you don`t use text encoding in file, so (for Windows) your code "think" that file has UCS2 encoding (two bytes per character). But what you get, if file encoding is Ansi (one byte per character) or UTF8 (variable size of char)?
Thus, for correct find&replace you must use file encoding and read/write parts of file, as LU RD said:
interface
uses
System.Classes,
System.SysUtils;
type
TFileSearchReplace = class(TObject)
private
FSourceFile: TFileStream;
FtmpFile: TFileStream;
FEncoding: TEncoding;
public
constructor Create(const AFileName: string);
destructor Destroy; override;
procedure Replace(const AFrom, ATo: string; ReplaceFlags: TReplaceFlags);
end;
implementation
uses
System.IOUtils,
System.StrUtils;
function Max(const A, B: Integer): Integer;
begin
if A > B then
Result := A
else
Result := B;
end;
{ TFileSearchReplace }
constructor TFileSearchReplace.Create(const AFileName: string);
begin
inherited Create;
FSourceFile := TFileStream.Create(AFileName, fmOpenReadWrite);
FtmpFile := TFileStream.Create(ChangeFileExt(AFileName, '.tmp'), fmCreate);
end;
destructor TFileSearchReplace.Destroy;
var
tmpFileName: string;
begin
if Assigned(FtmpFile) then
tmpFileName := FtmpFile.FileName;
FreeAndNil(FtmpFile);
FreeAndNil(FSourceFile);
TFile.Delete(tmpFileName);
inherited;
end;
procedure TFileSearchReplace.Replace(const AFrom, ATo: string;
ReplaceFlags: TReplaceFlags);
procedure CopyPreamble;
var
PreambleSize: Integer;
PreambleBuf: TBytes;
begin
// Copy Encoding preamble
SetLength(PreambleBuf, 100);
FSourceFile.Read(PreambleBuf, Length(PreambleBuf));
FSourceFile.Seek(0, soBeginning);
PreambleSize := TEncoding.GetBufferEncoding(PreambleBuf, FEncoding);
if PreambleSize <> 0 then
FtmpFile.CopyFrom(FSourceFile, PreambleSize);
end;
function GetLastIndex(const Str, SubStr: string): Integer;
var
i: Integer;
tmpSubStr, tmpStr: string;
begin
if not(rfIgnoreCase in ReplaceFlags) then
begin
i := Pos(SubStr, Str);
Result := i;
while i > 0 do
begin
i := PosEx(SubStr, Str, i + 1);
if i > 0 then
Result := i;
end;
if Result > 0 then
Inc(Result, Length(SubStr) - 1);
end
else
begin
tmpStr := UpperCase(Str);
tmpSubStr := UpperCase(SubStr);
i := Pos(tmpSubStr, tmpStr);
Result := i;
while i > 0 do
begin
i := PosEx(tmpSubStr, tmpStr, i + 1);
if i > 0 then
Result := i;
end;
if Result > 0 then
Inc(Result, Length(tmpSubStr) - 1);
end;
end;
var
SourceSize: int64;
procedure ParseBuffer(Buf: TBytes; var IsReplaced: Boolean);
var
i: Integer;
ReadedBufLen: Integer;
BufStr: string;
DestBytes: TBytes;
LastIndex: Integer;
begin
if IsReplaced and (not(rfReplaceAll in ReplaceFlags)) then
begin
FtmpFile.Write(Buf, Length(Buf));
Exit;
end;
// 1. Get chars from buffer
ReadedBufLen := 0;
for i := Length(Buf) downto 0 do
if FEncoding.GetCharCount(Buf, 0, i) <> 0 then
begin
ReadedBufLen := i;
Break;
end;
if ReadedBufLen = 0 then
raise EEncodingError.Create('Cant convert bytes to str');
FSourceFile.Seek(ReadedBufLen - Length(Buf), soCurrent);
BufStr := FEncoding.GetString(Buf, 0, ReadedBufLen);
if rfIgnoreCase in ReplaceFlags then
IsReplaced := ContainsText(BufStr, AFrom)
else
IsReplaced := ContainsStr(BufStr, AFrom);
if IsReplaced then
begin
LastIndex := GetLastIndex(BufStr, AFrom);
LastIndex := Max(LastIndex, Length(BufStr) - Length(AFrom) + 1);
end
else
LastIndex := Length(BufStr);
SetLength(BufStr, LastIndex);
FSourceFile.Seek(FEncoding.GetByteCount(BufStr) - ReadedBufLen, soCurrent);
BufStr := StringReplace(BufStr, AFrom, ATo, ReplaceFlags);
DestBytes := FEncoding.GetBytes(BufStr);
FtmpFile.Write(DestBytes, Length(DestBytes));
end;
var
Buf: TBytes;
BufLen: Integer;
bReplaced: Boolean;
begin
FSourceFile.Seek(0, soBeginning);
FtmpFile.Size := 0;
CopyPreamble;
SourceSize := FSourceFile.Size;
BufLen := Max(FEncoding.GetByteCount(AFrom) * 5, 2048);
BufLen := Max(FEncoding.GetByteCount(ATo) * 5, BufLen);
SetLength(Buf, BufLen);
bReplaced := False;
while FSourceFile.Position < SourceSize do
begin
BufLen := FSourceFile.Read(Buf, Length(Buf));
SetLength(Buf, BufLen);
ParseBuffer(Buf, bReplaced);
end;
FSourceFile.Size := 0;
FSourceFile.CopyFrom(FtmpFile, 0);
end;
how to use:
procedure TForm2.btn1Click(Sender: TObject);
var
Replacer: TFileSearchReplace;
StartTime: TDateTime;
begin
StartTime:=Now;
Replacer:=TFileSearchReplace.Create('c:\Temp\123.txt');
try
Replacer.Replace('some текст', 'some', [rfReplaceAll, rfIgnoreCase]);
finally
Replacer.Free;
end;
Caption:=FormatDateTime('nn:ss.zzz', Now - StartTime);
end;
Your first try creates several copies of the file in memory:
it loads the whole file into memory (TStringList)
it creates a copy of this memory when accessing the .Text property
it creates yet another copy of this memory when passing that string to StringReplace (The copy is the result which is built in StringReplace.)
You could try to solve the out of memory problem by getting rid of one or more of these copies:
e.g. read the file into a simple string variable rather than a TStringList
or keep the string list but run the StringReplace on each line separately and write the result to the file line by line.
That would increase the maximum file size your code can handle, but you will still run out of memory for huge files. If you want to handle files of any size, your second approach is the way to go.
No - I don't think there's a faster way that the 2nd option (if you want a completely generic search'n'replace function for any file of any size). It may be possible to make a faster version if you code it specifically according to your requirements, but as a general-purpose search'n'replace function, I don't believe you can go faster...
For instance, are you sure you need case-insensitive replacement? I would expect that this would be a large part of the time spent in the replace function. Try (just for kicks) to remove that requirement and see if it doesn't speed up the execution quite a bit on large files (this depends on how the internal coding of the StringReplace function is made - if it has a specific optimization for case-sensitive searches)
I believe refinement of Kami's code is needed to account for the string not being found, but the start of a new instance of the string might occur at the end of the buffer. The else clause is different:
if IsReplaced then begin
LastIndex := GetLastIndex(BufStr, AFrom);
LastIndex := Max(LastIndex, Length(BufStr) - Length(AFrom) + 1);
end else
LastIndex :=Length(BufStr) - Length(AFrom) + 1;
Correct fix is this one:
if IsReplaced then
begin
LastIndex := GetLastIndex(BufStr, AFrom);
LastIndex := Max(LastIndex, Length(BufStr) - Length(AFrom) + 1);
end
else
if FSourceFile.Position < SourceSize then
LastIndex := Length(BufStr) - Length(AFrom) + 1
else
LastIndex := Length(BufStr);

Faster way to split text in Delphi TStringList

I have an app that needs to do heavy text manipulation in a TStringList. Basically i need to split text by a delimiter ; for instance, if i have a singe line with 1000 chars and this delimiter occurs 3 times in this line, then i need to split it in 3 lines. The delimiter can contain more than one char, it can be a tag like '[test]' for example.
I've wrote two functions to do this task with 2 different approaches, but both are slow in big amounts of text (more then 2mbytes usually).
How can i achieve this goal in a faster way ?
Here are both functions, both receive 2 paramaters : 'lines' which is the original tstringlist and 'q' which is the delimiter.
function splitlines(lines : tstringlist; q: string) : integer;
var
s, aux, ant : string;
i,j : integer;
flag : boolean;
m2 : tstringlist;
begin
try
m2 := tstringlist.create;
m2.BeginUpdate;
result := 0;
for i := 0 to lines.count-1 do
begin
s := lines[i];
for j := 1 to length(s) do
begin
flag := lowercase(copy(s,j,length(q))) = lowercase(q);
if flag then
begin
inc(result);
m2.add(aux);
aux := s[j];
end
else
aux := aux + s[j];
end;
m2.add(aux);
aux := '';
end;
m2.EndUpdate;
lines.text := m2.text;
finally
m2.free;
end;
end;
function splitLines2(lines : tstringlist; q: string) : integer;
var
aux, p : string;
i : integer;
flag : boolean;
begin
//maux1 and maux2 are already instanced in the parent class
try
maux2.text := lines.text;
p := '';
i := 0;
flag := false;
maux1.BeginUpdate;
maux2.BeginUpdate;
while (pos(lowercase(q),lowercase(maux2.text)) > 0) and (i < 5000) do
begin
flag := true;
aux := p+copy(maux2.text,1,pos(lowercase(q),lowercase(maux2.text))-1);
maux1.add(aux);
maux2.text := copy(maux2.text,pos(lowercase(q),lowercase(maux2.text)),length(maux2.text));
p := copy(maux2.text,1,1);
maux2.text := copy(maux2.text,2,length(maux2.text));
inc(i);
end;
finally
result := i;
maux1.EndUpdate;
maux2.EndUpdate;
if flag then
begin
maux1.add(p+maux2.text);
lines.text := maux1.text;
end;
end;
end;
I've not tested the speed, but for academic purposes, here's an easy way to split the strings:
myStringList.Text :=
StringReplace(myStringList.Text, myDelimiter, #13#10, [rfReplaceAll]);
// Use [rfReplaceAll, rfIgnoreCase] if you want to ignore case
When you set the Text property of TStringList, it parses on new lines and splits there, so converting to a string, replacing the delimiter with new lines, then assigning it back to the Text property works.
The problems with your code (at least second approach) are
You are constantly using lowecase which is slow if called so many times
If I saw correctly you are copying the whole remaining text back to the original source. This is sure to be extra slow for large strings (eg files)
I have a tokenizer in my library. Its not the fastest or best but it should do (you can get it from Cromis Library, just use the units Cromis.StringUtils and Cromis.Unicode):
type
TTokens = array of ustring;
TTextTokenizer = class
private
FTokens: TTokens;
FDelimiters: array of ustring;
public
constructor Create;
procedure Tokenize(const Text: ustring);
procedure AddDelimiters(const Delimiters: array of ustring);
property Tokens: TTokens read FTokens;
end;
{ TTextTokenizer }
procedure TTextTokenizer.AddDelimiters(const Delimiters: array of ustring);
var
I: Integer;
begin
if Length(Delimiters) > 0 then
begin
SetLength(FDelimiters, Length(Delimiters));
for I := 0 to Length(Delimiters) - 1 do
FDelimiters[I] := Delimiters[I];
end;
end;
constructor TTextTokenizer.Create;
begin
SetLength(FTokens, 0);
SetLength(FDelimiters, 0);
end;
procedure TTextTokenizer.Tokenize(const Text: ustring);
var
I, K: Integer;
Counter: Integer;
NewToken: ustring;
Position: Integer;
CurrToken: ustring;
begin
SetLength(FTokens, 100);
CurrToken := '';
Counter := 0;
for I := 1 to Length(Text) do
begin
CurrToken := CurrToken + Text[I];
for K := 0 to Length(FDelimiters) - 1 do
begin
Position := Pos(FDelimiters[K], CurrToken);
if Position > 0 then
begin
NewToken := Copy(CurrToken, 1, Position - 1);
if NewToken <> '' then
begin
if Counter > Length(FTokens) then
SetLength(FTokens, Length(FTokens) * 2);
FTokens[Counter] := Trim(NewToken);
Inc(Counter)
end;
CurrToken := '';
end;
end;
end;
if CurrToken <> '' then
begin
if Counter > Length(FTokens) then
SetLength(FTokens, Length(FTokens) * 2);
FTokens[Counter] := Trim(CurrToken);
Inc(Counter)
end;
SetLength(FTokens, Counter);
end;
How about just using StrTokens from the JCL library
procedure StrTokens(const S: string; const List: TStrings);
It's open source
http://sourceforge.net/projects/jcl/
As an additional option, you can use regular expressions. Recent versions of Delphi (XE4 and XE5) come with built in regular expression support; older versions can find a free regex library download (zip file) at Regular-Expressions.info.
For the built-in regex support (uses the generic TArray<string>):
var
RegexObj: TRegEx;
SplitArray: TArray<string>;
begin
SplitArray := nil;
try
RegexObj := TRegEx.Create('\[test\]'); // Your sample expression. Replace with q
SplitArray := RegexObj.Split(Lines, 0);
except
on E: ERegularExpressionError do begin
// Syntax error in the regular expression
end;
end;
// Use SplitArray
end;
For using TPerlRegEx in earlier Delphi versions:
var
Regex: TPerlRegEx;
m2: TStringList;
begin
m2 := TStringList.Create;
try
Regex := TPerlRegEx.Create;
try
Regex.RegEx := '\[test\]'; // Using your sample expression - replace with q
Regex.Options := [];
Regex.State := [preNotEmpty];
Regex.Subject := Lines.Text;
Regex.SplitCapture(m2, 0);
finally
Regex.Free;
end;
// Work with m2
finally
m2.Free;
end;
end;
(For those unaware, the \ in the sample expression used are because the [] characters are meaningful in regular expressions and need to be escaped to be used in the regular expression text. Typically, they're not required in the text.)

Why do I get access violations using Mike Heydon's TStringBuilder class?

I am using a TStringBuilder class ported from .Net to Delphi 7.
And here is my code snippet:
procedure TForm1.btn1Click(Sender: TObject);
const
FILE_NAME = 'PATH TO A TEXT FILE';
var
sBuilder: TStringBuilder;
I: Integer;
fil: TStringList;
sResult: string;
randInt: Integer;
begin
randomize;
sResult := '';
for I := 1 to 100 do
begin
fil := TStringList.Create;
try
fil.LoadFromFile(FILE_NAME);
randInt := Random(1024);
sBuilder := TStringBuilder.Create(randInt);
try
sBuilder.Append(fil.Text);
sResult := sBuilder.AsString;
finally
sBuilder.free;
end;
mmo1.Text := sResult;
finally
FreeAndNil(fil);
end;
end;
showmessage ('DOne');
end;
I am experiencing AV errors. I can alleviate the problem when I create memory with the size multiple by 1024, however sometimes it still occurs.
Am I doing something wrong?
Your code is fine. The TStringBuilder code you're using is faulty. Consider this method:
procedure TStringBuilder.Append(const AString : string);
var iLen : integer;
begin
iLen := length(AString);
if iLen + FIndex > FBuffMax then _ExpandBuffer;
move(AString[1],FBuffer[FIndex],iLen);
inc(FIndex,iLen);
end;
If the future length is too long for the current buffer size, the buffer is expanded. _ExpandBuffer doubles the size of the buffer, but once that's done, it never checks whether the new buffer size is sufficient. If the original buffer size is 1024, and the file you're loading is 3 KB, then doubling the buffer size to 2048 will still leave the buffer too small in Append, and you'll end up overwriting 1024 bytes beyond the end of the buffer.
Change the if to a while in Append.

(Wide)String - storing in TFileStream, Delphi 7. What is the fastest way?

I'm using Delphi7 (non-unicode VCL), I need to store lots of WideStrings inside a TFileStream. I can't use TStringStream as the (wide)strings are mixed with binary data, the format is projected to speed up loading and writing the data ... However I believe that current way I'm loading/writing the strings might be a bottleneck of my code ...
currently I'm writing length of a string, then writing it char by char ...
while loading, first I'm loading the length, then loading char by char ...
So, what is the fastest way to save and load WideString to TFileStream?
Thanks in advance
Rather than read and write one character at a time, read and write them all at once:
procedure WriteWideString(const ws: WideString; stream: TStream);
var
nChars: LongInt;
begin
nChars := Length(ws);
stream.WriteBuffer(nChars, SizeOf(nChars);
if nChars > 0 then
stream.WriteBuffer(ws[1], nChars * SizeOf(ws[1]));
end;
function ReadWideString(stream: TStream): WideString;
var
nChars: LongInt;
begin
stream.ReadBuffer(nChars, SizeOf(nChars));
SetLength(Result, nChars);
if nChars > 0 then
stream.ReadBuffer(Result[1], nChars * SizeOf(Result[1]));
end;
Now, technically, since WideString is a Windows BSTR, it can contain an odd number of bytes. The Length function reads the number of bytes and divides by two, so it's possible (although not likely) that the code above will cut off the last byte. You could use this code instead:
procedure WriteWideString(const ws: WideString; stream: TStream);
var
nBytes: LongInt;
begin
nBytes := SysStringByteLen(Pointer(ws));
stream.WriteBuffer(nBytes, SizeOf(nBytes));
if nBytes > 0 then
stream.WriteBuffer(Pointer(ws)^, nBytes);
end;
function ReadWideString(stream: TStream): WideString;
var
nBytes: LongInt;
buffer: PAnsiChar;
begin
stream.ReadBuffer(nBytes, SizeOf(nBytes));
if nBytes > 0 then begin
GetMem(buffer, nBytes);
try
stream.ReadBuffer(buffer^, nBytes);
Result := SysAllocStringByteLen(buffer, nBytes)
finally
FreeMem(buffer);
end;
end else
Result := '';
end;
Inspired by Mghie's answer, have replaced my Read and Write calls with ReadBuffer and WriteBuffer. The latter will raise exceptions if they are unable to read or write the requested number of bytes.
There is nothing special about wide strings, to read and write them as fast as possible you need to read and write as much as possible in one go:
procedure TForm1.Button1Click(Sender: TObject);
var
Str: TStream;
W, W2: WideString;
L: integer;
begin
W := 'foo bar baz';
Str := TFileStream.Create('test.bin', fmCreate);
try
// write WideString
L := Length(W);
Str.WriteBuffer(L, SizeOf(integer));
if L > 0 then
Str.WriteBuffer(W[1], L * SizeOf(WideChar));
Str.Seek(0, soFromBeginning);
// read back WideString
Str.ReadBuffer(L, SizeOf(integer));
if L > 0 then begin
SetLength(W2, L);
Str.ReadBuffer(W2[1], L * SizeOf(WideChar));
end else
W2 := '';
Assert(W = W2);
finally
Str.Free;
end;
end;
WideStrings contain a 'string' of WideChar's, which use 2 bytes each. If you want to store the UTF-16 (which WideStrings use internally) strings in a file, and be able to use this file in other programs like notepad, you need to write a byte order mark first: #$FEFF.
If you know this, writing can look like this:
Stream1.Write(WideString1[1],Length(WideString)*2); //2=SizeOf(WideChar)
reading can look like this:
Stream1.Read(WideChar1,2);//assert returned 2 and WideChar1=#$FEFF
SetLength(WideString1,(Stream1.Size div 2)-1);
Stream1.Read(WideString1[1],(Stream1.Size div 2)-1);
You can also use TFastFileStream for reading the data or strings, I pasted the unit at http://pastebin.com/m6ecdc8c2 and a sample below:
program Project36;
{$APPTYPE CONSOLE}
uses
SysUtils, Classes,
FastStream in 'FastStream.pas';
const
WideNull: WideChar = #0;
procedure WriteWideStringToStream(Stream: TFileStream; var Data: WideString);
var
len: Word;
begin
len := Length(Data);
// Write WideString length
Stream.Write(len, SizeOf(len));
if (len > 0) then
begin
// Write WideString
Stream.Write(Data[1], len * SizeOf(WideChar));
end;
// Write null termination
Stream.Write(WideNull, SizeOf(WideNull));
end;
procedure CreateTestFile;
var
Stream: TFileStream;
MyString: WideString;
begin
Stream := TFileStream.Create('test.bin', fmCreate);
try
MyString := 'Hello World!';
WriteWideStringToStream(Stream, MyString);
MyString := 'Speed is Delphi!';
WriteWideStringToStream(Stream, MyString);
finally
Stream.Free;
end;
end;
function ReadWideStringFromStream(Stream: TFastFileStream): WideString;
var
len: Word;
begin
// Read length of WideString
Stream.Read(len, SizeOf(len));
// Read WideString
Result := PWideChar(Cardinal(Stream.Memory) + Stream.Position);
// Update position and skip null termination
Stream.Position := Stream.Position + (len * SizeOf(WideChar)) + SizeOf(WideNull);
end;
procedure ReadTestFile;
var
Stream: TFastFileStream;
my_wide_string: WideString;
begin
Stream := TFastFileStream.Create('test.bin');
try
Stream.Position := 0;
// Read WideString
my_wide_string := ReadWideStringFromStream(Stream);
WriteLn(my_wide_string);
// Read another WideString
my_wide_string := ReadWideStringFromStream(Stream);
WriteLn(my_wide_string);
finally
Stream.Free;
end;
end;
begin
CreateTestFile;
ReadTestFile;
ReadLn;
end.

Resources