Asynchronous Webcrawling F#, something wrong? - f#

Not quite sure if it is ok to do this but, my question is: Is there something wrong with my code ? It doesn't go as fast as I would like, and since I am using lots of async workflows maybe I am doing something wrong. The goal here is to build something that can crawl 20 000 pages in less than an hour.
open System
open System.Text
open System.Net
open System.IO
open System.Text.RegularExpressions
open System.Collections.Generic
open System.ComponentModel
open Microsoft.FSharp
open System.Threading
//This is the Parallel.Fs file
type ComparableUri ( uri: string ) =
inherit System.Uri( uri )
let elts (uri:System.Uri) =
uri.Scheme, uri.Host, uri.Port, uri.Segments
interface System.IComparable with
member this.CompareTo( uri2 ) =
compare (elts this) (elts(uri2 :?> ComparableUri))
override this.Equals(uri2) =
compare this (uri2 :?> ComparableUri ) = 0
override this.GetHashCode() = 0
///////////////////////////////////////////////Functions to retrieve html string//////////////////////////////
let mutable error = Set.empty<ComparableUri>
let mutable visited = Set.empty<ComparableUri>
let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri) =
async{
try
let req = (WebRequest.Create(uri)) :?> HttpWebRequest
// 'use' is equivalent to ‘using’ in C# for an IDisposable
req.UserAgent<-"Mozilla"
//Console.WriteLine("Waiting")
do! Async.Sleep(delay * 250)
let! resp = (req.AsyncGetResponse())
Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay)
use stream = resp.GetResponseStream()
use reader = new StreamReader(stream)
let html = reader.ReadToEnd()
return html
with
| _ as ex -> Console.WriteLine( ex.ToString() )
lock error (fun () -> error<- error.Add uri )
lock visited (fun () -> visited<-visited.Add uri )
return "BadUri"
}
///////////////////////////////////////////////Active Pattern Matching to retreive href//////////////////////////////
let (|Matches|_|) (pat:string) (inp:string) =
let m = Regex.Matches(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Count > 0
then Some (List.tail [ for g in m -> g.Value ])
else None
let (|Match|_|) (pat:string) (inp:string) =
let m = Regex.Match(inp, pat)
// Note the List.tl, since the first group is always the entirety of the matched string.
if m.Success then
Some (List.tail [ for g in m.Groups -> g.Value ])
else
None
///////////////////////////////////////////////Find Bad href//////////////////////////////
let isEmail (link:string) =
link.Contains("#")
let isMailto (link:string) =
if Seq.length link >=6 then
link.[0..5] = "mailto"
else
false
let isJavascript (link:string) =
if Seq.length link >=10 then
link.[0..9] = "javascript"
else
false
let isBadUri (link:string) =
link="BadUri"
let isEmptyHttp (link:string) =
link="http://"
let isFile (link:string)=
if Seq.length link >=6 then
link.[0..5] = "file:/"
else
false
let containsPipe (link:string) =
link.Contains("|")
let isAdLink (link:string) =
if Seq.length link >=6 then
link.[0..5] = "adlink"
elif Seq.length link >=9 then
link.[0..8] = "http://adLink"
else
false
///////////////////////////////////////////////Find Bad href//////////////////////////////
let getHref (htmlString:string) =
let urlPat = "href=\"([^\"]+)"
match htmlString with
| Matches urlPat urls -> urls |> List.map( fun href -> match href with
| Match (urlPat) (link::[]) -> link
| _ -> failwith "The href was not in correct format, there was more than one match" )
| _ -> Console.WriteLine( "No links for this page" );[]
|> List.filter( fun link -> not(isEmail link) )
|> List.filter( fun link -> not(isMailto link) )
|> List.filter( fun link -> not(isJavascript link) )
|> List.filter( fun link -> not(isBadUri link) )
|> List.filter( fun link -> not(isEmptyHttp link) )
|> List.filter( fun link -> not(isFile link) )
|> List.filter( fun link -> not(containsPipe link) )
|> List.filter( fun link -> not(isAdLink link) )
let treatAjax (href:System.Uri) =
let link = href.ToString()
let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0]
new Uri(firstPart)
//only follow pages with certain extnsion or ones with no exensions
let followHref (href:System.Uri) =
let valid2 = set[".py"]
let valid3 = set[".php";".htm";".asp"]
let valid4 = set[".php3";".php4";".php5";".html";".aspx"]
let arrLength = href.Segments |> Array.length
let lastExtension = (href.Segments).[arrLength-1]
let lengthLastExtension = Seq.length lastExtension
if (lengthLastExtension <= 3) then
not( lastExtension.Contains(".") )
else
//test for the 2 case
let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)]
let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith( validEnd) )
if isValid2 then
true
else
if lengthLastExtension <= 4 then
not( last4.Contains(".") )
else
let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)]
let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith( validEnd) )
if isValid3 then
true
else
if lengthLastExtension <= 5 then
not( last5.Contains(".") )
else
let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)]
let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith( validEnd) )
if isValid4 then
true
else
not( last6.Contains(".") ) && not(lastExtension.[0..5] = "mailto")
//Create the correct links / -> add the homepage , make then a comparabel Uri
let hrefLinksToUri ( uri:ComparableUri ) (hrefLinks:string list) =
hrefLinks
|> List.map( fun link -> try
if Seq.length link <4 then
Some(new Uri( uri, link ))
else
if link.[0..3] = "http" then
Some(new Uri(link))
else
Some(new Uri( uri, link ))
with
| _ as ex -> Console.WriteLine(link);
lock error (fun () ->error<-error.Add uri)
None
)
|> List.filter( fun link -> link.IsSome )
|> List.map( fun o -> o.Value)
|> List.map( fun uri -> new ComparableUri( string uri ) )
//Treat uri , removing ajax last part , and only following links specified b Benoit
let linksToFollow (hrefUris:ComparableUri list) =
hrefUris
|>List.map( treatAjax )
|>List.filter( fun link -> followHref link )
|>List.map( fun uri -> new ComparableUri( string uri ) )
|>Set.ofList
let needToVisit uri =
( lock visited (fun () -> not( visited.Contains uri) ) ) && (lock error (fun () -> not( error.Contains uri) ))
let getLinksToFollowAsyncDelay (delay:int) ( uri: ComparableUri ) =
//write
async{
let! links = getHtmlPrimitiveAsyncDelay delay uri
lock visited (fun () ->visited<-visited.Add uri)
let linksToFollow = getHref links
|> hrefLinksToUri uri
|> linksToFollow
|> Set.filter( needToVisit )
return linksToFollow
}
let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,System.Diagnostics.Stopwatch >) =
let uriAuthority = uri.Authority
let hasAuthority,watch = authorityDelay.TryGetValue(uriAuthority)
if hasAuthority then
let elapsed = watch.Elapsed
let s = TimeSpan(0,0,0,0,500)-elapsed
if s.TotalMilliseconds < 0.0 then
0
else
int(s.TotalMilliseconds)
else
let temp = System.Diagnostics.Stopwatch()
temp.Start()
authorityDelay.Add(uriAuthority,temp)
0
let rec getLinksToFollowFromSetAsync maxIteration ( uris: seq<ComparableUri> ) =
let authorityDelay = Dictionary<string,System.Diagnostics.Stopwatch>()
if maxIteration = 100 then
Console.WriteLine("Finished")
else
//Unite by authority add delay for those we same authority others ignore
let stopwatch= System.Diagnostics.Stopwatch()
stopwatch.Start()
let newLinks = uris
|> Seq.map( fun uri -> let delay = lock authorityDelay (fun () -> getDelay uri authorityDelay )
getLinksToFollowAsyncDelay delay uri )
|> Async.Parallel
|> Async.RunSynchronously
|> Seq.concat
stopwatch.Stop()
Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n")
getLinksToFollowFromSetAsync (maxIteration+1) newLinks
seq[set[ComparableUri( "http://rue89.com/" )]]
|>PSeq.ofSeq
|>PSeq.iter(getLinksToFollowFromSetAsync 0 )
getLinksToFollowFromSetAsync 0 (seq[ComparableUri( "http://twitter.com/" )])
Console.WriteLine("Finished")
Some feedBack would be great ! Thank you (note this is just something I am doing for fun)

I think the culprit is the line do! Async.Sleep(delay * 250) - you gradually wait longer and longer. What is the reason for it?

Related

Sending SSO cookies when doing an ajax call in websharper

I've created a website using WebSharper and has stumbled into a problem. I wish to integrate the site with VSTS REST API. To do that (seemlessly) I need to forward a session cookie. How do I do that in an WebSharper-Ajax call. My current implementation of the Ajax call prior to needing this looks like this and works just fine for the other needs I've had so far
let Ajax (request : Request) =
let httpMethod = request.Method
let url = request.EndPoint
let data = request.AsJson
let success ok =
System.Action<obj,string,JqXHR>(
fun res _ _ ->
let result = (res :?> string |> Json.Parse)
if JS.HasOwnProperty result "error" then
{
ErrorType = result?error
Reason = result?reason
} |> pushError
else
result
|> Success
|> ok
)
let contentType = Union<bool,string>.Union2Of2("application/json")
try
Async.FromContinuations
<| fun (ok, ko, _) ->
let settings = JQuery.AjaxSettings(
Url = url,
DataType = JQuery.DataType.Text,
Type = As<JQuery.RequestType> httpMethod,
Success = success ok,
ContentType = contentType,
Error = System.Action<JqXHR,string,string>(fun jqXHR _ _ ->
let error =
jqXHR?responseText
|> Json.Parse
{
ErrorType = error?error
Reason = error?reason
} |> pushError |> ok
)
)
match data with
Some data ->
settings.Data <- data
| None -> ()
JQuery.Ajax(settings) |> ignore
with e ->
async {
return {
ErrorType ="uncaught exception";
Reason = e.Message
} |> Error
}
It turns out that the solution is pretty easy. After creating the AjaxSetting object, simply use dynamic typing to add the xhrFields object
settings?xhrFields <- obj()
settings?xhrFields?withCredentials <- true

How do I get items from an RSS feed using .Net Core?

How do I get items from an RSS feed using .Net Core?
The following code doesn't appear to work:
open Microsoft.SyndicationFeed
open Microsoft.SyndicationFeed.Rss
[<Test>]
let ``Get links from iTunes RSS Feed`` () =
let url = "http://www.pwop.com/feed.aspx?show=dotnetrocks&filetype=master&tags=F%23"
use reader = XmlReader.Create(url)
let feedReader = RssFeedReader(reader)
let mutable linkTemplate = {
Title= ""
Url= ""
}
let links =
async {
let links = Collections.Generic.List<Link>()
while feedReader.Read() |> Async.AwaitTask |> Async.RunSynchronously do
match feedReader.ElementType with
| SyndicationElementType.Link ->
let item = feedReader.ReadLink() |> Async.AwaitTask |> Async.RunSynchronously
let link = { linkTemplate with Title= item.Title; Url= item.Uri.AbsolutePath }
links.Add(link)
| _ -> ()
return links
} |> Async.RunSynchronously
reader.Close()
System.Diagnostics.Debug.WriteLine(links.[0].Title)
links.[0].Title |> should not' (equal "")
Specifically, items are read but there's no actual data after the read.
I used the XElement class as recommended:
[<Test>]
let ``Get links from iTunes RSS Feed`` () =
let toLink (item:XElement) = {
Id = -1
ProfileId = "to be derived..."
Title= item.Element(XName.Get("title")) |> string
Url= item.Element(XName.Get("link")) |> string
Description = item.Element(XName.Get("description")) |> string
ContentType= Podcast |> contentTypeToString
Topics = []
IsFeatured= false
}
let baseAddress = "http://www.pwop.com/"
let url = "feed.aspx?show=dotnetrocks&filetype=master&tags=F%23"
use client = httpClient baseAddress
let response = client.GetAsync(url) |> Async.AwaitTask
|> Async.RunSynchronously
let links =
if response.IsSuccessStatusCode
then let text = response.Content.ReadAsStringAsync() |> Async.AwaitTask |> Async.RunSynchronously
XElement.Parse(text).Descendants(XName.Get("item")) |> Seq.toList |> List.map toLink
else []
links |> List.isEmpty |> should equal false

Suave and DotLiquid

edited for clarity
somehow this works:
path "/" >=> warbler (fun _ -> OK (string DateTime.Now))
but this one does not:
let txnAccA =
let sqlStr = "select JSON from Store.Txn"
let result = Db.execute sqlStr Config.oConnStr
match result with
| Some a ->
[for i in a do
let msg = JsonConvert.DeserializeObject<TxnAccA>(i)
yield msg
]
| _ ->
List.empty<TxnAccA>
let txnAmtA =
let sqlStr = "select JSON from Store.Amt"
let result = Db.execute sqlStr Config.oConnStr
match result with
| Some a ->
[for i in a do
let msg = JsonConvert.DeserializeObject<TxnAmtA>(i)
yield msg
]
| _ ->
List.empty<TxnAmtA>
let result ()= {Acc= txnAccA; Amt= txnAmtA}
path "/txn" >=> warbler (fun _ -> page "txn.html" (result()))
By "works" I mean that the page is not static, it displays latest data from database. Any idea why?
txnAccA and txnAmtA need to be functions (similar to result). They are defined as values now, so get assigned once and will not query the DB for every request. result will create a new record every time when called, but the values stay always the same.
let txnAccA () = //...
let txnAmtA () = //...
let result () = { Acc = txnAccA(); Amt = txnAmtA() }
path "/txn" >=> warbler (fun _ -> page "txn.html" (result()))

Why does ZeroMQ's socket.recv call hang in the following code?

I have encountered a problem with a simple pub-sub example in ZeroMQ. I have read plenty of documentation, but I cannot seem to find an answer.
I got libzmq and clrzmq from NuGet. For both the functions below the socket address is:
let sktAddr = "tcp://127.0.0.1:3456"
Here is a simple publisher, that queues a message every second.
// Publisher - this seems to work fine
let publisher () : unit =
let skt = (new ZMQ.Context()).Socket(ZMQ.SocketType.PUB)
skt.SetSockOpt(ZMQ.SocketOpt.LINGER, 0)
skt.Bind sktAddr
skt.SendMore("TEST_TOPIC", Text.Encoding.Unicode) |> ignore
let rec h1 () : unit =
let nv = DateTime.Now.ToUniversalTime().ToString()
printfn "Sending value: %s" nv
skt.Send(Text.Encoding.Unicode.GetBytes nv) |> ignore
Threading.Thread.Sleep 1000
let swt = new Threading.SpinWait()
swt.SpinOnce()
if Console.KeyAvailable then
match Console.ReadKey().Key with
| ConsoleKey.Q -> ()
| _ -> h1()
else
h1()
h1()
The following simple subscriber throws no error, but hangs at the line indicated below.
// Subscriber
let subscriber () : unit =
let skt = (new ZMQ.Context()).Socket(ZMQ.SocketType.SUB)
skt.Connect sktAddr
skt.Subscribe("TEST_TOPIC", Text.Encoding.Unicode)
let rec h1 () : unit =
let oDat = skt.Recv() // THE PROGRAMME HANGS HERE!
let strODat = (new Text.UnicodeEncoding()).GetString oDat
if oDat <> null then
printfn "Received: %s" strODat
else
printfn "No data received"
let swt = new System.Threading.SpinWait()
swt.SpinOnce()
if Console.KeyAvailable then
match Console.ReadKey().Key with
| ConsoleKey.Q -> ()
| _ -> h1()
else
h1()
h1()
I have read this question, but no solution is provided. So I am posting a new question here.
Thanks in advance for your help.
I believe the problem is in the publisher:
skt.SendMore("TEST_TOPIC", Text.Encoding.Unicode)
Not knowing F#, it appears the above statement happens outside the loop. If the subscriber is listening on TEST_TOPIC, any messages originating from the publisher require the topic name to precede content for each message, so the publisher must do this each time it sends:
skt.SendMore("TEST_TOPIC", Text.Encoding.Unicode)
skt.Send("some data here", Text.Encoding.Unicode)
..try this...
let publisher () : unit =
let skt = (new ZMQ.Context()).Socket(ZMQ.SocketType.PUB)
skt.SetSockOpt(ZMQ.SocketOpt.LINGER, 0)
skt.Bind sktAddr
let rec h1 () : unit =
let nv = DateTime.Now.ToUniversalTime().ToString()
printfn "Sending value: %s" nv
skt.SendMore("TEST_TOPIC", Text.Encoding.Unicode) |> ignore
skt.Send(Text.Encoding.Unicode.GetBytes nv) |> ignore
Threading.Thread.Sleep 1000
let swt = new Threading.SpinWait()
swt.SpinOnce()
if Console.KeyAvailable then
match Console.ReadKey().Key with
| ConsoleKey.Q -> ()
| _ -> h1()
else
h1()
h1()
..and the subscriber has to receive twice for each message:
// Subscriber
let subscriber () : unit =
let skt = (new ZMQ.Context()).Socket(ZMQ.SocketType.SUB)
skt.Connect sktAddr
skt.Subscribe("TEST_TOPIC", Text.Encoding.Unicode)
let rec h1 () : unit =
let topicName = skt.Recv()
let oDat = skt.Recv()
let strODat = (new Text.UnicodeEncoding()).GetString oDat
if oDat <> null then
printfn "Received: %s" strODat
else
printfn "No data received"
let swt = new System.Threading.SpinWait()
swt.SpinOnce()
if Console.KeyAvailable then
match Console.ReadKey().Key with
| ConsoleKey.Q -> ()
| _ -> h1()
else
h1()
h1()

Webcrawler - Fetch links

I'm trying to crawl a webpage, and get all the links, and add them to a list<string> which will be returned in the end, from the function.
My code:
let getUrls s : seq<string> =
let doc = new HtmlDocument() in
doc.LoadHtml s
doc.DocumentNode.SelectNodes "//a[#href]"
|> Seq.map(fun z -> (string z.Attributes.["href"]))
let crawler uri : seq<string> =
let rec crawl url =
let web = new WebClient()
let data = web.DownloadString url
getUrls data |> Seq.map crawl (* <-- ERROR HERE *)
crawl uri
The problem is that at the last line in the crawl function (the getUrls seq.map...), it simply throws an error:
Type mismatch. Expecting a string -> 'a but given a string
-> seq<'a> The resulting type would be infinite when unifying ''a'
and 'seq<'a>'
crawl is returning unit, but is expected to return seq<string>. I think you want something like:
let crawler uri =
let rec crawl url =
seq {
let web = new WebClient()
let data = web.DownloadString url
for url in getUrls data do
yield url
yield! crawl url
}
crawl uri
Adding a type annotation to crawl should point out the issue.
i think something like this:
let crawler (uri : seq<string>) =
let rec crawl url =
let data = Seq.empty
getUrls data
|> Seq.toList
|> function
| h :: t ->
crawl h
t |> List.iter crawl
| _-> ()
crawl uri
In order to fetch links:
open System.Net
open System.IO
open System.Text.RegularExpressions
type Url(x:string)=
member this.tostring = sprintf "%A" x
member this.request = System.Net.WebRequest.Create(x)
member this.response = this.request.GetResponse()
member this.stream = this.response.GetResponseStream()
member this.reader = new System.IO.StreamReader(this.stream)
member this.html = this.reader.ReadToEnd()
let linkex = "href=\s*\"[^\"h]*(http://[^&\"]*)\""
let getLinks (txt:string) = [
for m in Regex.Matches(txt,linkex)
-> m.Groups.Item(1).Value
]
let collectLinks (url:Url) = url.html
|> getLinks

Resources