FSharpx.Extras


  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
// ----------------------------------------------------------------------------
// F# async extensions (Crawler.fsx)
// (c) Tomas Petricek, 2011, Available under Apache 2.0 license.
// ----------------------------------------------------------------------------

// This example demonstrates how to use asynchronous sequences and
// blocking agents to implement a web crawler. The sample also uses
// various AsyncSeq combinators to process the resulting async sequence.
//
// The first version performs single-threaded random walk (returned 
// as an asynchronous sequence) and the second version is concurrent.

#r @"../../bin/v4.0/FSharpx.Extras.dll"
#r @"..\..\packages\HtmlAgilityPack.1.4.2\lib\HtmlAgilityPack.dll"

open System
open System.Net
open System.Text.RegularExpressions
open HtmlAgilityPack

open FSharpx.Control

// ----------------------------------------------------------------------------
// Helper functions for downloading documents, extracting links etc.

/// Asynchronously download the document and parse the HTML
let downloadDocument url = async {
  try let wc = new WebClient()
      let! html = wc.AsyncDownloadString(Uri(url))
      let doc = new HtmlDocument()
      doc.LoadHtml(html)
      return Some doc 
  with _ -> return None }

/// Extract all links from the document that start with "http://"
let extractLinks (doc:HtmlDocument) = 
  try
    [ for a in doc.DocumentNode.SelectNodes("//a") do
        if a.Attributes.Contains("href") then
          let href = a.Attributes.["href"].Value
          if href.StartsWith("http://") then 
            let endl = href.IndexOf('?')
            yield if endl > 0 then href.Substring(0, endl) else href ]
  with _ -> []

/// Extract the <title> of the web page
let getTitle (doc:HtmlDocument) =
  let title = doc.DocumentNode.SelectSingleNode("//title")
  if title <> null then title.InnerText.Trim() else "Untitled"

// ----------------------------------------------------------------------------
// Basic crawling - crawl web pages and follow just one link from every page

/// Crawl the internet starting from the specified page
/// From each page follow the first not-yet-visited page
let rec randomCrawl url = 
  let visited = new System.Collections.Generic.HashSet<_>()

  // Visits page and then recursively visits all referenced pages
  let rec loop url = asyncSeq {
    if visited.Add(url) then
      let! doc = downloadDocument url
      match doc with 
      | Some doc ->
          // Yield url and title as the next element
          yield url, getTitle doc
          // For every link, yield all referenced pages too
          for link in extractLinks doc do
            yield! loop link 
      | _ -> () }
  loop url

// Use AsyncSeq combinators to print the titles of the first 10
// web sites that are from other domains than bing.com
randomCrawl "http://news.bing.com"
|> AsyncSeq.filter (fun (url, title) -> url.Contains("bing.com") |> not)
|> AsyncSeq.map snd
|> AsyncSeq.take 10
|> AsyncSeq.iter (printfn "%s")
|> Async.Start


// ----------------------------------------------------------------------------
// Better crawler - crawls the web concurrently using the specified number of
// workers, stores results and pending URLS to blocking buffers and returns
// all results as an asynchronous sequence. After caller stops taking elements
// from the asynchronous sequence, the blocking buffers will eventually fill
// up and crawling will stop. 

let concurrentWorkers = 20

let rec concurrentCrawl url = asyncSeq {
  // Number of pending requests is usually very high 
  // (when the queue fills up, the workers will stop, so set this to 10k)
  let requests = BlockingQueueAgent<_>(10000)
  let results = BlockingQueueAgent<_>(40)
  let visited = ConcurrentSetAgent<_>()

  /// Worker repeatedly takes URL from the queue and processes it
  let worker() = async {
    while true do
      let! url = requests.AsyncGet()
      let! doc = downloadDocument url
      match doc with 
      | Some doc ->
          // Yield url and title as the next element
          do! results.AsyncAdd( (url, getTitle doc) )
          // For every link, yield all referenced pages too
          for link in extractLinks doc do
            let! added = visited.AsyncAdd(link)
            if added then
              do! requests.AsyncAdd(link) 
      | _ -> () }

  // Return an asynchronous sequence that sends intial request
  // to the crawler, starts workers and then repeatedly takes
  // results from the results queue.
  do! requests.AsyncAdd(url)
  for i in 0 .. concurrentWorkers do 
    worker () |> Async.Start
  while true do
    let! res = results.AsyncGet()
    yield res }

// ----------------------------------------------------------------------------
// Visualize the results of crawling - show the most common words in titles

// Create user interface with text box for displaying words
open System.Windows.Forms
let frm = new Form(TopMost=true, Visible=true, Width=400, Height=600)
let txt = new TextBox( Multiline = true, Dock = DockStyle.Fill, 
                       Font = new System.Drawing.Font("Cambria", 12.0f),
                       ScrollBars = ScrollBars.Vertical )
frm.Controls.Add(txt)

// Creates an asynchronous sequence that produces values of type 
// Map<string, int> representing words together with their count
// (new version returned after every processing step)
let tables = 
  concurrentCrawl "http://news.bing.com"
  // Split title into lowercase words 
  |> AsyncSeq.map (fun (_, title) -> 
        title.Split( [|' '; '.'; '-'; '|'; ','; ';' |], 
                     StringSplitOptions.RemoveEmptyEntries )
        |> Array.map (fun s -> s.ToLower()) )
  // Create sequence that aggregates words and returns immediate results
  |> AsyncSeq.scan (fun table words ->
      words |> Seq.fold (fun table word ->
        match Map.tryFind word table with
        | Some v -> Map.add word (v + 1) table
        | _ -> Map.add word 1 table) table) Map.empty

// Asynchronous workflow that iterates over the sequence
// and displays the results in the textbox
async { 
  let counter = ref 0
  for table in tables |> AsyncSeq.take 200 do
    frm.Text <- sprintf "Processed %d" (counter := !counter + 1; !counter)
    txt.Text <-
      table 
      |> Seq.sortBy (fun (KeyValue(k, v)) -> -v)
      |> Seq.map (fun (KeyValue(k, v)) -> sprintf "%s (%d)" k v)
      |> String.concat "\r\n" } 
|> Async.StartImmediate
namespace System
namespace System.Net
namespace System.Text
namespace System.Text.RegularExpressions
namespace Microsoft.FSharp.Control
val downloadDocument : url:string -> Async<'a option>

Full name: Crawler.downloadDocument


 Asynchronously download the document and parse the HTML
val url : string
val async : AsyncBuilder

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.async
val wc : WebClient
Multiple items
type WebClient =
  inherit Component
  new : unit -> WebClient
  member BaseAddress : string with get, set
  member CachePolicy : RequestCachePolicy with get, set
  member CancelAsync : unit -> unit
  member Credentials : ICredentials with get, set
  member DownloadData : address:string -> byte[] + 1 overload
  member DownloadDataAsync : address:Uri -> unit + 1 overload
  member DownloadFile : address:string * fileName:string -> unit + 1 overload
  member DownloadFileAsync : address:Uri * fileName:string -> unit + 1 overload
  member DownloadString : address:string -> string + 1 overload
  ...

Full name: System.Net.WebClient

--------------------
WebClient() : unit
val html : string
member WebClient.AsyncDownloadString : address:Uri -> Async<string>
Multiple items
type Uri =
  new : uriString:string -> Uri + 5 overloads
  member AbsolutePath : string
  member AbsoluteUri : string
  member Authority : string
  member DnsSafeHost : string
  member Equals : comparand:obj -> bool
  member Fragment : string
  member GetComponents : components:UriComponents * format:UriFormat -> string
  member GetHashCode : unit -> int
  member GetLeftPart : part:UriPartial -> string
  ...

Full name: System.Uri

--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
val doc : 'a
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val extractLinks : doc:'a -> 'b list

Full name: Crawler.extractLinks


 Extract all links from the document that start with "http://"
val a : obj
val href : 'b
val endl : int
val getTitle : doc:'a -> string

Full name: Crawler.getTitle


 Extract the <title> of the web page
val title : obj
val randomCrawl : url:'a -> 'b

Full name: Crawler.randomCrawl


 Crawl the internet starting from the specified page
 From each page follow the first not-yet-visited page
val url : 'a
val visited : Collections.Generic.HashSet<obj>
namespace System.Collections
namespace System.Collections.Generic
Multiple items
type HashSet<'T> =
  new : unit -> HashSet<'T> + 3 overloads
  member Add : item:'T -> bool
  member Clear : unit -> unit
  member Comparer : IEqualityComparer<'T>
  member Contains : item:'T -> bool
  member CopyTo : array:'T[] -> unit + 2 overloads
  member Count : int
  member ExceptWith : other:IEnumerable<'T> -> unit
  member GetEnumerator : unit -> Enumerator<'T>
  member GetObjectData : info:SerializationInfo * context:StreamingContext -> unit
  ...
  nested type Enumerator

Full name: System.Collections.Generic.HashSet<_>

--------------------
Collections.Generic.HashSet() : unit
Collections.Generic.HashSet(comparer: Collections.Generic.IEqualityComparer<'T>) : unit
Collections.Generic.HashSet(collection: Collections.Generic.IEnumerable<'T>) : unit
Collections.Generic.HashSet(collection: Collections.Generic.IEnumerable<'T>, comparer: Collections.Generic.IEqualityComparer<'T>) : unit
val loop : ('c -> 'd)
val url : 'c
Collections.Generic.HashSet.Add(item: obj) : bool
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val snd : tuple:('T1 * 'T2) -> 'T2

Full name: Microsoft.FSharp.Core.Operators.snd
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
Multiple items
type Async
static member AsBeginEnd : computation:('Arg -> Async<'T>) -> ('Arg * AsyncCallback * obj -> IAsyncResult) * (IAsyncResult -> 'T) * (IAsyncResult -> unit)
static member AwaitEvent : event:IEvent<'Del,'T> * ?cancelAction:(unit -> unit) -> Async<'T> (requires delegate and 'Del :> Delegate)
static member AwaitIAsyncResult : iar:IAsyncResult * ?millisecondsTimeout:int -> Async<bool>
static member AwaitTask : task:Task -> Async<unit>
static member AwaitTask : task:Task<'T> -> Async<'T>
static member AwaitWaitHandle : waitHandle:WaitHandle * ?millisecondsTimeout:int -> Async<bool>
static member CancelDefaultToken : unit -> unit
static member Catch : computation:Async<'T> -> Async<Choice<'T,exn>>
static member FromBeginEnd : beginAction:(AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg:'Arg1 * beginAction:('Arg1 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * beginAction:('Arg1 * 'Arg2 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromBeginEnd : arg1:'Arg1 * arg2:'Arg2 * arg3:'Arg3 * beginAction:('Arg1 * 'Arg2 * 'Arg3 * AsyncCallback * obj -> IAsyncResult) * endAction:(IAsyncResult -> 'T) * ?cancelAction:(unit -> unit) -> Async<'T>
static member FromContinuations : callback:(('T -> unit) * (exn -> unit) * (OperationCanceledException -> unit) -> unit) -> Async<'T>
static member Ignore : computation:Async<'T> -> Async<unit>
static member OnCancel : interruption:(unit -> unit) -> Async<IDisposable>
static member Parallel : computations:seq<Async<'T>> -> Async<'T []>
static member RunSynchronously : computation:Async<'T> * ?timeout:int * ?cancellationToken:CancellationToken -> 'T
static member Sleep : millisecondsDueTime:int -> Async<unit>
static member Start : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions * ?cancellationToken:CancellationToken -> Task<'T>
static member StartChild : computation:Async<'T> * ?millisecondsTimeout:int -> Async<Async<'T>>
static member StartChildAsTask : computation:Async<'T> * ?taskCreationOptions:TaskCreationOptions -> Async<Task<'T>>
static member StartImmediate : computation:Async<unit> * ?cancellationToken:CancellationToken -> unit
static member StartWithContinuations : computation:Async<'T> * continuation:('T -> unit) * exceptionContinuation:(exn -> unit) * cancellationContinuation:(OperationCanceledException -> unit) * ?cancellationToken:CancellationToken -> unit
static member SwitchToContext : syncContext:SynchronizationContext -> Async<unit>
static member SwitchToNewThread : unit -> Async<unit>
static member SwitchToThreadPool : unit -> Async<unit>
static member TryCancelled : computation:Async<'T> * compensation:(OperationCanceledException -> unit) -> Async<'T>
static member CancellationToken : Async<CancellationToken>
static member DefaultCancellationToken : CancellationToken

Full name: Microsoft.FSharp.Control.Async

--------------------
type Async<'T>

Full name: Microsoft.FSharp.Control.Async<_>
static member Async.Start : computation:Async<unit> * ?cancellationToken:Threading.CancellationToken -> unit
val concurrentWorkers : int

Full name: Crawler.concurrentWorkers
val concurrentCrawl : url:'a -> 'b

Full name: Crawler.concurrentCrawl
namespace System.Windows
namespace System.Windows.Forms
val frm : Form

Full name: Crawler.frm
Multiple items
type Form =
  inherit ContainerControl
  new : unit -> Form
  member AcceptButton : IButtonControl with get, set
  member Activate : unit -> unit
  member ActiveMdiChild : Form
  member AddOwnedForm : ownedForm:Form -> unit
  member AllowTransparency : bool with get, set
  member AutoScale : bool with get, set
  member AutoScaleBaseSize : Size with get, set
  member AutoScroll : bool with get, set
  member AutoSize : bool with get, set
  ...
  nested type ControlCollection

Full name: System.Windows.Forms.Form

--------------------
Form() : unit
val txt : TextBox

Full name: Crawler.txt
Multiple items
type TextBox =
  inherit TextBoxBase
  new : unit -> TextBox
  member AcceptsReturn : bool with get, set
  member AutoCompleteCustomSource : AutoCompleteStringCollection with get, set
  member AutoCompleteMode : AutoCompleteMode with get, set
  member AutoCompleteSource : AutoCompleteSource with get, set
  member CharacterCasing : CharacterCasing with get, set
  member Multiline : bool with get, set
  member PasswordChar : char with get, set
  member Paste : text:string -> unit
  member ScrollBars : ScrollBars with get, set
  ...

Full name: System.Windows.Forms.TextBox

--------------------
TextBox() : unit
type DockStyle =
  | None = 0
  | Top = 1
  | Bottom = 2
  | Left = 3
  | Right = 4
  | Fill = 5

Full name: System.Windows.Forms.DockStyle
field DockStyle.Fill = 5
namespace System.Drawing
Multiple items
type Font =
  inherit MarshalByRefObject
  new : prototype:Font * newStyle:FontStyle -> Font + 12 overloads
  member Bold : bool
  member Clone : unit -> obj
  member Dispose : unit -> unit
  member Equals : obj:obj -> bool
  member FontFamily : FontFamily
  member GdiCharSet : byte
  member GdiVerticalFont : bool
  member GetHashCode : unit -> int
  member GetHeight : unit -> float32 + 2 overloads
  ...

Full name: System.Drawing.Font

--------------------
Drawing.Font(prototype: Drawing.Font, newStyle: Drawing.FontStyle) : unit
   (+0 other overloads)
Drawing.Font(family: Drawing.FontFamily, emSize: float32) : unit
   (+0 other overloads)
Drawing.Font(familyName: string, emSize: float32) : unit
   (+0 other overloads)
Drawing.Font(family: Drawing.FontFamily, emSize: float32, style: Drawing.FontStyle) : unit
   (+0 other overloads)
Drawing.Font(family: Drawing.FontFamily, emSize: float32, unit: Drawing.GraphicsUnit) : unit
   (+0 other overloads)
Drawing.Font(familyName: string, emSize: float32, style: Drawing.FontStyle) : unit
   (+0 other overloads)
Drawing.Font(familyName: string, emSize: float32, unit: Drawing.GraphicsUnit) : unit
   (+0 other overloads)
Drawing.Font(family: Drawing.FontFamily, emSize: float32, style: Drawing.FontStyle, unit: Drawing.GraphicsUnit) : unit
   (+0 other overloads)
Drawing.Font(familyName: string, emSize: float32, style: Drawing.FontStyle, unit: Drawing.GraphicsUnit) : unit
   (+0 other overloads)
Drawing.Font(family: Drawing.FontFamily, emSize: float32, style: Drawing.FontStyle, unit: Drawing.GraphicsUnit, gdiCharSet: byte) : unit
   (+0 other overloads)
type ScrollBars =
  | None = 0
  | Horizontal = 1
  | Vertical = 2
  | Both = 3

Full name: System.Windows.Forms.ScrollBars
field ScrollBars.Vertical = 2
property Control.Controls: Control.ControlCollection
Control.ControlCollection.Add(value: Control) : unit
val tables : obj

Full name: Crawler.tables
type StringSplitOptions =
  | None = 0
  | RemoveEmptyEntries = 1

Full name: System.StringSplitOptions
field StringSplitOptions.RemoveEmptyEntries = 1
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : [<ParamArray>] indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.map
module Seq

from Microsoft.FSharp.Collections
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State

Full name: Microsoft.FSharp.Collections.Seq.fold
Multiple items
module Map

from Microsoft.FSharp.Collections

--------------------
type Map<'Key,'Value (requires comparison)> =
  interface IEnumerable
  interface IComparable
  interface IEnumerable<KeyValuePair<'Key,'Value>>
  interface ICollection<KeyValuePair<'Key,'Value>>
  interface IDictionary<'Key,'Value>
  new : elements:seq<'Key * 'Value> -> Map<'Key,'Value>
  member Add : key:'Key * value:'Value -> Map<'Key,'Value>
  member ContainsKey : key:'Key -> bool
  override Equals : obj -> bool
  member Remove : key:'Key -> Map<'Key,'Value>
  ...

Full name: Microsoft.FSharp.Collections.Map<_,_>

--------------------
new : elements:seq<'Key * 'Value> -> Map<'Key,'Value>
val tryFind : key:'Key -> table:Map<'Key,'T> -> 'T option (requires comparison)

Full name: Microsoft.FSharp.Collections.Map.tryFind
val add : key:'Key -> value:'T -> table:Map<'Key,'T> -> Map<'Key,'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Map.add
val empty<'Key,'T (requires comparison)> : Map<'Key,'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Map.empty
val counter : int ref
Multiple items
val ref : value:'T -> 'T ref

Full name: Microsoft.FSharp.Core.Operators.ref

--------------------
type 'T ref = Ref<'T>

Full name: Microsoft.FSharp.Core.ref<_>
val table : seq<Collections.Generic.KeyValuePair<string,int>>
property Form.Text: string
val sprintf : format:Printf.StringFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.sprintf
property TextBox.Text: string
val sortBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'T> (requires comparison)

Full name: Microsoft.FSharp.Collections.Seq.sortBy
active recognizer KeyValue: Collections.Generic.KeyValuePair<'Key,'Value> -> 'Key * 'Value

Full name: Microsoft.FSharp.Core.Operators.( |KeyValue| )
val k : string
val v : int
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
Multiple items
type String =
  new : value:char -> string + 7 overloads
  member Chars : int -> char
  member Clone : unit -> obj
  member CompareTo : value:obj -> int + 1 overload
  member Contains : value:string -> bool
  member CopyTo : sourceIndex:int * destination:char[] * destinationIndex:int * count:int -> unit
  member EndsWith : value:string -> bool + 2 overloads
  member Equals : obj:obj -> bool + 2 overloads
  member GetEnumerator : unit -> CharEnumerator
  member GetHashCode : unit -> int
  ...

Full name: System.String

--------------------
String(value: nativeptr<char>) : unit
String(value: nativeptr<sbyte>) : unit
String(value: char []) : unit
String(c: char, count: int) : unit
String(value: nativeptr<char>, startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int) : unit
String(value: char [], startIndex: int, length: int) : unit
String(value: nativeptr<sbyte>, startIndex: int, length: int, enc: Text.Encoding) : unit
val concat : sep:string -> strings:seq<string> -> string

Full name: Microsoft.FSharp.Core.String.concat
static member Async.StartImmediate : computation:Async<unit> * ?cancellationToken:Threading.CancellationToken -> unit
Fork me on GitHub