1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 | import java.util.Date
import akka.actor.{ActorSystem, Cancellable}
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
import scala.collection.mutable
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.io.Codec
object MyApp {
case class Offer(description: String, vendor: String, url: String) // simple model for an offer
val searchUrl = """https://www.leboncoin.fr/_loisirs_/offres/bretagne/?th=1&f=p&q=manga&location=Lannion%2022300"""
val alreadyChecked = new mutable.HashMap[String, Date]() // url/ date map
def main(args: Array[String]): Unit = startScheduler() // starts scheduling
def startScheduler(): Cancellable = ActorSystem("MySystem").scheduler.schedule(0.seconds, 10.minutes)(scrap) // run now and every 10 mins
def scrap(): Unit = { println("Scrapping...")
val browser = JsoupBrowser() // starts a browser
val page = browser.parseString(scala.io.Source.fromURL(searchUrl, Codec.ISO8859.name).mkString) // read and parse the page
val offerHrefs: Seq[String] = page >> elementList("a.list_item") flatMap (_ >?> element("a") map (_ attr "href")) // extract the links href
val withProtocolHrefs: Seq[String] = offerHrefs map ("https:" + _) // add missing protocol to each extracted href
withProtocolHrefs filterNot alreadyChecked.contains flatMap { offerHref => // load each offer if not already in the hashmap
alreadyChecked += offerHref -> new Date() // add the offer url to the hashmap to avoid to re-process it
val offerSection = browser.get(offerHref) >?> element("section #adview") // extract section containing offer info
for { // extract optionaly description and vendor name
propertiesDescription <- offerSection >?> element("div.properties_description")
description <- propertiesDescription flatMap (_ >?> text("p.value"))
linePro <- offerSection >?> element("div.line_pro")
vendor <- linePro flatMap (_ >?> text("p a"))
} yield Offer(description, vendor, offerHref)
} filter myFilter foreach myNotifier // filter with the custom filter then notify for each remaining/non-filtered offer
alreadyChecked retain { case (url, date) => date.getTime > (new Date().getTime - 1.day.toMillis) } // clean hasmap old entries
}
def myFilter(offer: Offer): Boolean = (offer.description.toLowerCase contains "dragon") || (offer.vendor == "MyFavoriteVendor")
def myNotifier(offer: Offer): Unit = println(offer)
}
// build.sbt
//scalaVersion := "2.11.11"
//libraryDependencies += "net.ruippeixotog" %% "scala-scraper" % "2.0.0"
//libraryDependencies += "com.typesafe.akka" %% "akka-actor" % "2.5.4"
|