deliter 203 Posted February 24, 2016 Report Share Posted February 24, 2016 example #1Email scraper,feed in a random site,the bot will go their,try to scrape an email address by looking for a mailto: href,if none found,look for any href containing contact,go to those URL's and try to find a mailto href I dont know much about href relative and absolute paths so I download a javascript library to do it for me,but i dont know how well it works I will make proper tutorial examples soon,this example just shows how easy it is to scrape href's define $emailScrape(#url) { run javascript("var myUri = new URI(\"http://example.org/\");") if($comparison($eval("myUri === undefined"),"=","True")) { then { plugin command("SocketCommands.dll", "socket container") { plugin command("SocketCommands.dll", "socket navigate", "GET", "http://pastebin.com/raw/4PGiXQZc") run javascript($plugin function("SocketCommands.dll", "$socket page html")) } } } plugin command("SocketCommands.dll", "socket container") { plugin command("SocketCommands.dll", "socket navigate", "GET", #url) set(#page,$plugin function("SocketCommands.dll", "$socket page html"),"Local") } set(#emailadd,$replace regular expression($plugin function("DeliterCSS.dll", "Deliter CSS Selector", #page, "a[href^=mailto]", "href"),"mailto:",""),"Local") if($comparison(#emailadd,">",$nothing)) { then { add item to list(%emails,#emailadd,"Don\'t Delete","Local") } else { add list to list(%a innertext,$plugin function("DeliterCSS.dll", "Deliter CSS Selector", #page, "a", "TextContent"),"Don\'t Delete","Local") add list to list(%a href,$plugin function("DeliterCSS.dll", "Deliter CSS Selector", #page, "a", "href"),"Don\'t Delete","Local") set(#position,0,"Local") loop($list total(%a href)) { set(#listItem,$list item(%a innertext,#position),"Local") if($comparison($find regular expression(#listItem,"(?i)contact"),">",$nothing)) { then { if($comparison($eval("var uri = new URI(\"{$list item(%a href,#position)}\");uri.is(\"relative\") === true; "),"=","True")) { then { set(#fixURL,"{$replace regular expression(#url,"/$","")}/{$replace regular expression($list item(%a href,#position),"^/","")}","Local") add item to list(%contactHref,#fixURL,"Delete","Local") } else { add item to list(%contactHref,$list item(%a href,#position),"Delete","Local") } } increment(#position) } else { increment(#position) } } } set(#position,0,"Local") loop while($comparison(#position,"<",$list total(%contactHref)) AND $comparison($list total(%emails),"=",0)) { plugin command("SocketCommands.dll", "socket container") { plugin command("SocketCommands.dll", "socket navigate", "GET", $list item(%contactHref,#position)) set(#page,$plugin function("SocketCommands.dll", "$socket page html"),"Local") set(#emailadd,$replace regular expression($plugin function("DeliterCSS.dll", "Deliter CSS Selector", #page, "a[href^=mailto]", "href"),"mailto:",""),"Local") if($comparison(#emailadd,">",$nothing)) { then { add item to list(%emails,#emailadd,"Don\'t Delete","Local") } } } increment(#position) } } } return(%emails)} 1 Quote Link to post Share on other sites
deliter 203 Posted February 27, 2016 Author Report Share Posted February 27, 2016 example #2here is a simple google scraper that returns the result url's the links are "a" tags inside of class r, <div class ="r"><a href="example.com">example.com</a></div> so expression is simply .r a . means class# means id"tagname" is tagname define $googleScraper(#search) { set(#Search Here,$replace regular expression(#search,"\\s","+"),"Local") plugin command("SocketCommands.dll", "socket container") { plugin command("SocketCommands.dll", "socket navigate", "GET", "https://www.google.com/search?q={#Search Here}") set(#page,$plugin function("SocketCommands.dll", "$socket page html"),"Local") set(#fixURL,$replace regular expression($plugin function("WpfApplication1.dll", "Deliter CSS Selector", #page, ".r a", "href"),"/url\\?q=|&sa=.+",""),"Local") add list to list(%hrefs,$list from text(#fixURL,$new line),"Delete","Local") return(%hrefs) } } set(#searchGoogle,$googleScraper("ubot studio"),"Global") Use this was example 1 to create a nice business scraper,input a search query and have email address's returned to you set(#position,0,"Global")add list to list(%mylinks,$list from text($googleScraper("salon London UK"),$new line),"Delete","Global")loop($list total(%mylinks)) { set(#myurl,$list item(%mylinks,#position),"Global") if($comparison($find regular expression(#myurl,"^http"),">",$nothing)) { then { add item to list(%newitems,#myurl,"Don\'t Delete","Global") increment(#position) } else { increment(#position) } }}set(#position,0,"Global")clear list(%mylinks)loop($list total(%newitems)) { add list to list(%results,$list from text($emailScrape($list item(%newitems,#position)),$new line),"Delete","Global") increment(#position)} Quote Link to post Share on other sites
deliter 203 Posted February 27, 2016 Author Report Share Posted February 27, 2016 example #3 Simple examples of selecting elements load html("<html> <head> </head> <body> <ul> <li >I am a sibling of all other li tags</li> <li > <p>I am a child of the second li</p> </li> <li>To auto generate CSS Paths,click View --> Web Inspector --> Select Magnify Glass in top left Corner</li> <li> -->Select element on page,go back to web inspector,your element should be highlighted in blue</li> <li> --> right click and select copy css path,web dev available in chrome 39 only,or use chrome/firefox dev tools</li> </ul> </body> </html>") set(#page,$document text,"Global") add list to list(%each li,$plugin function("WpfApplication1.dll", "Deliter CSS Selector", #page, "li", "TextContent"),"Delete","Global") add list to list(%Siblings,$plugin function("WpfApplication1.dll", "Deliter CSS Sibling Selector", #page, "body > ul > li:nth-child(1)", "OuterHtml"),"Delete","Global") set(#child of li,$plugin function("WpfApplication1.dll", "Deliter CSS Child Elements Selector", #page, "body > ul > li:nth-child(2)", "TextContent"),"Global") add list to list(%UL child html,$plugin function("WpfApplication1.dll", "Deliter CSS Child Elements Selector", #page, "ul", "OuterHtml"),"Delete","Global") Quote Link to post Share on other sites
deliter 203 Posted February 27, 2016 Author Report Share Posted February 27, 2016 example #4 Scrape HTML tables to Ubot Tables with http get Their are ways of doing this faster, by using a css path of "tr td" and check first if "tr th" path is greater than nothing,but for this tutorial will keep it simple First I create a list of the outerhtml of each tr and run through them in a list,making a new list of each tr's children(table items along the row) then add to a ubot table define CSS Table scraper(#url, #cssPathOfTable) { plugin command("SocketCommands.dll", "socket container") { plugin command("SocketCommands.dll", "socket navigate", "GET", #url) set(#page,$plugin function("SocketCommands.dll", "$socket page html"),"Local") set(#html,$plugin function("WpfApplication1.dll", "Deliter CSS Selector", #page, #cssPathOfTable, "OuterHtml"),"Local") add list to list(%Each Row,$plugin function("WpfApplication1.dll", "Deliter CSS Selector", #html, "tr", "OuterHtml"),"Delete","Local") set(#position,0,"Local") loop($list total(%Each Row)) { add list to list(%rows,$plugin function("WpfApplication1.dll", "Deliter CSS Child Elements Selector", "<table>{$list item(%Each Row,#position)}</table>", "tr", "TextContent"),"Delete","Local") add list to table as row(&results,#position,0,%rows) clear list(%rows) increment(#position) } } } CSS Table scraper("http://www.w3schools.com/html/html_tables.asp", "#main > table:nth-child(6)") Quote Link to post Share on other sites
deliter 203 Posted March 28, 2016 Author Report Share Posted March 28, 2016 Example #5 Matching elements by attribute and using regular expressions to match see this for more infohttp://thegeekyway.com/css-regex-selector-using-regular-expression-css/ this script will match the email address,and match the 2 example posts,when scraping a forum,like this one,you often find ID's by post load html("<a href=\"mailto:example@example.com\">Click to Email Me</a> <div id=\"post_01\"><p>first post</p></div> <div id=\"post_02\"><p>second post</p></div> ") alert("email:{$plugin function("WpfApplication1.dll", "Deliter CSS Selector", $document text, "a[href^=\"mailto:\"]", "TextContent")}") add list to list(%posts InnerText,$plugin function("WpfApplication1.dll", "Deliter CSS Selector", $document text, "div[id*=\"post_\"]", "TextContent"),"Delete","Global") Quote Link to post Share on other sites
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.