Legend 181 Posted September 27, 2012 Report Share Posted September 27, 2012 Anyone have any ideas how to scrape this so that the output is more manageable? Ideally I'd just like the reviews in a list... I don't need names/numbers, etc. navigate("http://www.amazon.co...8717612&sr=1-1", "Wait")click(<innertext="5 star">, "Left Click", "No")wait for element(<innertext="Showing 5-star reviews">, "", "Appear")add list to list(%reviews, $scrape attribute(<after="By ">, "innertext"), "Delete", "Global")loop while($exists(<innertext="Next ›">)) { click(<innertext="Next ›">, "Left Click", "No") wait for element(<innertext="Showing 5-star reviews">, "", "Appear") add list to list(%reviews, $scrape attribute(<after="By ">, "innertext"), "Delete", "Global") wait(2)} TIA Quote Link to post Share on other sites
kin 0 Posted September 27, 2012 Report Share Posted September 27, 2012 Great Idea. Try this: Sidenote: If you were planing to sell this let me know. I don't want to jack your idea.Also: check the navigate Url as the forum may shorten it. (I need to learn how to make an attachment. Just can't figure that out ... lol) - code removed Quote Link to post Share on other sites
k1lv9h 76 Posted September 27, 2012 Report Share Posted September 27, 2012 Hi, Code:ui check box("Stop Next Page Loop", #stopnextpageloop) navigate("http://www.amazon.com/No-Easy-Day-Firsthand-ebook/dp/B008MG1E4A/ref=sr_1_1?s=digital-text&ie=UTF8&qid=1348717612&sr=1-1", "Wait") set(#checkedpagedata, "false", "Global") set(#checkedpagedatacount, 0, "Global") loop while($comparison(#checkedpagedata, "=", "false")) { if($exists(<innertext="5 star">)) { then { set(#checkedpagedata, "true", "Global") } else { if($comparison(#checkedpagedatacount, ">=", 9)) { then { run javascript("window.location.reload()") wait for element(<innertext="5 star">, 30, "Appear") set(#checkedpagedatacount, 0, "Global") } else { } } wait(1) increment(#checkedpagedatacount) } } } wait($rand(3, 5)) click(<innertext="5 star">, "Left Click", "No") wait($rand(3, 5)) wait for element(<innertext="Showing 5-star reviews">, 30, "Appear") wait for browser event("Everything Loaded", 30) clear list(%reviewstemp) clear list(%reviews) run javascript("// scrapes all text by review") add list to list(%reviewstemp, $scrape attribute(<outerhtml=w"<div style=\"margin-left:0.5em;\">*</div>">, "innertext"), "Delete", "Global") removeextra() wait($rand(15, $rand(16, 25))) loop while($both($exists(<innertext="Next ›">), $comparison(#stopnextpageloop, "=", "false"))) { if($exists(<innertext="Next ›">)) { then { clear list(%nextpageurls) add list to list(%nextpageurls, $scrape attribute(<outerhtml=w"<a href=\"http://www.amazon.com/*\">Next ›</a>">, "fullhref"), "Delete", "Global") set(#navnextpage, $list item(%nextpageurls, 1), "Global") } else { } } navigate(#navnextpage, "Wait") wait for browser event("Everything Loaded", 30) wait for element(<innertext="Showing 5-star reviews">, 30, "Appear") set(#checkedpagedata, "false", "Global") set(#checkedpagedatacount, 0, "Global") loop while($comparison(#checkedpagedata, "=", "false")) { if($exists(<innertext="Showing 5-star reviews">)) { then { set(#checkedpagedata, "true", "Global") } else { if($comparison(#checkedpagedatacount, ">=", 9)) { then { run javascript("window.location.reload()") wait for element(<innertext="Showing 5-star reviews">, 30, "Appear") set(#checkedpagedatacount, 0, "Global") } else { } } wait(1) increment(#checkedpagedatacount) } } } set(#checkedpagedata, "false", "Global") set(#checkedpagedatacount, 0, "Global") loop while($comparison(#checkedpagedata, "=", "false")) { if($exists(<innertext="Customers who viewed this item also viewed">)) { then { set(#checkedpagedata, "true", "Global") } else { if($comparison(#checkedpagedatacount, ">=", 9)) { then { run javascript("window.location.reload()") wait for element(<innertext="Customers who viewed this item also viewed">, 30, "Appear") } else { } } wait(1) increment(#checkedpagedatacount) } } } wait for browser event("Everything Loaded", 30) clear list(%reviewstemp) comment("scrapes all text by review") add list to list(%reviewstemp, $scrape attribute(<outerhtml=w"<div style=\"margin-left:0.5em;\">*</div>">, "innertext"), "Delete", "Global") removeextra() wait($rand(15, $rand(16, 25))) } define removeextra { loop(0) { set list position(%reviewstemp, 0) } loop($list total(%reviewstemp)) { if($comparison($list position(%reviewstemp), "<", $list total(%reviewstemp))) { then { set(#reviewtempitem, $next list item(%reviewstemp), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\d\{1,4\} of \\d\{1,4\} people found the following review helpful{$new line}", $nothing), "Global") loop(1) { comment("remove just 5 stars") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\d\\.\\d\{1,3\} out of \\d stars ", $nothing), "Global") } loop(0) { comment("remove 5 star line") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\d\\.\\d\{1,3\} out of \\d stars.*{$new line}", $nothing), "Global") } set(#reviewtempitem, $replace regular expression(#reviewtempitem, " - See all my reviews", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\(TOP 500 REVIEWER\\).\{1,4\}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\(VINE VOICE\\).\{1,4\}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\(REAL NAME\\)...{$new line}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Amazon Verified Purchase\\(What\'s this\\?\\){$new line}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "This review is from:.*{$new line}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Help other customers find the most helpful reviews.{$new line}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Was this review helpful to you\\?.Yes.No{$new line}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Report abuse \\| Permalink{$new line}", $nothing), "Global") set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Comment.Comment.\\(\\d\{1,4\}\\)\\s\{1,4\}|Comment.Comment\\s\{1,4\}|Comment.Comments.\\(\\d\{1,4\}\\)\\s\{1,4\}{$new line}", $nothing), "Global") add item to list(%reviews, #reviewtempitem, "Delete", "Global") } else { } } } } sample-amazon-5star-0032.ubot Kevin 2 Quote Link to post Share on other sites
Legend 181 Posted September 28, 2012 Author Report Share Posted September 28, 2012 Thanks a lot guys! @kin - you had a very interesting workaround, just had a chance to look at it now and its nice, thanks! @Kevin - awesome code... much appreciated! Quote Link to post Share on other sites
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.