Jump to content
UBot Underground

Amazon 5 Star Review Scraper


Recommended Posts

Anyone have any ideas how to scrape this so that the output is more manageable? Ideally I'd just like the reviews in a list... I don't need names/numbers, etc.

 

navigate("http://www.amazon.co...8717612&sr=1-1", "Wait")

click(<innertext="5 star">, "Left Click", "No")

wait for element(<innertext="Showing 5-star reviews">, "", "Appear")

add list to list(%reviews, $scrape attribute(<after="By ">, "innertext"), "Delete", "Global")

loop while($exists(<innertext="Next ›">)) {

click(<innertext="Next ›">, "Left Click", "No")

wait for element(<innertext="Showing 5-star reviews">, "", "Appear")

add list to list(%reviews, $scrape attribute(<after="By ">, "innertext"), "Delete", "Global")

wait(2)

}

 

TIA

:rolleyes:

Link to post
Share on other sites

Great Idea.

 

Try this:

 

Sidenote: If you were planing to sell this let me know. I don't want to jack your idea.

Also: check the navigate Url as the forum may shorten it.

 

(I need to learn how to make an attachment. Just can't figure that out ... lol)

 

- code removed

Link to post
Share on other sites

Hi,

 

Code:

ui check box("Stop Next Page Loop", #stopnextpageloop)
navigate("http://www.amazon.com/No-Easy-Day-Firsthand-ebook/dp/B008MG1E4A/ref=sr_1_1?s=digital-text&ie=UTF8&qid=1348717612&sr=1-1", "Wait")
set(#checkedpagedata, "false", "Global")
set(#checkedpagedatacount, 0, "Global")
loop while($comparison(#checkedpagedata, "=", "false")) {
if($exists(<innertext="5 star">)) {
 then {
	 set(#checkedpagedata, "true", "Global")
 }
 else {
	 if($comparison(#checkedpagedatacount, ">=", 9)) {
		 then {
			 run javascript("window.location.reload()")
			 wait for element(<innertext="5 star">, 30, "Appear")
			 set(#checkedpagedatacount, 0, "Global")
		 }
		 else {
		 }
	 }
	 wait(1)
	 increment(#checkedpagedatacount)
 }
}
}
wait($rand(3, 5))
click(<innertext="5 star">, "Left Click", "No")
wait($rand(3, 5))
wait for element(<innertext="Showing 5-star reviews">, 30, "Appear")
wait for browser event("Everything Loaded", 30)
clear list(%reviewstemp)
clear list(%reviews)
run javascript("// scrapes all text by review")
add list to list(%reviewstemp, $scrape attribute(<outerhtml=w"<div style=\"margin-left:0.5em;\">*</div>">, "innertext"), "Delete", "Global")
removeextra()
wait($rand(15, $rand(16, 25)))
loop while($both($exists(<innertext="Next ›">), $comparison(#stopnextpageloop, "=", "false"))) {
if($exists(<innertext="Next ›">)) {
 then {
	 clear list(%nextpageurls)
	 add list to list(%nextpageurls, $scrape attribute(<outerhtml=w"<a href=\"http://www.amazon.com/*\">Next ›</a>">, "fullhref"), "Delete", "Global")
	 set(#navnextpage, $list item(%nextpageurls, 1), "Global")
 }
 else {
 }
}
navigate(#navnextpage, "Wait")
wait for browser event("Everything Loaded", 30)
wait for element(<innertext="Showing 5-star reviews">, 30, "Appear")
set(#checkedpagedata, "false", "Global")
set(#checkedpagedatacount, 0, "Global")
loop while($comparison(#checkedpagedata, "=", "false")) {
 if($exists(<innertext="Showing 5-star reviews">)) {
	 then {
		 set(#checkedpagedata, "true", "Global")
	 }
	 else {
		 if($comparison(#checkedpagedatacount, ">=", 9)) {
			 then {
				 run javascript("window.location.reload()")
				 wait for element(<innertext="Showing 5-star reviews">, 30, "Appear")
				 set(#checkedpagedatacount, 0, "Global")
			 }
			 else {
			 }
		 }
		 wait(1)
		 increment(#checkedpagedatacount)
	 }
 }
}
set(#checkedpagedata, "false", "Global")
set(#checkedpagedatacount, 0, "Global")
loop while($comparison(#checkedpagedata, "=", "false")) {
 if($exists(<innertext="Customers who viewed this item also viewed">)) {
	 then {
		 set(#checkedpagedata, "true", "Global")
	 }
	 else {
		 if($comparison(#checkedpagedatacount, ">=", 9)) {
			 then {
				 run javascript("window.location.reload()")
				 wait for element(<innertext="Customers who viewed this item also viewed">, 30, "Appear")
			 }
			 else {
			 }
		 }
		 wait(1)
		 increment(#checkedpagedatacount)
	 }
 }
}
wait for browser event("Everything Loaded", 30)
clear list(%reviewstemp)
comment("scrapes all text by review")
add list to list(%reviewstemp, $scrape attribute(<outerhtml=w"<div style=\"margin-left:0.5em;\">*</div>">, "innertext"), "Delete", "Global")
removeextra()
wait($rand(15, $rand(16, 25)))
}
define removeextra {
loop(0) {
 set list position(%reviewstemp, 0)
}
loop($list total(%reviewstemp)) {
 if($comparison($list position(%reviewstemp), "<", $list total(%reviewstemp))) {
	 then {
		 set(#reviewtempitem, $next list item(%reviewstemp), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\d\{1,4\} of \\d\{1,4\} people found the following review helpful{$new line}", $nothing), "Global")
		 loop(1) {
			 comment("remove just 5 stars")
			 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\d\\.\\d\{1,3\} out of \\d stars ", $nothing), "Global")
		 }
		 loop(0) {
			 comment("remove 5 star line")
			 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\d\\.\\d\{1,3\} out of \\d stars.*{$new line}", $nothing), "Global")
		 }
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, " - See all my reviews", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\(TOP 500 REVIEWER\\).\{1,4\}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\(VINE VOICE\\).\{1,4\}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "\\(REAL NAME\\)...{$new line}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Amazon Verified Purchase\\(What\'s this\\?\\){$new line}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "This review is from:.*{$new line}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Help other customers find the most helpful reviews.{$new line}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Was this review helpful to you\\?.Yes.No{$new line}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Report abuse \\| Permalink{$new line}", $nothing), "Global")
		 set(#reviewtempitem, $replace regular expression(#reviewtempitem, "Comment.Comment.\\(\\d\{1,4\}\\)\\s\{1,4\}|Comment.Comment\\s\{1,4\}|Comment.Comments.\\(\\d\{1,4\}\\)\\s\{1,4\}{$new line}", $nothing), "Global")
		 add item to list(%reviews, #reviewtempitem, "Delete", "Global")
	 }
	 else {
	 }
 }
}
}

 

sample-amazon-5star-0032.ubot

 

Kevin

  • Like 2
Link to post
Share on other sites

Thanks a lot guys!

 

@kin - you had a very interesting workaround, just had a chance to look at it now and its nice, thanks!

 

@Kevin - awesome code... much appreciated!

Link to post
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

Loading...
×
×
  • Create New...