commit 43716e59e5eca3abfcd3ce49868bcf3a403bf572
parent 7e8d50732a1133e3320c4875b7489e4986a277f2
Author: Ruben Rodriguez <ruben@gnu.org>
Date: Thu, 9 Oct 2014 15:40:40 +0200
Addded AddonsScraper.py
Diffstat:
1 file changed, 55 insertions(+), 0 deletions(-)
diff --git a/AddonsScraper.py b/AddonsScraper.py
@@ -0,0 +1,55 @@
+import re
+import urllib2
+import BeautifulSoup
+
+server="https://addons.mozilla.org"
+validlicenses=['http://www.gnu.org/licenses/gpl-3.0.html',
+'http://www.mozilla.org/MPL/MPL-1.1.html',
+'http://www.opensource.org/licenses/bsd-license.php',
+'http://www.gnu.org/licenses/gpl-2.0.html',
+'http://www.opensource.org/licenses/mit-license.php',
+'http://www.mozilla.org/MPL/2.0/']
+
+def normalink(string):
+ return re.sub('\?.*', '', string)
+
+def parselist(url):
+ l = []
+ request = urllib2.Request(url)
+ response = urllib2.urlopen(request)
+ soup = BeautifulSoup.BeautifulSoup(response)
+ for infodiv in soup.findAll('div',{'class':'info'}):
+ for h3 in infodiv.findAll('h3'):
+ for link in h3.findAll('a'):
+ l.append(re.sub('\?.*', '', link['href']))
+ return l
+
+
+def parsepage(url):
+ request = urllib2.Request(url)
+ response = urllib2.urlopen(request)
+ soup = BeautifulSoup.BeautifulSoup(response)
+ try:
+ licenseli = soup.findAll('li',{'class':'source-license'})[0]
+ license = licenseli.findAll('a')[0]['href']
+ if license not in validlicenses:
+ print "INVALID LICENSE: " + license
+ return 0
+ except:
+ return 0
+ print soup.findAll(attrs={"property":"og:title"})[0]['content']
+ """print soup.findAll(attrs={"property":"og:description"})[0]['content']
+ print normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
+ print license"""
+ addondiv = soup.findAll('div',{'id':'addon'})[0]
+ addonp = addondiv.findAll('p',{'class':'install-button'})[0]
+ button = addonp.findAll('a')[0]
+ print server + normalink(button['href'])
+
+links=[]
+for page in range(1,2):
+ links = links + parselist(server + "/en-US/firefox/search/?q=+&platform=linux&page=" + str(page))
+
+for link in links:
+ parsepage(server+link)
+