summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--AddonsScraper.py55
1 files changed, 55 insertions, 0 deletions
diff --git a/AddonsScraper.py b/AddonsScraper.py
new file mode 100644
index 0000000..869e1e8
--- /dev/null
+++ b/AddonsScraper.py
@@ -0,0 +1,55 @@
+import re
+import urllib2
+import BeautifulSoup
+
+server="https://addons.mozilla.org"
+validlicenses=['http://www.gnu.org/licenses/gpl-3.0.html',
+'http://www.mozilla.org/MPL/MPL-1.1.html',
+'http://www.opensource.org/licenses/bsd-license.php',
+'http://www.gnu.org/licenses/gpl-2.0.html',
+'http://www.opensource.org/licenses/mit-license.php',
+'http://www.mozilla.org/MPL/2.0/']
+
+def normalink(string):
+ return re.sub('\?.*', '', string)
+
+def parselist(url):
+ l = []
+ request = urllib2.Request(url)
+ response = urllib2.urlopen(request)
+ soup = BeautifulSoup.BeautifulSoup(response)
+ for infodiv in soup.findAll('div',{'class':'info'}):
+ for h3 in infodiv.findAll('h3'):
+ for link in h3.findAll('a'):
+ l.append(re.sub('\?.*', '', link['href']))
+ return l
+
+
+def parsepage(url):
+ request = urllib2.Request(url)
+ response = urllib2.urlopen(request)
+ soup = BeautifulSoup.BeautifulSoup(response)
+ try:
+ licenseli = soup.findAll('li',{'class':'source-license'})[0]
+ license = licenseli.findAll('a')[0]['href']
+ if license not in validlicenses:
+ print "INVALID LICENSE: " + license
+ return 0
+ except:
+ return 0
+ print soup.findAll(attrs={"property":"og:title"})[0]['content']
+ """print soup.findAll(attrs={"property":"og:description"})[0]['content']
+ print normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
+ print license"""
+ addondiv = soup.findAll('div',{'id':'addon'})[0]
+ addonp = addondiv.findAll('p',{'class':'install-button'})[0]
+ button = addonp.findAll('a')[0]
+ print server + normalink(button['href'])
+
+links=[]
+for page in range(1,2):
+ links = links + parselist(server + "/en-US/firefox/search/?q=+&platform=linux&page=" + str(page))
+
+for link in links:
+ parsepage(server+link)
+