summaryrefslogtreecommitdiff
path: root/tools/AddonsScraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/AddonsScraper.py')
-rw-r--r--tools/AddonsScraper.py169
1 files changed, 169 insertions, 0 deletions
diff --git a/tools/AddonsScraper.py b/tools/AddonsScraper.py
new file mode 100644
index 0000000..3813252
--- /dev/null
+++ b/tools/AddonsScraper.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2014 Rubén Rodríguez <ruben@gnu.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+#
+# This package parses https://addons.mozilla.org and generates a database
+# listing the addons that are under a valid license. addons.mozilla.org is
+# itself licensed as "CC Attribution Share-Alike v3.0 or any later version."
+# Note that screenshots get licensed under the same license as the program.
+#
+# To run, install mysql-server, python-mysqldb and python-beautifulsoup
+#
+
+
+import re
+import sys
+import urllib2
+import BeautifulSoup
+
+server="https://addons.mozilla.org"
+parsepages=2 #Number of pages per category to parse
+dbuser="root"
+dbpass=""
+
+validlicenses = ['http://www.gnu.org/licenses/gpl-3.0.html',
+'http://www.gnu.org/licenses/gpl-2.0.html',
+'http://www.gnu.org/licenses/lgpl-3.0.html',
+'http://www.gnu.org/licenses/lgpl-2.1.html',
+'http://www.opensource.org/licenses/bsd-license.php',
+'http://www.opensource.org/licenses/mit-license.php',
+'http://www.mozilla.org/MPL/MPL-1.1.html',
+'http://www.mozilla.org/MPL/2.0/']
+
+categories = ['alerts-updates', 'appearance', 'bookmarks', 'download-management', 'feeds-news-blogging', 'games-entertainment', 'language-support', 'photos-music-videos', 'privacy-security', 'search-tools', 'shopping', 'social-communication', 'tabs', 'web-development', 'other']
+
+def normalink(string):
+ return re.sub('\?.*', '', string)
+
+def parselist(url):
+ print "PARSING LIST: " + url
+ l = []
+ request = urllib2.Request(url)
+ response = urllib2.urlopen(request)
+ soup = BeautifulSoup.BeautifulSoup(response)
+ for infodiv in soup.findAll('div',{'class':'info'}):
+ for h3 in infodiv.findAll('h3'):
+ for link in h3.findAll('a'):
+ l.append(re.sub('\?.*', '', link['href']))
+ return l
+
+
+def parsepage(url, category):
+ request = urllib2.Request(url)
+ response = urllib2.urlopen(request)
+ soup = BeautifulSoup.BeautifulSoup(response)
+ try:
+ licenseli = soup.findAll('li',{'class':'source-license'})[0]
+ license = licenseli.findAll('a')[0]['href']
+ if license not in validlicenses:
+ if license[0] == "h":
+ print "INVALID LICENSE: " + license
+ return 0
+ except:
+ return 0
+ name = re.sub('/$','', normalink(url))
+ name = re.sub('.*/','', name)
+ prettyname = soup.findAll(attrs={"property":"og:title"})[0]['content']
+ description = soup.findAll(attrs={"property":"og:description"})[0]['content']
+ rating = soup.findAll(attrs={"itemprop":"ratingValue"})[0]['content']
+ popularity = soup.findAll(attrs={"itemprop":"interactionCount"})[0]['content']
+ popularity = re.sub('UserDownloads:', '', popularity)
+ htmldescription = soup.findAll('div',{'id':'addon-description'})[0]
+ icon = normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
+ screenshots = []
+ try:
+ previewdiv = soup.findAll('ul',{'id':'preview'})[0]
+ for a in previewdiv.findAll('a'):
+ screenshots.append(normalink(a['href']))
+ except:
+ pass
+ version = soup.findAll('span',{'class':'version-number'})[0].text
+ addondiv = soup.findAll('div',{'id':'addon'})[0]
+ addonp = addondiv.findAll('p',{'class':'install-button'})[0]
+ button = addonp.findAll('a')[0]
+ downloadlink = server + normalink(button['href'])
+ try:
+ homelink = soup.findAll('a',{'class':'home'})[1]['href']
+ homelink = re.sub('.*//','http://',homelink)
+ except:
+ homelink = ""
+ try:
+ supportlink = soup.findAll('a',{'class':'support'})[0]['href']
+ supportlink = re.sub('.*//','http://',supportlink)
+ except:
+ supportlink = ""
+
+ htmldescription = unicode(htmldescription)
+ description = unicode(description)
+ prettyname = unicode(prettyname)
+ description = re.sub('\'', '\\\'', description)
+ htmldescription = re.sub('\'', '\\\'', htmldescription)
+ prettyname = re.sub('\'', '\\\'', prettyname)
+ screenshots = unicode(screenshots)
+
+ sql = u'INSERT INTO addons.addons (`name`, `prettyname`, `description`, `htmldescription`, `icon`, `screenshots`, `version`, `rating`, `popularity`, `downloadlink`, `homelink`, `supportlink`, `retrievedlink`, `license`, `category`) VALUES ("' +name+ '", \'' +prettyname+ '\', \'' +description+ '\', \'' +htmldescription+ '\', "' +icon+ '", "' +screenshots+ '", "' +version+ '", "' +rating+ '", "' +popularity+ '", "' +downloadlink+ '", "' +homelink+ '", "' +supportlink+ '", "' +url+ '" , "' +license+ '", "' +category+ '");'
+ try:
+ cursor = db.cursor()
+ cursor.execute(sql)
+ cursor.close()
+ db.commit()
+ except:
+ print 'Failed to insert "' + name + '", query: "' + sql
+ else:
+ print "Added " + name + " " + url
+
+import MySQLdb
+db = MySQLdb.connect(host="localhost",user=dbuser,passwd=dbpass, charset="utf8", use_unicode=True)
+cursor = db.cursor()
+sql = """DROP DATABASE IF EXISTS addons;
+CREATE DATABASE addons CHARACTER SET utf8 COLLATE utf8_general_ci;
+USE addons;
+CREATE TABLE addons(
+id INT PRIMARY KEY AUTO_INCREMENT,
+name VARCHAR(50),
+prettyname VARCHAR(50),
+description TEXT,
+htmldescription TEXT,
+icon VARCHAR(255),
+screenshots TEXT,
+version VARCHAR(20),
+rating VARCHAR(10),
+popularity INT,
+downloadlink VARCHAR(255),
+homelink VARCHAR(255),
+supportlink VARCHAR(255),
+retrievedlink VARCHAR(255),
+license VARCHAR(255),
+category VARCHAR(20)
+);
+"""
+cursor.execute(sql)
+cursor.close()
+db.commit()
+
+for category in categories:
+ links=[]
+ for page in range(1,1+parsepages):
+ links = links + parselist(server + "/en-US/firefox/extensions/" + category + "/?sort=popular&page="+str(page))
+ for link in links:
+ parsepage(server+link, category)
+
+#tests
+#parsepage("https://addons.mozilla.org/en-US/firefox/addon/twoo/", "test")
+#parsepage("https://addons.mozilla.org/en-US/firefox/addon/what-about/", "test")