summaryrefslogtreecommitdiff
path: root/AddonsScraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'AddonsScraper.py')
-rw-r--r--AddonsScraper.py144
1 files changed, 129 insertions, 15 deletions
diff --git a/AddonsScraper.py b/AddonsScraper.py
index 869e1e8..3813252 100644
--- a/AddonsScraper.py
+++ b/AddonsScraper.py
@@ -1,19 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2014 Rubén Rodríguez <ruben@gnu.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+#
+# This package parses https://addons.mozilla.org and generates a database
+# listing the addons that are under a valid license. addons.mozilla.org is
+# itself licensed as "CC Attribution Share-Alike v3.0 or any later version."
+# Note that screenshots get licensed under the same license as the program.
+#
+# To run, install mysql-server, python-mysqldb and python-beautifulsoup
+#
+
+
import re
+import sys
import urllib2
import BeautifulSoup
server="https://addons.mozilla.org"
-validlicenses=['http://www.gnu.org/licenses/gpl-3.0.html',
-'http://www.mozilla.org/MPL/MPL-1.1.html',
-'http://www.opensource.org/licenses/bsd-license.php',
+parsepages=2 #Number of pages per category to parse
+dbuser="root"
+dbpass=""
+
+validlicenses = ['http://www.gnu.org/licenses/gpl-3.0.html',
'http://www.gnu.org/licenses/gpl-2.0.html',
+'http://www.gnu.org/licenses/lgpl-3.0.html',
+'http://www.gnu.org/licenses/lgpl-2.1.html',
+'http://www.opensource.org/licenses/bsd-license.php',
'http://www.opensource.org/licenses/mit-license.php',
+'http://www.mozilla.org/MPL/MPL-1.1.html',
'http://www.mozilla.org/MPL/2.0/']
+categories = ['alerts-updates', 'appearance', 'bookmarks', 'download-management', 'feeds-news-blogging', 'games-entertainment', 'language-support', 'photos-music-videos', 'privacy-security', 'search-tools', 'shopping', 'social-communication', 'tabs', 'web-development', 'other']
+
def normalink(string):
return re.sub('\?.*', '', string)
def parselist(url):
+ print "PARSING LIST: " + url
l = []
request = urllib2.Request(url)
response = urllib2.urlopen(request)
@@ -25,7 +64,7 @@ def parselist(url):
return l
-def parsepage(url):
+def parsepage(url, category):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
@@ -33,23 +72,98 @@ def parsepage(url):
licenseli = soup.findAll('li',{'class':'source-license'})[0]
license = licenseli.findAll('a')[0]['href']
if license not in validlicenses:
- print "INVALID LICENSE: " + license
+ if license[0] == "h":
+ print "INVALID LICENSE: " + license
return 0
except:
return 0
- print soup.findAll(attrs={"property":"og:title"})[0]['content']
- """print soup.findAll(attrs={"property":"og:description"})[0]['content']
- print normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
- print license"""
+ name = re.sub('/$','', normalink(url))
+ name = re.sub('.*/','', name)
+ prettyname = soup.findAll(attrs={"property":"og:title"})[0]['content']
+ description = soup.findAll(attrs={"property":"og:description"})[0]['content']
+ rating = soup.findAll(attrs={"itemprop":"ratingValue"})[0]['content']
+ popularity = soup.findAll(attrs={"itemprop":"interactionCount"})[0]['content']
+ popularity = re.sub('UserDownloads:', '', popularity)
+ htmldescription = soup.findAll('div',{'id':'addon-description'})[0]
+ icon = normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
+ screenshots = []
+ try:
+ previewdiv = soup.findAll('ul',{'id':'preview'})[0]
+ for a in previewdiv.findAll('a'):
+ screenshots.append(normalink(a['href']))
+ except:
+ pass
+ version = soup.findAll('span',{'class':'version-number'})[0].text
addondiv = soup.findAll('div',{'id':'addon'})[0]
addonp = addondiv.findAll('p',{'class':'install-button'})[0]
button = addonp.findAll('a')[0]
- print server + normalink(button['href'])
+ downloadlink = server + normalink(button['href'])
+ try:
+ homelink = soup.findAll('a',{'class':'home'})[1]['href']
+ homelink = re.sub('.*//','http://',homelink)
+ except:
+ homelink = ""
+ try:
+ supportlink = soup.findAll('a',{'class':'support'})[0]['href']
+ supportlink = re.sub('.*//','http://',supportlink)
+ except:
+ supportlink = ""
+
+ htmldescription = unicode(htmldescription)
+ description = unicode(description)
+ prettyname = unicode(prettyname)
+ description = re.sub('\'', '\\\'', description)
+ htmldescription = re.sub('\'', '\\\'', htmldescription)
+ prettyname = re.sub('\'', '\\\'', prettyname)
+ screenshots = unicode(screenshots)
+
+ sql = u'INSERT INTO addons.addons (`name`, `prettyname`, `description`, `htmldescription`, `icon`, `screenshots`, `version`, `rating`, `popularity`, `downloadlink`, `homelink`, `supportlink`, `retrievedlink`, `license`, `category`) VALUES ("' +name+ '", \'' +prettyname+ '\', \'' +description+ '\', \'' +htmldescription+ '\', "' +icon+ '", "' +screenshots+ '", "' +version+ '", "' +rating+ '", "' +popularity+ '", "' +downloadlink+ '", "' +homelink+ '", "' +supportlink+ '", "' +url+ '" , "' +license+ '", "' +category+ '");'
+ try:
+ cursor = db.cursor()
+ cursor.execute(sql)
+ cursor.close()
+ db.commit()
+ except:
+ print 'Failed to insert "' + name + '", query: "' + sql
+ else:
+ print "Added " + name + " " + url
-links=[]
-for page in range(1,2):
- links = links + parselist(server + "/en-US/firefox/search/?q=+&platform=linux&page=" + str(page))
+import MySQLdb
+db = MySQLdb.connect(host="localhost",user=dbuser,passwd=dbpass, charset="utf8", use_unicode=True)
+cursor = db.cursor()
+sql = """DROP DATABASE IF EXISTS addons;
+CREATE DATABASE addons CHARACTER SET utf8 COLLATE utf8_general_ci;
+USE addons;
+CREATE TABLE addons(
+id INT PRIMARY KEY AUTO_INCREMENT,
+name VARCHAR(50),
+prettyname VARCHAR(50),
+description TEXT,
+htmldescription TEXT,
+icon VARCHAR(255),
+screenshots TEXT,
+version VARCHAR(20),
+rating VARCHAR(10),
+popularity INT,
+downloadlink VARCHAR(255),
+homelink VARCHAR(255),
+supportlink VARCHAR(255),
+retrievedlink VARCHAR(255),
+license VARCHAR(255),
+category VARCHAR(20)
+);
+"""
+cursor.execute(sql)
+cursor.close()
+db.commit()
-for link in links:
- parsepage(server+link)
+for category in categories:
+ links=[]
+ for page in range(1,1+parsepages):
+ links = links + parselist(server + "/en-US/firefox/extensions/" + category + "/?sort=popular&page="+str(page))
+ for link in links:
+ parsepage(server+link, category)
+#tests
+#parsepage("https://addons.mozilla.org/en-US/firefox/addon/twoo/", "test")
+#parsepage("https://addons.mozilla.org/en-US/firefox/addon/what-about/", "test")