summaryrefslogtreecommitdiff
path: root/AddonsScraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'AddonsScraper.py')
-rw-r--r--AddonsScraper.py169
1 files changed, 0 insertions, 169 deletions
diff --git a/AddonsScraper.py b/AddonsScraper.py
deleted file mode 100644
index 3813252..0000000
--- a/AddonsScraper.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (C) 2014 Rubén Rodríguez <ruben@gnu.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-#
-#
-# This package parses https://addons.mozilla.org and generates a database
-# listing the addons that are under a valid license. addons.mozilla.org is
-# itself licensed as "CC Attribution Share-Alike v3.0 or any later version."
-# Note that screenshots get licensed under the same license as the program.
-#
-# To run, install mysql-server, python-mysqldb and python-beautifulsoup
-#
-
-
-import re
-import sys
-import urllib2
-import BeautifulSoup
-
-server="https://addons.mozilla.org"
-parsepages=2 #Number of pages per category to parse
-dbuser="root"
-dbpass=""
-
-validlicenses = ['http://www.gnu.org/licenses/gpl-3.0.html',
-'http://www.gnu.org/licenses/gpl-2.0.html',
-'http://www.gnu.org/licenses/lgpl-3.0.html',
-'http://www.gnu.org/licenses/lgpl-2.1.html',
-'http://www.opensource.org/licenses/bsd-license.php',
-'http://www.opensource.org/licenses/mit-license.php',
-'http://www.mozilla.org/MPL/MPL-1.1.html',
-'http://www.mozilla.org/MPL/2.0/']
-
-categories = ['alerts-updates', 'appearance', 'bookmarks', 'download-management', 'feeds-news-blogging', 'games-entertainment', 'language-support', 'photos-music-videos', 'privacy-security', 'search-tools', 'shopping', 'social-communication', 'tabs', 'web-development', 'other']
-
-def normalink(string):
- return re.sub('\?.*', '', string)
-
-def parselist(url):
- print "PARSING LIST: " + url
- l = []
- request = urllib2.Request(url)
- response = urllib2.urlopen(request)
- soup = BeautifulSoup.BeautifulSoup(response)
- for infodiv in soup.findAll('div',{'class':'info'}):
- for h3 in infodiv.findAll('h3'):
- for link in h3.findAll('a'):
- l.append(re.sub('\?.*', '', link['href']))
- return l
-
-
-def parsepage(url, category):
- request = urllib2.Request(url)
- response = urllib2.urlopen(request)
- soup = BeautifulSoup.BeautifulSoup(response)
- try:
- licenseli = soup.findAll('li',{'class':'source-license'})[0]
- license = licenseli.findAll('a')[0]['href']
- if license not in validlicenses:
- if license[0] == "h":
- print "INVALID LICENSE: " + license
- return 0
- except:
- return 0
- name = re.sub('/$','', normalink(url))
- name = re.sub('.*/','', name)
- prettyname = soup.findAll(attrs={"property":"og:title"})[0]['content']
- description = soup.findAll(attrs={"property":"og:description"})[0]['content']
- rating = soup.findAll(attrs={"itemprop":"ratingValue"})[0]['content']
- popularity = soup.findAll(attrs={"itemprop":"interactionCount"})[0]['content']
- popularity = re.sub('UserDownloads:', '', popularity)
- htmldescription = soup.findAll('div',{'id':'addon-description'})[0]
- icon = normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
- screenshots = []
- try:
- previewdiv = soup.findAll('ul',{'id':'preview'})[0]
- for a in previewdiv.findAll('a'):
- screenshots.append(normalink(a['href']))
- except:
- pass
- version = soup.findAll('span',{'class':'version-number'})[0].text
- addondiv = soup.findAll('div',{'id':'addon'})[0]
- addonp = addondiv.findAll('p',{'class':'install-button'})[0]
- button = addonp.findAll('a')[0]
- downloadlink = server + normalink(button['href'])
- try:
- homelink = soup.findAll('a',{'class':'home'})[1]['href']
- homelink = re.sub('.*//','http://',homelink)
- except:
- homelink = ""
- try:
- supportlink = soup.findAll('a',{'class':'support'})[0]['href']
- supportlink = re.sub('.*//','http://',supportlink)
- except:
- supportlink = ""
-
- htmldescription = unicode(htmldescription)
- description = unicode(description)
- prettyname = unicode(prettyname)
- description = re.sub('\'', '\\\'', description)
- htmldescription = re.sub('\'', '\\\'', htmldescription)
- prettyname = re.sub('\'', '\\\'', prettyname)
- screenshots = unicode(screenshots)
-
- sql = u'INSERT INTO addons.addons (`name`, `prettyname`, `description`, `htmldescription`, `icon`, `screenshots`, `version`, `rating`, `popularity`, `downloadlink`, `homelink`, `supportlink`, `retrievedlink`, `license`, `category`) VALUES ("' +name+ '", \'' +prettyname+ '\', \'' +description+ '\', \'' +htmldescription+ '\', "' +icon+ '", "' +screenshots+ '", "' +version+ '", "' +rating+ '", "' +popularity+ '", "' +downloadlink+ '", "' +homelink+ '", "' +supportlink+ '", "' +url+ '" , "' +license+ '", "' +category+ '");'
- try:
- cursor = db.cursor()
- cursor.execute(sql)
- cursor.close()
- db.commit()
- except:
- print 'Failed to insert "' + name + '", query: "' + sql
- else:
- print "Added " + name + " " + url
-
-import MySQLdb
-db = MySQLdb.connect(host="localhost",user=dbuser,passwd=dbpass, charset="utf8", use_unicode=True)
-cursor = db.cursor()
-sql = """DROP DATABASE IF EXISTS addons;
-CREATE DATABASE addons CHARACTER SET utf8 COLLATE utf8_general_ci;
-USE addons;
-CREATE TABLE addons(
-id INT PRIMARY KEY AUTO_INCREMENT,
-name VARCHAR(50),
-prettyname VARCHAR(50),
-description TEXT,
-htmldescription TEXT,
-icon VARCHAR(255),
-screenshots TEXT,
-version VARCHAR(20),
-rating VARCHAR(10),
-popularity INT,
-downloadlink VARCHAR(255),
-homelink VARCHAR(255),
-supportlink VARCHAR(255),
-retrievedlink VARCHAR(255),
-license VARCHAR(255),
-category VARCHAR(20)
-);
-"""
-cursor.execute(sql)
-cursor.close()
-db.commit()
-
-for category in categories:
- links=[]
- for page in range(1,1+parsepages):
- links = links + parselist(server + "/en-US/firefox/extensions/" + category + "/?sort=popular&page="+str(page))
- for link in links:
- parsepage(server+link, category)
-
-#tests
-#parsepage("https://addons.mozilla.org/en-US/firefox/addon/twoo/", "test")
-#parsepage("https://addons.mozilla.org/en-US/firefox/addon/what-about/", "test")