From c47f3dda10695dc1e71851e8cf7fdfb99d9d1c66 Mon Sep 17 00:00:00 2001 From: Ruben Rodriguez Date: Tue, 21 Oct 2014 01:10:26 +0200 Subject: Added tools dir --- AddonsScraper.py | 169 ------------------------------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 AddonsScraper.py (limited to 'AddonsScraper.py') diff --git a/AddonsScraper.py b/AddonsScraper.py deleted file mode 100644 index 3813252..0000000 --- a/AddonsScraper.py +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2014 Rubén Rodríguez -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# -# This package parses https://addons.mozilla.org and generates a database -# listing the addons that are under a valid license. addons.mozilla.org is -# itself licensed as "CC Attribution Share-Alike v3.0 or any later version." -# Note that screenshots get licensed under the same license as the program. -# -# To run, install mysql-server, python-mysqldb and python-beautifulsoup -# - - -import re -import sys -import urllib2 -import BeautifulSoup - -server="https://addons.mozilla.org" -parsepages=2 #Number of pages per category to parse -dbuser="root" -dbpass="" - -validlicenses = ['http://www.gnu.org/licenses/gpl-3.0.html', -'http://www.gnu.org/licenses/gpl-2.0.html', -'http://www.gnu.org/licenses/lgpl-3.0.html', -'http://www.gnu.org/licenses/lgpl-2.1.html', -'http://www.opensource.org/licenses/bsd-license.php', -'http://www.opensource.org/licenses/mit-license.php', -'http://www.mozilla.org/MPL/MPL-1.1.html', -'http://www.mozilla.org/MPL/2.0/'] - -categories = ['alerts-updates', 'appearance', 'bookmarks', 'download-management', 'feeds-news-blogging', 'games-entertainment', 'language-support', 'photos-music-videos', 'privacy-security', 'search-tools', 'shopping', 'social-communication', 'tabs', 'web-development', 'other'] - -def normalink(string): - return re.sub('\?.*', '', string) - -def parselist(url): - print "PARSING LIST: " + url - l = [] - request = urllib2.Request(url) - response = urllib2.urlopen(request) - soup = BeautifulSoup.BeautifulSoup(response) - for infodiv in soup.findAll('div',{'class':'info'}): - for h3 in infodiv.findAll('h3'): - for link in h3.findAll('a'): - l.append(re.sub('\?.*', '', link['href'])) - return l - - -def parsepage(url, category): - request = urllib2.Request(url) - response = urllib2.urlopen(request) - soup = BeautifulSoup.BeautifulSoup(response) - try: - licenseli = soup.findAll('li',{'class':'source-license'})[0] - license = licenseli.findAll('a')[0]['href'] - if license not in validlicenses: - if license[0] == "h": - print "INVALID LICENSE: " + license - return 0 - except: - return 0 - name = re.sub('/$','', normalink(url)) - name = re.sub('.*/','', name) - prettyname = soup.findAll(attrs={"property":"og:title"})[0]['content'] - description = soup.findAll(attrs={"property":"og:description"})[0]['content'] - rating = soup.findAll(attrs={"itemprop":"ratingValue"})[0]['content'] - popularity = soup.findAll(attrs={"itemprop":"interactionCount"})[0]['content'] - popularity = re.sub('UserDownloads:', '', popularity) - htmldescription = soup.findAll('div',{'id':'addon-description'})[0] - icon = normalink(soup.findAll(attrs={"property":"og:image"})[0]['content']) - screenshots = [] - try: - previewdiv = soup.findAll('ul',{'id':'preview'})[0] - for a in previewdiv.findAll('a'): - screenshots.append(normalink(a['href'])) - except: - pass - version = soup.findAll('span',{'class':'version-number'})[0].text - addondiv = soup.findAll('div',{'id':'addon'})[0] - addonp = addondiv.findAll('p',{'class':'install-button'})[0] - button = addonp.findAll('a')[0] - downloadlink = server + normalink(button['href']) - try: - homelink = soup.findAll('a',{'class':'home'})[1]['href'] - homelink = re.sub('.*//','http://',homelink) - except: - homelink = "" - try: - supportlink = soup.findAll('a',{'class':'support'})[0]['href'] - supportlink = re.sub('.*//','http://',supportlink) - except: - supportlink = "" - - htmldescription = unicode(htmldescription) - description = unicode(description) - prettyname = unicode(prettyname) - description = re.sub('\'', '\\\'', description) - htmldescription = re.sub('\'', '\\\'', htmldescription) - prettyname = re.sub('\'', '\\\'', prettyname) - screenshots = unicode(screenshots) - - sql = u'INSERT INTO addons.addons (`name`, `prettyname`, `description`, `htmldescription`, `icon`, `screenshots`, `version`, `rating`, `popularity`, `downloadlink`, `homelink`, `supportlink`, `retrievedlink`, `license`, `category`) VALUES ("' +name+ '", \'' +prettyname+ '\', \'' +description+ '\', \'' +htmldescription+ '\', "' +icon+ '", "' +screenshots+ '", "' +version+ '", "' +rating+ '", "' +popularity+ '", "' +downloadlink+ '", "' +homelink+ '", "' +supportlink+ '", "' +url+ '" , "' +license+ '", "' +category+ '");' - try: - cursor = db.cursor() - cursor.execute(sql) - cursor.close() - db.commit() - except: - print 'Failed to insert "' + name + '", query: "' + sql - else: - print "Added " + name + " " + url - -import MySQLdb -db = MySQLdb.connect(host="localhost",user=dbuser,passwd=dbpass, charset="utf8", use_unicode=True) -cursor = db.cursor() -sql = """DROP DATABASE IF EXISTS addons; -CREATE DATABASE addons CHARACTER SET utf8 COLLATE utf8_general_ci; -USE addons; -CREATE TABLE addons( -id INT PRIMARY KEY AUTO_INCREMENT, -name VARCHAR(50), -prettyname VARCHAR(50), -description TEXT, -htmldescription TEXT, -icon VARCHAR(255), -screenshots TEXT, -version VARCHAR(20), -rating VARCHAR(10), -popularity INT, -downloadlink VARCHAR(255), -homelink VARCHAR(255), -supportlink VARCHAR(255), -retrievedlink VARCHAR(255), -license VARCHAR(255), -category VARCHAR(20) -); -""" -cursor.execute(sql) -cursor.close() -db.commit() - -for category in categories: - links=[] - for page in range(1,1+parsepages): - links = links + parselist(server + "/en-US/firefox/extensions/" + category + "/?sort=popular&page="+str(page)) - for link in links: - parsepage(server+link, category) - -#tests -#parsepage("https://addons.mozilla.org/en-US/firefox/addon/twoo/", "test") -#parsepage("https://addons.mozilla.org/en-US/firefox/addon/what-about/", "test") -- cgit v1.2.3