Utilisateur:Hoolen/StatsWikipedia - Labels au 1er fevrier 2015
Scripts Python utilisés
modifier""" Récupération des tailles de chaque article ayant un label """
from io import StringIO
import re
import requests
from lxml import etree
MAX = 3000 # il y a moins de 3000 BA donc on les aura tous
#TODO : automatiser le process pour faire AdQ et BA en même temps
URL = "https://fr.wikipedia.org/w/index.php?title=Sp%C3%A9cial:Recherche&limit={}&profile=default&search=incategory%3A%22Bon+article%22".format(MAX)
article_size = re.compile(r'(.+)\sKio \((.+) mots\)')
spaces = re.compile(r'\s+')
req = requests.get(URL)
root = etree.parse(StringIO(req.text))
for li in root.xpath(".//ul[@class='mw-search-results']/li"):
title = li.xpath("./div[@class='mw-search-result-heading']/a/text()")[0]
size = li.xpath(".//div[@class='mw-search-result-data']/text()")[0]
size = article_size.search(size)
if size:
size = size.group(1)
print("%s,%s"%(title,size))
""" Groupement des articles par tranche de taille en Kio """
from collections import defaultdict
import math
import operator
sizes = defaultdict(int)
with open('BA_sizes.csv') as f:
for line in f:
title, size = line.rsplit(',', 1)
# Arrondi à la dizaine supérieure
rounded = math.ceil(float(size) / 10) * 10
sizes[rounded] += 1
items = sorted(sizes.items(), key=operator.itemgetter(0))
for (k, v) in items:
print("<=",k,",",v)
Résultats
modifierTaille de l'article,AdQ,BA <= 10,0,16 <= 20,21,214 <= 30,74,382 <= 40,94,316 <= 50,113,281 <= 60,135,222 <= 70,120,166 <= 80,113,135 <= 90,98,93 <= 100,96,91 <= 110,79,54 <= 120,69,52 <= 130,65,40 <= 140,68,35 <= 150,45,23 <= 160,39,28 <= 170,41,10 <= 180,21,16 <= 190,19,7 <= 200,20,10 <= 210,13,9 <= 220,13,5 <= 230,8,2 <= 240,9,3 <= 250,2,3 <= 260,2,1 <= 270,3,1 <= 280,4,1 <= 290,3,0 <= 300,2,0 <= 310,2,2 <= 320,0,1 <= 330,2,0 <= 340,1,0 <= 350,0,0 <= 360,0,0 <= 370,0,0 <= 380,0,0 <= 390,0,1