Statistiche sulla partecipazione al Bebras italiano 2016/17¶

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')

Insegnanti, stima delle squadre e alunni¶

import pandas as pd
import urllib2
from IPython.display import display, Markdown

try:
  with open('secret.key') as k:
    key = k.readline().strip()
  t = urllib2.urlopen(key)
  with open("teachers.json", "w") as tw:
     tw.writelines(t)
  t = urllib2.urlopen(key.replace("teachers", "overview"))
  with open("overview.json", "w") as tw:
     tw.writelines(t)
except:
  print "Caricamento dati dalla cache"  

teachers = None
with open("teachers.json", "r") as t:
    teachers = pd.DataFrame(pd.read_json(t, convert_axes=True))[3:]

teachers.index = range(len(teachers))
# considera solo chi ha compilato teams_expected e il tipo di scuola
filledt = teachers[((teachers['teams_expected'] > 0) | 
                    (teachers['teams_active'] > 0)) & 
                   (teachers['school_type'].notnull()) &
                   (teachers['school_type'] != '')] 

filled = len(filledt)
expected = filledt.sum()
expteams = expected['teams_expected']
regteams = expected['teams_active']

today = pd.datetime.today()
s = """Alla data del *{}:* **{:d}** insegnanti hanno stimato il numero di squadre, per un totale di **{:d}** squadre (~*{:d}* alunni); 
ci sono **{:d}** squadre già registrate (~*{:d}* alunni)."""
display(Markdown(s.format(str(today)[:19], filled, expteams, expteams*4, regteams, regteams*4)))

if today < pd.datetime(2016,11,7):
    isotoday = today.isoformat()[:10]
    with open("stats-" + isotoday + ".txt", "w") as stat:
        stat.write("{:d} {:d} {:d}\n".format(filled, expteams, expteams*4))

import os
data = []
for path, dirs, files in os.walk("."):
    for f in files:
        if f.startswith("stats-"):
            d = map(int, f.split('.')[0].split('-')[1:4])
            with open(f,"r") as df:
                nn = map(int, df.readline().strip().split(" "))
                d = pd.datetime(*d)
                data.append((d, nn))
data = pd.DataFrame.from_items(data, orient="index", columns=["insegnanti","squadre","alunni"])

lasty_teachers = 66+82+124
lasty_teams = 3465
lasty = pd.DataFrame.from_items([(pd.datetime(2016,9,15),[0,0,0]),
                                 (pd.datetime(2016,11,1),[lasty_teachers,lasty_teams,lasty_teams*4])], 
                                orient="index", 
                                columns=["insegnanti 2015 (interpolazione lineare)",
                                         "squadre 2015 (interpolazione lineare)",
                                         "alunni 2015 (interpolazione lineare)"])

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline
plt.figure(figsize=(12,5))

plt.subplot(121)
ax = data['squadre'].plot(legend=True)
data['alunni'].plot(ax=ax, legend=True)
lasty['alunni 2015 (interpolazione lineare)'].plot(style="--", ax=ax, legend=True)
lasty['squadre 2015 (interpolazione lineare)'].plot(style="--", ax=ax, legend=True)
ax.axvline(x=pd.datetime(2016,11,1), linewidth=.5, color='gray')
plt.subplot(122)
ax = data['insegnanti'].plot(legend=True)
lasty['insegnanti 2015 (interpolazione lineare)'].plot(style="--", ax=ax, legend=True)
p = ax.axvline(x=pd.datetime(2016,11,1), linewidth=.5, color='gray')

Insegnanti che hanno partecipato anche nel 2015¶

Si considera il numero di account riutilizzati: il numero di insegnanti che ripetono l'esperienza è quindi stimato per difetto, in quanto un insegnante iscritto con una nuova username risulta nuovo.

print "Account riutilizzati: {:0.1f}%".format(100*len(teachers[(teachers["teams_active"] > 0) & (teachers["id"] <= 324)]) / float(lasty_teachers))

Account riutilizzati: 51.8%

La popolazione studentesca nazionale¶

Dati ISTAT popolazione studentesca 2014 (fonte: http://dati.istat.it)

istat = pd.DataFrame.from_items([
                      ("PIEMONTE",              (191399, 117997, 168439)),
                      ("VALLE D'AOSTA",         (  5981,   3691,   5309)),
                      ("LIGURIA",               ( 61566,  39213,  60184)),
                      ("LOMBARDIA",             (468662, 283007, 381619)),
                      ("TRENTINO-ALTO ADIGE",   ( 27028,  16890,  21836)),
                      ("VENETO",                (232694, 142401, 204262)),
                      ("FRIULI-VENEZIA GIULIA", ( 51830,  32143,  46949)),
                      ("EMILIA-ROMAGNA",        (198417, 118460, 176968)),
                      ("TOSCANA",               (161001,  98203, 152886)),
                      ("UMBRIA",                ( 39181,  23488,  36946)),
                      ("MARCHE",                ( 67996,  42095,  70602)),
                      ("LAZIO",                 (268133, 161573, 249145)),
                      ("ABRUZZO",               ( 57146,  35828,  58578)),
                      ("MOLISE",                ( 12595,   8354,  14990)),
                      ("CAMPANIA",              (317346, 204223, 326644)),
                      ("PUGLIA",                (198662, 130675, 213545)),
                      ("BASILICATA",            (25237,  17097,   30214)),
                      ("CALABRIA",              (93277,  59624,  101208)),
                      ("SICILIA",               (254023, 164520, 252730)),
                      ("SARDEGNA",              (67379,  44105,   74003)),
                      ("ESTERO",       (pd.np.NaN, pd.np.NaN, pd.np.NaN))],
                      orient = "index",
                      columns = ('E','M','S'))
istat['totale'] = istat['E'] + istat['M'] + istat['S']
display(istat)

Analisi della distribuzione geografica delle squadre¶

def norm_region(r):
    """Normalize the name of a region. It also corrects wrong names."""
    r = r.strip().upper()
    if r == 'FVG' or r.startswith('FRIULI'):
       return 'FRIULI-VENEZIA GIULIA'
    if r.startswith('EMILIA'):
       return 'EMILIA-ROMAGNA'
    if r.startswith('TRENTINO'):
       return 'TRENTINO-ALTO ADIGE'
    elif r == 'ALBANIA' or r == 'BAVIERA':
       return 'ESTERO'
    else:
       return r

stat = pd.DataFrame()
stat['regione'] = filledt['school_region'].map(norm_region)
stat['tipo'] = filledt['school_type']
stat['squadre attese'] = filledt['teams_expected']

expected = stat.groupby(['regione', 'tipo']).aggregate('sum')
for (reg, tipo), row in expected.iterrows():
    if len(tipo) > 1:
       for t in tipo:
           try:
            expected.loc[(reg, t), 'squadre attese'] += row[0] / len(tipo)
           except:
            expected.loc[(reg, t), 'squadre attese'] = row[0] / len(tipo)
            try:
                expected.loc[(reg, t), 'popolazione'] = istat.loc[reg, t]
            except:
                print ":{}:{}:NOT FOUND".format(reg, t)
                expected.loc[(reg, t), 'popolazione'] = pd.np.NaN
    else:
        try:
            expected.loc[(reg, tipo), 'popolazione'] = istat.loc[reg, tipo]
        except:
            print "_{}_{}_NOT FOUND".format(reg, tipo)
            expected.loc[(reg, tipo), 'popolazione'] = pd.np.NaN
expected = expected[expected.index.isin(['E','M','S'], level=1)].sort_index()
expected['alunni attesi'] = expected['squadre attese'] * 4
expected['copertura (alunni ogni mille)'] = 1000 * expected['alunni attesi'] / expected['popolazione']
display(expected)

tot = expected[['squadre attese','alunni attesi', 'popolazione']].groupby(level='tipo').sum()
tot['copertura (alunni ogni mille)'] = 1000 * tot['alunni attesi'] / tot['popolazione']
display(tot)

glob = tot.sum()
print """squadre attese: {}\t alunni attesi: {}
popolazione: {}\t copertura (alunni ogni mille) {:0.1f}""".format(int(glob["squadre attese"]), 
                                                                  int(glob["alunni attesi"]), 
                                                                  int(glob["popolazione"]), 
                                                                  1000 * glob["alunni attesi"] / glob["popolazione"])

squadre attese: 8141	 alunni attesi: 32564
popolazione: 7181197	 copertura (alunni ogni mille) 4.5

exp_reg = expected[['squadre attese','alunni attesi', 'popolazione']].groupby(level='regione').sum()
exp_reg['copertura (alunni ogni mille)'] = 1000 * exp_reg['alunni attesi'] / exp_reg['popolazione']
display(exp_reg)

Cartografia ISTAT 2011 (fonte: http://www.istat.it/it/archivio/24613), convertita con il comando:

ogr2ogr -f GeoJSON -s_srs reg2011_g.prj -t_srs EPSG:4326 it.json reg2011_g.shp

(fonte: https://gist.github.com/riccardoscalco/6029355)

import geopandas as gpd
it = gpd.read_file("it.json")

TYPES = ('totale', 'primaria', 'secondaria primo grado', 'secondaria secondo grado')


def get_data_with_default(geo, i, t, data, j, label='squadre attese'):
    try:
        geo.loc[i, label + ' ' + t] = data.loc[j, label]
    except:
        geo.loc[i, label + ' ' + t] = 0
    finally:
        return geo.loc[i, label + ' ' + t]

for i, r in it.iterrows():
    for cname in istat.index:
        if r['NOME_REG'][0:5] == cname[0:5]:
            it.loc[i, 'NOME_REG'] = cname
            get_data_with_default(it, i, TYPES[0], exp_reg, cname)
            get_data_with_default(it, i, TYPES[1], expected, (cname, 'E'))
            get_data_with_default(it, i, TYPES[2], expected, (cname, 'M'))
            get_data_with_default(it, i, TYPES[3], expected, (cname, 'S'))
                
            it.loc[i, 'popolazione ' + TYPES[0]] = istat.loc[cname, 'totale']
            it.loc[i, 'popolazione ' + TYPES[1]] = istat.loc[cname, 'E']
            it.loc[i, 'popolazione ' + TYPES[2]] = istat.loc[cname, 'M']
            it.loc[i, 'popolazione ' + TYPES[3]] = istat.loc[cname, 'S']
            break

for t in TYPES:
    it['alunni attesi ' + t] = it['squadre attese ' + t] * 4
    it['copertura ' + t] = 1000 * it['alunni attesi ' + t] / it['popolazione ' + t]

    
plt.figure(figsize=(15,15))
for i, t in enumerate(TYPES):
    ax = plt.subplot(2,2, i+1)
    ax.set_aspect("equal")
    ax.set_axis_off()
    ax.set_title("Alunni attesi ogni mille ({})".format(t))
    it.plot(ax=ax, column='copertura ' + t, cmap='YlOrRd', scheme='quantiles', legend=True)

Il Bebras nel mondo (dati 2015)¶

w = gpd.read_file("world.json")
w = w.set_index("name")

with open("wbebras.json", "r") as t:
    wbebras = pd.DataFrame(pd.read_json(t, convert_axes=True, orient='index'))

for i in wbebras.index:
    try:
        w.loc[i, "bebras"] = wbebras.loc[i, "bebras"]
        w.loc[i, "oecd"]   = wbebras.loc[i, "oecd"]
        w.loc[i, "copertura"]   = 1000 * wbebras.loc[i, "bebras"] / wbebras.loc[i, "oecd"]
    except:
        print i

plt.figure(figsize=(10,10))
ax = plt.subplot(212)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti 2015 ogni 1000 studenti (dati OECD 2012)")       
w.dropna().plot(ax=ax,column='copertura', cmap='Blues', scheme='quantiles', legend=True)


ax = plt.subplot(211)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti Bebras 2015")       
p = w.dropna(subset=["bebras"]).plot(ax=ax,column='bebras', cmap='YlOrRd', scheme='quantiles', legend=True)

Numeri assoluti¶

display(wbebras.sort_values("bebras",ascending=False)["bebras"])

France                      344976
Germany                     248084
Ukraine                      93820
Slovakia                     66842
Belarus                      53587
Czech Republic               52596
United States of America     39213
Republic of Serbia           30823
South Africa                 28543
Lithuania                    24709
Russia                       24543
Slovenia                     24158
Netherlands                  21086
Macedonia                    19608
Austria                      17641
Australia                    16925
Taiwan                       13784
Hungary                      13438
Poland                       13392
Italy                        12017
Canada                       10288
Pakistan                      7369
Kazakhstan                    7311
Finland                       5598
Azerbaijan                    4065
Estonia                       4020
Japan                         3538
Iran                          2967
Belgium                       1762
Ireland                       1362
Latvia                        1209
Spain                          851
New Zealand                    783
Malaysia                       600
Iceland                        475
Bulgaria                       474
Cyprus                         314
Israel                         300
Bosnia and Herzegovina           0
Croatia                          0
Egypt                            0
Indonesia                        0
Name: bebras, dtype: int64

Analisi delle gare¶

import json

overview = None
with open("overview.json", "r") as t:
    overview = json.load(t)

dfov = pd.DataFrame(overview['teams'])
gare = pd.DataFrame()
gare['categoria'] = dfov['u_class'].str.extract('(.+)_.+', expand=False)
gare['insegnante'] = dfov['t_id'].astype('int64')
gare['login'] = dfov['u_id']
gare['status'] = dfov['u_investigation']
gare['risultato'] = dfov['view_exam_list'].str.extract('(\d+)p/\d+min', expand=False)
gare['tempo'] = dfov['view_exam_list'].str.extract('\d+p/(\d+)min', expand=False)
gare['data'] = pd.to_datetime(dfov['view_exam_list'].str.extract('Server End Date: ([0-9/ :]+)', expand=False))
#gare['prova'] = pd.to_datetime(dfov['11'].astype('int64'), unit='s')
#gare['prova1'] = pd.to_datetime(dfov['15'].fillna(0).astype('int64'), unit='s')
#gare['prova2'] = pd.to_datetime(dfov['e_last_creation_date'].fillna(0).astype('int64'), unit='s')
#gare['prova3'] = pd.to_datetime(dfov['u_lastlogindate'].astype('int64'), unit='s')

fid = filledt.set_index('id')
fid['regione'] = fid['school_region'].map(norm_region)
gare = gare.join(fid[['regione']],on='insegnante')

done = gare[gare['status'] != 'Empty']

Insegnanti per regione che hanno partecipato¶

display(done.groupby(['regione'])['insegnante'].nunique())

regione
ABRUZZO                   15
BASILICATA                 4
CALABRIA                  19
CAMPANIA                  96
EMILIA-ROMAGNA            36
ESTERO                     2
FRIULI-VENEZIA GIULIA     19
LAZIO                     51
LIGURIA                   15
LOMBARDIA                133
MARCHE                    11
MOLISE                     9
PIEMONTE                  47
PUGLIA                    62
SARDEGNA                  14
SICILIA                   40
TOSCANA                   18
TRENTINO-ALTO ADIGE        5
UMBRIA                    12
VALLE D'AOSTA              1
VENETO                    60
Name: insegnante, dtype: int64

Insegnanti per categoria¶

display(done.groupby(['categoria'])['insegnante'].nunique())

categoria
giga    163
kilo    325
mega    204
peta     99
tera    104
Name: insegnante, dtype: int64

Squadre per categoria¶

dcount = done.groupby(['regione', 'categoria']).count()

# manuale per forzare l'ordine
TYPES = ('kilo', 'mega', 'giga', 'tera', 'peta') # done['categoria'].unique()


for i, r in it.iterrows():
    for cname in istat.index:
        if r['NOME_REG'][0:5] == cname[0:5]:
            totale = 0
            for t in TYPES:
                totale += get_data_with_default(it, i, t, dcount, (cname, t), label='login')
            it.loc[i, "Squadre totali"] = totale
            break


    
plt.figure(figsize=(16,16))
for i, t in enumerate(TYPES):
    ax = plt.subplot(2,3, i+1)
    ax.set_aspect("equal")
    ax.set_axis_off()
    ax.set_title("Squadre ({})".format(t))
    it.plot(ax=ax, column='login ' + t, cmap='YlOrRd', scheme='quantiles', legend=True)

ax = plt.subplot(2,3,6)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Squadre ({})".format('totale'))
p = it.plot(ax=ax, column='Squadre totali', cmap='Blues', scheme='quantiles', legend=True)

	E	M	S	totale
PIEMONTE	191399.0	117997.0	168439.0	477835.0
VALLE D'AOSTA	5981.0	3691.0	5309.0	14981.0
LIGURIA	61566.0	39213.0	60184.0	160963.0
LOMBARDIA	468662.0	283007.0	381619.0	1133288.0
TRENTINO-ALTO ADIGE	27028.0	16890.0	21836.0	65754.0
VENETO	232694.0	142401.0	204262.0	579357.0
FRIULI-VENEZIA GIULIA	51830.0	32143.0	46949.0	130922.0
EMILIA-ROMAGNA	198417.0	118460.0	176968.0	493845.0
TOSCANA	161001.0	98203.0	152886.0	412090.0
UMBRIA	39181.0	23488.0	36946.0	99615.0
MARCHE	67996.0	42095.0	70602.0	180693.0
LAZIO	268133.0	161573.0	249145.0	678851.0
ABRUZZO	57146.0	35828.0	58578.0	151552.0
MOLISE	12595.0	8354.0	14990.0	35939.0
CAMPANIA	317346.0	204223.0	326644.0	848213.0
PUGLIA	198662.0	130675.0	213545.0	542882.0
BASILICATA	25237.0	17097.0	30214.0	72548.0
CALABRIA	93277.0	59624.0	101208.0	254109.0
SICILIA	254023.0	164520.0	252730.0	671273.0
SARDEGNA	67379.0	44105.0	74003.0	185487.0
ESTERO	NaN	NaN	NaN	NaN

		squadre attese	popolazione	alunni attesi	copertura (alunni ogni mille)
regione	tipo
ABRUZZO	E	79.0	57146.0	316.0	5.529696
	M	42.0	35828.0	168.0	4.689070
	S	41.0	58578.0	164.0	2.799686
BASILICATA	E	1.0	25237.0	4.0	0.158497
	M	7.0	17097.0	28.0	1.637714
	S	17.0	30214.0	68.0	2.250612
CALABRIA	E	62.0	93277.0	248.0	2.658748
	M	109.0	59624.0	436.0	7.312492
	S	50.0	101208.0	200.0	1.976128
CAMPANIA	E	463.0	317346.0	1852.0	5.835902
	M	440.0	204223.0	1760.0	8.618030
	S	92.0	326644.0	368.0	1.126609
EMILIA-ROMAGNA	E	201.0	198417.0	804.0	4.052072
	M	163.0	118460.0	652.0	5.503968
	S	168.0	176968.0	672.0	3.797297
ESTERO	E	9.0	NaN	36.0	NaN
	M	2.0	NaN	8.0	NaN
	S	2.0	NaN	8.0	NaN
FRIULI-VENEZIA GIULIA	E	16.0	51830.0	64.0	1.234806
	M	95.0	32143.0	380.0	11.822170
	S	224.0	46949.0	896.0	19.084539
LAZIO	E	357.0	268133.0	1428.0	5.325715
	M	133.0	161573.0	532.0	3.292629
	S	158.0	249145.0	632.0	2.536675
LIGURIA	E	62.0	61566.0	248.0	4.028197
	M	28.0	39213.0	112.0	2.856196
	S	129.0	60184.0	516.0	8.573707
LOMBARDIA	E	574.0	468662.0	2296.0	4.899053
	M	455.0	283007.0	1820.0	6.430936
	S	456.0	381619.0	1824.0	4.779636
...	...	...	...	...	...
MARCHE	M	142.0	42095.0	568.0	13.493289
MARCHE	S	92.0	70602.0	368.0	5.212317
MOLISE	E	18.0	12595.0	72.0	5.716554
	M	59.0	8354.0	236.0	28.249940
	S	18.0	14990.0	72.0	4.803202
PIEMONTE	E	161.0	191399.0	644.0	3.364699
	M	196.0	117997.0	784.0	6.644237
	S	140.0	168439.0	560.0	3.324646
PUGLIA	E	355.0	198662.0	1420.0	7.147819
	M	402.0	130675.0	1608.0	12.305338
	S	69.0	213545.0	276.0	1.292468
SARDEGNA	E	32.0	67379.0	128.0	1.899702
	M	90.0	44105.0	360.0	8.162340
	S	16.0	74003.0	64.0	0.864830
SICILIA	E	102.0	254023.0	408.0	1.606154
	M	146.0	164520.0	584.0	3.549720
	S	85.0	252730.0	340.0	1.345309
TOSCANA	E	78.0	161001.0	312.0	1.937876
	M	102.0	98203.0	408.0	4.154659
	S	6.0	152886.0	24.0	0.156980
TRENTINO-ALTO ADIGE	E	4.0	27028.0	16.0	0.591979
	M	155.0	16890.0	620.0	36.708111
	S	11.0	21836.0	44.0	2.015021
UMBRIA	E	61.0	39181.0	244.0	6.227508
	M	12.0	23488.0	48.0	2.043597
	S	0.0	36946.0	0.0	0.000000
VALLE D'AOSTA	E	6.0	5981.0	24.0	4.012707
VENETO	E	279.0	232694.0	1116.0	4.795998
	M	343.0	142401.0	1372.0	9.634764
	S	290.0	204262.0	1160.0	5.678981

	squadre attese	alunni attesi	popolazione	copertura (alunni ogni mille)
tipo
E	2956.0	11824.0	2799553.0	4.223531
M	3121.0	12484.0	1739896.0	7.175142
S	2064.0	8256.0	2641748.0	3.125203

	squadre attese	alunni attesi	popolazione	copertura (alunni ogni mille)
regione
ABRUZZO	162.0	648.0	151552.0	4.275760
BASILICATA	25.0	100.0	72548.0	1.378398
CALABRIA	221.0	884.0	254109.0	3.478822
CAMPANIA	995.0	3980.0	848213.0	4.692218
EMILIA-ROMAGNA	532.0	2128.0	493845.0	4.309044
ESTERO	13.0	52.0	NaN	NaN
FRIULI-VENEZIA GIULIA	335.0	1340.0	130922.0	10.235102
LAZIO	648.0	2592.0	678851.0	3.818216
LIGURIA	219.0	876.0	160963.0	5.442244
LOMBARDIA	1485.0	5940.0	1133288.0	5.241386
MARCHE	270.0	1080.0	180693.0	5.976989
MOLISE	95.0	380.0	35939.0	10.573472
PIEMONTE	497.0	1988.0	477835.0	4.160432
PUGLIA	826.0	3304.0	542882.0	6.086037
SARDEGNA	138.0	552.0	185487.0	2.975950
SICILIA	333.0	1332.0	671273.0	1.984290
TOSCANA	186.0	744.0	412090.0	1.805431
TRENTINO-ALTO ADIGE	170.0	680.0	65754.0	10.341576
UMBRIA	73.0	292.0	99615.0	2.931285
VALLE D'AOSTA	6.0	24.0	5981.0	4.012707
VENETO	912.0	3648.0	579357.0	6.296636