Auto-update: 2025-10-23 15:04:00
This commit is contained in:
parent
7698240a79
commit
6170c48371
28 changed files with 19938 additions and 0 deletions
744
activite1/knn_corse_interactive_v2.ipynb
Normal file
744
activite1/knn_corse_interactive_v2.ipynb
Normal file
|
|
@ -0,0 +1,744 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 🗺️ Classification k-NN : Haute-Corse ou Corse du Sud ?\n",
|
||||
"\n",
|
||||
"## Objectif\n",
|
||||
"Utiliser l'algorithme des **k plus proches voisins (k-NN)** pour déterminer si un point de la carte de Corse se situe en **Haute-Corse (2B)** ou en **Corse du Sud (2A)**, en se basant sur les villages les plus proches.\n",
|
||||
"\n",
|
||||
"## Principe\n",
|
||||
"1. On charge les données des villages corses avec leurs coordonnées GPS et leur département\n",
|
||||
"2. On choisit un point sur la carte\n",
|
||||
"3. On calcule les distances entre ce point et tous les villages\n",
|
||||
"4. On identifie les k villages les plus proches\n",
|
||||
"5. On vote : le département majoritaire parmi ces k villages devient la prédiction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 📦 Installation et imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Installation des bibliothèques nécessaires (si besoin)\n",
|
||||
"import sys\n",
|
||||
"!{sys.executable} -m pip install folium pandas numpy -q"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import folium\n",
|
||||
"from folium.plugins import MarkerCluster\n",
|
||||
"import math\n",
|
||||
"import json\n",
|
||||
"from collections import Counter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 📊 Chargement des données"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Charger le fichier CSV\n",
|
||||
"# Remplacez 'villages_corse.csv' par le chemin de votre fichier\n",
|
||||
"df = pd.read_csv('villages_corse.csv', sep='\\t', encoding='utf-8')\n",
|
||||
"\n",
|
||||
"# Afficher les premières lignes\n",
|
||||
"print(f\"Nombre de villages : {len(df)}\")\n",
|
||||
"print(f\"\\nColonnes : {list(df.columns)}\")\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🔧 Préparation des données"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def parse_coordinates(point_geo_str):\n",
|
||||
" \"\"\"\n",
|
||||
" Parse la colonne Point_Geo pour extraire latitude et longitude.\n",
|
||||
" Format attendu : \"latitude, longitude\"\n",
|
||||
" Exemple : \"41.984099158, 8.798384636\"\n",
|
||||
" \"\"\"\n",
|
||||
" try:\n",
|
||||
" # Séparer par la virgule\n",
|
||||
" parts = str(point_geo_str).split(',')\n",
|
||||
" lat = float(parts[0].strip())\n",
|
||||
" lon = float(parts[1].strip())\n",
|
||||
" return lat, lon\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Erreur parsing: {point_geo_str} - {e}\")\n",
|
||||
" return None, None\n",
|
||||
"\n",
|
||||
"# Extraire les coordonnées\n",
|
||||
"df[['latitude', 'longitude']] = df['Point_Geo'].apply(\n",
|
||||
" lambda x: pd.Series(parse_coordinates(x))\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Supprimer les lignes sans coordonnées valides\n",
|
||||
"df = df.dropna(subset=['latitude', 'longitude'])\n",
|
||||
"\n",
|
||||
"# Simplifier les noms de départements\n",
|
||||
"df['dept_simple'] = df['Code Département'].apply(lambda x: '2A' if str(x) == '2A' else '2B')\n",
|
||||
"\n",
|
||||
"print(f\"Villages avec coordonnées valides : {len(df)}\")\n",
|
||||
"print(f\"\\nRépartition par département :\")\n",
|
||||
"print(df['dept_simple'].value_counts())\n",
|
||||
"\n",
|
||||
"df[['Nom français', 'dept_simple', 'latitude', 'longitude']].head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 📏 Fonction de calcul de distance\n",
|
||||
"\n",
|
||||
"Nous utilisons la **formule de Haversine** pour calculer la distance entre deux points GPS sur la surface de la Terre."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def haversine_distance(lat1, lon1, lat2, lon2):\n",
|
||||
" \"\"\"\n",
|
||||
" Calcule la distance en kilomètres entre deux points GPS.\n",
|
||||
" Formule de Haversine.\n",
|
||||
" \"\"\"\n",
|
||||
" R = 6371 # Rayon de la Terre en km\n",
|
||||
" \n",
|
||||
" # Conversion en radians\n",
|
||||
" lat1_rad = math.radians(lat1)\n",
|
||||
" lat2_rad = math.radians(lat2)\n",
|
||||
" delta_lat = math.radians(lat2 - lat1)\n",
|
||||
" delta_lon = math.radians(lon2 - lon1)\n",
|
||||
" \n",
|
||||
" # Formule de Haversine\n",
|
||||
" a = math.sin(delta_lat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon/2)**2\n",
|
||||
" c = 2 * math.asin(math.sqrt(a))\n",
|
||||
" \n",
|
||||
" return R * c\n",
|
||||
"\n",
|
||||
"# Test de la fonction\n",
|
||||
"# Distance entre Ajaccio (41.9267, 8.7369) et Bastia (42.7028, 9.4500)\n",
|
||||
"dist_test = haversine_distance(41.9267, 8.7369, 42.7028, 9.4500)\n",
|
||||
"print(f\"Distance Ajaccio-Bastia : {dist_test:.1f} km\")\n",
|
||||
"\n",
|
||||
"# Test avec Afa et Alando (vos exemples)\n",
|
||||
"afa = df[df['Nom français'] == 'Afa'].iloc[0]\n",
|
||||
"alando = df[df['Nom français'] == 'Alando'].iloc[0]\n",
|
||||
"dist_afa_alando = haversine_distance(afa['latitude'], afa['longitude'], \n",
|
||||
" alando['latitude'], alando['longitude'])\n",
|
||||
"print(f\"Distance Afa-Alando : {dist_afa_alando:.1f} km\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🎯 Algorithme k-NN"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def knn_classify(test_lat, test_lon, df, k=5):\n",
|
||||
" \"\"\"\n",
|
||||
" Classifie un point (test_lat, test_lon) en utilisant k-NN.\n",
|
||||
" \n",
|
||||
" Retourne :\n",
|
||||
" - prediction : le département prédit ('2A' ou '2B')\n",
|
||||
" - neighbors : DataFrame des k plus proches voisins\n",
|
||||
" - votes : dictionnaire des votes\n",
|
||||
" \"\"\"\n",
|
||||
" # Calculer les distances pour tous les villages\n",
|
||||
" distances = []\n",
|
||||
" for idx, row in df.iterrows():\n",
|
||||
" dist = haversine_distance(test_lat, test_lon, row['latitude'], row['longitude'])\n",
|
||||
" distances.append({\n",
|
||||
" 'village': row['Nom français'],\n",
|
||||
" 'nom_corse': row['Nom corse'],\n",
|
||||
" 'departement': row['dept_simple'],\n",
|
||||
" 'latitude': row['latitude'],\n",
|
||||
" 'longitude': row['longitude'],\n",
|
||||
" 'distance': dist\n",
|
||||
" })\n",
|
||||
" \n",
|
||||
" # Créer un DataFrame et trier par distance\n",
|
||||
" dist_df = pd.DataFrame(distances)\n",
|
||||
" dist_df = dist_df.sort_values('distance')\n",
|
||||
" \n",
|
||||
" # Sélectionner les k plus proches\n",
|
||||
" neighbors = dist_df.head(k)\n",
|
||||
" \n",
|
||||
" # Voter\n",
|
||||
" votes = Counter(neighbors['departement'])\n",
|
||||
" prediction = votes.most_common(1)[0][0]\n",
|
||||
" \n",
|
||||
" return prediction, neighbors, votes\n",
|
||||
"\n",
|
||||
"# Test de l'algorithme avec un point au centre de la Corse\n",
|
||||
"test_lat, test_lon = 42.15, 9.05\n",
|
||||
"k = 5\n",
|
||||
"\n",
|
||||
"prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n",
|
||||
"\n",
|
||||
"print(f\"\\n🎯 Point de test : ({test_lat}, {test_lon})\")\n",
|
||||
"print(f\"\\nAvec k={k} :\")\n",
|
||||
"print(f\"Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n",
|
||||
"print(f\"Votes : {dict(votes)}\")\n",
|
||||
"print(f\"\\nLes {k} plus proches voisins :\")\n",
|
||||
"print(neighbors[['village', 'nom_corse', 'departement', 'distance']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🗺️ Visualisation avec Folium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_map(test_lat=None, test_lon=None, k=5, show_all_villages=False, show_boundaries=False):\n",
|
||||
" \"\"\"\n",
|
||||
" Crée une carte interactive avec Folium.\n",
|
||||
" \n",
|
||||
" Paramètres:\n",
|
||||
" - test_lat, test_lon: coordonnées du point à tester\n",
|
||||
" - k: nombre de voisins\n",
|
||||
" - show_all_villages: afficher tous les villages\n",
|
||||
" - show_boundaries: afficher les frontières des communes (peut être lent)\n",
|
||||
" \"\"\"\n",
|
||||
" # Centre de la Corse\n",
|
||||
" center_lat = 42.15\n",
|
||||
" center_lon = 9.05\n",
|
||||
" \n",
|
||||
" # Créer la carte\n",
|
||||
" m = folium.Map(\n",
|
||||
" location=[center_lat, center_lon],\n",
|
||||
" zoom_start=9,\n",
|
||||
" tiles='OpenStreetMap'\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Afficher les frontières des communes (optionnel)\n",
|
||||
" if show_boundaries:\n",
|
||||
" print(\"Affichage des frontières des communes...\")\n",
|
||||
" for idx, row in df.iterrows():\n",
|
||||
" try:\n",
|
||||
" zone_geo = json.loads(row['Zone_geo'])\n",
|
||||
" color = 'red' if row['dept_simple'] == '2A' else 'blue'\n",
|
||||
" \n",
|
||||
" folium.GeoJson(\n",
|
||||
" zone_geo,\n",
|
||||
" style_function=lambda x, color=color: {\n",
|
||||
" 'fillColor': color,\n",
|
||||
" 'color': color,\n",
|
||||
" 'weight': 1,\n",
|
||||
" 'fillOpacity': 0.1\n",
|
||||
" },\n",
|
||||
" tooltip=row['Nom français']\n",
|
||||
" ).add_to(m)\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" # Afficher tous les villages (optionnel)\n",
|
||||
" if show_all_villages:\n",
|
||||
" marker_cluster = MarkerCluster().add_to(m)\n",
|
||||
" \n",
|
||||
" for idx, row in df.iterrows():\n",
|
||||
" color = 'red' if row['dept_simple'] == '2A' else 'blue'\n",
|
||||
" folium.CircleMarker(\n",
|
||||
" location=[row['latitude'], row['longitude']],\n",
|
||||
" radius=3,\n",
|
||||
" color=color,\n",
|
||||
" fill=True,\n",
|
||||
" fillColor=color,\n",
|
||||
" fillOpacity=0.4,\n",
|
||||
" popup=f\"<b>{row['Nom français']}</b><br>{row['Nom corse']}<br>({row['dept_simple']})\"\n",
|
||||
" ).add_to(marker_cluster)\n",
|
||||
" \n",
|
||||
" # Si un point de test est fourni\n",
|
||||
" if test_lat is not None and test_lon is not None:\n",
|
||||
" # Classification\n",
|
||||
" prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n",
|
||||
" \n",
|
||||
" # Marqueur pour le point de test\n",
|
||||
" color = 'darkred' if prediction == '2A' else 'darkblue'\n",
|
||||
" dept_name = 'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'\n",
|
||||
" \n",
|
||||
" folium.Marker(\n",
|
||||
" location=[test_lat, test_lon],\n",
|
||||
" popup=f\"<b>Point à classifier</b><br>Prédiction : {dept_name}<br>Votes : {dict(votes)}\",\n",
|
||||
" icon=folium.Icon(color=color, icon='star', prefix='fa')\n",
|
||||
" ).add_to(m)\n",
|
||||
" \n",
|
||||
" # Afficher les k plus proches voisins\n",
|
||||
" for idx, neighbor in neighbors.iterrows():\n",
|
||||
" # Marqueur pour chaque voisin\n",
|
||||
" color = 'red' if neighbor['departement'] == '2A' else 'blue'\n",
|
||||
" folium.Marker(\n",
|
||||
" location=[neighbor['latitude'], neighbor['longitude']],\n",
|
||||
" popup=f\"<b>{neighbor['village']}</b><br>{neighbor['nom_corse']}<br>{neighbor['departement']}<br>Distance: {neighbor['distance']:.2f} km\",\n",
|
||||
" icon=folium.Icon(color=color, icon='info-sign')\n",
|
||||
" ).add_to(m)\n",
|
||||
" \n",
|
||||
" # Ligne entre le point test et le voisin\n",
|
||||
" folium.PolyLine(\n",
|
||||
" locations=[\n",
|
||||
" [test_lat, test_lon],\n",
|
||||
" [neighbor['latitude'], neighbor['longitude']]\n",
|
||||
" ],\n",
|
||||
" color=color,\n",
|
||||
" weight=2,\n",
|
||||
" opacity=0.5,\n",
|
||||
" tooltip=f\"{neighbor['distance']:.2f} km\"\n",
|
||||
" ).add_to(m)\n",
|
||||
" \n",
|
||||
" # Légende\n",
|
||||
" legend_html = '''\n",
|
||||
" <div style=\"position: fixed; \n",
|
||||
" bottom: 50px; right: 50px; width: 220px; height: 130px; \n",
|
||||
" background-color: white; border:2px solid grey; z-index:9999; \n",
|
||||
" font-size:14px; padding: 10px\">\n",
|
||||
" <p><strong>Légende</strong></p>\n",
|
||||
" <p><i class=\"fa fa-circle\" style=\"color:red\"></i> Corse du Sud (2A)</p>\n",
|
||||
" <p><i class=\"fa fa-circle\" style=\"color:blue\"></i> Haute-Corse (2B)</p>\n",
|
||||
" <p><i class=\"fa fa-star\" style=\"color:darkred\"></i> Point à classifier</p>\n",
|
||||
" </div>\n",
|
||||
" '''\n",
|
||||
" m.get_root().html.add_child(folium.Element(legend_html))\n",
|
||||
" \n",
|
||||
" return m\n",
|
||||
"\n",
|
||||
"# Créer la carte avec le point de test\n",
|
||||
"map_with_test = create_map(test_lat=42.15, test_lon=9.05, k=5, show_all_villages=False)\n",
|
||||
"map_with_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🔬 Expérimentation : Influence de k"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test avec différentes valeurs de k\n",
|
||||
"test_point = (42.15, 9.05) # Point au centre de la Corse\n",
|
||||
"\n",
|
||||
"print(f\"Point testé : {test_point}\\n\")\n",
|
||||
"print(f\"{'k':<5} {'Prédiction':<15} {'Votes 2A':<10} {'Votes 2B':<10}\")\n",
|
||||
"print(\"-\" * 50)\n",
|
||||
"\n",
|
||||
"for k in [1, 3, 5, 7, 9, 15, 21]:\n",
|
||||
" prediction, neighbors, votes = knn_classify(test_point[0], test_point[1], df, k=k)\n",
|
||||
" votes_2a = votes.get('2A', 0)\n",
|
||||
" votes_2b = votes.get('2B', 0)\n",
|
||||
" dept_name = 'Corse du Sud' if prediction == '2A' else 'Haute-Corse'\n",
|
||||
" print(f\"{k:<5} {dept_name:<15} {votes_2a:<10} {votes_2b:<10}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🎮 Mode interactif : Testez vos propres points !\n",
|
||||
"\n",
|
||||
"Modifiez les coordonnées ci-dessous pour tester différents points de la Corse.\n",
|
||||
"\n",
|
||||
"**Quelques repères géographiques :**\n",
|
||||
"- Ajaccio : (41.9267, 8.7369)\n",
|
||||
"- Bastia : (42.7028, 9.4500)\n",
|
||||
"- Corte : (42.3062, 9.1509)\n",
|
||||
"- Porto-Vecchio : (41.5914, 9.2795)\n",
|
||||
"- Calvi : (42.5679, 8.7575)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# === MODIFIEZ CES VALEURS ===\n",
|
||||
"test_latitude = 42.3 # Entre 41.3 (sud) et 43.0 (nord)\n",
|
||||
"test_longitude = 9.15 # Entre 8.5 (ouest) et 9.5 (est)\n",
|
||||
"k_value = 7 # Nombre de voisins\n",
|
||||
"# =============================\n",
|
||||
"\n",
|
||||
"prediction, neighbors, votes = knn_classify(test_latitude, test_longitude, df, k=k_value)\n",
|
||||
"\n",
|
||||
"print(f\"📍 Point : ({test_latitude}, {test_longitude})\")\n",
|
||||
"print(f\"🔢 k = {k_value}\")\n",
|
||||
"print(f\"\\n🎯 Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n",
|
||||
"print(f\"\\n📊 Votes : {dict(votes)}\")\n",
|
||||
"print(f\"\\n🏘️ Les {k_value} plus proches villages :\")\n",
|
||||
"print(neighbors[['village', 'nom_corse', 'departement', 'distance']].to_string(index=False))\n",
|
||||
"\n",
|
||||
"# Afficher la carte\n",
|
||||
"map_interactive = create_map(test_latitude, test_longitude, k=k_value, show_all_villages=False)\n",
|
||||
"map_interactive"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🌍 Carte complète avec tous les villages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Afficher tous les villages de Corse\n",
|
||||
"# Note : cette cellule peut prendre quelques secondes à s'exécuter\n",
|
||||
"\n",
|
||||
"map_all = create_map(show_all_villages=True, show_boundaries=False)\n",
|
||||
"map_all"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🗺️ Carte avec frontières des communes (BONUS)\n",
|
||||
"\n",
|
||||
"Cette cellule affiche les frontières réelles des communes. **Attention : cela peut prendre du temps à charger !**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Carte avec frontières - peut être lent !\n",
|
||||
"# Décommentez la ligne suivante pour l'exécuter\n",
|
||||
"# map_boundaries = create_map(test_lat=42.15, test_lon=9.05, k=5, show_boundaries=True)\n",
|
||||
"# map_boundaries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 📈 Visualisation de la frontière de décision"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Créer une grille de points et classifier chacun\n",
|
||||
"# Cela permet de visualiser la \"frontière\" selon k-NN\n",
|
||||
"\n",
|
||||
"def create_decision_boundary_map(k=5, grid_resolution=40):\n",
|
||||
" \"\"\"\n",
|
||||
" Crée une carte montrant la frontière de décision de k-NN.\n",
|
||||
" \"\"\"\n",
|
||||
" # Limites de la Corse\n",
|
||||
" lat_min, lat_max = 41.3, 43.0\n",
|
||||
" lon_min, lon_max = 8.5, 9.6\n",
|
||||
" \n",
|
||||
" # Créer une grille\n",
|
||||
" lats = np.linspace(lat_min, lat_max, grid_resolution)\n",
|
||||
" lons = np.linspace(lon_min, lon_max, grid_resolution)\n",
|
||||
" \n",
|
||||
" m = folium.Map(\n",
|
||||
" location=[42.15, 9.05],\n",
|
||||
" zoom_start=8,\n",
|
||||
" tiles='OpenStreetMap'\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Classifier chaque point de la grille\n",
|
||||
" print(f\"Classification d'une grille de {grid_resolution}x{grid_resolution} points...\")\n",
|
||||
" total = len(lats) * len(lons)\n",
|
||||
" count = 0\n",
|
||||
" \n",
|
||||
" for lat in lats:\n",
|
||||
" for lon in lons:\n",
|
||||
" prediction, _, _ = knn_classify(lat, lon, df, k=k)\n",
|
||||
" color = '#ffcccc' if prediction == '2A' else '#ccccff'\n",
|
||||
" \n",
|
||||
" folium.CircleMarker(\n",
|
||||
" location=[lat, lon],\n",
|
||||
" radius=4,\n",
|
||||
" color=color,\n",
|
||||
" fill=True,\n",
|
||||
" fillColor=color,\n",
|
||||
" fillOpacity=0.3,\n",
|
||||
" weight=0\n",
|
||||
" ).add_to(m)\n",
|
||||
" \n",
|
||||
" count += 1\n",
|
||||
" if count % 100 == 0:\n",
|
||||
" print(f\" {count}/{total} points traités ({100*count/total:.1f}%)\")\n",
|
||||
" \n",
|
||||
" print(\"Terminé !\")\n",
|
||||
" \n",
|
||||
" # Ajouter les villages\n",
|
||||
" for idx, row in df.iterrows():\n",
|
||||
" color = 'red' if row['dept_simple'] == '2A' else 'blue'\n",
|
||||
" folium.CircleMarker(\n",
|
||||
" location=[row['latitude'], row['longitude']],\n",
|
||||
" radius=2,\n",
|
||||
" color=color,\n",
|
||||
" fill=True,\n",
|
||||
" fillColor=color,\n",
|
||||
" fillOpacity=0.8,\n",
|
||||
" popup=row['Nom français']\n",
|
||||
" ).add_to(m)\n",
|
||||
" \n",
|
||||
" return m\n",
|
||||
"\n",
|
||||
"# Créer la carte (réduire grid_resolution si c'est trop lent)\n",
|
||||
"print(f\"Création de la carte de frontière avec k=5...\")\n",
|
||||
"print(\"Note : cela peut prendre 1-2 minutes...\")\n",
|
||||
"boundary_map = create_decision_boundary_map(k=5, grid_resolution=30)\n",
|
||||
"boundary_map"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 💡 Validation de l'algorithme"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# BONUS : Validation croisée\n",
|
||||
"# Tester la précision en utilisant les villages eux-mêmes\n",
|
||||
"\n",
|
||||
"def cross_validation(df, k=5, sample_size=100):\n",
|
||||
" \"\"\"\n",
|
||||
" Teste la précision de k-NN en utilisant un échantillon de villages.\n",
|
||||
" \"\"\"\n",
|
||||
" # Prendre un échantillon aléatoire\n",
|
||||
" sample = df.sample(n=min(sample_size, len(df)), random_state=42)\n",
|
||||
" \n",
|
||||
" correct = 0\n",
|
||||
" total = 0\n",
|
||||
" errors = []\n",
|
||||
" \n",
|
||||
" for idx, row in sample.iterrows():\n",
|
||||
" # Créer un dataset sans ce village\n",
|
||||
" df_without = df.drop(idx)\n",
|
||||
" \n",
|
||||
" # Classifier ce village\n",
|
||||
" prediction, neighbors, votes = knn_classify(\n",
|
||||
" row['latitude'], \n",
|
||||
" row['longitude'], \n",
|
||||
" df_without, \n",
|
||||
" k=k\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" if prediction == row['dept_simple']:\n",
|
||||
" correct += 1\n",
|
||||
" else:\n",
|
||||
" errors.append({\n",
|
||||
" 'village': row['Nom français'],\n",
|
||||
" 'vrai_dept': row['dept_simple'],\n",
|
||||
" 'prediction': prediction,\n",
|
||||
" 'votes': dict(votes)\n",
|
||||
" })\n",
|
||||
" total += 1\n",
|
||||
" \n",
|
||||
" accuracy = (correct / total) * 100\n",
|
||||
" return accuracy, correct, total, errors\n",
|
||||
"\n",
|
||||
"print(\"Test de précision de l'algorithme k-NN...\\n\")\n",
|
||||
"print(\"Validation croisée : chaque village est classifié en fonction de ses voisins.\\n\")\n",
|
||||
"\n",
|
||||
"for k in [1, 3, 5, 10, 15]:\n",
|
||||
" accuracy, correct, total, errors = cross_validation(df, k=k, sample_size=100)\n",
|
||||
" print(f\"k={k:2d} : {accuracy:.1f}% de précision ({correct}/{total} corrects)\")\n",
|
||||
"\n",
|
||||
"# Afficher quelques erreurs pour k=5\n",
|
||||
"print(\"\\n📋 Exemples d'erreurs avec k=5 :\")\n",
|
||||
"_, _, _, errors_k5 = cross_validation(df, k=5, sample_size=100)\n",
|
||||
"if errors_k5:\n",
|
||||
" for error in errors_k5[:5]:\n",
|
||||
" print(f\" • {error['village']} : prédit {error['prediction']} (vrai: {error['vrai_dept']}) - votes: {error['votes']}\")\n",
|
||||
"else:\n",
|
||||
" print(\" Aucune erreur !\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🎓 Questions de réflexion\n",
|
||||
"\n",
|
||||
"1. **Influence de k** : Comment la prédiction change-t-elle avec différentes valeurs de k ?\n",
|
||||
"\n",
|
||||
"2. **Points frontières** : Trouvez des coordonnées où la classification est ambiguë (votes proches).\n",
|
||||
"\n",
|
||||
"3. **Zones problématiques** : Où se situent les villages difficiles à classifier correctement ?\n",
|
||||
"\n",
|
||||
"4. **Validité géographique** : Cette méthode respecte-t-elle toujours les vraies frontières administratives ?\n",
|
||||
"\n",
|
||||
"5. **Améliorations** : Comment pourrait-on améliorer l'algorithme ?\n",
|
||||
" - Pondération par distance inverse\n",
|
||||
" - Prise en compte d'autres critères (altitude, population...)\n",
|
||||
" - k adaptatif selon la densité de villages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 💡 Exercices supplémentaires\n",
|
||||
"\n",
|
||||
"1. **Trouver la frontière** : Trouvez des points sur la \"frontière\" k-NN (là où un changement de k change la classification)\n",
|
||||
"\n",
|
||||
"2. **Villages isolés** : Identifiez les villages dont le département diffère de leurs k plus proches voisins\n",
|
||||
"\n",
|
||||
"3. **Pondération** : Implémentez une version pondérée où les villages plus proches ont plus d'influence\n",
|
||||
"\n",
|
||||
"4. **Comparaison** : Comparez la frontière k-NN avec la vraie frontière administrative"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# EXERCICE : Villages \"anomaliques\"\n",
|
||||
"# Trouver les villages dont les k plus proches voisins sont majoritairement de l'autre département\n",
|
||||
"\n",
|
||||
"def find_anomalous_villages(df, k=5):\n",
|
||||
" \"\"\"\n",
|
||||
" Trouve les villages qui seraient mal classifiés par k-NN.\n",
|
||||
" \"\"\"\n",
|
||||
" anomalies = []\n",
|
||||
" \n",
|
||||
" for idx, row in df.iterrows():\n",
|
||||
" # Créer un dataset sans ce village\n",
|
||||
" df_without = df.drop(idx)\n",
|
||||
" \n",
|
||||
" # Classifier ce village\n",
|
||||
" prediction, neighbors, votes = knn_classify(\n",
|
||||
" row['latitude'], \n",
|
||||
" row['longitude'], \n",
|
||||
" df_without, \n",
|
||||
" k=k\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Si la prédiction ne correspond pas au vrai département\n",
|
||||
" if prediction != row['dept_simple']:\n",
|
||||
" anomalies.append({\n",
|
||||
" 'village': row['Nom français'],\n",
|
||||
" 'nom_corse': row['Nom corse'],\n",
|
||||
" 'vrai_dept': row['dept_simple'],\n",
|
||||
" 'prediction': prediction,\n",
|
||||
" 'votes_2A': votes.get('2A', 0),\n",
|
||||
" 'votes_2B': votes.get('2B', 0),\n",
|
||||
" 'latitude': row['latitude'],\n",
|
||||
" 'longitude': row['longitude']\n",
|
||||
" })\n",
|
||||
" \n",
|
||||
" return pd.DataFrame(anomalies)\n",
|
||||
"\n",
|
||||
"print(\"Recherche des villages 'anomaliques' avec k=5...\\n\")\n",
|
||||
"anomalies_df = find_anomalous_villages(df, k=5)\n",
|
||||
"\n",
|
||||
"print(f\"Nombre de villages anomaliques : {len(anomalies_df)}\")\n",
|
||||
"print(f\"\\nVillages qui seraient classifiés dans le mauvais département :\\n\")\n",
|
||||
"print(anomalies_df[['village', 'nom_corse', 'vrai_dept', 'prediction', 'votes_2A', 'votes_2B']])\n",
|
||||
"\n",
|
||||
"# Afficher ces villages sur une carte\n",
|
||||
"if len(anomalies_df) > 0:\n",
|
||||
" m_anomalies = folium.Map(location=[42.15, 9.05], zoom_start=9)\n",
|
||||
" \n",
|
||||
" for idx, row in anomalies_df.iterrows():\n",
|
||||
" folium.Marker(\n",
|
||||
" location=[row['latitude'], row['longitude']],\n",
|
||||
" popup=f\"<b>{row['village']}</b><br>Vrai: {row['vrai_dept']}<br>Prédit: {row['prediction']}<br>Votes: {row['votes_2A']}-{row['votes_2B']}\",\n",
|
||||
" icon=folium.Icon(color='orange', icon='exclamation-triangle', prefix='fa')\n",
|
||||
" ).add_to(m_anomalies)\n",
|
||||
" \n",
|
||||
" display(m_anomalies)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue