knnCorsica/activite1/knn_corse_interactive_v2.ipynb

744 lines
27 KiB
Text
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 🗺️ Classification k-NN : Haute-Corse ou Corse du Sud ?\n",
"\n",
"## Objectif\n",
"Utiliser l'algorithme des **k plus proches voisins (k-NN)** pour déterminer si un point de la carte de Corse se situe en **Haute-Corse (2B)** ou en **Corse du Sud (2A)**, en se basant sur les villages les plus proches.\n",
"\n",
"## Principe\n",
"1. On charge les données des villages corses avec leurs coordonnées GPS et leur département\n",
"2. On choisit un point sur la carte\n",
"3. On calcule les distances entre ce point et tous les villages\n",
"4. On identifie les k villages les plus proches\n",
"5. On vote : le département majoritaire parmi ces k villages devient la prédiction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 📦 Installation et imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Installation des bibliothèques nécessaires (si besoin)\n",
"import sys\n",
"!{sys.executable} -m pip install folium pandas numpy -q"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import folium\n",
"from folium.plugins import MarkerCluster\n",
"import math\n",
"import json\n",
"from collections import Counter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 📊 Chargement des données"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Charger le fichier CSV\n",
"# Remplacez 'villages_corse.csv' par le chemin de votre fichier\n",
"df = pd.read_csv('villages_corse.csv', sep='\\t', encoding='utf-8')\n",
"\n",
"# Afficher les premières lignes\n",
"print(f\"Nombre de villages : {len(df)}\")\n",
"print(f\"\\nColonnes : {list(df.columns)}\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🔧 Préparation des données"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def parse_coordinates(point_geo_str):\n",
" \"\"\"\n",
" Parse la colonne Point_Geo pour extraire latitude et longitude.\n",
" Format attendu : \"latitude, longitude\"\n",
" Exemple : \"41.984099158, 8.798384636\"\n",
" \"\"\"\n",
" try:\n",
" # Séparer par la virgule\n",
" parts = str(point_geo_str).split(',')\n",
" lat = float(parts[0].strip())\n",
" lon = float(parts[1].strip())\n",
" return lat, lon\n",
" except Exception as e:\n",
" print(f\"Erreur parsing: {point_geo_str} - {e}\")\n",
" return None, None\n",
"\n",
"# Extraire les coordonnées\n",
"df[['latitude', 'longitude']] = df['Point_Geo'].apply(\n",
" lambda x: pd.Series(parse_coordinates(x))\n",
")\n",
"\n",
"# Supprimer les lignes sans coordonnées valides\n",
"df = df.dropna(subset=['latitude', 'longitude'])\n",
"\n",
"# Simplifier les noms de départements\n",
"df['dept_simple'] = df['Code Département'].apply(lambda x: '2A' if str(x) == '2A' else '2B')\n",
"\n",
"print(f\"Villages avec coordonnées valides : {len(df)}\")\n",
"print(f\"\\nRépartition par département :\")\n",
"print(df['dept_simple'].value_counts())\n",
"\n",
"df[['Nom français', 'dept_simple', 'latitude', 'longitude']].head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 📏 Fonction de calcul de distance\n",
"\n",
"Nous utilisons la **formule de Haversine** pour calculer la distance entre deux points GPS sur la surface de la Terre."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def haversine_distance(lat1, lon1, lat2, lon2):\n",
" \"\"\"\n",
" Calcule la distance en kilomètres entre deux points GPS.\n",
" Formule de Haversine.\n",
" \"\"\"\n",
" R = 6371 # Rayon de la Terre en km\n",
" \n",
" # Conversion en radians\n",
" lat1_rad = math.radians(lat1)\n",
" lat2_rad = math.radians(lat2)\n",
" delta_lat = math.radians(lat2 - lat1)\n",
" delta_lon = math.radians(lon2 - lon1)\n",
" \n",
" # Formule de Haversine\n",
" a = math.sin(delta_lat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon/2)**2\n",
" c = 2 * math.asin(math.sqrt(a))\n",
" \n",
" return R * c\n",
"\n",
"# Test de la fonction\n",
"# Distance entre Ajaccio (41.9267, 8.7369) et Bastia (42.7028, 9.4500)\n",
"dist_test = haversine_distance(41.9267, 8.7369, 42.7028, 9.4500)\n",
"print(f\"Distance Ajaccio-Bastia : {dist_test:.1f} km\")\n",
"\n",
"# Test avec Afa et Alando (vos exemples)\n",
"afa = df[df['Nom français'] == 'Afa'].iloc[0]\n",
"alando = df[df['Nom français'] == 'Alando'].iloc[0]\n",
"dist_afa_alando = haversine_distance(afa['latitude'], afa['longitude'], \n",
" alando['latitude'], alando['longitude'])\n",
"print(f\"Distance Afa-Alando : {dist_afa_alando:.1f} km\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🎯 Algorithme k-NN"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def knn_classify(test_lat, test_lon, df, k=5):\n",
" \"\"\"\n",
" Classifie un point (test_lat, test_lon) en utilisant k-NN.\n",
" \n",
" Retourne :\n",
" - prediction : le département prédit ('2A' ou '2B')\n",
" - neighbors : DataFrame des k plus proches voisins\n",
" - votes : dictionnaire des votes\n",
" \"\"\"\n",
" # Calculer les distances pour tous les villages\n",
" distances = []\n",
" for idx, row in df.iterrows():\n",
" dist = haversine_distance(test_lat, test_lon, row['latitude'], row['longitude'])\n",
" distances.append({\n",
" 'village': row['Nom français'],\n",
" 'nom_corse': row['Nom corse'],\n",
" 'departement': row['dept_simple'],\n",
" 'latitude': row['latitude'],\n",
" 'longitude': row['longitude'],\n",
" 'distance': dist\n",
" })\n",
" \n",
" # Créer un DataFrame et trier par distance\n",
" dist_df = pd.DataFrame(distances)\n",
" dist_df = dist_df.sort_values('distance')\n",
" \n",
" # Sélectionner les k plus proches\n",
" neighbors = dist_df.head(k)\n",
" \n",
" # Voter\n",
" votes = Counter(neighbors['departement'])\n",
" prediction = votes.most_common(1)[0][0]\n",
" \n",
" return prediction, neighbors, votes\n",
"\n",
"# Test de l'algorithme avec un point au centre de la Corse\n",
"test_lat, test_lon = 42.15, 9.05\n",
"k = 5\n",
"\n",
"prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n",
"\n",
"print(f\"\\n🎯 Point de test : ({test_lat}, {test_lon})\")\n",
"print(f\"\\nAvec k={k} :\")\n",
"print(f\"Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n",
"print(f\"Votes : {dict(votes)}\")\n",
"print(f\"\\nLes {k} plus proches voisins :\")\n",
"print(neighbors[['village', 'nom_corse', 'departement', 'distance']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🗺️ Visualisation avec Folium"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def create_map(test_lat=None, test_lon=None, k=5, show_all_villages=False, show_boundaries=False):\n",
" \"\"\"\n",
" Crée une carte interactive avec Folium.\n",
" \n",
" Paramètres:\n",
" - test_lat, test_lon: coordonnées du point à tester\n",
" - k: nombre de voisins\n",
" - show_all_villages: afficher tous les villages\n",
" - show_boundaries: afficher les frontières des communes (peut être lent)\n",
" \"\"\"\n",
" # Centre de la Corse\n",
" center_lat = 42.15\n",
" center_lon = 9.05\n",
" \n",
" # Créer la carte\n",
" m = folium.Map(\n",
" location=[center_lat, center_lon],\n",
" zoom_start=9,\n",
" tiles='OpenStreetMap'\n",
" )\n",
" \n",
" # Afficher les frontières des communes (optionnel)\n",
" if show_boundaries:\n",
" print(\"Affichage des frontières des communes...\")\n",
" for idx, row in df.iterrows():\n",
" try:\n",
" zone_geo = json.loads(row['Zone_geo'])\n",
" color = 'red' if row['dept_simple'] == '2A' else 'blue'\n",
" \n",
" folium.GeoJson(\n",
" zone_geo,\n",
" style_function=lambda x, color=color: {\n",
" 'fillColor': color,\n",
" 'color': color,\n",
" 'weight': 1,\n",
" 'fillOpacity': 0.1\n",
" },\n",
" tooltip=row['Nom français']\n",
" ).add_to(m)\n",
" except:\n",
" pass\n",
" \n",
" # Afficher tous les villages (optionnel)\n",
" if show_all_villages:\n",
" marker_cluster = MarkerCluster().add_to(m)\n",
" \n",
" for idx, row in df.iterrows():\n",
" color = 'red' if row['dept_simple'] == '2A' else 'blue'\n",
" folium.CircleMarker(\n",
" location=[row['latitude'], row['longitude']],\n",
" radius=3,\n",
" color=color,\n",
" fill=True,\n",
" fillColor=color,\n",
" fillOpacity=0.4,\n",
" popup=f\"<b>{row['Nom français']}</b><br>{row['Nom corse']}<br>({row['dept_simple']})\"\n",
" ).add_to(marker_cluster)\n",
" \n",
" # Si un point de test est fourni\n",
" if test_lat is not None and test_lon is not None:\n",
" # Classification\n",
" prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n",
" \n",
" # Marqueur pour le point de test\n",
" color = 'darkred' if prediction == '2A' else 'darkblue'\n",
" dept_name = 'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'\n",
" \n",
" folium.Marker(\n",
" location=[test_lat, test_lon],\n",
" popup=f\"<b>Point à classifier</b><br>Prédiction : {dept_name}<br>Votes : {dict(votes)}\",\n",
" icon=folium.Icon(color=color, icon='star', prefix='fa')\n",
" ).add_to(m)\n",
" \n",
" # Afficher les k plus proches voisins\n",
" for idx, neighbor in neighbors.iterrows():\n",
" # Marqueur pour chaque voisin\n",
" color = 'red' if neighbor['departement'] == '2A' else 'blue'\n",
" folium.Marker(\n",
" location=[neighbor['latitude'], neighbor['longitude']],\n",
" popup=f\"<b>{neighbor['village']}</b><br>{neighbor['nom_corse']}<br>{neighbor['departement']}<br>Distance: {neighbor['distance']:.2f} km\",\n",
" icon=folium.Icon(color=color, icon='info-sign')\n",
" ).add_to(m)\n",
" \n",
" # Ligne entre le point test et le voisin\n",
" folium.PolyLine(\n",
" locations=[\n",
" [test_lat, test_lon],\n",
" [neighbor['latitude'], neighbor['longitude']]\n",
" ],\n",
" color=color,\n",
" weight=2,\n",
" opacity=0.5,\n",
" tooltip=f\"{neighbor['distance']:.2f} km\"\n",
" ).add_to(m)\n",
" \n",
" # Légende\n",
" legend_html = '''\n",
" <div style=\"position: fixed; \n",
" bottom: 50px; right: 50px; width: 220px; height: 130px; \n",
" background-color: white; border:2px solid grey; z-index:9999; \n",
" font-size:14px; padding: 10px\">\n",
" <p><strong>Légende</strong></p>\n",
" <p><i class=\"fa fa-circle\" style=\"color:red\"></i> Corse du Sud (2A)</p>\n",
" <p><i class=\"fa fa-circle\" style=\"color:blue\"></i> Haute-Corse (2B)</p>\n",
" <p><i class=\"fa fa-star\" style=\"color:darkred\"></i> Point à classifier</p>\n",
" </div>\n",
" '''\n",
" m.get_root().html.add_child(folium.Element(legend_html))\n",
" \n",
" return m\n",
"\n",
"# Créer la carte avec le point de test\n",
"map_with_test = create_map(test_lat=42.15, test_lon=9.05, k=5, show_all_villages=False)\n",
"map_with_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🔬 Expérimentation : Influence de k"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test avec différentes valeurs de k\n",
"test_point = (42.15, 9.05) # Point au centre de la Corse\n",
"\n",
"print(f\"Point testé : {test_point}\\n\")\n",
"print(f\"{'k':<5} {'Prédiction':<15} {'Votes 2A':<10} {'Votes 2B':<10}\")\n",
"print(\"-\" * 50)\n",
"\n",
"for k in [1, 3, 5, 7, 9, 15, 21]:\n",
" prediction, neighbors, votes = knn_classify(test_point[0], test_point[1], df, k=k)\n",
" votes_2a = votes.get('2A', 0)\n",
" votes_2b = votes.get('2B', 0)\n",
" dept_name = 'Corse du Sud' if prediction == '2A' else 'Haute-Corse'\n",
" print(f\"{k:<5} {dept_name:<15} {votes_2a:<10} {votes_2b:<10}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🎮 Mode interactif : Testez vos propres points !\n",
"\n",
"Modifiez les coordonnées ci-dessous pour tester différents points de la Corse.\n",
"\n",
"**Quelques repères géographiques :**\n",
"- Ajaccio : (41.9267, 8.7369)\n",
"- Bastia : (42.7028, 9.4500)\n",
"- Corte : (42.3062, 9.1509)\n",
"- Porto-Vecchio : (41.5914, 9.2795)\n",
"- Calvi : (42.5679, 8.7575)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# === MODIFIEZ CES VALEURS ===\n",
"test_latitude = 42.3 # Entre 41.3 (sud) et 43.0 (nord)\n",
"test_longitude = 9.15 # Entre 8.5 (ouest) et 9.5 (est)\n",
"k_value = 7 # Nombre de voisins\n",
"# =============================\n",
"\n",
"prediction, neighbors, votes = knn_classify(test_latitude, test_longitude, df, k=k_value)\n",
"\n",
"print(f\"📍 Point : ({test_latitude}, {test_longitude})\")\n",
"print(f\"🔢 k = {k_value}\")\n",
"print(f\"\\n🎯 Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n",
"print(f\"\\n📊 Votes : {dict(votes)}\")\n",
"print(f\"\\n🏘 Les {k_value} plus proches villages :\")\n",
"print(neighbors[['village', 'nom_corse', 'departement', 'distance']].to_string(index=False))\n",
"\n",
"# Afficher la carte\n",
"map_interactive = create_map(test_latitude, test_longitude, k=k_value, show_all_villages=False)\n",
"map_interactive"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🌍 Carte complète avec tous les villages"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Afficher tous les villages de Corse\n",
"# Note : cette cellule peut prendre quelques secondes à s'exécuter\n",
"\n",
"map_all = create_map(show_all_villages=True, show_boundaries=False)\n",
"map_all"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🗺️ Carte avec frontières des communes (BONUS)\n",
"\n",
"Cette cellule affiche les frontières réelles des communes. **Attention : cela peut prendre du temps à charger !**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Carte avec frontières - peut être lent !\n",
"# Décommentez la ligne suivante pour l'exécuter\n",
"# map_boundaries = create_map(test_lat=42.15, test_lon=9.05, k=5, show_boundaries=True)\n",
"# map_boundaries"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 📈 Visualisation de la frontière de décision"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Créer une grille de points et classifier chacun\n",
"# Cela permet de visualiser la \"frontière\" selon k-NN\n",
"\n",
"def create_decision_boundary_map(k=5, grid_resolution=40):\n",
" \"\"\"\n",
" Crée une carte montrant la frontière de décision de k-NN.\n",
" \"\"\"\n",
" # Limites de la Corse\n",
" lat_min, lat_max = 41.3, 43.0\n",
" lon_min, lon_max = 8.5, 9.6\n",
" \n",
" # Créer une grille\n",
" lats = np.linspace(lat_min, lat_max, grid_resolution)\n",
" lons = np.linspace(lon_min, lon_max, grid_resolution)\n",
" \n",
" m = folium.Map(\n",
" location=[42.15, 9.05],\n",
" zoom_start=8,\n",
" tiles='OpenStreetMap'\n",
" )\n",
" \n",
" # Classifier chaque point de la grille\n",
" print(f\"Classification d'une grille de {grid_resolution}x{grid_resolution} points...\")\n",
" total = len(lats) * len(lons)\n",
" count = 0\n",
" \n",
" for lat in lats:\n",
" for lon in lons:\n",
" prediction, _, _ = knn_classify(lat, lon, df, k=k)\n",
" color = '#ffcccc' if prediction == '2A' else '#ccccff'\n",
" \n",
" folium.CircleMarker(\n",
" location=[lat, lon],\n",
" radius=4,\n",
" color=color,\n",
" fill=True,\n",
" fillColor=color,\n",
" fillOpacity=0.3,\n",
" weight=0\n",
" ).add_to(m)\n",
" \n",
" count += 1\n",
" if count % 100 == 0:\n",
" print(f\" {count}/{total} points traités ({100*count/total:.1f}%)\")\n",
" \n",
" print(\"Terminé !\")\n",
" \n",
" # Ajouter les villages\n",
" for idx, row in df.iterrows():\n",
" color = 'red' if row['dept_simple'] == '2A' else 'blue'\n",
" folium.CircleMarker(\n",
" location=[row['latitude'], row['longitude']],\n",
" radius=2,\n",
" color=color,\n",
" fill=True,\n",
" fillColor=color,\n",
" fillOpacity=0.8,\n",
" popup=row['Nom français']\n",
" ).add_to(m)\n",
" \n",
" return m\n",
"\n",
"# Créer la carte (réduire grid_resolution si c'est trop lent)\n",
"print(f\"Création de la carte de frontière avec k=5...\")\n",
"print(\"Note : cela peut prendre 1-2 minutes...\")\n",
"boundary_map = create_decision_boundary_map(k=5, grid_resolution=30)\n",
"boundary_map"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 💡 Validation de l'algorithme"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# BONUS : Validation croisée\n",
"# Tester la précision en utilisant les villages eux-mêmes\n",
"\n",
"def cross_validation(df, k=5, sample_size=100):\n",
" \"\"\"\n",
" Teste la précision de k-NN en utilisant un échantillon de villages.\n",
" \"\"\"\n",
" # Prendre un échantillon aléatoire\n",
" sample = df.sample(n=min(sample_size, len(df)), random_state=42)\n",
" \n",
" correct = 0\n",
" total = 0\n",
" errors = []\n",
" \n",
" for idx, row in sample.iterrows():\n",
" # Créer un dataset sans ce village\n",
" df_without = df.drop(idx)\n",
" \n",
" # Classifier ce village\n",
" prediction, neighbors, votes = knn_classify(\n",
" row['latitude'], \n",
" row['longitude'], \n",
" df_without, \n",
" k=k\n",
" )\n",
" \n",
" if prediction == row['dept_simple']:\n",
" correct += 1\n",
" else:\n",
" errors.append({\n",
" 'village': row['Nom français'],\n",
" 'vrai_dept': row['dept_simple'],\n",
" 'prediction': prediction,\n",
" 'votes': dict(votes)\n",
" })\n",
" total += 1\n",
" \n",
" accuracy = (correct / total) * 100\n",
" return accuracy, correct, total, errors\n",
"\n",
"print(\"Test de précision de l'algorithme k-NN...\\n\")\n",
"print(\"Validation croisée : chaque village est classifié en fonction de ses voisins.\\n\")\n",
"\n",
"for k in [1, 3, 5, 10, 15]:\n",
" accuracy, correct, total, errors = cross_validation(df, k=k, sample_size=100)\n",
" print(f\"k={k:2d} : {accuracy:.1f}% de précision ({correct}/{total} corrects)\")\n",
"\n",
"# Afficher quelques erreurs pour k=5\n",
"print(\"\\n📋 Exemples d'erreurs avec k=5 :\")\n",
"_, _, _, errors_k5 = cross_validation(df, k=5, sample_size=100)\n",
"if errors_k5:\n",
" for error in errors_k5[:5]:\n",
" print(f\" • {error['village']} : prédit {error['prediction']} (vrai: {error['vrai_dept']}) - votes: {error['votes']}\")\n",
"else:\n",
" print(\" Aucune erreur !\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 🎓 Questions de réflexion\n",
"\n",
"1. **Influence de k** : Comment la prédiction change-t-elle avec différentes valeurs de k ?\n",
"\n",
"2. **Points frontières** : Trouvez des coordonnées où la classification est ambiguë (votes proches).\n",
"\n",
"3. **Zones problématiques** : Où se situent les villages difficiles à classifier correctement ?\n",
"\n",
"4. **Validité géographique** : Cette méthode respecte-t-elle toujours les vraies frontières administratives ?\n",
"\n",
"5. **Améliorations** : Comment pourrait-on améliorer l'algorithme ?\n",
" - Pondération par distance inverse\n",
" - Prise en compte d'autres critères (altitude, population...)\n",
" - k adaptatif selon la densité de villages"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 💡 Exercices supplémentaires\n",
"\n",
"1. **Trouver la frontière** : Trouvez des points sur la \"frontière\" k-NN (là où un changement de k change la classification)\n",
"\n",
"2. **Villages isolés** : Identifiez les villages dont le département diffère de leurs k plus proches voisins\n",
"\n",
"3. **Pondération** : Implémentez une version pondérée où les villages plus proches ont plus d'influence\n",
"\n",
"4. **Comparaison** : Comparez la frontière k-NN avec la vraie frontière administrative"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# EXERCICE : Villages \"anomaliques\"\n",
"# Trouver les villages dont les k plus proches voisins sont majoritairement de l'autre département\n",
"\n",
"def find_anomalous_villages(df, k=5):\n",
" \"\"\"\n",
" Trouve les villages qui seraient mal classifiés par k-NN.\n",
" \"\"\"\n",
" anomalies = []\n",
" \n",
" for idx, row in df.iterrows():\n",
" # Créer un dataset sans ce village\n",
" df_without = df.drop(idx)\n",
" \n",
" # Classifier ce village\n",
" prediction, neighbors, votes = knn_classify(\n",
" row['latitude'], \n",
" row['longitude'], \n",
" df_without, \n",
" k=k\n",
" )\n",
" \n",
" # Si la prédiction ne correspond pas au vrai département\n",
" if prediction != row['dept_simple']:\n",
" anomalies.append({\n",
" 'village': row['Nom français'],\n",
" 'nom_corse': row['Nom corse'],\n",
" 'vrai_dept': row['dept_simple'],\n",
" 'prediction': prediction,\n",
" 'votes_2A': votes.get('2A', 0),\n",
" 'votes_2B': votes.get('2B', 0),\n",
" 'latitude': row['latitude'],\n",
" 'longitude': row['longitude']\n",
" })\n",
" \n",
" return pd.DataFrame(anomalies)\n",
"\n",
"print(\"Recherche des villages 'anomaliques' avec k=5...\\n\")\n",
"anomalies_df = find_anomalous_villages(df, k=5)\n",
"\n",
"print(f\"Nombre de villages anomaliques : {len(anomalies_df)}\")\n",
"print(f\"\\nVillages qui seraient classifiés dans le mauvais département :\\n\")\n",
"print(anomalies_df[['village', 'nom_corse', 'vrai_dept', 'prediction', 'votes_2A', 'votes_2B']])\n",
"\n",
"# Afficher ces villages sur une carte\n",
"if len(anomalies_df) > 0:\n",
" m_anomalies = folium.Map(location=[42.15, 9.05], zoom_start=9)\n",
" \n",
" for idx, row in anomalies_df.iterrows():\n",
" folium.Marker(\n",
" location=[row['latitude'], row['longitude']],\n",
" popup=f\"<b>{row['village']}</b><br>Vrai: {row['vrai_dept']}<br>Prédit: {row['prediction']}<br>Votes: {row['votes_2A']}-{row['votes_2B']}\",\n",
" icon=folium.Icon(color='orange', icon='exclamation-triangle', prefix='fa')\n",
" ).add_to(m_anomalies)\n",
" \n",
" display(m_anomalies)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}