From 7236fef0d789db7796013df738752d7e643e6687 Mon Sep 17 00:00:00 2001 From: lprik Date: Thu, 23 Oct 2025 13:50:58 +0000 Subject: [PATCH] Supprimer activite1/knn_corse_interactive_v2.ipynb --- activite1/knn_corse_interactive_v2.ipynb | 744 ----------------------- 1 file changed, 744 deletions(-) delete mode 100644 activite1/knn_corse_interactive_v2.ipynb diff --git a/activite1/knn_corse_interactive_v2.ipynb b/activite1/knn_corse_interactive_v2.ipynb deleted file mode 100644 index a5ac185..0000000 --- a/activite1/knn_corse_interactive_v2.ipynb +++ /dev/null @@ -1,744 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 🗺️ Classification k-NN : Haute-Corse ou Corse du Sud ?\n", - "\n", - "## Objectif\n", - "Utiliser l'algorithme des **k plus proches voisins (k-NN)** pour déterminer si un point de la carte de Corse se situe en **Haute-Corse (2B)** ou en **Corse du Sud (2A)**, en se basant sur les villages les plus proches.\n", - "\n", - "## Principe\n", - "1. On charge les données des villages corses avec leurs coordonnées GPS et leur département\n", - "2. On choisit un point sur la carte\n", - "3. On calcule les distances entre ce point et tous les villages\n", - "4. On identifie les k villages les plus proches\n", - "5. On vote : le département majoritaire parmi ces k villages devient la prédiction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📦 Installation et imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Installation des bibliothèques nécessaires (si besoin)\n", - "import sys\n", - "!{sys.executable} -m pip install folium pandas numpy -q" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import folium\n", - "from folium.plugins import MarkerCluster\n", - "import math\n", - "import json\n", - "from collections import Counter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📊 Chargement des données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Charger le fichier CSV\n", - "# Remplacez 'villages_corse.csv' par le chemin de votre fichier\n", - "df = pd.read_csv('villages_corse.csv', sep='\\t', encoding='utf-8')\n", - "\n", - "# Afficher les premières lignes\n", - "print(f\"Nombre de villages : {len(df)}\")\n", - "print(f\"\\nColonnes : {list(df.columns)}\")\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🔧 Préparation des données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_coordinates(point_geo_str):\n", - " \"\"\"\n", - " Parse la colonne Point_Geo pour extraire latitude et longitude.\n", - " Format attendu : \"latitude, longitude\"\n", - " Exemple : \"41.984099158, 8.798384636\"\n", - " \"\"\"\n", - " try:\n", - " # Séparer par la virgule\n", - " parts = str(point_geo_str).split(',')\n", - " lat = float(parts[0].strip())\n", - " lon = float(parts[1].strip())\n", - " return lat, lon\n", - " except Exception as e:\n", - " print(f\"Erreur parsing: {point_geo_str} - {e}\")\n", - " return None, None\n", - "\n", - "# Extraire les coordonnées\n", - "df[['latitude', 'longitude']] = df['Point_Geo'].apply(\n", - " lambda x: pd.Series(parse_coordinates(x))\n", - ")\n", - "\n", - "# Supprimer les lignes sans coordonnées valides\n", - "df = df.dropna(subset=['latitude', 'longitude'])\n", - "\n", - "# Simplifier les noms de départements\n", - "df['dept_simple'] = df['Code Département'].apply(lambda x: '2A' if str(x) == '2A' else '2B')\n", - "\n", - "print(f\"Villages avec coordonnées valides : {len(df)}\")\n", - "print(f\"\\nRépartition par département :\")\n", - "print(df['dept_simple'].value_counts())\n", - "\n", - "df[['Nom français', 'dept_simple', 'latitude', 'longitude']].head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📏 Fonction de calcul de distance\n", - "\n", - "Nous utilisons la **formule de Haversine** pour calculer la distance entre deux points GPS sur la surface de la Terre." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def haversine_distance(lat1, lon1, lat2, lon2):\n", - " \"\"\"\n", - " Calcule la distance en kilomètres entre deux points GPS.\n", - " Formule de Haversine.\n", - " \"\"\"\n", - " R = 6371 # Rayon de la Terre en km\n", - " \n", - " # Conversion en radians\n", - " lat1_rad = math.radians(lat1)\n", - " lat2_rad = math.radians(lat2)\n", - " delta_lat = math.radians(lat2 - lat1)\n", - " delta_lon = math.radians(lon2 - lon1)\n", - " \n", - " # Formule de Haversine\n", - " a = math.sin(delta_lat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon/2)**2\n", - " c = 2 * math.asin(math.sqrt(a))\n", - " \n", - " return R * c\n", - "\n", - "# Test de la fonction\n", - "# Distance entre Ajaccio (41.9267, 8.7369) et Bastia (42.7028, 9.4500)\n", - "dist_test = haversine_distance(41.9267, 8.7369, 42.7028, 9.4500)\n", - "print(f\"Distance Ajaccio-Bastia : {dist_test:.1f} km\")\n", - "\n", - "# Test avec Afa et Alando (vos exemples)\n", - "afa = df[df['Nom français'] == 'Afa'].iloc[0]\n", - "alando = df[df['Nom français'] == 'Alando'].iloc[0]\n", - "dist_afa_alando = haversine_distance(afa['latitude'], afa['longitude'], \n", - " alando['latitude'], alando['longitude'])\n", - "print(f\"Distance Afa-Alando : {dist_afa_alando:.1f} km\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🎯 Algorithme k-NN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def knn_classify(test_lat, test_lon, df, k=5):\n", - " \"\"\"\n", - " Classifie un point (test_lat, test_lon) en utilisant k-NN.\n", - " \n", - " Retourne :\n", - " - prediction : le département prédit ('2A' ou '2B')\n", - " - neighbors : DataFrame des k plus proches voisins\n", - " - votes : dictionnaire des votes\n", - " \"\"\"\n", - " # Calculer les distances pour tous les villages\n", - " distances = []\n", - " for idx, row in df.iterrows():\n", - " dist = haversine_distance(test_lat, test_lon, row['latitude'], row['longitude'])\n", - " distances.append({\n", - " 'village': row['Nom français'],\n", - " 'nom_corse': row['Nom corse'],\n", - " 'departement': row['dept_simple'],\n", - " 'latitude': row['latitude'],\n", - " 'longitude': row['longitude'],\n", - " 'distance': dist\n", - " })\n", - " \n", - " # Créer un DataFrame et trier par distance\n", - " dist_df = pd.DataFrame(distances)\n", - " dist_df = dist_df.sort_values('distance')\n", - " \n", - " # Sélectionner les k plus proches\n", - " neighbors = dist_df.head(k)\n", - " \n", - " # Voter\n", - " votes = Counter(neighbors['departement'])\n", - " prediction = votes.most_common(1)[0][0]\n", - " \n", - " return prediction, neighbors, votes\n", - "\n", - "# Test de l'algorithme avec un point au centre de la Corse\n", - "test_lat, test_lon = 42.15, 9.05\n", - "k = 5\n", - "\n", - "prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n", - "\n", - "print(f\"\\n🎯 Point de test : ({test_lat}, {test_lon})\")\n", - "print(f\"\\nAvec k={k} :\")\n", - "print(f\"Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n", - "print(f\"Votes : {dict(votes)}\")\n", - "print(f\"\\nLes {k} plus proches voisins :\")\n", - "print(neighbors[['village', 'nom_corse', 'departement', 'distance']])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🗺️ Visualisation avec Folium" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def create_map(test_lat=None, test_lon=None, k=5, show_all_villages=False, show_boundaries=False):\n", - " \"\"\"\n", - " Crée une carte interactive avec Folium.\n", - " \n", - " Paramètres:\n", - " - test_lat, test_lon: coordonnées du point à tester\n", - " - k: nombre de voisins\n", - " - show_all_villages: afficher tous les villages\n", - " - show_boundaries: afficher les frontières des communes (peut être lent)\n", - " \"\"\"\n", - " # Centre de la Corse\n", - " center_lat = 42.15\n", - " center_lon = 9.05\n", - " \n", - " # Créer la carte\n", - " m = folium.Map(\n", - " location=[center_lat, center_lon],\n", - " zoom_start=9,\n", - " tiles='OpenStreetMap'\n", - " )\n", - " \n", - " # Afficher les frontières des communes (optionnel)\n", - " if show_boundaries:\n", - " print(\"Affichage des frontières des communes...\")\n", - " for idx, row in df.iterrows():\n", - " try:\n", - " zone_geo = json.loads(row['Zone_geo'])\n", - " color = 'red' if row['dept_simple'] == '2A' else 'blue'\n", - " \n", - " folium.GeoJson(\n", - " zone_geo,\n", - " style_function=lambda x, color=color: {\n", - " 'fillColor': color,\n", - " 'color': color,\n", - " 'weight': 1,\n", - " 'fillOpacity': 0.1\n", - " },\n", - " tooltip=row['Nom français']\n", - " ).add_to(m)\n", - " except:\n", - " pass\n", - " \n", - " # Afficher tous les villages (optionnel)\n", - " if show_all_villages:\n", - " marker_cluster = MarkerCluster().add_to(m)\n", - " \n", - " for idx, row in df.iterrows():\n", - " color = 'red' if row['dept_simple'] == '2A' else 'blue'\n", - " folium.CircleMarker(\n", - " location=[row['latitude'], row['longitude']],\n", - " radius=3,\n", - " color=color,\n", - " fill=True,\n", - " fillColor=color,\n", - " fillOpacity=0.4,\n", - " popup=f\"{row['Nom français']}
{row['Nom corse']}
({row['dept_simple']})\"\n", - " ).add_to(marker_cluster)\n", - " \n", - " # Si un point de test est fourni\n", - " if test_lat is not None and test_lon is not None:\n", - " # Classification\n", - " prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n", - " \n", - " # Marqueur pour le point de test\n", - " color = 'darkred' if prediction == '2A' else 'darkblue'\n", - " dept_name = 'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'\n", - " \n", - " folium.Marker(\n", - " location=[test_lat, test_lon],\n", - " popup=f\"Point à classifier
Prédiction : {dept_name}
Votes : {dict(votes)}\",\n", - " icon=folium.Icon(color=color, icon='star', prefix='fa')\n", - " ).add_to(m)\n", - " \n", - " # Afficher les k plus proches voisins\n", - " for idx, neighbor in neighbors.iterrows():\n", - " # Marqueur pour chaque voisin\n", - " color = 'red' if neighbor['departement'] == '2A' else 'blue'\n", - " folium.Marker(\n", - " location=[neighbor['latitude'], neighbor['longitude']],\n", - " popup=f\"{neighbor['village']}
{neighbor['nom_corse']}
{neighbor['departement']}
Distance: {neighbor['distance']:.2f} km\",\n", - " icon=folium.Icon(color=color, icon='info-sign')\n", - " ).add_to(m)\n", - " \n", - " # Ligne entre le point test et le voisin\n", - " folium.PolyLine(\n", - " locations=[\n", - " [test_lat, test_lon],\n", - " [neighbor['latitude'], neighbor['longitude']]\n", - " ],\n", - " color=color,\n", - " weight=2,\n", - " opacity=0.5,\n", - " tooltip=f\"{neighbor['distance']:.2f} km\"\n", - " ).add_to(m)\n", - " \n", - " # Légende\n", - " legend_html = '''\n", - "
\n", - "

Légende

\n", - "

Corse du Sud (2A)

\n", - "

Haute-Corse (2B)

\n", - "

Point à classifier

\n", - "
\n", - " '''\n", - " m.get_root().html.add_child(folium.Element(legend_html))\n", - " \n", - " return m\n", - "\n", - "# Créer la carte avec le point de test\n", - "map_with_test = create_map(test_lat=42.15, test_lon=9.05, k=5, show_all_villages=False)\n", - "map_with_test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🔬 Expérimentation : Influence de k" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Test avec différentes valeurs de k\n", - "test_point = (42.15, 9.05) # Point au centre de la Corse\n", - "\n", - "print(f\"Point testé : {test_point}\\n\")\n", - "print(f\"{'k':<5} {'Prédiction':<15} {'Votes 2A':<10} {'Votes 2B':<10}\")\n", - "print(\"-\" * 50)\n", - "\n", - "for k in [1, 3, 5, 7, 9, 15, 21]:\n", - " prediction, neighbors, votes = knn_classify(test_point[0], test_point[1], df, k=k)\n", - " votes_2a = votes.get('2A', 0)\n", - " votes_2b = votes.get('2B', 0)\n", - " dept_name = 'Corse du Sud' if prediction == '2A' else 'Haute-Corse'\n", - " print(f\"{k:<5} {dept_name:<15} {votes_2a:<10} {votes_2b:<10}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🎮 Mode interactif : Testez vos propres points !\n", - "\n", - "Modifiez les coordonnées ci-dessous pour tester différents points de la Corse.\n", - "\n", - "**Quelques repères géographiques :**\n", - "- Ajaccio : (41.9267, 8.7369)\n", - "- Bastia : (42.7028, 9.4500)\n", - "- Corte : (42.3062, 9.1509)\n", - "- Porto-Vecchio : (41.5914, 9.2795)\n", - "- Calvi : (42.5679, 8.7575)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# === MODIFIEZ CES VALEURS ===\n", - "test_latitude = 42.3 # Entre 41.3 (sud) et 43.0 (nord)\n", - "test_longitude = 9.15 # Entre 8.5 (ouest) et 9.5 (est)\n", - "k_value = 7 # Nombre de voisins\n", - "# =============================\n", - "\n", - "prediction, neighbors, votes = knn_classify(test_latitude, test_longitude, df, k=k_value)\n", - "\n", - "print(f\"📍 Point : ({test_latitude}, {test_longitude})\")\n", - "print(f\"🔢 k = {k_value}\")\n", - "print(f\"\\n🎯 Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n", - "print(f\"\\n📊 Votes : {dict(votes)}\")\n", - "print(f\"\\n🏘️ Les {k_value} plus proches villages :\")\n", - "print(neighbors[['village', 'nom_corse', 'departement', 'distance']].to_string(index=False))\n", - "\n", - "# Afficher la carte\n", - "map_interactive = create_map(test_latitude, test_longitude, k=k_value, show_all_villages=False)\n", - "map_interactive" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🌍 Carte complète avec tous les villages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Afficher tous les villages de Corse\n", - "# Note : cette cellule peut prendre quelques secondes à s'exécuter\n", - "\n", - "map_all = create_map(show_all_villages=True, show_boundaries=False)\n", - "map_all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🗺️ Carte avec frontières des communes (BONUS)\n", - "\n", - "Cette cellule affiche les frontières réelles des communes. **Attention : cela peut prendre du temps à charger !**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Carte avec frontières - peut être lent !\n", - "# Décommentez la ligne suivante pour l'exécuter\n", - "# map_boundaries = create_map(test_lat=42.15, test_lon=9.05, k=5, show_boundaries=True)\n", - "# map_boundaries" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📈 Visualisation de la frontière de décision" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Créer une grille de points et classifier chacun\n", - "# Cela permet de visualiser la \"frontière\" selon k-NN\n", - "\n", - "def create_decision_boundary_map(k=5, grid_resolution=40):\n", - " \"\"\"\n", - " Crée une carte montrant la frontière de décision de k-NN.\n", - " \"\"\"\n", - " # Limites de la Corse\n", - " lat_min, lat_max = 41.3, 43.0\n", - " lon_min, lon_max = 8.5, 9.6\n", - " \n", - " # Créer une grille\n", - " lats = np.linspace(lat_min, lat_max, grid_resolution)\n", - " lons = np.linspace(lon_min, lon_max, grid_resolution)\n", - " \n", - " m = folium.Map(\n", - " location=[42.15, 9.05],\n", - " zoom_start=8,\n", - " tiles='OpenStreetMap'\n", - " )\n", - " \n", - " # Classifier chaque point de la grille\n", - " print(f\"Classification d'une grille de {grid_resolution}x{grid_resolution} points...\")\n", - " total = len(lats) * len(lons)\n", - " count = 0\n", - " \n", - " for lat in lats:\n", - " for lon in lons:\n", - " prediction, _, _ = knn_classify(lat, lon, df, k=k)\n", - " color = '#ffcccc' if prediction == '2A' else '#ccccff'\n", - " \n", - " folium.CircleMarker(\n", - " location=[lat, lon],\n", - " radius=4,\n", - " color=color,\n", - " fill=True,\n", - " fillColor=color,\n", - " fillOpacity=0.3,\n", - " weight=0\n", - " ).add_to(m)\n", - " \n", - " count += 1\n", - " if count % 100 == 0:\n", - " print(f\" {count}/{total} points traités ({100*count/total:.1f}%)\")\n", - " \n", - " print(\"Terminé !\")\n", - " \n", - " # Ajouter les villages\n", - " for idx, row in df.iterrows():\n", - " color = 'red' if row['dept_simple'] == '2A' else 'blue'\n", - " folium.CircleMarker(\n", - " location=[row['latitude'], row['longitude']],\n", - " radius=2,\n", - " color=color,\n", - " fill=True,\n", - " fillColor=color,\n", - " fillOpacity=0.8,\n", - " popup=row['Nom français']\n", - " ).add_to(m)\n", - " \n", - " return m\n", - "\n", - "# Créer la carte (réduire grid_resolution si c'est trop lent)\n", - "print(f\"Création de la carte de frontière avec k=5...\")\n", - "print(\"Note : cela peut prendre 1-2 minutes...\")\n", - "boundary_map = create_decision_boundary_map(k=5, grid_resolution=30)\n", - "boundary_map" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 💡 Validation de l'algorithme" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# BONUS : Validation croisée\n", - "# Tester la précision en utilisant les villages eux-mêmes\n", - "\n", - "def cross_validation(df, k=5, sample_size=100):\n", - " \"\"\"\n", - " Teste la précision de k-NN en utilisant un échantillon de villages.\n", - " \"\"\"\n", - " # Prendre un échantillon aléatoire\n", - " sample = df.sample(n=min(sample_size, len(df)), random_state=42)\n", - " \n", - " correct = 0\n", - " total = 0\n", - " errors = []\n", - " \n", - " for idx, row in sample.iterrows():\n", - " # Créer un dataset sans ce village\n", - " df_without = df.drop(idx)\n", - " \n", - " # Classifier ce village\n", - " prediction, neighbors, votes = knn_classify(\n", - " row['latitude'], \n", - " row['longitude'], \n", - " df_without, \n", - " k=k\n", - " )\n", - " \n", - " if prediction == row['dept_simple']:\n", - " correct += 1\n", - " else:\n", - " errors.append({\n", - " 'village': row['Nom français'],\n", - " 'vrai_dept': row['dept_simple'],\n", - " 'prediction': prediction,\n", - " 'votes': dict(votes)\n", - " })\n", - " total += 1\n", - " \n", - " accuracy = (correct / total) * 100\n", - " return accuracy, correct, total, errors\n", - "\n", - "print(\"Test de précision de l'algorithme k-NN...\\n\")\n", - "print(\"Validation croisée : chaque village est classifié en fonction de ses voisins.\\n\")\n", - "\n", - "for k in [1, 3, 5, 10, 15]:\n", - " accuracy, correct, total, errors = cross_validation(df, k=k, sample_size=100)\n", - " print(f\"k={k:2d} : {accuracy:.1f}% de précision ({correct}/{total} corrects)\")\n", - "\n", - "# Afficher quelques erreurs pour k=5\n", - "print(\"\\n📋 Exemples d'erreurs avec k=5 :\")\n", - "_, _, _, errors_k5 = cross_validation(df, k=5, sample_size=100)\n", - "if errors_k5:\n", - " for error in errors_k5[:5]:\n", - " print(f\" • {error['village']} : prédit {error['prediction']} (vrai: {error['vrai_dept']}) - votes: {error['votes']}\")\n", - "else:\n", - " print(\" Aucune erreur !\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🎓 Questions de réflexion\n", - "\n", - "1. **Influence de k** : Comment la prédiction change-t-elle avec différentes valeurs de k ?\n", - "\n", - "2. **Points frontières** : Trouvez des coordonnées où la classification est ambiguë (votes proches).\n", - "\n", - "3. **Zones problématiques** : Où se situent les villages difficiles à classifier correctement ?\n", - "\n", - "4. **Validité géographique** : Cette méthode respecte-t-elle toujours les vraies frontières administratives ?\n", - "\n", - "5. **Améliorations** : Comment pourrait-on améliorer l'algorithme ?\n", - " - Pondération par distance inverse\n", - " - Prise en compte d'autres critères (altitude, population...)\n", - " - k adaptatif selon la densité de villages" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 💡 Exercices supplémentaires\n", - "\n", - "1. **Trouver la frontière** : Trouvez des points sur la \"frontière\" k-NN (là où un changement de k change la classification)\n", - "\n", - "2. **Villages isolés** : Identifiez les villages dont le département diffère de leurs k plus proches voisins\n", - "\n", - "3. **Pondération** : Implémentez une version pondérée où les villages plus proches ont plus d'influence\n", - "\n", - "4. **Comparaison** : Comparez la frontière k-NN avec la vraie frontière administrative" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# EXERCICE : Villages \"anomaliques\"\n", - "# Trouver les villages dont les k plus proches voisins sont majoritairement de l'autre département\n", - "\n", - "def find_anomalous_villages(df, k=5):\n", - " \"\"\"\n", - " Trouve les villages qui seraient mal classifiés par k-NN.\n", - " \"\"\"\n", - " anomalies = []\n", - " \n", - " for idx, row in df.iterrows():\n", - " # Créer un dataset sans ce village\n", - " df_without = df.drop(idx)\n", - " \n", - " # Classifier ce village\n", - " prediction, neighbors, votes = knn_classify(\n", - " row['latitude'], \n", - " row['longitude'], \n", - " df_without, \n", - " k=k\n", - " )\n", - " \n", - " # Si la prédiction ne correspond pas au vrai département\n", - " if prediction != row['dept_simple']:\n", - " anomalies.append({\n", - " 'village': row['Nom français'],\n", - " 'nom_corse': row['Nom corse'],\n", - " 'vrai_dept': row['dept_simple'],\n", - " 'prediction': prediction,\n", - " 'votes_2A': votes.get('2A', 0),\n", - " 'votes_2B': votes.get('2B', 0),\n", - " 'latitude': row['latitude'],\n", - " 'longitude': row['longitude']\n", - " })\n", - " \n", - " return pd.DataFrame(anomalies)\n", - "\n", - "print(\"Recherche des villages 'anomaliques' avec k=5...\\n\")\n", - "anomalies_df = find_anomalous_villages(df, k=5)\n", - "\n", - "print(f\"Nombre de villages anomaliques : {len(anomalies_df)}\")\n", - "print(f\"\\nVillages qui seraient classifiés dans le mauvais département :\\n\")\n", - "print(anomalies_df[['village', 'nom_corse', 'vrai_dept', 'prediction', 'votes_2A', 'votes_2B']])\n", - "\n", - "# Afficher ces villages sur une carte\n", - "if len(anomalies_df) > 0:\n", - " m_anomalies = folium.Map(location=[42.15, 9.05], zoom_start=9)\n", - " \n", - " for idx, row in anomalies_df.iterrows():\n", - " folium.Marker(\n", - " location=[row['latitude'], row['longitude']],\n", - " popup=f\"{row['village']}
Vrai: {row['vrai_dept']}
Prédit: {row['prediction']}
Votes: {row['votes_2A']}-{row['votes_2B']}\",\n", - " icon=folium.Icon(color='orange', icon='exclamation-triangle', prefix='fa')\n", - " ).add_to(m_anomalies)\n", - " \n", - " display(m_anomalies)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}