diff --git a/activite1/knn_corse_interactive.ipynb b/activite1/knn_corse_interactive.ipynb deleted file mode 100644 index df21d27..0000000 --- a/activite1/knn_corse_interactive.ipynb +++ /dev/null @@ -1,561 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 🗺️ Classification k-NN : Haute-Corse ou Corse du Sud ?\n", - "\n", - "## Objectif\n", - "Utiliser l'algorithme des **k plus proches voisins (k-NN)** pour déterminer si un point de la carte de Corse se situe en **Haute-Corse (2B)** ou en **Corse du Sud (2A)**, en se basant sur les villages les plus proches.\n", - "\n", - "## Principe\n", - "1. On charge les données des villages corses avec leurs coordonnées GPS et leur département\n", - "2. On choisit un point sur la carte\n", - "3. On calcule les distances entre ce point et tous les villages\n", - "4. On identifie les k villages les plus proches\n", - "5. On vote : le département majoritaire parmi ces k villages devient la prédiction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📦 Installation et imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Installation des bibliothèques nécessaires (si besoin)\n", - "import sys\n", - "!{sys.executable} -m pip install folium pandas numpy -q" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import folium\n", - "from folium.plugins import MarkerCluster\n", - "import math\n", - "from collections import Counter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📊 Chargement des données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Charger le fichier CSV\n", - "# Remplacez 'villages_corse.csv' par le chemin de votre fichier\n", - "df = pd.read_csv('villages_corse.csv', sep=';', encoding='utf-8')\n", - "\n", - "# Afficher les premières lignes\n", - "print(f\"Nombre de villages : {len(df)}\")\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🔧 Préparation des données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_coordinates(point_geo_str):\n", - " \"\"\"\n", - " Parse la colonne Point_Geo pour extraire latitude et longitude.\n", - " Format attendu : \"POINT(longitude latitude)\" ou similaire\n", - " \"\"\"\n", - " try:\n", - " # Supprimer 'POINT(' et ')'\n", - " coords = point_geo_str.replace('POINT(', '').replace(')', '').strip()\n", - " lon, lat = coords.split()\n", - " return float(lat), float(lon)\n", - " except:\n", - " return None, None\n", - "\n", - "# Extraire les coordonnées\n", - "df[['latitude', 'longitude']] = df['Point_Geo'].apply(\n", - " lambda x: pd.Series(parse_coordinates(x))\n", - ")\n", - "\n", - "# Supprimer les lignes sans coordonnées valides\n", - "df = df.dropna(subset=['latitude', 'longitude'])\n", - "\n", - "# Simplifier les noms de départements\n", - "df['dept_simple'] = df['Code Département'].apply(lambda x: '2A' if x == '2A' else '2B')\n", - "\n", - "print(f\"Villages avec coordonnées valides : {len(df)}\")\n", - "print(f\"\\nRépartition par département :\")\n", - "print(df['dept_simple'].value_counts())\n", - "\n", - "df[['Nom français', 'dept_simple', 'latitude', 'longitude']].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📏 Fonction de calcul de distance\n", - "\n", - "Nous utilisons la **formule de Haversine** pour calculer la distance entre deux points GPS sur la surface de la Terre." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def haversine_distance(lat1, lon1, lat2, lon2):\n", - " \"\"\"\n", - " Calcule la distance en kilomètres entre deux points GPS.\n", - " Formule de Haversine.\n", - " \"\"\"\n", - " R = 6371 # Rayon de la Terre en km\n", - " \n", - " # Conversion en radians\n", - " lat1_rad = math.radians(lat1)\n", - " lat2_rad = math.radians(lat2)\n", - " delta_lat = math.radians(lat2 - lat1)\n", - " delta_lon = math.radians(lon2 - lon1)\n", - " \n", - " # Formule de Haversine\n", - " a = math.sin(delta_lat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon/2)**2\n", - " c = 2 * math.asin(math.sqrt(a))\n", - " \n", - " return R * c\n", - "\n", - "# Test de la fonction\n", - "# Distance entre Ajaccio et Bastia (environ 100 km)\n", - "dist_test = haversine_distance(41.9267, 8.7369, 42.7028, 9.4500)\n", - "print(f\"Distance Ajaccio-Bastia : {dist_test:.1f} km\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🎯 Algorithme k-NN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def knn_classify(test_lat, test_lon, df, k=5):\n", - " \"\"\"\n", - " Classifie un point (test_lat, test_lon) en utilisant k-NN.\n", - " \n", - " Retourne :\n", - " - prediction : le département prédit ('2A' ou '2B')\n", - " - neighbors : DataFrame des k plus proches voisins\n", - " - votes : dictionnaire des votes\n", - " \"\"\"\n", - " # Calculer les distances pour tous les villages\n", - " distances = []\n", - " for idx, row in df.iterrows():\n", - " dist = haversine_distance(test_lat, test_lon, row['latitude'], row['longitude'])\n", - " distances.append({\n", - " 'village': row['Nom français'],\n", - " 'departement': row['dept_simple'],\n", - " 'latitude': row['latitude'],\n", - " 'longitude': row['longitude'],\n", - " 'distance': dist\n", - " })\n", - " \n", - " # Créer un DataFrame et trier par distance\n", - " dist_df = pd.DataFrame(distances)\n", - " dist_df = dist_df.sort_values('distance')\n", - " \n", - " # Sélectionner les k plus proches\n", - " neighbors = dist_df.head(k)\n", - " \n", - " # Voter\n", - " votes = Counter(neighbors['departement'])\n", - " prediction = votes.most_common(1)[0][0]\n", - " \n", - " return prediction, neighbors, votes\n", - "\n", - "# Test de l'algorithme avec un point au centre de la Corse\n", - "test_lat, test_lon = 42.15, 9.05\n", - "k = 5\n", - "\n", - "prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n", - "\n", - "print(f\"\\n🎯 Point de test : ({test_lat}, {test_lon})\")\n", - "print(f\"\\nAvec k={k} :\")\n", - "print(f\"Prédiction : {prediction}\")\n", - "print(f\"Votes : {dict(votes)}\")\n", - "print(f\"\\nLes {k} plus proches voisins :\")\n", - "print(neighbors[['village', 'departement', 'distance']])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🗺️ Visualisation avec Folium" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def create_map(test_lat=None, test_lon=None, k=5, show_all_villages=False):\n", - " \"\"\"\n", - " Crée une carte interactive avec Folium.\n", - " \"\"\"\n", - " # Centre de la Corse\n", - " center_lat = 42.15\n", - " center_lon = 9.05\n", - " \n", - " # Créer la carte\n", - " m = folium.Map(\n", - " location=[center_lat, center_lon],\n", - " zoom_start=9,\n", - " tiles='OpenStreetMap'\n", - " )\n", - " \n", - " # Afficher tous les villages (optionnel, peut être lourd)\n", - " if show_all_villages:\n", - " marker_cluster = MarkerCluster().add_to(m)\n", - " \n", - " for idx, row in df.iterrows():\n", - " color = 'red' if row['dept_simple'] == '2A' else 'blue'\n", - " folium.CircleMarker(\n", - " location=[row['latitude'], row['longitude']],\n", - " radius=2,\n", - " color=color,\n", - " fill=True,\n", - " fillColor=color,\n", - " fillOpacity=0.3,\n", - " popup=f\"{row['Nom français']} ({row['dept_simple']})\"\n", - " ).add_to(marker_cluster)\n", - " \n", - " # Si un point de test est fourni\n", - " if test_lat is not None and test_lon is not None:\n", - " # Classification\n", - " prediction, neighbors, votes = knn_classify(test_lat, test_lon, df, k=k)\n", - " \n", - " # Marqueur pour le point de test\n", - " color = 'darkred' if prediction == '2A' else 'darkblue'\n", - " folium.Marker(\n", - " location=[test_lat, test_lon],\n", - " popup=f\"Point à classifier
Prédiction : {prediction}
Votes : {dict(votes)}\",\n", - " icon=folium.Icon(color=color, icon='star', prefix='fa')\n", - " ).add_to(m)\n", - " \n", - " # Afficher les k plus proches voisins\n", - " for idx, neighbor in neighbors.iterrows():\n", - " # Marqueur pour chaque voisin\n", - " color = 'red' if neighbor['departement'] == '2A' else 'blue'\n", - " folium.Marker(\n", - " location=[neighbor['latitude'], neighbor['longitude']],\n", - " popup=f\"{neighbor['village']}
{neighbor['departement']}
Distance: {neighbor['distance']:.2f} km\",\n", - " icon=folium.Icon(color=color, icon='info-sign')\n", - " ).add_to(m)\n", - " \n", - " # Ligne entre le point test et le voisin\n", - " folium.PolyLine(\n", - " locations=[\n", - " [test_lat, test_lon],\n", - " [neighbor['latitude'], neighbor['longitude']]\n", - " ],\n", - " color=color,\n", - " weight=2,\n", - " opacity=0.5\n", - " ).add_to(m)\n", - " \n", - " # Légende\n", - " legend_html = '''\n", - "
\n", - "

Légende

\n", - "

Corse du Sud (2A)

\n", - "

Haute-Corse (2B)

\n", - "

Point à classifier

\n", - "
\n", - " '''\n", - " m.get_root().html.add_child(folium.Element(legend_html))\n", - " \n", - " return m\n", - "\n", - "# Créer la carte avec le point de test\n", - "map_with_test = create_map(test_lat=42.15, test_lon=9.05, k=5, show_all_villages=False)\n", - "map_with_test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🔬 Expérimentation : Influence de k" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Test avec différentes valeurs de k\n", - "test_point = (42.15, 9.05) # Point au centre de la Corse\n", - "\n", - "print(f\"Point testé : {test_point}\\n\")\n", - "print(f\"{'k':<5} {'Prédiction':<12} {'Votes 2A':<10} {'Votes 2B':<10}\")\n", - "print(\"-\" * 45)\n", - "\n", - "for k in [1, 3, 5, 7, 9, 15]:\n", - " prediction, neighbors, votes = knn_classify(test_point[0], test_point[1], df, k=k)\n", - " votes_2a = votes.get('2A', 0)\n", - " votes_2b = votes.get('2B', 0)\n", - " print(f\"{k:<5} {prediction:<12} {votes_2a:<10} {votes_2b:<10}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🎮 Mode interactif : Testez vos propres points !\n", - "\n", - "Modifiez les coordonnées ci-dessous pour tester différents points de la Corse." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# === MODIFIEZ CES VALEURS ===\n", - "test_latitude = 42.5 # Entre 41.3 (sud) et 43.0 (nord)\n", - "test_longitude = 9.2 # Entre 8.5 (ouest) et 9.5 (est)\n", - "k_value = 7 # Nombre de voisins\n", - "# =============================\n", - "\n", - "prediction, neighbors, votes = knn_classify(test_latitude, test_longitude, df, k=k_value)\n", - "\n", - "print(f\"📍 Point : ({test_latitude}, {test_longitude})\")\n", - "print(f\"🔢 k = {k_value}\")\n", - "print(f\"\\n🎯 Prédiction : {'Corse du Sud (2A)' if prediction == '2A' else 'Haute-Corse (2B)'}\")\n", - "print(f\"\\n📊 Votes : {dict(votes)}\")\n", - "print(f\"\\n🏘️ Les {k_value} plus proches villages :\")\n", - "print(neighbors[['village', 'departement', 'distance']].to_string(index=False))\n", - "\n", - "# Afficher la carte\n", - "map_interactive = create_map(test_latitude, test_longitude, k=k_value, show_all_villages=False)\n", - "map_interactive" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🌍 Carte complète avec tous les villages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Afficher tous les villages de Corse (peut être lent)\n", - "# Attention : cette cellule peut prendre du temps à s'exécuter\n", - "\n", - "map_all = create_map(show_all_villages=True)\n", - "map_all" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 📈 Visualisation de la frontière entre départements" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Créer une grille de points et classifier chacun\n", - "# Cela permet de visualiser la \"frontière\" selon k-NN\n", - "\n", - "def create_decision_boundary_map(k=5, grid_resolution=50):\n", - " \"\"\"\n", - " Crée une carte montrant la frontière de décision de k-NN.\n", - " \"\"\"\n", - " # Limites de la Corse\n", - " lat_min, lat_max = 41.3, 43.0\n", - " lon_min, lon_max = 8.5, 9.6\n", - " \n", - " # Créer une grille\n", - " lats = np.linspace(lat_min, lat_max, grid_resolution)\n", - " lons = np.linspace(lon_min, lon_max, grid_resolution)\n", - " \n", - " m = folium.Map(\n", - " location=[42.15, 9.05],\n", - " zoom_start=8,\n", - " tiles='OpenStreetMap'\n", - " )\n", - " \n", - " # Classifier chaque point de la grille\n", - " print(\"Classification de la grille en cours...\")\n", - " for i, lat in enumerate(lats[::5]): # Réduire la résolution pour la vitesse\n", - " for lon in lons[::5]:\n", - " prediction, _, _ = knn_classify(lat, lon, df, k=k)\n", - " color = 'red' if prediction == '2A' else 'blue'\n", - " \n", - " folium.CircleMarker(\n", - " location=[lat, lon],\n", - " radius=3,\n", - " color=color,\n", - " fill=True,\n", - " fillColor=color,\n", - " fillOpacity=0.2,\n", - " weight=0\n", - " ).add_to(m)\n", - " \n", - " if (i+1) % 5 == 0:\n", - " print(f\" {(i+1)*100//len(lats[::5])}% complété\")\n", - " \n", - " print(\"Terminé !\")\n", - " return m\n", - "\n", - "# Créer la carte (peut prendre quelques secondes)\n", - "print(f\"Création de la carte de frontière avec k=5...\")\n", - "boundary_map = create_decision_boundary_map(k=5, grid_resolution=30)\n", - "boundary_map" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 🎓 Questions de réflexion\n", - "\n", - "1. **Influence de k** : Testez différentes valeurs de k (1, 3, 5, 10, 20). Comment la prédiction change-t-elle ?\n", - "\n", - "2. **Points frontières** : Trouvez des points où la classification change selon la valeur de k.\n", - "\n", - "3. **Zones ambiguës** : Où se situent les zones les plus difficiles à classifier ?\n", - "\n", - "4. **Validité** : Cette méthode est-elle toujours fiable ? Dans quels cas pourrait-elle échouer ?\n", - "\n", - "5. **Amélioration** : Comment pourrait-on améliorer l'algorithme ? (pondération par distance, normalisation, etc.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 💡 Pour aller plus loin\n", - "\n", - "### Exercices supplémentaires :\n", - "\n", - "1. **Pondération par distance** : Modifier l'algorithme pour donner plus de poids aux villages plus proches\n", - "2. **Validation croisée** : Tester la précision en utilisant les villages eux-mêmes comme points de test\n", - "3. **Autres critères** : Ajouter l'altitude comme dimension supplémentaire\n", - "4. **Clustering** : Identifier des groupes de villages similaires" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# BONUS : Validation croisée\n", - "# Tester la précision en utilisant les villages eux-mêmes\n", - "\n", - "def cross_validation(df, k=5, sample_size=100):\n", - " \"\"\"\n", - " Teste la précision de k-NN en utilisant un échantillon de villages.\n", - " \"\"\"\n", - " # Prendre un échantillon aléatoire\n", - " sample = df.sample(n=min(sample_size, len(df)), random_state=42)\n", - " \n", - " correct = 0\n", - " total = 0\n", - " \n", - " for idx, row in sample.iterrows():\n", - " # Créer un dataset sans ce village\n", - " df_without = df.drop(idx)\n", - " \n", - " # Classifier ce village\n", - " prediction, _, _ = knn_classify(\n", - " row['latitude'], \n", - " row['longitude'], \n", - " df_without, \n", - " k=k\n", - " )\n", - " \n", - " if prediction == row['dept_simple']:\n", - " correct += 1\n", - " total += 1\n", - " \n", - " accuracy = (correct / total) * 100\n", - " return accuracy, correct, total\n", - "\n", - "print(\"Test de précision de l'algorithme k-NN...\\n\")\n", - "\n", - "for k in [1, 3, 5, 10]:\n", - " accuracy, correct, total = cross_validation(df, k=k, sample_size=100)\n", - " print(f\"k={k:2d} : {accuracy:.1f}% de précision ({correct}/{total} corrects)\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}