From d92b69a2b03b1f596780b900ae783bc8076c3b17 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 23 Apr 2024 16:06:34 +0200
Subject: [PATCH 001/100] Implemented  which uses the overpass API to download
 power features for individual countries.

---
 config/config.default.yaml   |   5 +-
 envs/environment.yaml        |   2 +
 rules/build_electricity.smk  |  34 +++++++++
 scripts/clean_osm_data.py    |  39 +++++++++++
 scripts/retrieve_osm_data.py | 130 +++++++++++++++++++++++++++++++++++
 5 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 scripts/clean_osm_data.py
 create mode 100644 scripts/retrieve_osm_data.py

diff --git a/config/config.default.yaml b/config/config.default.yaml
index 42132f226..05418edf2 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -64,6 +64,10 @@ snapshots:
   end: "2014-01-01"
   inclusive: 'left'
 
+osm:
+  retrieve: true
+  use-prebuilt: false
+
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
 enable:
   retrieve: auto
@@ -79,7 +83,6 @@ enable:
   custom_busmap: false
   drop_leap_day: true
 
-
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
 co2_budget:
   2020: 0.701
diff --git a/envs/environment.yaml b/envs/environment.yaml
index ee1d1605a..47dcdd620 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -48,6 +48,7 @@ dependencies:
 - pyxlsb
 - graphviz
 - pre-commit
+- geojson
 
   # Keep in conda environment when calling ipython
 - ipython
@@ -64,3 +65,4 @@ dependencies:
   - snakemake-executor-plugin-slurm
   - snakemake-executor-plugin-cluster-generic
   - highspy
+  - overpass
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index ed341d2f8..589dfab6f 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -606,3 +606,37 @@ rule prepare_network:
         "../envs/environment.yaml"
     script:
         "../scripts/prepare_network.py"
+
+
+if config["osm"].get("retrieve", True):
+    rule retrieve_osm_data:
+        output:
+            cables_way="data/osm/raw/{country}/cables_way_raw.geojson",
+            lines_way="data/osm/raw/{country}/lines_way_raw.geojson",
+            substations_way="data/osm/raw/{country}/substations_way_raw.geojson",
+            substations_node="data/osm/raw/{country}/substations_node_raw.geojson",
+            transformers_way="data/osm/raw/{country}/transformers_way_raw.geojson",
+            transformers_node="data/osm/raw/{country}/transformers_node_raw.geojson",
+        log:
+            logs("retrieve_osm_data_{country}.log"),
+        script:
+            "../scripts/retrieve_osm_data.py"
+
+rule clean_osm_data:
+    # params:
+    #     countries=config["countries"],
+    input:
+        cables_way=[f"data/osm/raw/{country}/cables_way_raw.geojson" for country in config["countries"]],
+        lines_way=[f"data/osm/raw/{country}/lines_way_raw.geojson" for country in config["countries"]],
+        substations_way=[f"data/osm/raw/{country}/substations_way_raw.geojson" for country in config["countries"]],
+        substations_node=[f"data/osm/raw/{country}/substations_node_raw.geojson" for country in config["countries"]],
+        transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.geojson" for country in config["countries"]],
+        transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.geojson" for country in config["countries"]],
+    output:
+        dummy="data/osm/raw/dummy.txt"
+        # cables="resources/RDIR/cables_clean_.geojson"
+        # lines=
+    log:
+        logs("clean_osm_data.log"),
+    script:
+        "../scripts/clean_osm_data.py"
\ No newline at end of file
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
new file mode 100644
index 000000000..d4c3ba36e
--- /dev/null
+++ b/scripts/clean_osm_data.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+"""
+TODO To fill later
+"""
+
+# import geojson
+import logging
+# import numpy as np
+# import overpass as op
+# import os
+# import pandas as pd
+# import pypsa
+# import requests
+
+from _helpers import configure_logging
+logger = logging.getLogger(__name__)
+
+def clean_osm_data(output):
+    with open(output, "w") as file:
+        file.write("Hello, world!\n")
+
+
+if __name__ == "__main__":
+    # Detect running outside of snakemake and mock snakemake for testing
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("clean_osm_data")
+    
+    configure_logging(snakemake)
+    logger.info("Dummy log: clean_osm_data()")
+
+    output = str(snakemake.output)
+    clean_osm_data(output)
+
+
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
new file mode 100644
index 000000000..47592d296
--- /dev/null
+++ b/scripts/retrieve_osm_data.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+"""
+TODO To fill later
+"""
+
+import geojson
+import logging
+import overpass as op
+import os
+import requests
+import time
+
+from _helpers import configure_logging
+logger = logging.getLogger(__name__)
+
+
+def _get_overpass_areas(countries):
+    # If a single country code is provided, convert it to a list
+    if not isinstance(countries, list):
+        countries = [countries]
+
+    # Overpass API endpoint URL
+    overpass_url = "https://overpass-api.de/api/interpreter"
+
+    osm_areas = []
+    for c in countries:
+        # Overpass query to fetch the relation for the specified country code
+        overpass_query = f"""
+            [out:json];
+            area["ISO3166-1"="{c}"];
+            out;
+        """
+
+        # Send the request to Overpass API
+        response = requests.post(overpass_url, data=overpass_query)
+
+        # Parse the response
+        data = response.json()
+
+        # Check if the response contains any results
+        if "elements" in data and len(data["elements"]) > 0:
+            # Extract the area ID from the relation
+            osm_area_id = data["elements"][0]["id"]
+            osm_areas.append(f"area({osm_area_id})")
+        else:
+            # Print a warning if no results are found for the country code
+            logger.info(f"No area code found for the specified country code: {c}. Ommitted from the list.")
+    
+    # Create a dictionary mapping country codes to their corresponding OSM area codes
+    op_areas_dict = dict(zip(countries, osm_areas))
+    
+    return op_areas_dict
+    
+
+def retrieve_osm_data(
+        country, 
+        output,
+        features=[
+            "cables_way", 
+            "lines_way", 
+            "substations_way",
+            "substations_node",
+            "transformers_way",
+            "transformers_node",
+            ]):
+    
+    op_area = _get_overpass_areas(country)
+
+    features_dict= {
+        'cables_way': 'way["power"="cable"]',
+        'lines_way': 'way["power"="line"]',
+        'substations_way': 'way["power"="substation"]',
+        'substations_node': 'node["power"="substation"]',
+        'transformers_way': 'way["power"="transformer"]',
+        'transformers_node': 'node["power"="transformer"]',
+    }
+
+    for f in features:
+        if f not in features_dict:
+            raise ValueError(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
+            logger.info(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
+
+        logger.info(f" - Fetching OSM data for feature '{f}' in {country}...")
+        # Build the overpass query
+        op_query = f'''
+            {op_area[country]}->.searchArea;
+            (
+            {features_dict[f]}(area.searchArea);
+            );
+            out body geom;
+        '''
+
+        # Send the request
+        # response = requests.post(overpass_url, data = op_query)
+        response = op.API(timeout=300).get(op_query) # returns data in geojson format. Timeout (max.) set to 300s
+
+        filepath = output[f]
+        parentfolder = os.path.dirname(filepath)
+        if not os.path.exists(parentfolder):
+            # Create the folder and its parent directories if they don't exist
+            os.makedirs(parentfolder)
+
+        with open(filepath, mode = "w") as f:
+            geojson.dump(response,f,indent=2)
+            # geojson.dump(response.json(),f,indent=2)
+        logger.info(" - Done.")
+        time.sleep(5) 
+
+
+if __name__ == "__main__":
+    # Detect running outside of snakemake and mock snakemake for testing
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("retrieve_osm_data", country="BE")
+    
+    configure_logging(snakemake)
+
+    # Retrieve the OSM data
+    country = snakemake.wildcards.country
+    output = snakemake.output
+
+    # Wait 5 seconds before fetching the OSM data to prevent too many requests error
+    # TODO pypsa-eur: Add try catch to implement this only when needed
+    logger.info(f"Waiting 5 seconds... Retrieving OSM data for {country}:")
+    time.sleep(5) 
+    retrieve_osm_data(country, output)
\ No newline at end of file

From 6352c03c75993fe19a69bbd0d4119d2c1d125646 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 24 Apr 2024 17:30:09 +0200
Subject: [PATCH 002/100] Extended  rule by input.

---
 rules/build_electricity.smk | 28 +++++++++++++++++++++-------
 scripts/clean_osm_data.py   | 11 +++++++++--
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 589dfab6f..a23bffc6f 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -622,16 +622,21 @@ if config["osm"].get("retrieve", True):
         script:
             "../scripts/retrieve_osm_data.py"
 
+FEATURES = ["cables_way", "lines_way", "substations_way", "substations_node", "transformers_way", "transformers_node"]
 rule clean_osm_data:
     # params:
     #     countries=config["countries"],
     input:
-        cables_way=[f"data/osm/raw/{country}/cables_way_raw.geojson" for country in config["countries"]],
-        lines_way=[f"data/osm/raw/{country}/lines_way_raw.geojson" for country in config["countries"]],
-        substations_way=[f"data/osm/raw/{country}/substations_way_raw.geojson" for country in config["countries"]],
-        substations_node=[f"data/osm/raw/{country}/substations_node_raw.geojson" for country in config["countries"]],
-        transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.geojson" for country in config["countries"]],
-        transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.geojson" for country in config["countries"]],
+        **{
+            f"{country}": [f"data/osm/raw/{country}/{feature}.geojson" for feature in FEATURES]
+            for country in config["countries"]
+            },
+        # cables_way[country]=[f"data/osm/raw/{country}/cables_way_raw.geojson" for country in config["countries"]],
+        # lines_way=[f"data/osm/raw/{country}/lines_way_raw.geojson" for country in config["countries"]],
+        # substations_way=[f"data/osm/raw/{country}/substations_way_raw.geojson" for country in config["countries"]],
+        # substations_node=[f"data/osm/raw/{country}/substations_node_raw.geojson" for country in config["countries"]],
+        # transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.geojson" for country in config["countries"]],
+        # transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.geojson" for country in config["countries"]],
     output:
         dummy="data/osm/raw/dummy.txt"
         # cables="resources/RDIR/cables_clean_.geojson"
@@ -639,4 +644,13 @@ rule clean_osm_data:
     log:
         logs("clean_osm_data.log"),
     script:
-        "../scripts/clean_osm_data.py"
\ No newline at end of file
+        "../scripts/clean_osm_data.py"
+
+
+# {
+#             f"{country}": f"{
+#                 f"{feature}": f"data/osm/raw/{country}/{feature}.geojson" 
+#                 }"
+#                 for feature in FEATURES
+#                 for country in config["countries"]
+#             }
\ No newline at end of file
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index d4c3ba36e..305a9fb98 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -7,13 +7,14 @@
 """
 
 # import geojson
+import geopandas as gpd
 import logging
 # import numpy as np
-# import overpass as op
 # import os
-# import pandas as pd
+import pandas as pd
 # import pypsa
 # import requests
+import tqdm.auto as tqdm
 
 from _helpers import configure_logging
 logger = logging.getLogger(__name__)
@@ -36,4 +37,10 @@ def clean_osm_data(output):
     output = str(snakemake.output)
     clean_osm_data(output)
 
+    # Create df by iterating over lines_way and append them to df_lines_way
+    gdf1 = gpd.read_file(snakemake.input["lines_way"])
+    
+    
+    snakemake.wildcards
+    snakemake.input["lines_way"].keys()
 

From 87b4ccedf1d5aaf8cae59d4d16f9f7bc0ce279c0 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 29 Apr 2024 21:03:22 +0200
Subject: [PATCH 003/100] Bug fixes and improvements to clean_osm_data.py.
 Added  in retrieve_osm_data.py.

---
 rules/build_electricity.smk  |  36 ++--
 scripts/clean_osm_data.py    | 361 +++++++++++++++++++++++++++++++++--
 scripts/retrieve_osm_data.py |  25 ++-
 3 files changed, 386 insertions(+), 36 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 8d68cfb34..c4c89c472 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -611,32 +611,34 @@ rule prepare_network:
 if config["osm"].get("retrieve", True):
     rule retrieve_osm_data:
         output:
-            cables_way="data/osm/raw/{country}/cables_way_raw.geojson",
-            lines_way="data/osm/raw/{country}/lines_way_raw.geojson",
-            substations_way="data/osm/raw/{country}/substations_way_raw.geojson",
-            substations_node="data/osm/raw/{country}/substations_node_raw.geojson",
-            transformers_way="data/osm/raw/{country}/transformers_way_raw.geojson",
-            transformers_node="data/osm/raw/{country}/transformers_node_raw.geojson",
+            cables_way="data/osm/raw/{country}/cables_way_raw.json",
+            lines_way="data/osm/raw/{country}/lines_way_raw.json",
+            substations_way="data/osm/raw/{country}/substations_way_raw.json",
+            substations_node="data/osm/raw/{country}/substations_node_raw.json",
+            transformers_way="data/osm/raw/{country}/transformers_way_raw.json",
+            transformers_node="data/osm/raw/{country}/transformers_node_raw.json",
+            relations="data/osm/raw/{country}/relations_raw.json",
         log:
             logs("retrieve_osm_data_{country}.log"),
         script:
             "../scripts/retrieve_osm_data.py"
 
-FEATURES = ["cables_way", "lines_way", "substations_way", "substations_node", "transformers_way", "transformers_node"]
+# FEATURES = ["cables_way", "lines_way", "substations_way", "substations_node", "transformers_way", "transformers_node"]
 rule clean_osm_data:
     # params:
     #     countries=config["countries"],
     input:
-        **{
-            f"{country}": [f"data/osm/raw/{country}/{feature}.geojson" for feature in FEATURES]
-            for country in config["countries"]
-            },
-        # cables_way[country]=[f"data/osm/raw/{country}/cables_way_raw.geojson" for country in config["countries"]],
-        # lines_way=[f"data/osm/raw/{country}/lines_way_raw.geojson" for country in config["countries"]],
-        # substations_way=[f"data/osm/raw/{country}/substations_way_raw.geojson" for country in config["countries"]],
-        # substations_node=[f"data/osm/raw/{country}/substations_node_raw.geojson" for country in config["countries"]],
-        # transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.geojson" for country in config["countries"]],
-        # transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.geojson" for country in config["countries"]],
+        # **{
+        #     f"{country}": [f"data/osm/raw/{country}/{feature}.geojson" for feature in FEATURES]
+        #     for country in config["countries"]
+        #     },
+        cables_way=[f"data/osm/raw/{country}/cables_way_raw.json" for country in config["countries"]],
+        lines_way=[f"data/osm/raw/{country}/lines_way_raw.json" for country in config["countries"]],
+        substations_way=[f"data/osm/raw/{country}/substations_way_raw.json" for country in config["countries"]],
+        substations_node=[f"data/osm/raw/{country}/substations_node_raw.json" for country in config["countries"]],
+        transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.json" for country in config["countries"]],
+        transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.json" for country in config["countries"]],
+        relations=[f"data/osm/raw/{country}/relations_raw.json" for country in config["countries"]],
     output:
         dummy="data/osm/raw/dummy.txt"
         # cables="resources/RDIR/cables_clean_.geojson"
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 305a9fb98..e534801c3 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -6,14 +6,12 @@
 TODO To fill later
 """
 
-# import geojson
 import geopandas as gpd
+import json
 import logging
-# import numpy as np
-# import os
 import pandas as pd
-# import pypsa
-# import requests
+import re
+from shapely.geometry import LineString, Point
 import tqdm.auto as tqdm
 
 from _helpers import configure_logging
@@ -24,6 +22,125 @@ def clean_osm_data(output):
         file.write("Hello, world!\n")
 
 
+def _create_linestring(row):
+    coords = [(coord['lon'], coord['lat']) for coord in row["geometry"]]
+    return LineString(coords)
+
+
+def _clean_voltage(column):
+    """
+    Function to clean the raw voltage column: manual fixing and drop nan values
+
+    Args:
+    - column: pandas Series, the column to be cleaned
+
+    Returns:
+    - column: pandas Series, the cleaned column
+    """
+    column = (
+        column
+        .astype(str)
+        .str.lower()
+        .str.replace("fixme", "")
+        .str.replace("(temp 150000)", "")
+        .str.replace("low", "1000")
+        .str.replace("minor", "1000")
+        .str.replace("medium", "33000")
+        .str.replace("med", "33000")
+        .str.replace("m", "33000")
+        .str.replace("high", "150000")
+        .str.replace("unknown", "")
+        .str.replace("23000-109000", "109000")
+        .str.replace("INF", "")
+        .str.replace("<", "")
+        .str.replace("?", "")
+        .str.replace(",", "")
+        .str.replace(" ", "")
+        .str.replace("_", "")
+        .str.replace("kv", "000")
+        .str.replace("v", "")
+        .str.replace("/", ";") 
+        .str.replace("nan", "")
+        .str.replace("<NA>", "")
+    )
+
+    # Remove all remaining non-numeric characters except for semicolons
+    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+
+    column.dropna(inplace=True)
+    return column
+
+
+def _clean_circuits(column):
+    """
+    Function to clean the raw circuits column: manual fixing and drop nan values
+
+    Args:
+    - column: pandas Series, the column to be cleaned
+
+    Returns:
+    - column: pandas Series, the cleaned column
+    """
+    column = column.copy()
+    column = (
+        column
+        .astype(str)
+        .str.replace("partial", "")
+        .str.replace("1operator=RTE operator:wikidata=Q2178795", "")
+        .str.lower()
+        .str.replace("1/3", "1")
+        .str.replace("<NA>", "")
+        .str.replace("nan", "")
+    )
+
+    # Remove all remaining non-numeric characters except for semicolons
+    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+
+    column.dropna(inplace=True)
+    return column.astype(str)
+
+
+def _clean_frequency(column):
+    column = column.copy()
+    to_fifty = column.astype(str) != "0"
+    column[to_fifty] = "50"    
+
+    return column
+
+
+def _split_voltage(df):
+    to_split = df['voltage'].str.contains(';')
+    new_rows = []
+    for index, row in df[to_split].iterrows():
+        split_values = row["voltage"].split(';')
+        new_sub_id_len = int(len(split_values))
+        for i, value in enumerate(split_values):
+            new_sub_id = str(i+1)
+            new_id = str(row['id']) + '_' + new_sub_id
+            new_row = {
+                'id': new_id, 
+                'sub_id': new_sub_id,
+                'sub_id_len': new_sub_id_len,
+                'bounds': row['bounds'],
+                'nodes': row['nodes'],
+                'geometry': row['geometry'],
+                'power': row['power'],
+                'cables': row['cables'],
+                'circuits': row['circuits'],
+                'frequency': row['frequency'],
+                'voltage': value, 
+                'wires': row['wires'],}
+            new_rows.append(new_row)
+
+    # Create DataFrame from split rows
+    split_df = pd.DataFrame(new_rows)
+    df_new = pd.concat([df[~to_split], split_df])
+    df_new["sub_id_len"] = df_new["sub_id_len"].astype(int)
+
+    # Append the original DataFrame with split_df
+    return df_new
+
+
 if __name__ == "__main__":
     # Detect running outside of snakemake and mock snakemake for testing
     if "snakemake" not in globals():
@@ -34,13 +151,235 @@ def clean_osm_data(output):
     configure_logging(snakemake)
     logger.info("Dummy log: clean_osm_data()")
 
-    output = str(snakemake.output)
-    clean_osm_data(output)
+    # input_path = snakemake.input.lines_way + snakemake.input.cables_way
+    # input_path = {
+    #     "lines": snakemake.input.lines_way,
+    #     "cables": snakemake.input.cables_way,
+    # }
+
+    # columns = ["id", "sub_id", "sub_id_len", "bounds", "nodes", "geometry", "power", "cables", "circuits", "frequency", "voltage", "wires"]
+    # df_lines = pd.DataFrame(columns=columns)
+    # crs = "EPSG:4326"
+
+    # # using tqdm loop over input path
+
+    # for key in input_path:
+    #     logger.info(f"Processing {key}...")
+    #     for idx, ip in enumerate(input_path[key]):
+    #         if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
+    #             logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path[key])).zfill(2)}: {ip}")
+    #             with open(ip, "r") as f:
+    #                 data = json.load(f)
+                
+    #             df = pd.DataFrame(data['elements'])
+    #             df["id"] = df["id"].astype(str)
+    #             df["sub_id"] = "0" # initiate sub_id column with 0
+    #             df["sub_id_len"] = 0 # initiate sub_id column with 0
+
+    #             col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires"]
+
+    #             tags = pd.json_normalize(df["tags"]) \
+    #                 .map(lambda x: str(x) if pd.notnull(x) else x)
+                
+    #             for ct in col_tags:
+    #                 if ct not in tags.columns:
+    #                     tags[ct] = pd.NA
+                
+    #             tags = tags.loc[:, col_tags]
+
+    #             df = pd.concat([df, tags], axis="columns") 
+    #             df.drop(columns=["type", "tags"], inplace=True)
+                
+    #             df_lines = pd.concat([df_lines, df], axis="rows")
+
+    #         else:
+    #             logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path[key])).zfill(2)} (empty): {ip}")
+    #             continue
+    #     logger.info("---")
+    
+    # # Drop duplicates
+    # df_lines.drop_duplicates(subset="id", inplace=True)
+
+    # df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
+    # # drop voltage = ""
+    # df_lines = _split_voltage(df_lines)
+    # df_lines = df_lines[df_lines["voltage"] != ""]
+    # df_lines["voltage"] = df_lines["voltage"].astype(int, errors="ignore")
+
+    # # Drop voltages below 220 kV
+    # df_lines = df_lines[df_lines["voltage"] >= 220000]
 
-    # Create df by iterating over lines_way and append them to df_lines_way
-    gdf1 = gpd.read_file(snakemake.input["lines_way"])
+    # # Clean frequencies
+    # df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
+    # df_lines["frequency"] = df_lines["frequency"].astype(int, errors="ignore")
+
+    # # Clean circuits
+    # df_lines["circuits"] = _clean_circuits(df_lines["circuits"])
+    # # Map correct circuits to lines that where split
+    
+    # # Initiate new column for cleaned circuits with values that are already valid:
+    # # Condition 1: Length of sub_id is 0, the line was not split
+    # # Condition 2: Number of entries in circuits separated by semicolon is 1, value is unique
+    # # Condition 3: Circuits is not an empty string
+    # # Condition 4: Circuits is not "0"
+    # bool_circuits_valid = (df_lines["sub_id_len"] == 0) & \
+    #     (df_lines["circuits"].apply(lambda x: len(x.split(";"))) == 1) & \
+    #     (df_lines["circuits"] != "") & \
+    #     (df_lines["circuits"] != "0")
+        
+    # df_lines.loc[bool_circuits_valid, "circuits_clean"] = df_lines.loc[bool_circuits_valid, "circuits"]
     
+    # # Boolean to check if sub_id_len is equal to the number of circuits
+    # bool_equal = df_lines["sub_id_len"] == df_lines["circuits"] \
+    #                 .apply(lambda x: len(x.split(";")))
+    # op_equal = lambda row: row["circuits"].split(";")[int(row["sub_id"])-1]
+        
+    # df_lines.loc[bool_equal, "circuits_clean"] = df_lines[bool_equal] \
+    #     .apply(op_equal, axis=1)
+    
+    # bool_larger = df_lines["sub_id_len"] > \
+    #     df_lines["circuits"].apply(lambda x: len(x.split(";")))
+    
+    # pd.set_option('display.max_rows', None)
+    # df_lines.loc[bool_larger, ["id", "sub_id", "sub_id_len", "cables", "circuits", "circuits_clean", "frequency"]]
+
+
+
+
+
+    # df_lines[df_lines["sub_id_len"] > 0]["circuits"]
+
+
+    # df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)    
+    # gdf = gpd.GeoDataFrame(
+    #     df_lines[["id", "sub_id", "sub_id_len", "power", "cables", "circuits", "voltage", "geometry"]], 
+    #     geometry = "geometry", crs = "EPSG:4326"
+    #     )
     
-    snakemake.wildcards
-    snakemake.input["lines_way"].keys()
+    # gdf.explore()
+    # df_lines.voltage.unique()
+
+    # df_lines.circuits.apply(lambda x: x.split(";")).explode().unique()
+
+    # ol_lines_way = ["id", "power", "cables", "circuits", "frequency", "voltage"]
+
+    # # gdf = gpd.read_file(lines_way[3])
+    # # gdf2 = gpd.GeoDataFrame(gdf, geometry=gdf.geometry)
+    # # df = gdf.to_json()
+
+    # # gdf.to_file("example.geojson", layer_options={"ID_GENERATE": "YES"})
+
+
+    output = str(snakemake.output)
+    clean_osm_data(output)
+
+
+
+
+# # Example DataFrame
+# data = {'id': ["ID1", "ID2", "ID3", "ID4", "ID5"],
+#         'A': ["220000", "380000", ";100000", "220000;220000;380000", "220000;;400000;700000"],
+#         'B': [1, 2, 3, 4, 5],
+#         'C': [6, 7, 8, 9, 10]}
+# df = pd.DataFrame(data)
+
+# # Split the entries in column A that contain a semicolon
+# split_rows = df[df['A'].str.contains(';')]
+# split_values = split_rows['A'].str.split(';', expand=True)
+
+# # Create two copies of the rows containing semicolons, one for each split value
+# split_rows_1 = split_rows.copy()
+# split_rows_2 = split_rows.copy()
+
+# # Update column A in the split rows to contain the split values
+# split_rows_1['A'] = split_values[0]
+# split_rows_2['A'] = split_values[1]
+
+# # Concatenate the split rows with the original DataFrame, excluding the rows containing semicolons
+# result_df = pd.concat([df[~df.index.isin(split_rows.index)], split_rows_1, split_rows_2], ignore_index=True)
+
+# # Display the result
+# print(result_df)
+
+
+# '# Sample DataFrame
+# data = {'id': ["ID1", "ID2", "ID3", "ID4", "ID5"],
+#         'voltage': ["220000", "380000", ";100000", "220000;220000;380000", "220000;;400000;700000"],
+#         'B': [1, 2, 3, 4, 5],
+#         'C': [6, 7, 8, 9, 10]}
+# df = pd.DataFrame(data)
+
+# # Find rows to split
+# to_split = df['voltage'].str.contains(';')
+
+# # Splitting entries and creating new rows
+
+
+# new_rows = []
+
+# for index, row in df[to_split].iterrows():
+#     split_values = row["voltage"].split(';')
+#     for i, value in enumerate(split_values):
+#         new_id = str(row['id']) + '_' + str(i+1)
+#         new_row = {
+#             'id': new_id, 
+#             'bounds': row['bounds'],
+#             'nodes': row['nodes'],
+#             'geometry': row['geometry'],
+#             'cables': row['cables'],
+#             'circuits': row['circuits'],
+#             'frequency': row['frequency'],
+#             'voltage': value, 
+#             'wires': row['wires'],}
+#         new_rows.append(new_row)
+
+# # Create DataFrame from split rows
+# split_df = pd.DataFrame(new_rows)
+
+# # Append the original DataFrame with split_df
+# final_df = pd.concat([df[~to_split], split_df])
+
+# print(final_df)
+
+
+
+# from shapely.geometry import LineString
+# import numpy as np
+# import matplotlib.pyplot as plt
+
+# def offset_line(original_line, distance):
+#     # Compute the direction vector between the two endpoints
+#     direction_vector = np.array(original_line.coords[1]) - np.array(original_line.coords[0])
+
+#     # Compute the orthogonal vector
+#     orthogonal_vector = np.array([-direction_vector[1], direction_vector[0]])
+
+#     # Normalize the orthogonal vector
+#     orthogonal_vector /= np.linalg.norm(orthogonal_vector)
+
+#     # Compute the offset LineString
+#     offset_points = []
+#     for point in original_line.coords:
+#         offset_point = np.array(point) + distance * orthogonal_vector
+#         offset_points.append((offset_point[0], offset_point[1]))
+
+#     return LineString(offset_points)
+
+# # Example usage:
+# original_line = lines.iloc[5]
+# offset_distance = 1.0
+# b = offset_line(original_line, offset_distance)
 
+# # Plot both LineStrings
+# fig, ax = plt.subplots()
+# x, y = original_line.xy
+# ax.plot(x, y, label='Original LineString')
+# x, y = offset_line.xy
+# ax.plot(x, y, label='Offset LineString')
+# ax.set_aspect('equal')
+# ax.legend()
+# plt.xlabel('X')
+# plt.ylabel('Y')
+# plt.title('Original and Offset LineStrings')
+# plt.grid(True)
+# plt.show()
\ No newline at end of file
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 47592d296..15eec040d 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -6,9 +6,9 @@
 TODO To fill later
 """
 
-import geojson
+import json
 import logging
-import overpass as op
+# import overpass as op
 import os
 import requests
 import time
@@ -43,7 +43,10 @@ def _get_overpass_areas(countries):
         # Check if the response contains any results
         if "elements" in data and len(data["elements"]) > 0:
             # Extract the area ID from the relation
-            osm_area_id = data["elements"][0]["id"]
+            if c == "FR": # take second one for France
+                osm_area_id = data["elements"][1]["id"]
+            else:
+                osm_area_id = data["elements"][0]["id"]
             osm_areas.append(f"area({osm_area_id})")
         else:
             # Print a warning if no results are found for the country code
@@ -65,10 +68,14 @@ def retrieve_osm_data(
             "substations_node",
             "transformers_way",
             "transformers_node",
+            "relations",
             ]):
     
     op_area = _get_overpass_areas(country)
 
+    # Overpass API endpoint URL
+    overpass_url = "https://overpass-api.de/api/interpreter"
+
     features_dict= {
         'cables_way': 'way["power"="cable"]',
         'lines_way': 'way["power"="line"]',
@@ -76,6 +83,7 @@ def retrieve_osm_data(
         'substations_node': 'node["power"="substation"]',
         'transformers_way': 'way["power"="transformer"]',
         'transformers_node': 'node["power"="transformer"]',
+        'relations': 'rel["route"="power"]["type"="route"]'
     }
 
     for f in features:
@@ -86,6 +94,7 @@ def retrieve_osm_data(
         logger.info(f" - Fetching OSM data for feature '{f}' in {country}...")
         # Build the overpass query
         op_query = f'''
+            [out:json];
             {op_area[country]}->.searchArea;
             (
             {features_dict[f]}(area.searchArea);
@@ -94,8 +103,8 @@ def retrieve_osm_data(
         '''
 
         # Send the request
-        # response = requests.post(overpass_url, data = op_query)
-        response = op.API(timeout=300).get(op_query) # returns data in geojson format. Timeout (max.) set to 300s
+        response = requests.post(overpass_url, data = op_query)
+        # response = op.API(timeout=300).get(op_query) # returns data in geojson format. Timeout (max.) set to 300s
 
         filepath = output[f]
         parentfolder = os.path.dirname(filepath)
@@ -104,10 +113,10 @@ def retrieve_osm_data(
             os.makedirs(parentfolder)
 
         with open(filepath, mode = "w") as f:
-            geojson.dump(response,f,indent=2)
-            # geojson.dump(response.json(),f,indent=2)
+            # geojson.dump(response,f,indent=2)
+            json.dump(response.json(),f,indent=2)
         logger.info(" - Done.")
-        time.sleep(5) 
+        # time.sleep(5) 
 
 
 if __name__ == "__main__":

From e6c9acce29a2981a2c6bc8830bae7d656847d3b0 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 3 May 2024 17:14:28 +0200
Subject: [PATCH 004/100] Updated clean_osm_data and retrieve_osm_data to
 create clean substations.

---
 rules/build_electricity.smk  |  18 +-
 scripts/clean_osm_data.py    | 716 +++++++++++++++++++++++++----------
 scripts/retrieve_osm_data.py |  21 +-
 3 files changed, 546 insertions(+), 209 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index c4c89c472..23bf99969 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -615,9 +615,10 @@ if config["osm"].get("retrieve", True):
             lines_way="data/osm/raw/{country}/lines_way_raw.json",
             substations_way="data/osm/raw/{country}/substations_way_raw.json",
             substations_node="data/osm/raw/{country}/substations_node_raw.json",
-            transformers_way="data/osm/raw/{country}/transformers_way_raw.json",
-            transformers_node="data/osm/raw/{country}/transformers_node_raw.json",
-            relations="data/osm/raw/{country}/relations_raw.json",
+            substations_relation="data/osm/raw/{country}/substations_relation_raw.json",
+            # transformers_way="data/osm/raw/{country}/transformers_way_raw.json",
+            # transformers_node="data/osm/raw/{country}/transformers_node_raw.json",
+            # route_relations="data/osm/raw/{country}/route_relations_raw.json",
         log:
             logs("retrieve_osm_data_{country}.log"),
         script:
@@ -636,13 +637,12 @@ rule clean_osm_data:
         lines_way=[f"data/osm/raw/{country}/lines_way_raw.json" for country in config["countries"]],
         substations_way=[f"data/osm/raw/{country}/substations_way_raw.json" for country in config["countries"]],
         substations_node=[f"data/osm/raw/{country}/substations_node_raw.json" for country in config["countries"]],
-        transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.json" for country in config["countries"]],
-        transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.json" for country in config["countries"]],
-        relations=[f"data/osm/raw/{country}/relations_raw.json" for country in config["countries"]],
+        substations_relation=[f"data/osm/raw/{country}/substations_relation_raw.json" for country in config["countries"]],
+        # transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.json" for country in config["countries"]],
+        # transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.json" for country in config["countries"]],
+        # route_relations=[f"data/osm/raw/{country}/route_relations_raw.json" for country in config["countries"]],
     output:
-        dummy="data/osm/raw/dummy.txt"
-        # cables="resources/RDIR/cables_clean_.geojson"
-        # lines=
+        substations="data/osm/clean/substations.geojson",
     log:
         logs("clean_osm_data.log"),
     script:
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index e534801c3..90e3ca17e 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -6,12 +6,17 @@
 TODO To fill later
 """
 
+from branca.element import Figure
+import folium
 import geopandas as gpd
 import json
 import logging
+import os
+import numpy as np
 import pandas as pd
 import re
-from shapely.geometry import LineString, Point
+from shapely.geometry import LineString, Point, Polygon
+from shapely.ops import linemerge
 import tqdm.auto as tqdm
 
 from _helpers import configure_logging
@@ -27,6 +32,29 @@ def _create_linestring(row):
     return LineString(coords)
 
 
+def _create_polygon(row):
+    """
+    Create a Shapely Polygon from a list of coordinate dictionaries.
+    
+    Parameters:
+        coords (list): List of dictionaries with 'lat' and 'lon' keys representing coordinates.
+        
+    Returns:
+        shapely.geometry.Polygon: The constructed polygon object.
+    """
+    # Extract coordinates as tuples
+    point_coords = [(coord['lon'], coord['lat']) for coord in row["geometry"]]
+    
+    # Ensure closure by repeating the first coordinate as the last coordinate
+    if point_coords[0] != point_coords[-1]:
+        point_coords.append(point_coords[0])
+    
+    # Create Polygon object
+    polygon = Polygon(point_coords)
+    
+    return polygon
+
+
 def _clean_voltage(column):
     """
     Function to clean the raw voltage column: manual fixing and drop nan values
@@ -37,11 +65,21 @@ def _clean_voltage(column):
     Returns:
     - column: pandas Series, the cleaned column
     """
+    column = column.copy()
+
+    column = (
+        column
+        .astype(str)
+        .str.lower()
+        .str.replace("400/220/110 kV'", "400000;220000;110000")
+        .str.replace("400/220/110/20_kv", "400000;220000;110000;20000")
+        .str.replace("2x25000", "25000;25000")
+    )
+
     column = (
         column
         .astype(str)
         .str.lower()
-        .str.replace("fixme", "")
         .str.replace("(temp 150000)", "")
         .str.replace("low", "1000")
         .str.replace("minor", "1000")
@@ -49,23 +87,20 @@ def _clean_voltage(column):
         .str.replace("med", "33000")
         .str.replace("m", "33000")
         .str.replace("high", "150000")
-        .str.replace("unknown", "")
         .str.replace("23000-109000", "109000")
-        .str.replace("INF", "")
-        .str.replace("<", "")
-        .str.replace("?", "")
-        .str.replace(",", "")
-        .str.replace(" ", "")
-        .str.replace("_", "")
+        .str.replace("380000>220000", "380000;220000")
+        .str.replace(":", ";")
+        .str.replace("<", ";")
+        .str.replace(",", ";")
         .str.replace("kv", "000")
-        .str.replace("v", "")
+        .str.replace("kva", "000")
         .str.replace("/", ";") 
         .str.replace("nan", "")
-        .str.replace("<NA>", "")
+        .str.replace("<na>", "")
     )
 
     # Remove all remaining non-numeric characters except for semicolons
-    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', str(x)))
 
     column.dropna(inplace=True)
     return column
@@ -88,8 +123,75 @@ def _clean_circuits(column):
         .str.replace("partial", "")
         .str.replace("1operator=RTE operator:wikidata=Q2178795", "")
         .str.lower()
+        .str.replace("1,5", "3") # (way 998005838, should be corrected in OSM soon)
         .str.replace("1/3", "1")
-        .str.replace("<NA>", "")
+        .str.replace("<na>", "")
+        .str.replace("nan", "")
+    )
+
+    # Remove all remaining non-numeric characters except for semicolons
+    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+
+    column.dropna(inplace=True)
+    return column.astype(str)
+
+
+def _clean_cables(column):
+    """
+    Function to clean the raw cables column: manual fixing and drop nan values
+
+    Args:
+    - column: pandas Series, the column to be cleaned
+
+    Returns:
+    - column: pandas Series, the cleaned column
+    """
+    column = column.copy()
+    column = (
+        column
+        .astype(str)
+        .str.lower()
+        .str.replace("1/3", "1")
+        .str.replace("3x2;2", "3")
+        .str.replace("<na>", "")
+        .str.replace("nan", "")
+    )
+
+    # Remove all remaining non-numeric characters except for semicolons
+    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+
+    column.dropna(inplace=True)
+    return column.astype(str)
+
+
+def _clean_wires(column):
+    """
+    Function to clean the raw wires column: manual fixing and drop nan values
+
+    Args:
+    - column: pandas Series, the column to be cleaned
+
+    Returns:
+    - column: pandas Series, the cleaned column
+    """
+    column = column.copy()
+    column = (
+        column
+        .astype(str)
+        .str.lower()
+        .str.replace("?", "")
+        .str.replace("trzyprzewodowe", "3")
+        .str.replace("pojedyńcze", "1")
+        .str.replace("single", "1")
+        .str.replace("double", "2")
+        .str.replace("triple", "3")
+        .str.replace("quad", "4")
+        .str.replace("fivefold", "5")
+        .str.replace("yes", "3")
+        .str.replace("1/3", "1")
+        .str.replace("3x2;2", "3")
+        .str.replace("_", "")
+        .str.replace("<na>", "")
         .str.replace("nan", "")
     )
 
@@ -100,7 +202,7 @@ def _clean_circuits(column):
     return column.astype(str)
 
 
-def _clean_frequency(column):
+def _set_frequency(column):
     column = column.copy()
     to_fifty = column.astype(str) != "0"
     column[to_fifty] = "50"    
@@ -108,6 +210,46 @@ def _clean_frequency(column):
     return column
 
 
+def _check_voltage(voltage, list_voltages):
+    voltages = voltage.split(';')
+    for v in voltages:
+        if v in list_voltages:
+            return True
+    return False
+
+
+def _clean_frequency(column):   
+    column = column.copy()
+    """
+    Function to clean the raw frequency column: manual fixing and drop nan values
+
+    Args:
+    - column: pandas Series, the column to be cleaned
+
+    Returns:
+    - column: pandas Series, the cleaned column
+    """
+    column = column.copy()
+    column = (
+        column
+        .astype(str)
+        .str.lower()
+        .str.replace("16.67", "16.7")
+        .str.replace("16,7", "16.7")
+        .str.replace("?", "")
+        .str.replace("hz", "")
+        .str.replace(" ", "")
+        .str.replace("<NA>", "")
+        .str.replace("nan", "")
+    )
+
+    # Remove all remaining non-numeric characters except for semicolons
+    column = column.apply(lambda x: re.sub(r'[^0-9;.]', '', x))
+
+    column.dropna(inplace=True)
+    return column.astype(str)
+
+
 def _split_voltage(df):
     to_split = df['voltage'].str.contains(';')
     new_rows = []
@@ -124,6 +266,7 @@ def _split_voltage(df):
                 'bounds': row['bounds'],
                 'nodes': row['nodes'],
                 'geometry': row['geometry'],
+                'country': row['country'],
                 'power': row['power'],
                 'cables': row['cables'],
                 'circuits': row['circuits'],
@@ -141,6 +284,66 @@ def _split_voltage(df):
     return df_new
 
 
+def _split_cells(df, cols=["voltage"]):
+    """
+    Split semicolon separated cells i.e. [66000;220000] and create new
+    identical rows.
+
+    Parameters
+    ----------
+    df : dataframe
+        Dataframe under analysis
+    cols : list
+        List of target columns over which to perform the analysis
+
+    Example
+    -------
+    Original data:
+    row 1: '66000;220000', '50'
+
+    After applying split_cells():
+    row 1, '66000', '50', 2
+    row 2, '220000', '50', 2
+    """
+    if df.empty:
+        return df
+
+    # Create a dictionary to store the suffix count for each original ID
+    suffix_counts = {}
+    # Create a dictionary to store the number of splits associated with each original ID
+    num_splits = {}
+
+    # Split cells and create new rows
+    x = df.assign(**{col: df[col].str.split(";") for col in cols})
+    x = x.explode(cols, ignore_index=True)
+
+    # Count the number of splits associated with each original ID
+    num_splits = x.groupby('id').size().to_dict()
+
+    # Update the 'split_elements' column
+    x["split_elements"] = x["id"].map(num_splits)
+
+    # Function to generate the new ID with suffix and update the number of splits
+    def generate_new_id(row):
+        original_id = row["id"]
+        if row["split_elements"] == 1:
+            return original_id
+        else:
+            suffix_counts[original_id] = suffix_counts.get(original_id, 0) + 1
+            return f"{original_id}_{suffix_counts[original_id]}"
+
+    # Update the ID column with the new IDs
+    x["id"] = x.apply(generate_new_id, axis=1)
+
+    return x
+
+
+# Function to check if any substring is in valid_strings
+def _any_substring_in_list(s, list_strings):
+    substrings = s.split(';')
+    return any(sub in list_strings for sub in substrings)
+
+
 if __name__ == "__main__":
     # Detect running outside of snakemake and mock snakemake for testing
     if "snakemake" not in globals():
@@ -151,235 +354,360 @@ def _split_voltage(df):
     configure_logging(snakemake)
     logger.info("Dummy log: clean_osm_data()")
 
-    # input_path = snakemake.input.lines_way + snakemake.input.cables_way
-    # input_path = {
-    #     "lines": snakemake.input.lines_way,
-    #     "cables": snakemake.input.cables_way,
-    # }
-
-    # columns = ["id", "sub_id", "sub_id_len", "bounds", "nodes", "geometry", "power", "cables", "circuits", "frequency", "voltage", "wires"]
-    # df_lines = pd.DataFrame(columns=columns)
-    # crs = "EPSG:4326"
-
-    # # using tqdm loop over input path
-
-    # for key in input_path:
-    #     logger.info(f"Processing {key}...")
-    #     for idx, ip in enumerate(input_path[key]):
-    #         if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
-    #             logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path[key])).zfill(2)}: {ip}")
-    #             with open(ip, "r") as f:
-    #                 data = json.load(f)
+    ############# BUSES / SUBSTATIONS ######################
+    input_path_substations = {
+        "substations_way": snakemake.input.substations_way,
+        "substations_relation": snakemake.input.substations_relation,
+    }
+
+    cols_substations_way = ["id", "geometry", "country", "power", "substation", "voltage", "frequency"]
+    cols_substations_relation = ["id", "country", "power", "substation", "voltage", "frequency"]
+    df_substations_way = pd.DataFrame(columns = cols_substations_way)
+    df_substations_relation = pd.DataFrame(columns = cols_substations_relation)
+
+    for key in input_path_substations:
+        logger.info(f"Processing {key}...")
+        for idx, ip in enumerate(input_path_substations[key]):
+            if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
+                country = os.path.basename(os.path.dirname(input_path_substations[key][idx]))  
+                logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)}: {ip}")
+                with open(ip, "r") as f:
+                    data = json.load(f)
                 
-    #             df = pd.DataFrame(data['elements'])
-    #             df["id"] = df["id"].astype(str)
-    #             df["sub_id"] = "0" # initiate sub_id column with 0
-    #             df["sub_id_len"] = 0 # initiate sub_id column with 0
+                df = pd.DataFrame(data['elements'])
+                df["id"] = df["id"].astype(str)
+                df["country"] = country
 
-    #             col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires"]
+                col_tags = ["power", "substation", "voltage", "frequency"]
 
-    #             tags = pd.json_normalize(df["tags"]) \
-    #                 .map(lambda x: str(x) if pd.notnull(x) else x)
+                tags = pd.json_normalize(df["tags"]) \
+                    .map(lambda x: str(x) if pd.notnull(x) else x)
                 
-    #             for ct in col_tags:
-    #                 if ct not in tags.columns:
-    #                     tags[ct] = pd.NA
+                for ct in col_tags:
+                    if ct not in tags.columns:
+                        tags[ct] = pd.NA
                 
-    #             tags = tags.loc[:, col_tags]
+                tags = tags.loc[:, col_tags]
 
-    #             df = pd.concat([df, tags], axis="columns") 
-    #             df.drop(columns=["type", "tags"], inplace=True)
-                
-    #             df_lines = pd.concat([df_lines, df], axis="rows")
+                df = pd.concat([df, tags], axis="columns") 
 
-    #         else:
-    #             logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path[key])).zfill(2)} (empty): {ip}")
-    #             continue
-    #     logger.info("---")
-    
-    # # Drop duplicates
-    # df_lines.drop_duplicates(subset="id", inplace=True)
+                if key == "substations_way":
+                    df.drop(columns=["type", "tags", "bounds", "nodes"], inplace=True)
+                    df_substations_way = pd.concat([df_substations_way, df], axis="rows")
+                elif key == "substations_relation":
+                    df.drop(columns=["type", "tags", "bounds"], inplace=True)
+                    df_substations_relation = pd.concat([df_substations_relation, df], axis="rows")
 
-    # df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
-    # # drop voltage = ""
-    # df_lines = _split_voltage(df_lines)
-    # df_lines = df_lines[df_lines["voltage"] != ""]
-    # df_lines["voltage"] = df_lines["voltage"].astype(int, errors="ignore")
+            else:
+                logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)} (empty): {ip}")
+                continue
+        logger.info("---")
 
-    # # Drop voltages below 220 kV
-    # df_lines = df_lines[df_lines["voltage"] >= 220000]
+    df_substations_way.drop_duplicates(subset='id', keep='first', inplace=True)
+    df_substations_relation.drop_duplicates(subset='id', keep='first', inplace=True)
 
-    # # Clean frequencies
-    # df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
-    # df_lines["frequency"] = df_lines["frequency"].astype(int, errors="ignore")
+    df_substations_way["geometry"] = df_substations_way.apply(_create_polygon, axis=1)
 
-    # # Clean circuits
-    # df_lines["circuits"] = _clean_circuits(df_lines["circuits"])
-    # # Map correct circuits to lines that where split
+    # Normalise the members column of df_substations_relation
+    cols_members = ["id", "type", "ref", "role", "geometry"]
+    df_substations_relation_members = pd.DataFrame(columns = cols_members)
+
+    for index, row in df_substations_relation.iterrows():
+        col_members = ["type", "ref", "role", "geometry"]
+        df = pd.json_normalize(row["members"]) 
+                
+        for cm in col_members:
+            if cm not in df.columns:
+                df[cm] = pd.NA
+
+        df = df.loc[:, col_members]
+        df["id"] = str(row["id"])
+        df["ref"] = df["ref"].astype(str)
+        df = df[df["type"] != "node"]
+        df = df.dropna(subset=["geometry"])
+        df = df[~df["role"].isin(["", "incoming_line", "substation", "inner"])]
+        df_substations_relation_members = pd.concat([df_substations_relation_members, df], axis="rows")
     
-    # # Initiate new column for cleaned circuits with values that are already valid:
-    # # Condition 1: Length of sub_id is 0, the line was not split
-    # # Condition 2: Number of entries in circuits separated by semicolon is 1, value is unique
-    # # Condition 3: Circuits is not an empty string
-    # # Condition 4: Circuits is not "0"
-    # bool_circuits_valid = (df_lines["sub_id_len"] == 0) & \
-    #     (df_lines["circuits"].apply(lambda x: len(x.split(";"))) == 1) & \
-    #     (df_lines["circuits"] != "") & \
-    #     (df_lines["circuits"] != "0")
-        
-    # df_lines.loc[bool_circuits_valid, "circuits_clean"] = df_lines.loc[bool_circuits_valid, "circuits"]
+    df_substations_relation_members.reset_index(inplace=True)
+    df_substations_relation_members["linestring"] = df_substations_relation_members.apply(_create_linestring, axis=1)  
+    df_substations_relation_members_grouped = df_substations_relation_members.groupby('id')['linestring'] \
+        .apply(lambda x: linemerge(x.tolist())).reset_index()
+    df_substations_relation_members_grouped["geometry"] = df_substations_relation_members_grouped["linestring"].apply(lambda x: x.convex_hull)
     
-    # # Boolean to check if sub_id_len is equal to the number of circuits
-    # bool_equal = df_lines["sub_id_len"] == df_lines["circuits"] \
-    #                 .apply(lambda x: len(x.split(";")))
-    # op_equal = lambda row: row["circuits"].split(";")[int(row["sub_id"])-1]
-        
-    # df_lines.loc[bool_equal, "circuits_clean"] = df_lines[bool_equal] \
-    #     .apply(op_equal, axis=1)
+    df_substations_relation = df_substations_relation.join(
+        df_substations_relation_members_grouped.set_index('id'), 
+        on='id', how='left'
+        ).drop(columns=["members", "linestring"]) \
+        .dropna(subset=["geometry"])
     
-    # bool_larger = df_lines["sub_id_len"] > \
-    #     df_lines["circuits"].apply(lambda x: len(x.split(";")))
+    # reorder columns and concatenate
+    df_substations_relation = df_substations_relation[cols_substations_way]
+    df_substations = pd.concat([df_substations_way, df_substations_relation], axis="rows")
+
+    # Create centroids from geometries
+    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
+    df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
+    df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
+
+    # Clean columns
+    df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
+    df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
+    df_substations["frequency"] = df_substations["frequency"].astype(str, errors="ignore")
+
+    list_voltages = df_substations["voltage"].str.split(";").explode().unique().astype(str)
+    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
+    list_voltages = list_voltages[~np.char.startswith(list_voltages, '1')]
+
+    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_substations = df_substations[bool_voltages]
+
+    df_substations = _split_cells(df_substations)
+    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_substations = df_substations[bool_voltages]
+    df_substations["split_count"] = df_substations["id"].apply(lambda x: x.split("_")[1] if "_" in x else "0")
+    df_substations["split_count"] = df_substations["split_count"].astype(int)
+
+    bool_split = df_substations["split_elements"] > 1
+    bool_frequency_len = df_substations["frequency"].apply(lambda x: len(x.split(";"))) == df_substations["split_elements"]
+    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = df_substations.loc[bool_frequency_len & bool_split, "frequency"] \
     
-    # pd.set_option('display.max_rows', None)
-    # df_lines.loc[bool_larger, ["id", "sub_id", "sub_id_len", "cables", "circuits", "circuits_clean", "frequency"]]
-
-
-
+    op_freq = lambda row: row["frequency"].split(";")[row["split_count"]-1]
 
-
-    # df_lines[df_lines["sub_id_len"] > 0]["circuits"]
-
-
-    # df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)    
-    # gdf = gpd.GeoDataFrame(
-    #     df_lines[["id", "sub_id", "sub_id_len", "power", "cables", "circuits", "voltage", "geometry"]], 
-    #     geometry = "geometry", crs = "EPSG:4326"
-    #     )
+    df_substations.loc[bool_frequency_len & bool_split, ["frequency"]] = df_substations.loc[bool_frequency_len & bool_split, ] \
+        .apply(op_freq, axis=1)
+    
+    df_substations = _split_cells(df_substations, cols=["frequency"])
+    bool_invalid_frequency = df_substations["frequency"].apply(lambda x: x not in ["50", "0"])
+    df_substations.loc[bool_invalid_frequency, "frequency"] = "50"
+    df_substations["power"] = "substation"
+    df_substations["substation"] = "transmission"
+    df_substations["dc"] = False
+    df_substations.loc[df_substations["frequency"] == "0", "dc"] = True
+    df_substations["under_construction"] = False
+    df_substations["station_id"] = None
+    df_substations["tag_area"] = None
+
+    # rename columns
+    df_substations.rename(
+        columns={
+            "id": "bus_id", 
+            "power": "symbol",
+            "substation":"tag_substation",
+            }, inplace=True)
     
-    # gdf.explore()
-    # df_lines.voltage.unique()
+    df_substations = df_substations[[
+        "bus_id",
+        "symbol", 
+        "tag_substation", 
+        "voltage", 
+        "lon", 
+        "lat", 
+        "dc", 
+        "under_construction", 
+        "station_id", 
+        "tag_area", 
+        "country",
+        "geometry",
+        ]]
+    
+    gdf_substations = gpd.GeoDataFrame(df_substations, geometry = "geometry", crs = "EPSG:4326")
 
-    # df_lines.circuits.apply(lambda x: x.split(";")).explode().unique()
+    filepath_substations = snakemake.output["substations"]
+    # save substations output
+    logger.info(f"Exporting clean substations to {filepath_substations}")
+    parentfolder_substations = os.path.dirname(filepath_substations)
+    if not os.path.exists(parentfolder_substations):
+        # Create the folder and its parent directories if they don't exist
+        os.makedirs(parentfolder_substations)
 
-    # ol_lines_way = ["id", "power", "cables", "circuits", "frequency", "voltage"]
+    gdf_substations.to_file(filepath_substations, driver="GeoJSON")
 
-    # # gdf = gpd.read_file(lines_way[3])
-    # # gdf2 = gpd.GeoDataFrame(gdf, geometry=gdf.geometry)
-    # # df = gdf.to_json()
+    ############# LINES AND CABLES ######################
 
-    # # gdf.to_file("example.geojson", layer_options={"ID_GENERATE": "YES"})
+    input_path_lines_cables = {
+        "lines": snakemake.input.lines_way,
+        "cables": snakemake.input.cables_way,
+    }
 
+    columns = ["id", "sub_id", "sub_id_len", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", "wires"]
+    df_lines = pd.DataFrame(columns=columns)
+    crs = "EPSG:4326"
 
-    output = str(snakemake.output)
-    clean_osm_data(output)
+    # using tqdm loop over input path
 
+    for key in input_path_lines_cables:
+        logger.info(f"Processing {key}...")
+        for idx, ip in enumerate(input_path_lines_cables[key]):
+            if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
+                country = os.path.basename(os.path.dirname(input_path_lines_cables[key][idx]))
+                
+                logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path_lines_cables[key])).zfill(2)}: {ip}")
+                with open(ip, "r") as f:
+                    data = json.load(f)
+                
+                df = pd.DataFrame(data['elements'])
+                df["id"] = df["id"].astype(str)
+                df["sub_id"] = "0" # initiate sub_id column with 0
+                df["sub_id_len"] = 0 # initiate sub_id column with 0
+                df["country"] = country
 
+                col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires"]
 
+                tags = pd.json_normalize(df["tags"]) \
+                    .map(lambda x: str(x) if pd.notnull(x) else x)
+                
+                for ct in col_tags:
+                    if ct not in tags.columns:
+                        tags[ct] = pd.NA
+                
+                tags = tags.loc[:, col_tags]
 
-# # Example DataFrame
-# data = {'id': ["ID1", "ID2", "ID3", "ID4", "ID5"],
-#         'A': ["220000", "380000", ";100000", "220000;220000;380000", "220000;;400000;700000"],
-#         'B': [1, 2, 3, 4, 5],
-#         'C': [6, 7, 8, 9, 10]}
-# df = pd.DataFrame(data)
+                df = pd.concat([df, tags], axis="columns") 
+                df.drop(columns=["type", "tags"], inplace=True)
+                
+                df_lines = pd.concat([df_lines, df], axis="rows")
+
+            else:
+                logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_lines_cables[key])).zfill(2)} (empty): {ip}")
+                continue
+        logger.info("---")
+
+    # Initiate boolean with False, only set to true if all cleaning steps are passed
+    df_lines["cleaned"] = False
+    df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
+
+    list_voltages = df_lines["voltage"].str.split(";").explode().unique().astype(str)
+    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
+    list_voltages = list_voltages[~np.char.startswith(list_voltages, '1')]
+
+    bool_voltages = df_lines["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_lines = df_lines[bool_voltages]
+
+    # Additional cleaning
+    df_lines["circuits"] = _clean_circuits(df_lines["circuits"])
+    df_lines["cables"] = _clean_cables(df_lines["cables"])
+    df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
+    df_lines["wires"] = _clean_wires(df_lines["wires"])
+
+    df_lines = _split_cells(df_lines)
+    bool_voltages = df_lines["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_lines = df_lines[bool_voltages]
+
+    bool_ac = df_lines["frequency"] != "0"
+    bool_dc = ~bool_ac
+    bool_noinfo = (df_lines["cables"] == "") & (df_lines["circuits"] == "")
+    valid_frequency = ["50", "0"]
+    bool_invalid_frequency = df_lines["frequency"].apply(lambda x: x not in valid_frequency)
+
+    # Fill in all values where cables info and circuits does not exist. Assuming 1 circuit
+    df_lines.loc[bool_noinfo, "circuits"] = "1"
+    df_lines.loc[bool_noinfo & bool_invalid_frequency, "frequency"] = "50"
+    df_lines.loc[bool_noinfo, "cleaned"] = True
+
+    df_lines
+
+    df_lines[bool_dc]
+
+    df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)  
+    gdf_lines = gpd.GeoDataFrame(
+        df_lines[["id", "power", "cables", "circuits", "voltage", "geometry"]], 
+        geometry = "geometry", crs = "EPSG:4326"
+        )
+    
+    gdf_lines.explore()
 
-# # Split the entries in column A that contain a semicolon
-# split_rows = df[df['A'].str.contains(';')]
-# split_values = split_rows['A'].str.split(';', expand=True)
+    ### Split into AC and DC
+    df_lines_ac = df_lines[df_lines["frequency"] != "0"].copy()
+    df_lines_dc = df_lines[df_lines["frequency"] == "0"].copy()
 
-# # Create two copies of the rows containing semicolons, one for each split value
-# split_rows_1 = split_rows.copy()
-# split_rows_2 = split_rows.copy()
+    df_lines_dc["cleaned"] = False
+    
+    
 
-# # Update column A in the split rows to contain the split values
-# split_rows_1['A'] = split_values[0]
-# split_rows_2['A'] = split_values[1]
 
-# # Concatenate the split rows with the original DataFrame, excluding the rows containing semicolons
-# result_df = pd.concat([df[~df.index.isin(split_rows.index)], split_rows_1, split_rows_2], ignore_index=True)
 
-# # Display the result
-# print(result_df)
+    ########
+    ########
+    ########
 
 
-# '# Sample DataFrame
-# data = {'id': ["ID1", "ID2", "ID3", "ID4", "ID5"],
-#         'voltage': ["220000", "380000", ";100000", "220000;220000;380000", "220000;;400000;700000"],
-#         'B': [1, 2, 3, 4, 5],
-#         'C': [6, 7, 8, 9, 10]}
-# df = pd.DataFrame(data)
+    fig = Figure(width = "50%", height = 600)
 
-# # Find rows to split
-# to_split = df['voltage'].str.contains(';')
+    m = gdf_substations.explore(name = "Buses", color = "red")
+    m = gdf_lines.explore(m = m, name = "Lines")
 
-# # Splitting entries and creating new rows
+    folium.LayerControl(collapsed = False).add_to(m)
 
+    fig.add_child(m)
+    m
 
-# new_rows = []
+    gdf_substations.explore()
+    df_lines.voltage.unique()
 
-# for index, row in df[to_split].iterrows():
-#     split_values = row["voltage"].split(';')
-#     for i, value in enumerate(split_values):
-#         new_id = str(row['id']) + '_' + str(i+1)
-#         new_row = {
-#             'id': new_id, 
-#             'bounds': row['bounds'],
-#             'nodes': row['nodes'],
-#             'geometry': row['geometry'],
-#             'cables': row['cables'],
-#             'circuits': row['circuits'],
-#             'frequency': row['frequency'],
-#             'voltage': value, 
-#             'wires': row['wires'],}
-#         new_rows.append(new_row)
+    np.set_printoptions(threshold=np.inf)
 
-# # Create DataFrame from split rows
-# split_df = pd.DataFrame(new_rows)
 
-# # Append the original DataFrame with split_df
-# final_df = pd.concat([df[~to_split], split_df])
+    # duplicate_lines = df_lines[df_lines.duplicated(subset=['id'], keep=False)].copy()
 
-# print(final_df)
+    # grouped_duplicates = duplicate_rows.groupby('id').agg({'country': 'list'})
 
+    a = df_lines[(df_lines["cables"].apply(lambda x: len(x.split(";"))) == 1) & ((df_lines["voltage"].apply(lambda x: len(x.split(";"))) == 1)) & (df_lines["cables"] != "")]
+    # Drop duplicates
+    df_lines.drop_duplicates(subset="id", inplace=True)
 
+    df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
+    # df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
+    df_lines["circuits"] = _clean_circuits(df_lines["circuits"])
+   
+    list_voltages = df_lines["voltage"].str.split(";").explode().unique().astype(str)
+    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
+    list_voltages[~np.char.startswith(list_voltages, '1')]
 
-# from shapely.geometry import LineString
-# import numpy as np
-# import matplotlib.pyplot as plt
+    # df_lines_subset = df_lines[df_lines["voltage"].apply(_any_substring_in_list, list_voltages)]
 
-# def offset_line(original_line, distance):
-#     # Compute the direction vector between the two endpoints
-#     direction_vector = np.array(original_line.coords[1]) - np.array(original_line.coords[0])
+    # drop voltage = ""
+    df_lines = _split_voltage(df_lines)
+    df_lines = df_lines[df_lines["voltage"] != ""]
+    df_lines["voltage"] = df_lines["voltage"].astype(int, errors="ignore")
 
-#     # Compute the orthogonal vector
-#     orthogonal_vector = np.array([-direction_vector[1], direction_vector[0]])
+    # Drop voltages below 220 kV
+    df_lines = df_lines[df_lines["voltage"] >= 200000]
 
-#     # Normalize the orthogonal vector
-#     orthogonal_vector /= np.linalg.norm(orthogonal_vector)
+    # set frequencies
+    df_lines["frequency"] = _set_frequency(df_lines["frequency"])
+    df_lines["frequency"] = df_lines["frequency"].astype(int, errors="ignore")
 
-#     # Compute the offset LineString
-#     offset_points = []
-#     for point in original_line.coords:
-#         offset_point = np.array(point) + distance * orthogonal_vector
-#         offset_points.append((offset_point[0], offset_point[1]))
+    # Clean circuits
+     # Map correct circuits to lines that where split
+    
+    # Initiate new column for cleaned circuits with values that are already valid:
+    # Condition 1: Length of sub_id is 0, the line was not split
+    # Condition 2: Number of entries in circuits separated by semicolon is 1, value is unique
+    # Condition 3: Circuits is not an empty string
+    # Condition 4: Circuits is not "0"
+    bool_circuits_valid = (df_lines["sub_id_len"] == 0) & \
+        (df_lines["circuits"].apply(lambda x: len(x.split(";"))) == 1) & \
+        (df_lines["circuits"] != "") & \
+        (df_lines["circuits"] != "0")
+        
+    df_lines.loc[bool_circuits_valid, "circuits_clean"] = df_lines.loc[bool_circuits_valid, "circuits"]
+    
+    # Boolean to check if sub_id_len is equal to the number of circuits
+    bool_equal = df_lines["sub_id_len"] == df_lines["circuits"] \
+                    .apply(lambda x: len(x.split(";")))
+    op_equal = lambda row: row["circuits"].split(";")[int(row["sub_id"])-1]
+        
+    df_lines.loc[bool_equal, "circuits_clean"] = df_lines[bool_equal] \
+        .apply(op_equal, axis=1)
+    
+    bool_larger = df_lines["sub_id_len"] > \
+        df_lines["circuits"].apply(lambda x: len(x.split(";")))
+    
+    pd.set_option('display.max_rows', None)
+    pd.set_option('display.max_columns', None)
+    df_lines.loc[bool_larger, ["id", "sub_id", "sub_id_len", "cables", "circuits", "circuits_clean", "frequency"]]
 
-#     return LineString(offset_points)
 
-# # Example usage:
-# original_line = lines.iloc[5]
-# offset_distance = 1.0
-# b = offset_line(original_line, offset_distance)
 
-# # Plot both LineStrings
-# fig, ax = plt.subplots()
-# x, y = original_line.xy
-# ax.plot(x, y, label='Original LineString')
-# x, y = offset_line.xy
-# ax.plot(x, y, label='Offset LineString')
-# ax.set_aspect('equal')
-# ax.legend()
-# plt.xlabel('X')
-# plt.ylabel('Y')
-# plt.title('Original and Offset LineStrings')
-# plt.grid(True)
-# plt.show()
\ No newline at end of file
+    output = str(snakemake.output)
+    clean_osm_data(output)
\ No newline at end of file
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 15eec040d..9a4526a5f 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -66,9 +66,10 @@ def retrieve_osm_data(
             "lines_way", 
             "substations_way",
             "substations_node",
-            "transformers_way",
-            "transformers_node",
-            "relations",
+            "substations_relation",
+            # "transformers_way",
+            # "transformers_node",
+            # "route_relations",
             ]):
     
     op_area = _get_overpass_areas(country)
@@ -76,14 +77,22 @@ def retrieve_osm_data(
     # Overpass API endpoint URL
     overpass_url = "https://overpass-api.de/api/interpreter"
 
+    # features_dict= {
+    #     'cables_way': 'way["power"="cable"]',
+    #     'lines_way': 'way["power"="line"]',
+    #     'substations_way': 'way["power"="substation"]',
+    #     'substations_node': 'node["power"="substation"]',
+    #     'transformers_way': 'way["power"="transformer"]',
+    #     'transformers_node': 'node["power"="transformer"]',
+    #     'route_relations': 'rel["route"="power"]["type"="route"]'
+    # }
+
     features_dict= {
         'cables_way': 'way["power"="cable"]',
         'lines_way': 'way["power"="line"]',
         'substations_way': 'way["power"="substation"]',
         'substations_node': 'node["power"="substation"]',
-        'transformers_way': 'way["power"="transformer"]',
-        'transformers_node': 'node["power"="transformer"]',
-        'relations': 'rel["route"="power"]["type"="route"]'
+        'substations_relation': 'relation["power"="substation"]',
     }
 
     for f in features:

From 85aa3f1f240c67323d4bc812f16020504c51c9a8 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 6 May 2024 16:04:37 +0200
Subject: [PATCH 005/100] Finished clean_osm_data function.

---
 rules/build_electricity.smk |   1 +
 scripts/clean_osm_data.py   | 281 +++++++++++++++++++++---------------
 2 files changed, 165 insertions(+), 117 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 23bf99969..4d4495adc 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -643,6 +643,7 @@ rule clean_osm_data:
         # route_relations=[f"data/osm/raw/{country}/route_relations_raw.json" for country in config["countries"]],
     output:
         substations="data/osm/clean/substations.geojson",
+        lines="data/osm/clean/lines.geojson",
     log:
         logs("clean_osm_data.log"),
     script:
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 90e3ca17e..1ccef3644 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -250,40 +250,6 @@ def _clean_frequency(column):
     return column.astype(str)
 
 
-def _split_voltage(df):
-    to_split = df['voltage'].str.contains(';')
-    new_rows = []
-    for index, row in df[to_split].iterrows():
-        split_values = row["voltage"].split(';')
-        new_sub_id_len = int(len(split_values))
-        for i, value in enumerate(split_values):
-            new_sub_id = str(i+1)
-            new_id = str(row['id']) + '_' + new_sub_id
-            new_row = {
-                'id': new_id, 
-                'sub_id': new_sub_id,
-                'sub_id_len': new_sub_id_len,
-                'bounds': row['bounds'],
-                'nodes': row['nodes'],
-                'geometry': row['geometry'],
-                'country': row['country'],
-                'power': row['power'],
-                'cables': row['cables'],
-                'circuits': row['circuits'],
-                'frequency': row['frequency'],
-                'voltage': value, 
-                'wires': row['wires'],}
-            new_rows.append(new_row)
-
-    # Create DataFrame from split rows
-    split_df = pd.DataFrame(new_rows)
-    df_new = pd.concat([df[~to_split], split_df])
-    df_new["sub_id_len"] = df_new["sub_id_len"].astype(int)
-
-    # Append the original DataFrame with split_df
-    return df_new
-
-
 def _split_cells(df, cols=["voltage"]):
     """
     Split semicolon separated cells i.e. [66000;220000] and create new
@@ -338,6 +304,19 @@ def generate_new_id(row):
     return x
 
 
+def _distribute_to_circuits(row):
+    if row["circuits"] != "":
+        circuits = int(row["circuits"])
+    else:
+        cables = int(row["cables"])
+        circuits = cables / 3
+
+    single_circuit = int(max(1, np.floor_divide(circuits, row["split_elements"])))
+    single_circuit = str(single_circuit)
+
+    return single_circuit
+
+
 # Function to check if any substring is in valid_strings
 def _any_substring_in_list(s, list_strings):
     substrings = s.split(';')
@@ -529,7 +508,7 @@ def _any_substring_in_list(s, list_strings):
         "cables": snakemake.input.cables_way,
     }
 
-    columns = ["id", "sub_id", "sub_id_len", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", "wires"]
+    columns = ["id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", "wires"]
     df_lines = pd.DataFrame(columns=columns)
     crs = "EPSG:4326"
 
@@ -547,8 +526,6 @@ def _any_substring_in_list(s, list_strings):
                 
                 df = pd.DataFrame(data['elements'])
                 df["id"] = df["id"].astype(str)
-                df["sub_id"] = "0" # initiate sub_id column with 0
-                df["sub_id_len"] = 0 # initiate sub_id column with 0
                 df["country"] = country
 
                 col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires"]
@@ -572,6 +549,18 @@ def _any_substring_in_list(s, list_strings):
                 continue
         logger.info("---")
 
+    # Find duplicates based on id column
+    duplicate_rows = df_lines[df_lines.duplicated(subset=['id'], keep=False)].copy()
+    # group rows by id and aggregate the country column to a string split by semicolon
+    grouped_duplicates = duplicate_rows.groupby('id')["country"].agg(lambda x: ';'.join(x)).reset_index()
+    duplicate_rows.drop_duplicates(subset="id", inplace=True)
+    duplicate_rows.drop(columns=["country"], inplace=True)
+    duplicate_rows = duplicate_rows.join(grouped_duplicates.set_index('id'), on='id', how='left')
+
+    # Drop duplicates and update the df_lines dataframe with the cleaned data
+    df_lines = df_lines[~df_lines["id"].isin(duplicate_rows["id"])]
+    df_lines = pd.concat([df_lines, duplicate_rows], axis="rows")
+
     # Initiate boolean with False, only set to true if all cleaning steps are passed
     df_lines["cleaned"] = False
     df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
@@ -589,6 +578,9 @@ def _any_substring_in_list(s, list_strings):
     df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
     df_lines["wires"] = _clean_wires(df_lines["wires"])
 
+    df_lines["voltage_original"] = df_lines["voltage"]
+    df_lines["circuits_original"] = df_lines["circuits"]
+
     df_lines = _split_cells(df_lines)
     bool_voltages = df_lines["voltage"].apply(_check_voltage, list_voltages=list_voltages)
     df_lines = df_lines[bool_voltages]
@@ -604,27 +596,147 @@ def _any_substring_in_list(s, list_strings):
     df_lines.loc[bool_noinfo & bool_invalid_frequency, "frequency"] = "50"
     df_lines.loc[bool_noinfo, "cleaned"] = True
 
-    df_lines
-
-    df_lines[bool_dc]
-
-    df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)  
-    gdf_lines = gpd.GeoDataFrame(
-        df_lines[["id", "power", "cables", "circuits", "voltage", "geometry"]], 
-        geometry = "geometry", crs = "EPSG:4326"
-        )
+    # Fill in all values where cables info exists and split_elements == 1
+    bool_cables_ac = (df_lines["cables"] != "") & \
+        (df_lines["split_elements"] == 1) & \
+        (df_lines["cables"] != "0") & \
+        (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
+        (df_lines["circuits"] == "") & \
+        (df_lines["cleaned"] == False) & \
+        bool_ac
     
-    gdf_lines.explore()
-
-    ### Split into AC and DC
-    df_lines_ac = df_lines[df_lines["frequency"] != "0"].copy()
-    df_lines_dc = df_lines[df_lines["frequency"] == "0"].copy()
+    df_lines.loc[bool_cables_ac, "circuits"] = df_lines.loc[bool_cables_ac, "cables"] \
+        .apply(lambda x: str(int(max(1, np.floor_divide(int(x),3)))))
+    
+    df_lines.loc[bool_cables_ac, "frequency"] = "50"
+    df_lines.loc[bool_cables_ac, "cleaned"] = True
+
+    bool_cables_dc = (df_lines["cables"] != "") & \
+        (df_lines["split_elements"] == 1) & \
+        (df_lines["cables"] != "0") & \
+        (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
+        (df_lines["circuits"] == "") & \
+        (df_lines["cleaned"] == False) & \
+        bool_dc
+    
+    df_lines.loc[bool_cables_dc, "circuits"] = df_lines.loc[bool_cables_dc, "cables"] \
+        .apply(lambda x: str(int(max(1, np.floor_divide(int(x),2)))))
+    
+    df_lines.loc[bool_cables_dc, "frequency"] = "0"
+    df_lines.loc[bool_cables_dc, "cleaned"] = True
+
+    # Fill in all values where circuits info exists and split_elements == 1
+    bool_lines = (df_lines["circuits"] != "") & \
+        (df_lines["split_elements"] == 1) & \
+        (df_lines["circuits"] != "0") & \
+        (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & \
+        (df_lines["cleaned"] == False) 
+    
+    df_lines.loc[bool_lines & bool_ac, "frequency"] = "50"
+    df_lines.loc[bool_lines & bool_dc, "frequency"] = "0"
+    df_lines.loc[bool_lines, "cleaned"] = True
+
+    # Clean those values where number of voltages split by semicolon is larger than no cables or no circuits
+    bool_cables = (df_lines["voltage_original"].apply(lambda x: len(x.split(";")) > 1)) & \
+        (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
+        (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & \
+        (df_lines["cleaned"] == False)
+    
+    df_lines.loc[bool_cables, "circuits"] = df_lines[bool_cables] \
+        .apply(_distribute_to_circuits, axis=1)
+    df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
+    df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
+    df_lines.loc[bool_cables, "cleaned"] = True
+
+    # Clean those values where multiple circuit values are present, divided by semicolon
+    bool_cables = (df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1)) & \
+        (df_lines.apply(lambda row: len(row["circuits"].split(";")) == row["split_elements"], axis=1)) & \
+        (df_lines["cleaned"] == False)
+    
+    df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables] \
+        .apply(lambda row: str(row["circuits"].split(";")[
+            int(row["id"].split("_")[-1])-1
+        ]), axis=1)
+    
+    df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
+    df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
+    df_lines.loc[bool_cables, "cleaned"] = True
+
+    # Clean those values where multiple cables values are present, divided by semicolon
+    bool_cables = (df_lines["cables"].apply(lambda x: len(x.split(";")) > 1)) & \
+        (df_lines.apply(lambda row: len(row["cables"].split(";")) == row["split_elements"], axis=1)) & \
+        (df_lines["cleaned"] == False)
+
+    df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables] \
+        .apply(lambda row: 
+            str(max(1,
+                np.floor_divide(
+                    int(row["cables"].split(";")[int(row["id"].split("_")[-1])-1]),
+                    3
+                    )
+                )),
+            axis=1)
+    
+    df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
+    df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
+    df_lines.loc[bool_cables, "cleaned"] = True
+
+    # All remaining lines to circuits == 1
+    bool_leftover = (df_lines["cleaned"] == False)
+    str_id = "; ".join(str(id) for id in df_lines.loc[bool_leftover, "id"])
+    logger.info(f"Setting circuits of remaining {sum(bool_leftover)} lines to 1...")
+    logger.info(f"Lines affected: {str_id}")
+    df_lines.loc[bool_leftover, "circuits"] = "1"
+    df_lines.loc[bool_leftover & bool_ac, "frequency"] = "50"
+    df_lines.loc[bool_leftover & bool_dc, "frequency"] = "0"
+    df_lines.loc[bool_leftover, "cleaned"] = True
 
-    df_lines_dc["cleaned"] = False
+    # rename columns
+    df_lines.rename(
+        columns={
+            "id": "line_id", 
+            "power": "tag_type",
+            "frequency":"tag_frequency",
+            }, inplace=True)
     
+    df_lines["bus0"] = None
+    df_lines["bus1"] = None
+    df_lines["length"] = None
+    df_lines.loc[df_lines["tag_type"] == "line", "underground"] = False
+    df_lines.loc[df_lines["tag_type"] == "cable", "underground"] = True
+    df_lines["under_construction"] = False
+    df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True
+    df_lines.loc[df_lines["tag_frequency"] == "50", "dc"] = False
+
+    df_lines = df_lines[[
+        "line_id",
+        "circuits",
+        "tag_type",
+        "voltage",
+        "tag_frequency",
+        "bus0",
+        "bus1",
+        "length",
+        "underground",
+        "under_construction",
+        "dc",
+        "country",
+        "geometry",
+        ]]
     
+    df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)  
+    gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = "EPSG:4326")
 
+    filepath_lines = snakemake.output["lines"]
+    # save substations output
+    logger.info(f"Exporting clean lines to {filepath_lines}")
+    parentfolder_lines = os.path.dirname(filepath_lines)
+    if not os.path.exists(parentfolder_lines):
+        # Create the folder and its parent directories if they don't exist
+        os.makedirs(parentfolder_lines)
 
+    gdf_lines.to_file(filepath_lines, driver="GeoJSON")
+    
 
     ########
     ########
@@ -642,71 +754,6 @@ def _any_substring_in_list(s, list_strings):
     m
 
     gdf_substations.explore()
-    df_lines.voltage.unique()
-
-    np.set_printoptions(threshold=np.inf)
-
-
-    # duplicate_lines = df_lines[df_lines.duplicated(subset=['id'], keep=False)].copy()
-
-    # grouped_duplicates = duplicate_rows.groupby('id').agg({'country': 'list'})
-
-    a = df_lines[(df_lines["cables"].apply(lambda x: len(x.split(";"))) == 1) & ((df_lines["voltage"].apply(lambda x: len(x.split(";"))) == 1)) & (df_lines["cables"] != "")]
-    # Drop duplicates
-    df_lines.drop_duplicates(subset="id", inplace=True)
-
-    df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
-    # df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
-    df_lines["circuits"] = _clean_circuits(df_lines["circuits"])
-   
-    list_voltages = df_lines["voltage"].str.split(";").explode().unique().astype(str)
-    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
-    list_voltages[~np.char.startswith(list_voltages, '1')]
-
-    # df_lines_subset = df_lines[df_lines["voltage"].apply(_any_substring_in_list, list_voltages)]
-
-    # drop voltage = ""
-    df_lines = _split_voltage(df_lines)
-    df_lines = df_lines[df_lines["voltage"] != ""]
-    df_lines["voltage"] = df_lines["voltage"].astype(int, errors="ignore")
-
-    # Drop voltages below 220 kV
-    df_lines = df_lines[df_lines["voltage"] >= 200000]
-
-    # set frequencies
-    df_lines["frequency"] = _set_frequency(df_lines["frequency"])
-    df_lines["frequency"] = df_lines["frequency"].astype(int, errors="ignore")
-
-    # Clean circuits
-     # Map correct circuits to lines that where split
-    
-    # Initiate new column for cleaned circuits with values that are already valid:
-    # Condition 1: Length of sub_id is 0, the line was not split
-    # Condition 2: Number of entries in circuits separated by semicolon is 1, value is unique
-    # Condition 3: Circuits is not an empty string
-    # Condition 4: Circuits is not "0"
-    bool_circuits_valid = (df_lines["sub_id_len"] == 0) & \
-        (df_lines["circuits"].apply(lambda x: len(x.split(";"))) == 1) & \
-        (df_lines["circuits"] != "") & \
-        (df_lines["circuits"] != "0")
-        
-    df_lines.loc[bool_circuits_valid, "circuits_clean"] = df_lines.loc[bool_circuits_valid, "circuits"]
-    
-    # Boolean to check if sub_id_len is equal to the number of circuits
-    bool_equal = df_lines["sub_id_len"] == df_lines["circuits"] \
-                    .apply(lambda x: len(x.split(";")))
-    op_equal = lambda row: row["circuits"].split(";")[int(row["sub_id"])-1]
-        
-    df_lines.loc[bool_equal, "circuits_clean"] = df_lines[bool_equal] \
-        .apply(op_equal, axis=1)
-    
-    bool_larger = df_lines["sub_id_len"] > \
-        df_lines["circuits"].apply(lambda x: len(x.split(";")))
-    
-    pd.set_option('display.max_rows', None)
-    pd.set_option('display.max_columns', None)
-    df_lines.loc[bool_larger, ["id", "sub_id", "sub_id_len", "cables", "circuits", "circuits_clean", "frequency"]]
-
 
 
     output = str(snakemake.output)

From 49c1baffd92317cdc4af603d103e19948d96b1b8 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 6 May 2024 17:05:00 +0200
Subject: [PATCH 006/100] Added check whether line is a circle. If so, drop it.

---
 scripts/clean_osm_data.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 1ccef3644..63f27d7f1 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -725,6 +725,10 @@ def _any_substring_in_list(s, list_strings):
         ]]
     
     df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)  
+    # Drop all rows where the geometry has equal start and end point
+    bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
+    df_lines = df_lines[~bool_circle]    
+    
     gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = "EPSG:4326")
 
     filepath_lines = snakemake.output["lines"]

From 75cffe444bdfadd24e971f5cedebeb6223568ad9 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 6 May 2024 17:10:59 +0200
Subject: [PATCH 007/100] Extended build_electricity.smk by
 build_osm_network.py

---
 rules/build_electricity.smk | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 4d4495adc..e844fb818 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -624,7 +624,7 @@ if config["osm"].get("retrieve", True):
         script:
             "../scripts/retrieve_osm_data.py"
 
-# FEATURES = ["cables_way", "lines_way", "substations_way", "substations_node", "transformers_way", "transformers_node"]
+
 rule clean_osm_data:
     # params:
     #     countries=config["countries"],
@@ -650,10 +650,22 @@ rule clean_osm_data:
         "../scripts/clean_osm_data.py"
 
 
-# {
-#             f"{country}": f"{
-#                 f"{feature}": f"data/osm/raw/{country}/{feature}.geojson" 
-#                 }"
-#                 for feature in FEATURES
-#                 for country in config["countries"]
-#             }
\ No newline at end of file
+rule build_osm_network:
+    input:
+        substations="data/osm/clean/substations.geojson",
+        lines="data/osm/clean/lines.geojson",
+    output:
+        lines="data/osm/lines.csv",
+        converters="data/osm/converters.csv",
+        transformers="data/osm/transformers.csv",
+        substations="data/osm/buses.csv",
+        lines_geojson="data/osm/lines.geojson",
+        converters_geojson="data/osm/converters.geojson",
+        transformers_geojson="data/osm/transformers.geojson",
+        substations_geojson="data/osm/buses.geojson",
+    log:
+        logs("build_osm_network.log"),
+    benchmark:
+        benchmarks("build_osm_network")
+    script:
+        "../scripts/build_osm_network.py"
\ No newline at end of file

From efb96118e85216c22a41f0042ade99471f4e5ea7 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 7 May 2024 13:28:43 +0200
Subject: [PATCH 008/100] Added build_osm_network

---
 scripts/build_osm_network.py | 1151 ++++++++++++++++++++++++++++++++++
 scripts/clean_osm_data.py    |  473 ++++++++------
 2 files changed, 1435 insertions(+), 189 deletions(-)
 create mode 100644 scripts/build_osm_network.py

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
new file mode 100644
index 000000000..bc0e46541
--- /dev/null
+++ b/scripts/build_osm_network.py
@@ -0,0 +1,1151 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText:  PyPSA-Earth and PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+from _helpers import (
+    configure_logging,
+)
+from shapely.geometry import LineString, Point
+from shapely.ops import linemerge, split
+from shapely import wkt
+from tqdm import tqdm
+from _benchmark import memory_logger
+import yaml
+
+logger = logging.getLogger(__name__)
+
+# list of recognised nan values (NA and na excluded as may be confused with Namibia 2-letter country code)
+NA_VALUES = ["NULL", "", "N/A", "NAN", "NaN", "nan", "Nan", "n/a", "null"]
+
+def read_csv_nafix(file, **kwargs):
+    "Function to open a csv as pandas file and standardize the na value"
+    if "keep_default_na" not in kwargs:
+        kwargs["keep_default_na"] = False
+    if "na_values" not in kwargs:
+        kwargs["na_values"] = NA_VALUES
+
+    if os.stat(file).st_size > 0:
+        return pd.read_csv(file, **kwargs)
+    else:
+        return pd.DataFrame()
+
+
+def save_to_geojson(df, fn):
+    if os.path.exists(fn):
+        os.unlink(fn)  # remove file if it exists
+
+    # save file if the (Geo)DataFrame is non-empty
+    if df.empty:
+        # create empty file to avoid issues with snakemake
+        with open(fn, "w") as fp:
+            pass
+    else:
+        # save file
+        df.to_file(fn, driver="GeoJSON")
+
+
+def read_geojson(fn, cols=[], dtype=None, crs="EPSG:4326"):
+    """
+    Function to read a geojson file fn. When the file is empty, then an empty
+    GeoDataFrame is returned having columns cols, the specified crs and the
+    columns specified by the dtype dictionary it not none.
+
+    Parameters:
+    ------------
+    fn : str
+        Path to the file to read
+    cols : list
+        List of columns of the GeoDataFrame
+    dtype : dict
+        Dictionary of the type of the object by column
+    crs : str
+        CRS of the GeoDataFrame
+    """
+    # if the file is non-zero, read the geodataframe and return it
+    if os.path.getsize(fn) > 0:
+        return gpd.read_file(fn)
+    else:
+        # else return an empty GeoDataFrame
+        df = gpd.GeoDataFrame(columns=cols, geometry=[], crs=crs)
+        if isinstance(dtype, dict):
+            for k, v in dtype.items():
+                df[k] = df[k].astype(v)
+        return df
+
+
+def to_csv_nafix(df, path, **kwargs):
+    if "na_rep" in kwargs:
+        del kwargs["na_rep"]
+    # if len(df) > 0:
+    if not df.empty or not df.columns.empty:
+        return df.to_csv(path, **kwargs, na_rep=NA_VALUES[0])
+    else:
+        with open(path, "w") as fp:
+            pass
+
+
+def line_endings_to_bus_conversion(lines):
+    # Assign to every line a start and end point
+
+    lines["bounds"] = lines["geometry"].boundary  # create start and end point
+
+    lines["bus_0_coors"] = lines["bounds"].map(lambda p: p.geoms[0])
+    lines["bus_1_coors"] = lines["bounds"].map(lambda p: p.geoms[-1])
+
+    # splits into coordinates
+    lines["bus0_lon"] = lines["bus_0_coors"].x
+    lines["bus0_lat"] = lines["bus_0_coors"].y
+    lines["bus1_lon"] = lines["bus_1_coors"].x
+    lines["bus1_lat"] = lines["bus_1_coors"].y
+
+    return lines
+
+
+# tol in m
+def set_substations_ids(buses, distance_crs, tol=5000):
+    """
+    Function to set substations ids to buses, accounting for location
+    tolerance.
+
+    The algorithm is as follows:
+
+    1. initialize all substation ids to -1
+    2. if the current substation has been already visited [substation_id < 0], then skip the calculation
+    3. otherwise:
+        1. identify the substations within the specified tolerance (tol)
+        2. when all the substations in tolerance have substation_id < 0, then specify a new substation_id
+        3. otherwise, if one of the substation in tolerance has a substation_id >= 0, then set that substation_id to all the others;
+           in case of multiple substations with substation_ids >= 0, the first value is picked for all
+    """
+
+    buses["station_id"] = -1
+
+    # create temporary series to execute distance calculations using m as reference distances
+    temp_bus_geom = buses.geometry.to_crs(distance_crs)
+
+    # set tqdm options for substation ids
+    tqdm_kwargs_substation_ids = dict(
+        ascii=False,
+        unit=" buses",
+        total=buses.shape[0],
+        desc="Set substation ids ",
+    )
+
+    station_id = 0
+    for i, row in tqdm(buses.iterrows(), **tqdm_kwargs_substation_ids):
+        if buses.loc[i, "station_id"] >= 0:
+            continue
+
+        # get substations within tolerance
+        close_nodes = np.flatnonzero(
+            temp_bus_geom.distance(temp_bus_geom.loc[i]) <= tol
+        )
+
+        if len(close_nodes) == 1:
+            # if only one substation is in tolerance, then the substation is the current one iì
+            # Note that the node cannot be with substation_id >= 0, given the preliminary check
+            # at the beginning of the for loop
+            buses.loc[buses.index[i], "station_id"] = station_id
+            # update station id
+            station_id += 1
+        else:
+            # several substations in tolerance
+            # get their ids
+            subset_substation_ids = buses.loc[buses.index[close_nodes], "station_id"]
+            # check if all substation_ids are negative (<0)
+            all_neg = subset_substation_ids.max() < 0
+            # check if at least a substation_id is negative (<0)
+            some_neg = subset_substation_ids.min() < 0
+
+            if all_neg:
+                # when all substation_ids are negative, then this is a new substation id
+                # set the current station_id and increment the counter
+                buses.loc[buses.index[close_nodes], "station_id"] = station_id
+                station_id += 1
+            elif some_neg:
+                # otherwise, when at least a substation_id is non-negative, then pick the first value
+                # and set it to all the other substations within tolerance
+                sub_id = -1
+                for substation_id in subset_substation_ids:
+                    if substation_id >= 0:
+                        sub_id = substation_id
+                        break
+                buses.loc[buses.index[close_nodes], "station_id"] = sub_id
+
+
+def set_lines_ids(lines, buses, distance_crs):
+    """
+    Function to set line buses ids to the closest bus in the list.
+    """
+    # set tqdm options for set lines ids
+    tqdm_kwargs_line_ids = dict(
+        ascii=False,
+        unit=" lines",
+        total=lines.shape[0],
+        desc="Set line bus ids ",
+    )
+
+    # initialization
+    lines["bus0"] = -1
+    lines["bus1"] = -1
+
+    busesepsg = buses.to_crs(distance_crs)
+    linesepsg = lines.to_crs(distance_crs)
+
+    for i, row in tqdm(linesepsg.iterrows(), **tqdm_kwargs_line_ids):
+        # select buses having the voltage level of the current line
+        buses_sel = busesepsg[
+            (buses["voltage"] == row["voltage"]) & (buses["dc"] == row["dc"])
+        ]
+
+        # find the closest node of the bus0 of the line
+        bus0_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[0]).idxmin()
+        lines.loc[i, "bus0"] = buses.loc[bus0_id, "bus_id"]
+
+        # check if the line starts exactly in the node, otherwise modify the linestring
+        distance_bus0 = busesepsg.geometry.loc[bus0_id].distance(
+            row.geometry.boundary.geoms[0]
+        )
+        if distance_bus0 > 0.0:
+            # the line does not start in the node, thus modify the linestring
+            lines.loc[i, "geometry"] = linemerge(
+                [
+                    LineString(
+                        [
+                            buses.geometry.loc[bus0_id],
+                            lines.geometry.loc[i].boundary.geoms[0],
+                        ]
+                    ),
+                    lines.geometry.loc[i],
+                ]
+            )
+
+        # find the closest node of the bus1 of the line
+        bus1_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[1]).idxmin()
+        lines.loc[i, "bus1"] = buses.loc[bus1_id, "bus_id"]
+
+        # check if the line ends exactly in the node, otherwise modify the linestring
+        distance_bus1 = busesepsg.geometry.loc[bus1_id].distance(
+            row.geometry.boundary.geoms[1]
+        )
+        if distance_bus1 > 0.0:
+            # the line does not end in the node, thus modify the linestring
+            lines.loc[i, "geometry"] = linemerge(
+                [
+                    lines.geometry.loc[i],
+                    LineString(
+                        [
+                            lines.geometry.loc[i].boundary.geoms[1],
+                            buses.geometry.loc[bus1_id],
+                        ]
+                    ),
+                ]
+            )
+
+    return lines, buses
+
+
+def merge_stations_same_station_id(
+    buses, delta_lon=0.001, delta_lat=0.001, precision=4
+):
+    """
+    Function to merge buses with same voltage and station_id This function
+    iterates over all substation ids and creates a bus_id for every substation
+    and voltage level.
+
+    Therefore, a substation with multiple voltage levels is represented
+    with different buses, one per voltage level
+    """
+    # initialize list of cleaned buses
+    buses_clean = []
+
+    # initialize the number of buses
+    n_buses = 0
+
+    for g_name, g_value in buses.groupby(by="station_id"):
+        # average location of the buses having the same station_id
+        station_point_x = np.round(g_value.geometry.x.mean(), precision)
+        station_point_y = np.round(g_value.geometry.y.mean(), precision)
+        is_dclink_boundary_point = any(g_value["is_dclink_boundary_point"])
+
+        # loop for every voltage level in the bus
+        # The location of the buses is averaged; in the case of multiple voltage levels for the same station_id,
+        # each bus corresponding to a voltage level and each polatity is located at a distance regulated by delta_lon/delta_lat
+        v_it = 0
+        for v_name, bus_row in g_value.groupby(by=["voltage", "dc"]):
+            lon_bus = np.round(station_point_x + v_it * delta_lon, precision)
+            lat_bus = np.round(station_point_y + v_it * delta_lat, precision)
+
+            # add the bus
+            buses_clean.append(
+                [
+                    n_buses,  # "bus_id"
+                    g_name,  # "station_id"
+                    v_name[0],  # "voltage"
+                    bus_row["dc"].all(),  # "dc"
+                    "|".join(bus_row["symbol"].unique()),  # "symbol"
+                    bus_row["under_construction"].any(),  # "under_construction"
+                    "|".join(bus_row["tag_substation"].unique()),  # "tag_substation"
+                    bus_row["tag_area"].sum(),  # "tag_area"
+                    lon_bus,  # "lon"
+                    lat_bus,  # "lat"
+                    bus_row["country"].iloc[0],  # "country",
+                    is_dclink_boundary_point, # check if new bus was formed of at least one DC link boundary point
+                    Point(
+                        lon_bus,
+                        lat_bus,
+                    ),  # "geometry"
+                ]
+            )
+
+            # increase counters
+            v_it += 1
+            n_buses += 1
+
+    # names of the columns
+    buses_clean_columns = [
+        "bus_id",
+        "station_id",
+        "voltage",
+        "dc",
+        "symbol",
+        "under_construction",
+        "tag_substation",
+        "tag_area",
+        "x",
+        "y",
+        "country",
+        "is_dclink_boundary_point",
+        "geometry",
+    ]
+
+    gdf_buses_clean = gpd.GeoDataFrame(buses_clean, columns=buses_clean_columns).set_crs(
+        crs=buses.crs, inplace=True
+    )
+
+    return gdf_buses_clean
+
+
+def get_ac_frequency(df, fr_col="tag_frequency"):
+    """
+    # Function to define a default frequency value.
+
+    Attempts to find the most usual non-zero frequency across the
+    dataframe; 50 Hz is assumed as a back-up value
+    """
+
+    # Initialize a default frequency value
+    ac_freq_default = 50
+
+    grid_freq_levels = df[fr_col].value_counts(sort=True, dropna=True)
+    if not grid_freq_levels.empty:
+        # AC lines frequency shouldn't be 0Hz
+        ac_freq_levels = grid_freq_levels.loc[
+            grid_freq_levels.index.get_level_values(0) != "0"
+        ]
+        ac_freq_default = ac_freq_levels.index.get_level_values(0)[0]
+
+    return ac_freq_default
+
+
+def get_transformers(buses, lines):
+    """
+    Function to create fake transformer lines that connect buses of the same
+    station_id at different voltage.
+    """
+
+    ac_freq = get_ac_frequency(lines)
+    df_transformers = []
+
+    # Transformers should be added between AC buses only
+    # TODO pypsa-eur: Fix this! instead of tilde use !=
+    buses_ac = buses[buses["dc"] != True]
+    for g_name, g_value in buses_ac.sort_values("voltage", ascending=True).groupby(
+        by="station_id"
+    ):
+        # note: by construction there cannot be more that two buses with the same station_id and same voltage
+        n_voltages = len(g_value)
+
+        if n_voltages > 1:
+            for id in range(0, n_voltages - 1):
+                # when g_value has more than one node, it means that there are multiple voltages for the same bus
+                geom_trans = LineString(
+                    [g_value.geometry.iloc[id], g_value.geometry.iloc[id + 1]]
+                )
+
+                df_transformers.append(
+                    [
+                        f"transf_{g_name}_{id}",  # "line_id"
+                        g_value["bus_id"].iloc[id],  # "bus0"
+                        g_value["bus_id"].iloc[id + 1],  # "bus1"
+                        g_value.voltage.iloc[id],  # "voltage_bus0"
+                        g_value.voltage.iloc[id + 1],  # "voltage_bus0"
+                        g_value.country.iloc[id],  # "country"
+                        geom_trans,  # "geometry"
+                    ]
+                )
+    # TODO pypsa-eur: fix bug in pypsa-earth, where the id column is wrongly named "line_id" instead of "transformer_id
+    # name of the columns
+    trasf_columns = [
+        "transformer_id",
+        "bus0",
+        "bus1",
+        "voltage_bus0",
+        "voltage_bus1",
+        "country",
+        "geometry",
+    ]
+
+    df_transformers = gpd.GeoDataFrame(df_transformers, columns=trasf_columns)
+    if not df_transformers.empty:
+        init_index = 0 if lines.empty else lines.index[-1] + 1
+        df_transformers.set_index(init_index + df_transformers.index, inplace=True)
+    # update line endings
+    df_transformers = line_endings_to_bus_conversion(df_transformers)
+
+    return df_transformers
+
+
+def get_converters(buses, lines):
+    """
+    Function to create fake converter lines that connect buses of the same
+    station_id of different polarities.
+    """
+
+    df_converters = []
+
+    for g_name, g_value in buses.sort_values("voltage", ascending=True).groupby(
+        by="station_id"
+    ):
+        # note: by construction there cannot be more that two buses with the same station_id and same voltage
+        n_voltages = len(g_value)
+
+        # A converter stations should have both AC and DC parts
+        if g_value["dc"].any() & ~g_value["dc"].all():
+            dc_voltage = g_value[g_value.dc]["voltage"].values
+
+            for u in dc_voltage:
+                id_0 = g_value[g_value["dc"] & g_value["voltage"].isin([u])].index[0]
+
+                ac_voltages = g_value[~g_value.dc]["voltage"]
+                # A converter is added between a DC nodes and AC one with the closest voltage
+                id_1 = ac_voltages.sub(u).abs().idxmin()
+
+                geom_conv = LineString(
+                    [g_value.geometry.loc[id_0], g_value.geometry.loc[id_1]]
+                )
+
+                # check if bus is a dclink boundary point, only then add converter
+                if g_value["is_dclink_boundary_point"].loc[id_0]:
+                    df_converters.append(
+                        [
+                            f"convert_{g_name}_{id_0}",  # "line_id"
+                            g_value["bus_id"].loc[id_0],  # "bus0"
+                            g_value["bus_id"].loc[id_1],  # "bus1"
+                            False,  # "underground"
+                            False,  # "under_construction"
+                            g_value.country.loc[id_0],  # "country"
+                            geom_conv,  # "geometry"
+                        ]
+                )
+
+    # name of the columns
+    conv_columns = [
+        "converter_id",
+        "bus0",
+        "bus1",
+        "underground",
+        "under_construction",
+        "country",
+        "geometry",
+    ]
+
+    df_converters = gpd.GeoDataFrame(df_converters, columns=conv_columns).reset_index()
+
+    return df_converters
+
+
+def connect_stations_same_station_id(lines, buses):
+    """
+    Function to create fake links between substations with the same
+    substation_id.
+    """
+    ac_freq = get_ac_frequency(lines)
+    station_id_list = buses.station_id.unique()
+
+    add_lines = []
+    from shapely.geometry import LineString
+
+    for s_id in station_id_list:
+        buses_station_id = buses[buses.station_id == s_id]
+
+        if len(buses_station_id) > 1:
+            for b_it in range(1, len(buses_station_id)):
+                add_lines.append(
+                    [
+                        f"link{buses_station_id}_{b_it}",  # "line_id"
+                        buses_station_id.index[0],  # "bus0"
+                        buses_station_id.index[b_it],  # "bus1"
+                        400000,  # "voltage"
+                        1,  # "circuits"
+                        0.0,  # "length"
+                        False,  # "underground"
+                        False,  # "under_construction"
+                        "transmission",  # "tag_type"
+                        ac_freq,  # "tag_frequency"
+                        buses_station_id.country.iloc[0],  # "country"
+                        LineString(
+                            [
+                                buses_station_id.geometry.iloc[0],
+                                buses_station_id.geometry.iloc[b_it],
+                            ]
+                        ),  # "geometry"
+                        LineString(
+                            [
+                                buses_station_id.geometry.iloc[0],
+                                buses_station_id.geometry.iloc[b_it],
+                            ]
+                        ).bounds,  # "bounds"
+                        buses_station_id.geometry.iloc[0],  # "bus_0_coors"
+                        buses_station_id.geometry.iloc[b_it],  # "bus_1_coors"
+                        buses_station_id.lon.iloc[0],  # "bus0_lon"
+                        buses_station_id.lat.iloc[0],  # "bus0_lat"
+                        buses_station_id.lon.iloc[b_it],  # "bus1_lon"
+                        buses_station_id.lat.iloc[b_it],  # "bus1_lat"
+                    ]
+                )
+
+    # name of the columns
+    add_lines_columns = [
+        "line_id",
+        "bus0",
+        "bus1",
+        "voltage",
+        "circuits",
+        "length",
+        "underground",
+        "under_construction",
+        "tag_type",
+        "tag_frequency",
+        "country",
+        "geometry",
+        "bounds",
+        "bus_0_coors",
+        "bus_1_coors",
+        "bus0_lon",
+        "bus0_lat",
+        "bus1_lon",
+        "bus1_lat",
+    ]
+
+    df_add_lines = gpd.GeoDataFrame(pd.concat(add_lines), columns=add_lines_columns)
+    lines = pd.concat([lines, df_add_lines], ignore_index=True)
+
+    return lines
+
+
+def set_lv_substations(buses):
+    """
+    Function to set what nodes are lv, thereby setting substation_lv The
+    current methodology is to set lv nodes to buses where multiple voltage
+    level are found, hence when the station_id is duplicated.
+    """
+    # initialize column substation_lv to true
+    buses["substation_lv"] = True
+
+    # For each station number with multiple buses make lowest voltage `substation_lv = TRUE`
+    bus_with_stations_duplicates = buses[
+        buses.station_id.duplicated(keep=False)
+    ].sort_values(by=["station_id", "voltage"])
+    lv_bus_at_station_duplicates = (
+        buses[buses.station_id.duplicated(keep=False)]
+        .sort_values(by=["station_id", "voltage"])
+        .drop_duplicates(subset=["station_id"])
+    )
+    # Set all buses with station duplicates "False"
+    buses.loc[bus_with_stations_duplicates.index, "substation_lv"] = False
+    # Set lv_buses with station duplicates "True"
+    buses.loc[lv_bus_at_station_duplicates.index, "substation_lv"] = True
+
+    return buses
+
+
+# Note tolerance = 0.01 means around 700m
+# TODO: the current tolerance is high to avoid an issue in the Nigeria case where line 565939360-1
+#       seems to be interconnected to both ends, but at the eastern one, the node is actually not connected
+#       another line seems to be exactly touching the node, but from the data point of view it only fly over it.
+#       There may be the need to split a line in several segments in the case the line is within tolerance with
+#       respect to a node
+
+
+def merge_stations_lines_by_station_id_and_voltage(
+    lines, buses, geo_crs, distance_crs, tol=5000
+):
+    """
+    Function to merge close stations and adapt the line datasets to adhere to
+    the merged dataset.
+    """
+
+    logger.info(
+        "Stage 3a/4: Set substation ids with tolerance of %.2f km" % (tol / 1000)
+    )
+
+    # TODO pypsa-eur: Add this fix to pypsa-earth: Buses should not be clustered geographically if they are different 
+    # bus types (AC != DC)
+    buses_ac = buses[buses["dc"] == False].reset_index()
+    buses_dc = buses[buses["dc"] == True].reset_index() 
+
+    # set substation ids
+    # set_substations_ids(buses, distance_crs, tol=tol)
+    set_substations_ids(buses_ac, distance_crs, tol=tol)
+    set_substations_ids(buses_dc, distance_crs, tol=tol)
+
+    # Find boundary points of DC links
+    # lines_dc_shape = lines[lines["dc"] == True].unary_union
+    # lines_dc_bounds = lines_dc_shape.boundary
+    # lines_dc_points = [p for p in lines_dc_bounds.geoms]
+    lines_dc = lines[lines['dc'] == True].reset_index()
+    lines_dc["adj_idx"] = range(0, len(lines_dc))
+
+    # Initialize an empty adjacency matrix
+    dc_adj_matrix = np.zeros((len(lines_dc), len(lines_dc)), dtype=int)
+
+    # Fill the adjacency matrix
+    for i in range(len(lines_dc)):
+        for j in range(len(lines_dc)):
+            if are_lines_connected(lines_dc.iloc[i], lines_dc.iloc[j]):
+                dc_adj_matrix[i, j] = 1
+
+    dc_paths = find_paths(dc_adj_matrix)
+
+    all_dc_boundary_points = pd.Series()
+
+    for path in dc_paths:
+        bus_0_coors = lines_dc.iloc[path]["bus_0_coors"]
+        bus_1_coors = lines_dc.iloc[path]["bus_1_coors"]
+
+        # Create DataFrame containing all points within a path
+        dc_points = pd.concat([bus_0_coors, bus_1_coors], ignore_index = True)
+
+        # Determine the value counts of individual points. If it occurs more than 
+        # once, it cannot be an end-point of a path
+        bool_duplicates = dc_points.apply(lambda p: sum([are_almost_equal(p, s) for s in dc_points])) > 1
+        
+        # Drop all duplicates
+        dc_boundary_points = dc_points[~bool_duplicates]
+
+        if dc_boundary_points.empty:
+            all_dc_boundary_points = dc_boundary_points
+        else:
+            all_dc_boundary_points = pd.concat([all_dc_boundary_points, dc_boundary_points], ignore_index = True)
+
+
+    # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
+    # check for each entry in buses_dc whether it is included in lines_dc_points
+    buses_ac["is_dclink_boundary_point"] = False
+    buses_dc["is_dclink_boundary_point"] = buses_dc.geometry.apply(
+        lambda p: any([p.within(l) for l in all_dc_boundary_points])
+    )
+
+    logger.info("Stage 3b/4: Merge substations with the same id")
+
+    # merge buses with same station id and voltage
+    if not buses.empty:
+        buses_ac = merge_stations_same_station_id(buses_ac)
+        buses_dc = merge_stations_same_station_id(buses_dc)
+        buses_dc["bus_id"] = buses_ac["bus_id"].max() + buses_dc["bus_id"] + 1
+        buses = pd.concat([buses_ac, buses_dc], ignore_index=True)
+        set_substations_ids(buses, distance_crs, tol=tol)
+
+    logger.info("Stage 3c/4: Specify the bus ids of the line endings")
+
+    # set the bus ids to the line dataset
+    lines, buses = set_lines_ids(lines, buses, distance_crs)
+
+    # drop lines starting and ending in the same node
+    lines.drop(lines[lines["bus0"] == lines["bus1"]].index, inplace=True)
+    # update line endings
+    lines = line_endings_to_bus_conversion(lines)
+
+    # set substation_lv
+    set_lv_substations(buses)
+
+    logger.info("Stage 3d/4: Add converters to lines")
+
+    # append fake converters
+    # lines = pd.concat([lines, converters], ignore_index=True)
+
+    # reset index
+    lines.reset_index(drop=True, inplace=True)
+    # if len(links) > 0:
+    #     links.reset_index(drop=True, inplace=True)
+
+    return lines, buses
+
+
+def create_station_at_equal_bus_locations(
+    lines, buses, geo_crs, distance_crs, tol=5000
+):
+    # V1. Create station_id at same bus location
+    # - We saw that buses are not connected exactly at one point, they are
+    #   usually connected to a substation "area" (analysed on maps)
+    # - Create station_id at exactly the same location might therefore be not
+    #   always correct
+    # - Though as you can see below, it might be still sometime the case.
+    #   Examples are **station 4** (2 lines with the same voltage connect at the
+    #   same point) and **station 23** (4 lines with two different voltages connect
+    #   at the same point)
+    # TODO: Filter out the generator lines - defined as going from generator to
+    #       the next station which is connected to a load. Excluding generator
+    #       lines make probably sense because they are not transmission expansion
+    #       relevant. For now we simplify and include generator lines.
+
+    # If same location/geometry make station
+    bus_all = buses
+
+    # set substation ids
+    set_substations_ids(buses, distance_crs, tol=tol)
+
+    # set the bus ids to the line dataset
+    lines, buses = set_lines_ids(lines, buses, distance_crs)
+
+    # update line endings
+    lines = line_endings_to_bus_conversion(lines)
+
+    # For each station number with multiple buses make lowest voltage `substation_lv = TRUE`
+    set_lv_substations(bus_all)
+
+    # TRY: Keep only buses that are not duplicated & lv_substation = True
+    # TODO: Check if this is necessary. What effect do duplicates have?
+    bus_all = bus_all[bus_all["substation_lv"] == True]
+
+    lines = connect_stations_same_station_id(lines, buses)
+
+    return lines, buses
+
+
+def _split_linestring_by_point(linestring, points):
+    """
+    Function to split a linestring geometry by multiple inner points.
+
+    Parameters
+    ----------
+    lstring : LineString
+        Linestring of the line to be split
+    points : list
+        List of points to split the linestring
+
+    Return
+    ------
+    list_lines : list
+        List of linestring to split the line
+    """
+
+    list_linestrings = [linestring]
+
+    for p in points:
+        # execute split to all lines and store results
+        temp_list = [split(l, p) for l in list_linestrings]
+        # nest all geometries
+        list_linestrings = [lstring for tval in temp_list for lstring in tval.geoms]
+
+    return list_linestrings
+
+
+def fix_overpassing_lines(lines, buses, distance_crs, tol=1):
+    """
+    Function to avoid buses overpassing lines with no connection when the bus
+    is within a given tolerance from the line.
+
+    Parameters
+    ----------
+    lines : GeoDataFrame
+        Geodataframe of lines
+    buses : GeoDataFrame
+        Geodataframe of substations
+    tol : float
+        Tolerance in meters of the distance between the substation and the line
+        below which the line will be split
+    """
+
+    lines_to_add = []  # list of lines to be added
+    lines_to_split = []  # list of lines that have been split
+
+    lines_epsgmod = lines.to_crs(distance_crs)
+    buses_epsgmod = buses.to_crs(distance_crs)
+
+    # set tqdm options for substation ids
+    tqdm_kwargs_substation_ids = dict(
+        ascii=False,
+        unit=" lines",
+        total=lines.shape[0],
+        desc="Verify lines overpassing nodes ",
+    )
+
+    for l in tqdm(lines.index, **tqdm_kwargs_substation_ids):
+        # bus indices being within tolerance from the line
+        bus_in_tol_epsg = buses_epsgmod[
+            buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol
+        ]
+
+        # exclude endings of the lines
+        bus_in_tol_epsg = bus_in_tol_epsg[
+            (
+                (
+                    bus_in_tol_epsg.geometry.distance(
+                        lines_epsgmod.geometry.loc[l].boundary.geoms[0]
+                    )
+                    > tol
+                )
+                | (
+                    bus_in_tol_epsg.geometry.distance(
+                        lines_epsgmod.geometry.loc[l].boundary.geoms[1]
+                    )
+                    > tol
+                )
+            )
+        ]
+
+        if not bus_in_tol_epsg.empty:
+            # add index of line to split
+            lines_to_split.append(l)
+
+            buses_locs = buses.geometry.loc[bus_in_tol_epsg.index]
+
+            # get new line geometries
+            new_geometries = _split_linestring_by_point(lines.geometry[l], buses_locs)
+            n_geoms = len(new_geometries)
+
+            # create temporary copies of the line
+            df_append = gpd.GeoDataFrame([lines.loc[l]] * n_geoms)
+            # update geometries
+            df_append["geometry"] = new_geometries
+            # update name of the line
+            df_append["line_id"] = [
+                str(df_append["line_id"].iloc[0]) + f"_{id}" for id in range(n_geoms)
+            ]
+
+            lines_to_add.append(df_append)
+
+    if not lines_to_add:
+        return lines, buses
+
+    df_to_add = gpd.GeoDataFrame(pd.concat(lines_to_add, ignore_index=True))
+    df_to_add.set_crs(lines.crs, inplace=True)
+    df_to_add.set_index(lines.index[-1] + df_to_add.index, inplace=True)
+
+    # update length
+    df_to_add["length"] = df_to_add.to_crs(distance_crs).geometry.length
+
+    # update line endings
+    df_to_add = line_endings_to_bus_conversion(df_to_add)
+
+    # remove original lines
+    lines.drop(lines_to_split, inplace=True)
+
+    lines = gpd.GeoDataFrame(
+        pd.concat([lines, df_to_add], ignore_index=True).reset_index(drop=True),
+        crs=lines.crs,
+    )
+
+    return lines, buses
+
+
+def add_buses_to_empty_countries(country_list, fp_country_shapes, buses):
+    """
+    Function to add a bus for countries missing substation data.
+    """
+    country_shapes = gpd.read_file(fp_country_shapes).set_index("name")["geometry"]
+    bus_country_list = buses["country"].unique().tolist()
+
+    # it may happen that bus_country_list contains entries not relevant as a country name (e.g. "not found")
+    # difference can't give negative values; the following will return only relevant country names
+    no_data_countries = list(set(country_list).difference(set(bus_country_list)))
+
+    if len(no_data_countries) > 0:
+        logger.info(
+            f"No buses for the following countries: {no_data_countries}. Adding a node for everyone of them."
+        )
+        no_data_countries_shape = (
+            country_shapes[country_shapes.index.isin(no_data_countries) == True]
+            .reset_index()
+            .to_crs(geo_crs)
+        )
+        length = len(no_data_countries)
+        df = gpd.GeoDataFrame(
+            {
+                "voltage": [220000] * length,
+                "country": no_data_countries_shape["name"],
+                "x": no_data_countries_shape["geometry"].centroid.x,
+                "y": no_data_countries_shape["geometry"].centroid.y,
+                "bus_id": np.arange(len(buses) + 1, len(buses) + (length + 1), 1),
+                "station_id": [np.nan] * length,
+                # All lines for the countries with NA bus data are assumed to be AC
+                "dc": [False] * length,
+                "under_construction": [False] * length,
+                "tag_area": [0.0] * length,
+                "symbol": ["substation"] * length,
+                "tag_substation": ["transmission"] * length,
+                "geometry": no_data_countries_shape["geometry"].centroid,
+                "substation_lv": [True] * length,
+            },
+            crs=geo_crs,
+        ).astype(
+            buses.dtypes.to_dict()
+        )  # keep the same dtypes as buses
+        buses = gpd.GeoDataFrame(
+            pd.concat([buses, df], ignore_index=True).reset_index(drop=True),
+            crs=buses.crs,
+        )
+
+        # update country list by buses dataframe
+        bus_country_list = buses["country"].unique().tolist()
+
+    non_allocated_countries = list(
+        set(country_list).symmetric_difference(set(bus_country_list))
+    )
+
+    if len(non_allocated_countries) > 0:
+        logger.error(
+            f"There following countries could not be allocated properly: {non_allocated_countries}"
+        )
+
+    return buses
+
+
+def build_network(
+    inputs,
+    outputs,
+    build_osm_network_config,
+    countries_config,
+    geo_crs,
+    distance_crs,
+):  
+    osm_clean_columns = {
+        'substation': {
+            'bus_id': 'object',
+            'station_id': 'float',
+            'voltage': 'float',
+            'dc': 'bool',
+            'symbol': 'object',
+            'under_construction': 'bool',
+            'tag_substation': 'str',
+            'tag_area': 'str',
+            'lon': 'float',
+            'lat': 'float',
+            'country': 'str',
+            'geometry': 'object',
+            'tag_source': 'str',
+        },
+        'line': {
+            'line_id': 'object',
+            'bus0': 'object',
+            'bus1': 'object',
+            'voltage': 'float',
+            'circuits': 'float',
+            'length': 'float',
+            'underground': 'bool',
+            'under_construction': 'bool',
+            'tag_type': 'str',
+            'tag_frequency': 'float',
+            'dc': 'bool',
+            'country': 'object',
+            'geometry': 'object',
+        }
+    }
+
+    logger.info("Stage 1/5: Read input data")
+    buses = read_geojson(
+        inputs["substations"],
+        osm_clean_columns["substation"].keys(),
+        dtype=osm_clean_columns["substation"],
+    )
+
+    lines = read_geojson(
+        inputs["lines"],
+        osm_clean_columns["line"].keys(),
+        dtype=osm_clean_columns["line"],
+    )
+
+    lines = line_endings_to_bus_conversion(lines)
+
+    logger.info("Stage 2/5: AC and DC network: enabled")
+
+    # Address the overpassing line issue Step 3/5
+    if build_osm_network_config.get("split_overpassing_lines", False):
+        tol = build_osm_network_config.get("overpassing_lines_tolerance", 1)
+        logger.info("Stage 3/5: Avoid nodes overpassing lines: enabled with tolerance")
+
+        lines, buses = fix_overpassing_lines(lines, buses, distance_crs, tol=tol)
+    else:
+        logger.info("Stage 3/5: Avoid nodes overpassing lines: disabled")
+    
+    # Add bus to countries with no buses
+    buses = add_buses_to_empty_countries(countries_config, inputs.country_shapes, buses)
+
+    # METHOD to merge buses with same voltage and within tolerance Step 4/5
+    if build_osm_network_config.get("group_close_buses", False):
+        tol = build_osm_network_config.get("group_tolerance_buses", 5000)
+        logger.info(
+            f"Stage 4/5: Aggregate close substations: enabled with tolerance {tol} m"
+        )
+        lines, buses = merge_stations_lines_by_station_id_and_voltage(
+            lines, buses, geo_crs, distance_crs, tol=tol
+        )
+    else:
+        logger.info("Stage 4/5: Aggregate close substations: disabled")
+
+    logger.info("Stage 5/5: Add augmented substation to country with no data")
+
+    # Recalculate lengths of lines
+    utm = lines.estimate_utm_crs(datum_name = "WGS 84")
+    lines["length"] = lines.to_crs(utm).length
+
+    # get transformers: modelled as lines connecting buses with different voltage
+    transformers = get_transformers(buses, lines)
+
+    # get converters: currently modelled as links connecting buses with different polarity
+    converters = get_converters(buses, lines)
+
+    logger.info("Save outputs")
+
+    # create clean directory if not already exist
+    if not os.path.exists(outputs["lines"]):
+        os.makedirs(os.path.dirname(outputs["lines"]), exist_ok=True)
+
+
+    ### Convert output to pypsa-eur friendly format
+    # Rename "substation" in buses["symbol"] to "Substation"
+    buses["symbol"] = buses["symbol"].replace({"substation": "Substation"})
+
+    # Drop unncessary index column and set respective element ids as index
+    lines.set_index("line_id", inplace=True)
+    converters.set_index("converter_id", inplace=True)
+    transformers.set_index("transformer_id", inplace=True)
+    buses.set_index("bus_id", inplace=True)
+
+
+    # Convert voltages from V to kV
+    lines["voltage"] = lines["voltage"] / 1000
+    transformers["voltage_bus0"], transformers["voltage_bus1"] = transformers["voltage_bus0"] / 1000, \
+        transformers["voltage_bus1"] / 1000
+    buses["voltage"] = buses["voltage"] / 1000
+
+    # Convert 'true' and 'false' to 't' and 'f'    
+    lines = lines.replace({True: "t", False: "f"})
+    converters = converters.replace({True: "t", False: "f"})
+    buses = buses.replace({True: "t", False: "f"})
+    
+    # Change column orders
+    cols_lines = ["bus0", "bus1", "voltage", "circuits", "length", "underground", "under_construction", "geometry",
+                  "tag_type", "tag_frequency", "country", "bounds", 
+                  "bus_0_coors", "bus_1_coors", "bus0_lon", "bus0_lat", "bus1_lon", "bus1_lat"]
+    
+    lines = lines[cols_lines]
+    cols_lines_csv = ["bus_id", "station_id", "voltage", "dc", "symbol", "under_construction", "tags", "x","y"]
+
+    to_csv_nafix(lines, outputs["lines"])  # Generate CSV
+    to_csv_nafix(converters, outputs["converters"])  # Generate CSV
+    to_csv_nafix(transformers, outputs["transformers"])  # Generate CSV
+
+    colstodrop = ["bounds", "bus_0_coors", "bus_1_coors"]
+
+    # Export to GeoJSON for quick validations
+    save_to_geojson(gpd.GeoDataFrame(lines.drop(columns = colstodrop), geometry = "geometry", crs = geo_crs), outputs["lines_geojson"])
+    save_to_geojson(gpd.GeoDataFrame(converters, geometry = "geometry", crs = geo_crs), outputs["converters_geojson"])
+    save_to_geojson(gpd.GeoDataFrame(transformers.drop(columns = colstodrop), geometry = "geometry", crs = geo_crs), outputs["transformers_geojson"])
+
+    # create clean directory if not already exist
+    if not os.path.exists(outputs["substations"]):
+        os.makedirs(os.path.dirname(outputs["substations"]), exist_ok=True)
+    # Generate CSV
+    to_csv_nafix(buses, outputs["substations"])
+    save_to_geojson(gpd.GeoDataFrame(buses, geometry = "geometry", crs = geo_crs), outputs["substations_geojson"])
+
+    return None
+
+
+# Function to check if two lines are connected
+def are_lines_connected(line1, line2):
+    # return (line1['geometry'].touches(line2['geometry']))
+    return (
+        are_almost_equal(line1["bus_0_coors"], line2["bus_0_coors"]),
+        are_almost_equal(line1["bus_0_coors"], line2["bus_1_coors"]),
+        are_almost_equal(line1["bus_1_coors"], line2["bus_0_coors"]),
+        are_almost_equal(line1["bus_1_coors"], line2["bus_1_coors"])
+        )
+
+
+def _dfs(adj_matrix, visited, current_vertex, path):
+    visited[current_vertex] = True
+    path.append(current_vertex)
+    for neighbor in range(len(adj_matrix)):
+        if adj_matrix[current_vertex][neighbor] == 1 and not visited[neighbor]:
+            _dfs(adj_matrix, visited, neighbor, path)
+    return path
+
+
+# Returns all connected paths as a vector
+def find_paths(adj_matrix):
+    visited = [False] * len(adj_matrix)
+    paths = []
+    for vertex in range(len(adj_matrix)):
+        if not visited[vertex]:
+            path = _dfs(adj_matrix, visited, vertex, [])
+            if path:
+                paths.append(path)
+    return paths
+
+def are_almost_equal(point1, point2, tolerance=1e-6):
+    """
+    Check if two Shapely points are almost equal with a given tolerance.
+    
+    Args:
+    point1 (Point): First Shapely point.
+    point2 (Point): Second Shapely point.
+    tolerance (float): Tolerance for coordinate deviation.
+    
+    Returns:
+    bool: True if the points are almost equal, False otherwise.
+    """
+    return abs(point1.x - point2.x) < tolerance and abs(point1.y - point2.y) < tolerance
+
+
+if __name__ == "__main__":
+    # Detect running outside of snakemake and mock snakemake for testing
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("build_osm_network")
+    
+    configure_logging(snakemake)
+
+    # load default crs
+    geo_crs = snakemake.config["crs"]["geo_crs"]
+    distance_crs = snakemake.config["crs"]["distance_crs"]
+
+    build_osm_network = snakemake.config["build_osm_network"]
+    countries = snakemake.config["countries"]
+
+    with memory_logger(
+        filename=getattr(snakemake.log, "memory", None), interval=30.0
+    ) as mem:
+        build_network(
+        snakemake.input,
+        snakemake.output,
+        build_osm_network,
+        countries,
+        geo_crs,
+        distance_crs,
+        )
+
+    logger.info(f"Maximum memory usage: {mem.mem_usage}")
\ No newline at end of file
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 63f27d7f1..a87e30823 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -17,7 +17,7 @@
 import re
 from shapely.geometry import LineString, Point, Polygon
 from shapely.ops import linemerge
-import tqdm.auto as tqdm
+from tqdm import tqdm
 
 from _helpers import configure_logging
 logger = logging.getLogger(__name__)
@@ -296,7 +296,7 @@ def generate_new_id(row):
             return original_id
         else:
             suffix_counts[original_id] = suffix_counts.get(original_id, 0) + 1
-            return f"{original_id}_{suffix_counts[original_id]}"
+            return f"{original_id}-{suffix_counts[original_id]}"
 
     # Update the ID column with the new IDs
     x["id"] = x.apply(generate_new_id, axis=1)
@@ -323,185 +323,72 @@ def _any_substring_in_list(s, list_strings):
     return any(sub in list_strings for sub in substrings)
 
 
-if __name__ == "__main__":
-    # Detect running outside of snakemake and mock snakemake for testing
-    if "snakemake" not in globals():
-        from _helpers import mock_snakemake
-
-        snakemake = mock_snakemake("clean_osm_data")
-    
-    configure_logging(snakemake)
-    logger.info("Dummy log: clean_osm_data()")
+def add_line_endings_tosubstations(substations, lines):
+    if lines.empty:
+        return substations
 
-    ############# BUSES / SUBSTATIONS ######################
-    input_path_substations = {
-        "substations_way": snakemake.input.substations_way,
-        "substations_relation": snakemake.input.substations_relation,
-    }
-
-    cols_substations_way = ["id", "geometry", "country", "power", "substation", "voltage", "frequency"]
-    cols_substations_relation = ["id", "country", "power", "substation", "voltage", "frequency"]
-    df_substations_way = pd.DataFrame(columns = cols_substations_way)
-    df_substations_relation = pd.DataFrame(columns = cols_substations_relation)
-
-    for key in input_path_substations:
-        logger.info(f"Processing {key}...")
-        for idx, ip in enumerate(input_path_substations[key]):
-            if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
-                country = os.path.basename(os.path.dirname(input_path_substations[key][idx]))  
-                logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)}: {ip}")
-                with open(ip, "r") as f:
-                    data = json.load(f)
-                
-                df = pd.DataFrame(data['elements'])
-                df["id"] = df["id"].astype(str)
-                df["country"] = country
-
-                col_tags = ["power", "substation", "voltage", "frequency"]
-
-                tags = pd.json_normalize(df["tags"]) \
-                    .map(lambda x: str(x) if pd.notnull(x) else x)
-                
-                for ct in col_tags:
-                    if ct not in tags.columns:
-                        tags[ct] = pd.NA
-                
-                tags = tags.loc[:, col_tags]
-
-                df = pd.concat([df, tags], axis="columns") 
+    # extract columns from substation df
+    bus_s = pd.DataFrame(columns=substations.columns)
+    bus_e = pd.DataFrame(columns=substations.columns)
 
-                if key == "substations_way":
-                    df.drop(columns=["type", "tags", "bounds", "nodes"], inplace=True)
-                    df_substations_way = pd.concat([df_substations_way, df], axis="rows")
-                elif key == "substations_relation":
-                    df.drop(columns=["type", "tags", "bounds"], inplace=True)
-                    df_substations_relation = pd.concat([df_substations_relation, df], axis="rows")
-
-            else:
-                logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)} (empty): {ip}")
-                continue
-        logger.info("---")
-
-    df_substations_way.drop_duplicates(subset='id', keep='first', inplace=True)
-    df_substations_relation.drop_duplicates(subset='id', keep='first', inplace=True)
-
-    df_substations_way["geometry"] = df_substations_way.apply(_create_polygon, axis=1)
-
-    # Normalise the members column of df_substations_relation
-    cols_members = ["id", "type", "ref", "role", "geometry"]
-    df_substations_relation_members = pd.DataFrame(columns = cols_members)
+    # Read information from line.csv
+    bus_s[["voltage", "country"]] = lines[["voltage", "country"]].astype(str)
+    bus_s["geometry"] = lines.geometry.boundary.map(
+        lambda p: p.geoms[0] if len(p.geoms) >= 2 else None
+    )
+    bus_s["lon"] = bus_s["geometry"].map(lambda p: p.x if p != None else None)
+    bus_s["lat"] = bus_s["geometry"].map(lambda p: p.y if p != None else None)
+    bus_s["bus_id"] = (
+        (substations["bus_id"].max() if "bus_id" in substations else 0)
+        + 1
+        + bus_s.index
+    )
+    bus_s["dc"] = lines["dc"]
 
-    for index, row in df_substations_relation.iterrows():
-        col_members = ["type", "ref", "role", "geometry"]
-        df = pd.json_normalize(row["members"]) 
-                
-        for cm in col_members:
-            if cm not in df.columns:
-                df[cm] = pd.NA
+    bus_e[["voltage", "country"]] = lines[["voltage", "country"]].astype(str)
+    bus_e["geometry"] = lines.geometry.boundary.map(
+        lambda p: p.geoms[1] if len(p.geoms) >= 2 else None
+    )
+    bus_e["lon"] = bus_e["geometry"].map(lambda p: p.x if p != None else None)
+    bus_e["lat"] = bus_e["geometry"].map(lambda p: p.y if p != None else None)
+    bus_e["bus_id"] = bus_s["bus_id"].max() + 1 + bus_e.index
+    bus_e["dc"] = lines["dc"]
 
-        df = df.loc[:, col_members]
-        df["id"] = str(row["id"])
-        df["ref"] = df["ref"].astype(str)
-        df = df[df["type"] != "node"]
-        df = df.dropna(subset=["geometry"])
-        df = df[~df["role"].isin(["", "incoming_line", "substation", "inner"])]
-        df_substations_relation_members = pd.concat([df_substations_relation_members, df], axis="rows")
-    
-    df_substations_relation_members.reset_index(inplace=True)
-    df_substations_relation_members["linestring"] = df_substations_relation_members.apply(_create_linestring, axis=1)  
-    df_substations_relation_members_grouped = df_substations_relation_members.groupby('id')['linestring'] \
-        .apply(lambda x: linemerge(x.tolist())).reset_index()
-    df_substations_relation_members_grouped["geometry"] = df_substations_relation_members_grouped["linestring"].apply(lambda x: x.convex_hull)
-    
-    df_substations_relation = df_substations_relation.join(
-        df_substations_relation_members_grouped.set_index('id'), 
-        on='id', how='left'
-        ).drop(columns=["members", "linestring"]) \
-        .dropna(subset=["geometry"])
-    
-    # reorder columns and concatenate
-    df_substations_relation = df_substations_relation[cols_substations_way]
-    df_substations = pd.concat([df_substations_way, df_substations_relation], axis="rows")
+    bus_all = pd.concat([bus_s, bus_e], ignore_index=True)
 
-    # Create centroids from geometries
-    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
-    df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
-    df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
+    # Initialize default values
+    bus_all["station_id"] = np.nan
+    # Assuming substations completed for installed lines
+    bus_all["under_construction"] = False
+    bus_all["tag_area"] = 0.0
+    bus_all["symbol"] = "substation"
+    # TODO: this tag may be improved, maybe depending on voltage levels
+    bus_all["tag_substation"] = "transmission"
+    bus_all["tag_source"] = "line_ending"
 
-    # Clean columns
-    df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
-    df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
-    df_substations["frequency"] = df_substations["frequency"].astype(str, errors="ignore")
+    buses = pd.concat([substations, bus_all], ignore_index=True)
 
-    list_voltages = df_substations["voltage"].str.split(";").explode().unique().astype(str)
-    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
-    list_voltages = list_voltages[~np.char.startswith(list_voltages, '1')]
+    # # Assign index to bus_id
+    buses["bus_id"] = buses.index
 
-    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
-    df_substations = df_substations[bool_voltages]
+    # TODO: pypsa-eur: change this later to improve country assignment
+    bool_multiple_countries = buses["country"].str.contains(";")
+    buses.loc[bool_multiple_countries, "country"] = buses.loc[bool_multiple_countries, "country"].str.split(";").str[0]
 
-    df_substations = _split_cells(df_substations)
-    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
-    df_substations = df_substations[bool_voltages]
-    df_substations["split_count"] = df_substations["id"].apply(lambda x: x.split("_")[1] if "_" in x else "0")
-    df_substations["split_count"] = df_substations["split_count"].astype(int)
+    return buses
 
-    bool_split = df_substations["split_elements"] > 1
-    bool_frequency_len = df_substations["frequency"].apply(lambda x: len(x.split(";"))) == df_substations["split_elements"]
-    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = df_substations.loc[bool_frequency_len & bool_split, "frequency"] \
-    
-    op_freq = lambda row: row["frequency"].split(";")[row["split_count"]-1]
 
-    df_substations.loc[bool_frequency_len & bool_split, ["frequency"]] = df_substations.loc[bool_frequency_len & bool_split, ] \
-        .apply(op_freq, axis=1)
-    
-    df_substations = _split_cells(df_substations, cols=["frequency"])
-    bool_invalid_frequency = df_substations["frequency"].apply(lambda x: x not in ["50", "0"])
-    df_substations.loc[bool_invalid_frequency, "frequency"] = "50"
-    df_substations["power"] = "substation"
-    df_substations["substation"] = "transmission"
-    df_substations["dc"] = False
-    df_substations.loc[df_substations["frequency"] == "0", "dc"] = True
-    df_substations["under_construction"] = False
-    df_substations["station_id"] = None
-    df_substations["tag_area"] = None
+if __name__ == "__main__":
+    # Detect running outside of snakemake and mock snakemake for testing
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
 
-    # rename columns
-    df_substations.rename(
-        columns={
-            "id": "bus_id", 
-            "power": "symbol",
-            "substation":"tag_substation",
-            }, inplace=True)
-    
-    df_substations = df_substations[[
-        "bus_id",
-        "symbol", 
-        "tag_substation", 
-        "voltage", 
-        "lon", 
-        "lat", 
-        "dc", 
-        "under_construction", 
-        "station_id", 
-        "tag_area", 
-        "country",
-        "geometry",
-        ]]
+        snakemake = mock_snakemake("clean_osm_data")
     
-    gdf_substations = gpd.GeoDataFrame(df_substations, geometry = "geometry", crs = "EPSG:4326")
-
-    filepath_substations = snakemake.output["substations"]
-    # save substations output
-    logger.info(f"Exporting clean substations to {filepath_substations}")
-    parentfolder_substations = os.path.dirname(filepath_substations)
-    if not os.path.exists(parentfolder_substations):
-        # Create the folder and its parent directories if they don't exist
-        os.makedirs(parentfolder_substations)
-
-    gdf_substations.to_file(filepath_substations, driver="GeoJSON")
+    configure_logging(snakemake)
+    logger.info("Dummy log: clean_osm_data()")
 
-    ############# LINES AND CABLES ######################
+     ############# LINES AND CABLES ######################
 
     input_path_lines_cables = {
         "lines": snakemake.input.lines_way,
@@ -655,7 +542,7 @@ def _any_substring_in_list(s, list_strings):
     
     df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables] \
         .apply(lambda row: str(row["circuits"].split(";")[
-            int(row["id"].split("_")[-1])-1
+            int(row["id"].split("-")[-1])-1
         ]), axis=1)
     
     df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
@@ -671,7 +558,7 @@ def _any_substring_in_list(s, list_strings):
         .apply(lambda row: 
             str(max(1,
                 np.floor_divide(
-                    int(row["cables"].split(";")[int(row["id"].split("_")[-1])-1]),
+                    int(row["cables"].split(";")[int(row["id"].split("-")[-1])-1]),
                     3
                     )
                 )),
@@ -683,9 +570,11 @@ def _any_substring_in_list(s, list_strings):
 
     # All remaining lines to circuits == 1
     bool_leftover = (df_lines["cleaned"] == False)
-    str_id = "; ".join(str(id) for id in df_lines.loc[bool_leftover, "id"])
-    logger.info(f"Setting circuits of remaining {sum(bool_leftover)} lines to 1...")
-    logger.info(f"Lines affected: {str_id}")
+    if sum(bool_leftover) > 0:
+        str_id = "; ".join(str(id) for id in df_lines.loc[bool_leftover, "id"])
+        logger.info(f"Setting circuits of remaining {sum(bool_leftover)} lines to 1...")
+        logger.info(f"Lines affected: {str_id}")
+    
     df_lines.loc[bool_leftover, "circuits"] = "1"
     df_lines.loc[bool_leftover & bool_ac, "frequency"] = "50"
     df_lines.loc[bool_leftover & bool_dc, "frequency"] = "0"
@@ -702,11 +591,13 @@ def _any_substring_in_list(s, list_strings):
     df_lines["bus0"] = None
     df_lines["bus1"] = None
     df_lines["length"] = None
+    df_lines["underground"] = False
     df_lines.loc[df_lines["tag_type"] == "line", "underground"] = False
     df_lines.loc[df_lines["tag_type"] == "cable", "underground"] = True
     df_lines["under_construction"] = False
-    df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True
+    df_lines["dc"] = False
     df_lines.loc[df_lines["tag_frequency"] == "50", "dc"] = False
+    df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True
 
     df_lines = df_lines[[
         "line_id",
@@ -728,9 +619,183 @@ def _any_substring_in_list(s, list_strings):
     # Drop all rows where the geometry has equal start and end point
     bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
     df_lines = df_lines[~bool_circle]    
+
+    # TODO pypsa-eur: Temporary solution as one AC line between converters will create an error in simplify_network
+    # As this case is not considered there:
+    lines_to_drop = ["775580659"]
+    if lines_to_drop in df_lines["line_id"].values:
+        df_lines.drop(df_lines[df_lines["line_id"].isin(lines_to_drop)].index, inplace=True)
     
     gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = "EPSG:4326")
 
+     # Lines data types
+    gdf_lines["circuits"] = gdf_lines["circuits"].astype(int)
+    gdf_lines["voltage"] = gdf_lines["voltage"].astype(int)
+    gdf_lines["tag_frequency"] = gdf_lines["tag_frequency"].astype(int)
+
+
+    ############# BUSES / SUBSTATIONS ######################
+    input_path_substations = {
+        "substations_way": snakemake.input.substations_way,
+        "substations_relation": snakemake.input.substations_relation,
+    }
+
+    cols_substations_way = ["id", "geometry", "country", "power", "substation", "voltage", "frequency"]
+    cols_substations_relation = ["id", "country", "power", "substation", "voltage", "frequency"]
+    df_substations_way = pd.DataFrame(columns = cols_substations_way)
+    df_substations_relation = pd.DataFrame(columns = cols_substations_relation)
+
+    for key in input_path_substations:
+        logger.info(f"Processing {key}...")
+        for idx, ip in enumerate(input_path_substations[key]):
+            if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
+                country = os.path.basename(os.path.dirname(input_path_substations[key][idx]))  
+                logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)}: {ip}")
+                with open(ip, "r") as f:
+                    data = json.load(f)
+                
+                df = pd.DataFrame(data['elements'])
+                df["id"] = df["id"].astype(str)
+                # new string that adds "way/" to id
+                df["id"] = df["id"].apply(lambda x: f"way/{x}" if key == "substations_way" else f"relation/{x}")
+                df["country"] = country
+
+                col_tags = ["power", "substation", "voltage", "frequency"]
+
+                tags = pd.json_normalize(df["tags"]) \
+                    .map(lambda x: str(x) if pd.notnull(x) else x)
+                
+                for ct in col_tags:
+                    if ct not in tags.columns:
+                        tags[ct] = pd.NA
+                
+                tags = tags.loc[:, col_tags]
+
+                df = pd.concat([df, tags], axis="columns") 
+
+                if key == "substations_way":
+                    df.drop(columns=["type", "tags", "bounds", "nodes"], inplace=True)
+                    df_substations_way = pd.concat([df_substations_way, df], axis="rows")
+                elif key == "substations_relation":
+                    df.drop(columns=["type", "tags", "bounds"], inplace=True)
+                    df_substations_relation = pd.concat([df_substations_relation, df], axis="rows")
+
+            else:
+                logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)} (empty): {ip}")
+                continue
+        logger.info("---")
+
+    df_substations_way.drop_duplicates(subset='id', keep='first', inplace=True)
+    df_substations_relation.drop_duplicates(subset='id', keep='first', inplace=True)
+
+    df_substations_way["geometry"] = df_substations_way.apply(_create_polygon, axis=1)
+
+    # Normalise the members column of df_substations_relation
+    cols_members = ["id", "type", "ref", "role", "geometry"]
+    df_substations_relation_members = pd.DataFrame(columns = cols_members)
+
+    for index, row in df_substations_relation.iterrows():
+        col_members = ["type", "ref", "role", "geometry"]
+        df = pd.json_normalize(row["members"]) 
+                
+        for cm in col_members:
+            if cm not in df.columns:
+                df[cm] = pd.NA
+
+        df = df.loc[:, col_members]
+        df["id"] = str(row["id"])
+        df["ref"] = df["ref"].astype(str)
+        df = df[df["type"] != "node"]
+        df = df.dropna(subset=["geometry"])
+        df = df[~df["role"].isin(["", "incoming_line", "substation", "inner"])]
+        df_substations_relation_members = pd.concat([df_substations_relation_members, df], axis="rows")
+    
+    df_substations_relation_members.reset_index(inplace=True)
+    df_substations_relation_members["linestring"] = df_substations_relation_members.apply(_create_linestring, axis=1)  
+    df_substations_relation_members_grouped = df_substations_relation_members.groupby('id')['linestring'] \
+        .apply(lambda x: linemerge(x.tolist())).reset_index()
+    df_substations_relation_members_grouped["geometry"] = df_substations_relation_members_grouped["linestring"].apply(lambda x: x.convex_hull)
+    
+    df_substations_relation = df_substations_relation.join(
+        df_substations_relation_members_grouped.set_index('id'), 
+        on='id', how='left'
+        ).drop(columns=["members", "linestring"]) \
+        .dropna(subset=["geometry"])
+    
+    # reorder columns and concatenate
+    df_substations_relation = df_substations_relation[cols_substations_way]
+    df_substations = pd.concat([df_substations_way, df_substations_relation], axis="rows")
+
+    # Create centroids from geometries
+    df_substations.loc[:, "polygon"] = df_substations["geometry"]
+    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
+    df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
+    df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
+
+    # Clean columns
+    df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
+    df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
+    df_substations["frequency"] = df_substations["frequency"].astype(str, errors="ignore")
+
+    list_voltages = df_substations["voltage"].str.split(";").explode().unique().astype(str)
+    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
+    list_voltages = list_voltages[~np.char.startswith(list_voltages, '1')]
+
+    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_substations = df_substations[bool_voltages]
+
+    df_substations = _split_cells(df_substations)
+    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_substations = df_substations[bool_voltages]
+    df_substations["split_count"] = df_substations["id"].apply(lambda x: x.split("-")[1] if "-" in x else "0")
+    df_substations["split_count"] = df_substations["split_count"].astype(int)
+
+    bool_split = df_substations["split_elements"] > 1
+    bool_frequency_len = df_substations["frequency"].apply(lambda x: len(x.split(";"))) == df_substations["split_elements"]
+    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = df_substations.loc[bool_frequency_len & bool_split, "frequency"] \
+    
+    op_freq = lambda row: row["frequency"].split(";")[row["split_count"]-1]
+
+    df_substations.loc[bool_frequency_len & bool_split, ["frequency"]] = df_substations.loc[bool_frequency_len & bool_split, ] \
+        .apply(op_freq, axis=1)
+    
+    df_substations = _split_cells(df_substations, cols=["frequency"])
+    bool_invalid_frequency = df_substations["frequency"].apply(lambda x: x not in ["50", "0"])
+    df_substations.loc[bool_invalid_frequency, "frequency"] = "50"
+    df_substations["power"] = "substation"
+    df_substations["substation"] = "transmission"
+    df_substations["dc"] = False
+    df_substations.loc[df_substations["frequency"] == "0", "dc"] = True
+    df_substations["under_construction"] = False
+    df_substations["station_id"] = None
+    df_substations["tag_area"] = None
+    df_substations["tag_source"] = df_substations["id"]
+
+
+    # Create an empty list to store the results
+    results = []
+
+    for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines), desc="Processing LineStrings"):
+        line = row['geometry']  
+        # Check if the LineString is within any Polygon in 'substations_df'
+        is_within_any_substation = any(line.within(substation_polygon) for substation_polygon in df_substations["polygon"])
+        results.append(is_within_any_substation)
+
+    # Add the results to 'gdf_lines'
+    gdf_lines['within_substation'] = results
+
+    # gdf_sub = gpd.GeoDataFrame(df_substations[["id", "polygon"]], geometry = "polygon", crs = "EPSG:4326")
+    # fig = Figure(width = "70%", height = 600)
+
+    # m = gdf_sub.explore(name = "Subs", color = "red")
+    # m = gdf_lines.explore(m = m, name = "lines")
+
+    # folium.LayerControl(collapsed = False).add_to(m)
+
+    # fig.add_child(m)
+    # m
+    gdf_lines = gdf_lines[~gdf_lines["within_substation"]]
+
     filepath_lines = snakemake.output["lines"]
     # save substations output
     logger.info(f"Exporting clean lines to {filepath_lines}")
@@ -740,25 +805,55 @@ def _any_substring_in_list(s, list_strings):
         os.makedirs(parentfolder_lines)
 
     gdf_lines.to_file(filepath_lines, driver="GeoJSON")
-    
-
-    ########
-    ########
-    ########
-
-
-    fig = Figure(width = "50%", height = 600)
 
-    m = gdf_substations.explore(name = "Buses", color = "red")
-    m = gdf_lines.explore(m = m, name = "Lines")
 
-    folium.LayerControl(collapsed = False).add_to(m)
+    # rename columns
+    df_substations.rename(
+        columns={
+            "id": "bus_id", 
+            "power": "symbol",
+            "substation":"tag_substation",
+            }, inplace=True)
+    
+    df_substations = df_substations[[
+        "bus_id",
+        "symbol", 
+        "tag_substation", 
+        "voltage", 
+        "lon", 
+        "lat", 
+        "dc", 
+        "under_construction", 
+        "station_id", 
+        "tag_area", 
+        "country",
+        "geometry",
+        "tag_source",
+        ]]
+    
+    df_substations["bus_id"] = df_substations.index
 
-    fig.add_child(m)
-    m
+    df_substations = add_line_endings_tosubstations(
+                df_substations, gdf_lines
+            )
+    
+    #group gdf_substations by voltage and and geometry (dropping duplicates)
+    df_substations = df_substations.groupby(["voltage", "lon", "lat", "tag_source"]).first().reset_index()
+    df_substations["bus_id"] = df_substations.index
+    
+    gdf_substations = gpd.GeoDataFrame(df_substations, geometry = "geometry", crs = "EPSG:4326")
 
-    gdf_substations.explore()
+    # Substation data types
+    gdf_substations["bus_id"] = gdf_substations["bus_id"].astype(int)
+    gdf_substations["voltage"] = gdf_substations["voltage"].astype(int)
 
+    filepath_substations = snakemake.output["substations"]
+    # save substations output
+    logger.info(f"Exporting clean substations to {filepath_substations}")
+    parentfolder_substations = os.path.dirname(filepath_substations)
+    if not os.path.exists(parentfolder_substations):
+        # Create the folder and its parent directories if they don't exist
+        os.makedirs(parentfolder_substations)
 
-    output = str(snakemake.output)
-    clean_osm_data(output)
\ No newline at end of file
+    gdf_substations.to_file(filepath_substations, driver="GeoJSON")    
+    
\ No newline at end of file

From 266a8d0ca69964b14b88e570a1ba288524be0046 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 7 May 2024 22:32:17 +0200
Subject: [PATCH 009/100] Working osm-network-fast

---
 rules/build_electricity.smk  |  121 ++--
 scripts/base_network_osm.py  | 1133 ++++++++++++++++++++++++++++++++++
 scripts/build_osm_network.py |   13 +-
 scripts/clean_osm_data.py    |   13 +-
 scripts/simplify_network.py  |    7 +-
 5 files changed, 1228 insertions(+), 59 deletions(-)
 create mode 100644 scripts/base_network_osm.py

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 5e7b362de..630d1b46d 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -64,42 +64,80 @@ rule build_powerplants:
     script:
         "../scripts/build_powerplants.py"
 
+if config["base_network"] == "eegk":
+    rule base_network:
+        params:
+            countries=config_provider("countries"),
+            snapshots=config_provider("snapshots"),
+            drop_leap_day=config_provider("enable", "drop_leap_day"),
+            lines=config_provider("lines"),
+            links=config_provider("links"),
+            transformers=config_provider("transformers"),
+        input:
+            eg_buses="data/entsoegridkit/buses.csv",
+            eg_lines="data/entsoegridkit/lines.csv",
+            eg_links="data/entsoegridkit/links.csv",
+            eg_converters="data/entsoegridkit/converters.csv",
+            eg_transformers="data/entsoegridkit/transformers.csv",
+            parameter_corrections="data/parameter_corrections.yaml",
+            links_p_nom="data/links_p_nom.csv",
+            links_tyndp="data/links_tyndp.csv",
+            country_shapes=resources("country_shapes.geojson"),
+            offshore_shapes=resources("offshore_shapes.geojson"),
+            europe_shape=resources("europe_shape.geojson"),
+        output:
+            base_network=resources("networks/base.nc"),
+            regions_onshore=resources("regions_onshore.geojson"),
+            regions_offshore=resources("regions_offshore.geojson"),
+        log:
+            logs("base_network.log"),
+        benchmark:
+            benchmarks("base_network")
+        threads: 1
+        resources:
+            mem_mb=1500,
+        conda:
+            "../envs/environment.yaml"
+        script:
+            "../scripts/base_network.py"
 
-rule base_network:
-    params:
-        countries=config_provider("countries"),
-        snapshots=config_provider("snapshots"),
-        drop_leap_day=config_provider("enable", "drop_leap_day"),
-        lines=config_provider("lines"),
-        links=config_provider("links"),
-        transformers=config_provider("transformers"),
-    input:
-        eg_buses="data/entsoegridkit/buses.csv",
-        eg_lines="data/entsoegridkit/lines.csv",
-        eg_links="data/entsoegridkit/links.csv",
-        eg_converters="data/entsoegridkit/converters.csv",
-        eg_transformers="data/entsoegridkit/transformers.csv",
-        parameter_corrections="data/parameter_corrections.yaml",
-        links_p_nom="data/links_p_nom.csv",
-        links_tyndp="data/links_tyndp.csv",
-        country_shapes=resources("country_shapes.geojson"),
-        offshore_shapes=resources("offshore_shapes.geojson"),
-        europe_shape=resources("europe_shape.geojson"),
-    output:
-        base_network=resources("networks/base.nc"),
-        regions_onshore=resources("regions_onshore.geojson"),
-        regions_offshore=resources("regions_offshore.geojson"),
-    log:
-        logs("base_network.log"),
-    benchmark:
-        benchmarks("base_network")
-    threads: 1
-    resources:
-        mem_mb=1500,
-    conda:
-        "../envs/environment.yaml"
-    script:
-        "../scripts/base_network.py"
+
+if config["base_network"] == "osm":
+    rule base_network:
+        params:
+            countries=config_provider("countries"),
+            snapshots=config_provider("snapshots"),
+            drop_leap_day=config_provider("enable", "drop_leap_day"),
+            lines=config_provider("lines"),
+            links=config_provider("links"),
+            transformers=config_provider("transformers"),
+        input:
+            eg_buses="data/osm/buses.csv",
+            eg_lines="data/osm/lines.csv",
+            # eg_links="data/entsoegridkit/links.csv",
+            eg_converters="data/osm/converters.csv",
+            eg_transformers="data/osm/transformers.csv",
+            # parameter_corrections="data/parameter_corrections.yaml",
+            links_p_nom="data/links_p_nom.csv",
+            links_tyndp="data/links_tyndp_osm.csv",
+            country_shapes=resources("country_shapes.geojson"),
+            offshore_shapes=resources("offshore_shapes.geojson"),
+            europe_shape=resources("europe_shape.geojson"),
+        output:
+            base_network=resources("networks/base.nc"),
+            regions_onshore=resources("regions_onshore.geojson"),
+            regions_offshore=resources("regions_offshore.geojson"),
+        log:
+            logs("base_network.log"),
+        benchmark:
+            benchmarks("base_network")
+        threads: 1
+        resources:
+            mem_mb=1500,
+        conda:
+            "../envs/environment.yaml"
+        script:
+            "../scripts/base_network_osm.py"
 
 
 rule build_shapes:
@@ -597,9 +635,6 @@ if config["osm"].get("retrieve", True):
             substations_way="data/osm/raw/{country}/substations_way_raw.json",
             substations_node="data/osm/raw/{country}/substations_node_raw.json",
             substations_relation="data/osm/raw/{country}/substations_relation_raw.json",
-            # transformers_way="data/osm/raw/{country}/transformers_way_raw.json",
-            # transformers_node="data/osm/raw/{country}/transformers_node_raw.json",
-            # route_relations="data/osm/raw/{country}/route_relations_raw.json",
         log:
             logs("retrieve_osm_data_{country}.log"),
         script:
@@ -607,21 +642,12 @@ if config["osm"].get("retrieve", True):
 
 
 rule clean_osm_data:
-    # params:
-    #     countries=config["countries"],
     input:
-        # **{
-        #     f"{country}": [f"data/osm/raw/{country}/{feature}.geojson" for feature in FEATURES]
-        #     for country in config["countries"]
-        #     },
         cables_way=[f"data/osm/raw/{country}/cables_way_raw.json" for country in config["countries"]],
         lines_way=[f"data/osm/raw/{country}/lines_way_raw.json" for country in config["countries"]],
         substations_way=[f"data/osm/raw/{country}/substations_way_raw.json" for country in config["countries"]],
         substations_node=[f"data/osm/raw/{country}/substations_node_raw.json" for country in config["countries"]],
         substations_relation=[f"data/osm/raw/{country}/substations_relation_raw.json" for country in config["countries"]],
-        # transformers_way=[f"data/osm/raw/{country}/transformers_way_raw.json" for country in config["countries"]],
-        # transformers_node=[f"data/osm/raw/{country}/transformers_node_raw.json" for country in config["countries"]],
-        # route_relations=[f"data/osm/raw/{country}/route_relations_raw.json" for country in config["countries"]],
     output:
         substations="data/osm/clean/substations.geojson",
         lines="data/osm/clean/lines.geojson",
@@ -635,6 +661,7 @@ rule build_osm_network:
     input:
         substations="data/osm/clean/substations.geojson",
         lines="data/osm/clean/lines.geojson",
+        country_shapes=resources("country_shapes.geojson"),
     output:
         lines="data/osm/lines.csv",
         converters="data/osm/converters.csv",
diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
new file mode 100644
index 000000000..874c778fe
--- /dev/null
+++ b/scripts/base_network_osm.py
@@ -0,0 +1,1133 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+
+# coding: utf-8
+"""
+Creates the network topology from a `ENTSO-E map extract.
+
+<https://github.com/PyPSA/GridKit/tree/master/entsoe>`_ (March 2022) as a PyPSA
+network.
+
+Relevant Settings
+-----------------
+
+.. code:: yaml
+
+    countries:
+
+    electricity:
+        voltages:
+
+    lines:
+        types:
+        s_max_pu:
+        under_construction:
+
+    links:
+        p_max_pu:
+        under_construction:
+        include_tyndp:
+
+    transformers:
+        x:
+        s_nom:
+        type:
+
+.. seealso::
+    Documentation of the configuration file ``config/config.yaml`` at
+    :ref:`snapshots_cf`, :ref:`toplevel_cf`, :ref:`electricity_cf`, :ref:`load_cf`,
+    :ref:`lines_cf`, :ref:`links_cf`, :ref:`transformers_cf`
+
+Inputs
+------
+
+- ``data/entsoegridkit``:  Extract from the geographical vector data of the online `ENTSO-E Interactive Map <https://www.entsoe.eu/data/map/>`_ by the `GridKit <https://github.com/martacki/gridkit>`_ toolkit dating back to March 2022.
+- ``data/parameter_corrections.yaml``: Corrections for ``data/entsoegridkit``
+- ``data/links_p_nom.csv``: confer :ref:`links`
+- ``data/links_tyndp.csv``: List of projects in the `TYNDP 2018 <https://tyndp.entsoe.eu/tyndp2018/>`_ that are at least *in permitting* with fields for start- and endpoint (names and coordinates), length, capacity, construction status, and project reference ID.
+- ``resources/country_shapes.geojson``: confer :ref:`shapes`
+- ``resources/offshore_shapes.geojson``: confer :ref:`shapes`
+- ``resources/europe_shape.geojson``: confer :ref:`shapes`
+
+Outputs
+-------
+
+- ``networks/base.nc``
+
+    .. image:: img/base.png
+        :scale: 33 %
+
+- ``resources/regions_onshore.geojson``:
+
+    .. image:: img/regions_onshore.png
+        :scale: 33 %
+
+- ``resources/regions_offshore.geojson``:
+
+    .. image:: img/regions_offshore.png
+        :scale: 33 %
+
+Description
+-----------
+Creates the network topology from an ENTSO-E map extract, and create Voronoi shapes for each bus representing both onshore and offshore regions.
+"""
+
+import logging
+from itertools import product
+
+import geopandas as gpd
+import networkx as nx
+import numpy as np
+import pandas as pd
+import pypsa
+import shapely
+import shapely.prepared
+import shapely.wkt
+import yaml
+from _helpers import REGION_COLS, configure_logging, get_snapshots, set_scenario_config
+from packaging.version import Version, parse
+from scipy import spatial
+from scipy.sparse import csgraph
+from shapely.geometry import LineString, Point, Polygon
+
+PD_GE_2_2 = parse(pd.__version__) >= Version("2.2")
+
+logger = logging.getLogger(__name__)
+
+
+def _get_oid(df):
+    if "tags" in df.columns:
+        return df.tags.str.extract('"oid"=>"(\d+)"', expand=False)
+    else:
+        return pd.Series(np.nan, df.index)
+
+
+def _get_country(df):
+    if "tags" in df.columns:
+        return df.tags.str.extract('"country"=>"([A-Z]{2})"', expand=False)
+    else:
+        return pd.Series(np.nan, df.index)
+
+
+def _find_closest_links(links, new_links, distance_upper_bound=1.5):
+    treecoords = np.asarray(
+        [
+            np.asarray(shapely.wkt.loads(s).coords)[[0, -1]].flatten()
+            for s in links.geometry
+        ]
+    )
+    querycoords = np.vstack(
+        [new_links[["x1", "y1", "x2", "y2"]], new_links[["x2", "y2", "x1", "y1"]]]
+    )
+    tree = spatial.KDTree(treecoords)
+    dist, ind = tree.query(querycoords, distance_upper_bound=distance_upper_bound)
+    found_b = ind < len(links)
+    found_i = np.arange(len(new_links) * 2)[found_b] % len(new_links)
+    return (
+        pd.DataFrame(
+            dict(D=dist[found_b], i=links.index[ind[found_b] % len(links)]),
+            index=new_links.index[found_i],
+        )
+        .sort_values(by="D")[lambda ds: ~ds.index.duplicated(keep="first")]
+        .sort_index()["i"]
+    )
+
+
+def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
+    buses = (
+        pd.read_csv(
+            eg_buses,
+            quotechar="'",
+            true_values=["t"],
+            false_values=["f"],
+            dtype=dict(bus_id="str"),
+        )
+        .set_index("bus_id")
+        .drop(["station_id"], axis=1)
+        .rename(columns=dict(voltage="v_nom"))
+    )
+
+    buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
+    buses["under_construction"] = buses.under_construction.where(
+        lambda s: s.notnull(), False
+    ).astype(bool)
+
+    # remove all buses outside of all countries including exclusive economic zones (offshore)
+    europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
+    # TODO pypsa-eur: Temporary fix: Convex hull, this is important when nodes are between countries
+    europe_shape = europe_shape.convex_hull
+
+    europe_shape_prepped = shapely.prepared.prep(europe_shape)
+    buses_in_europe_b = buses[["x", "y"]].apply(
+        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
+    )
+
+
+    # TODO pypsa-eur: Find a long-term solution
+    # buses_with_v_nom_to_keep_b = (
+    #     buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
+    # )
+
+    # Quick fix:
+    buses_with_v_nom_to_keep_b = (min(config_elec["voltages"]) <= buses.v_nom) & (buses.v_nom <= max(config_elec["voltages"]))
+
+    logger.info(
+        f'Removing buses with voltages {pd.Index(buses.v_nom.unique()).dropna().difference(config_elec["voltages"])}'
+    )
+    return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
+
+
+def _load_transformers_from_eg(buses, eg_transformers):
+    transformers = pd.read_csv(
+        eg_transformers,
+        quotechar="'",
+        true_values=["t"],
+        false_values=["f"],
+        dtype=dict(transformer_id="str", bus0="str", bus1="str"),
+    ).set_index("transformer_id")
+
+    transformers = _remove_dangling_branches(transformers, buses)
+
+    return transformers
+
+
+def _load_converters_from_eg(buses, eg_converters):
+    converters = pd.read_csv(
+        eg_converters,
+        quotechar="'",
+        true_values=["t"],
+        false_values=["f"],
+        dtype=dict(converter_id="str", bus0="str", bus1="str"),
+    ).set_index("converter_id")
+
+    converters = _remove_dangling_branches(converters, buses)
+
+    converters["carrier"] = "B2B"
+
+    return converters
+
+
+def _load_links_from_eg(buses, eg_links):
+    links = pd.read_csv(
+        eg_links,
+        quotechar="'",
+        true_values=["t"],
+        false_values=["f"],
+        dtype=dict(link_id="str", bus0="str", bus1="str", under_construction="bool"),
+    ).set_index("link_id")
+
+    links["length"] /= 1e3
+
+    # Skagerrak Link is connected to 132kV bus which is removed in _load_buses_from_eg.
+    # Connect to neighboring 380kV bus
+    links.loc[links.bus1 == "6396", "bus1"] = "6398"
+
+    links = _remove_dangling_branches(links, buses)
+
+    # Add DC line parameters
+    links["carrier"] = "DC"
+
+    return links
+
+
+def _add_links_from_tyndp(buses, links, links_tyndp, europe_shape):
+    links_tyndp = pd.read_csv(links_tyndp)
+
+    # remove all links from list which lie outside all of the desired countries
+    europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
+    europe_shape_prepped = shapely.prepared.prep(europe_shape)
+    x1y1_in_europe_b = links_tyndp[["x1", "y1"]].apply(
+        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
+    )
+    x2y2_in_europe_b = links_tyndp[["x2", "y2"]].apply(
+        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
+    )
+    is_within_covered_countries_b = x1y1_in_europe_b & x2y2_in_europe_b
+
+    if not is_within_covered_countries_b.all():
+        logger.info(
+            "TYNDP links outside of the covered area (skipping): "
+            + ", ".join(links_tyndp.loc[~is_within_covered_countries_b, "Name"])
+        )
+
+        links_tyndp = links_tyndp.loc[is_within_covered_countries_b]
+        if links_tyndp.empty:
+            return buses, links
+
+    has_replaces_b = links_tyndp.replaces.notnull()
+    oids = dict(Bus=_get_oid(buses), Link=_get_oid(links))
+    keep_b = dict(
+        Bus=pd.Series(True, index=buses.index), Link=pd.Series(True, index=links.index)
+    )
+    for reps in links_tyndp.loc[has_replaces_b, "replaces"]:
+        for comps in reps.split(":"):
+            oids_to_remove = comps.split(".")
+            c = oids_to_remove.pop(0)
+            keep_b[c] &= ~oids[c].isin(oids_to_remove)
+    buses = buses.loc[keep_b["Bus"]]
+    links = links.loc[keep_b["Link"]]
+
+    links_tyndp["j"] = _find_closest_links(
+        links, links_tyndp, distance_upper_bound=0.20
+    )
+    # Corresponds approximately to 20km tolerances
+
+    if links_tyndp["j"].notnull().any():
+        logger.info(
+            "TYNDP links already in the dataset (skipping): "
+            + ", ".join(links_tyndp.loc[links_tyndp["j"].notnull(), "Name"])
+        )
+        links_tyndp = links_tyndp.loc[links_tyndp["j"].isnull()]
+        if links_tyndp.empty:
+            return buses, links
+
+    tree = spatial.KDTree(buses[["x", "y"]])
+    _, ind0 = tree.query(links_tyndp[["x1", "y1"]])
+    ind0_b = ind0 < len(buses)
+    links_tyndp.loc[ind0_b, "bus0"] = buses.index[ind0[ind0_b]]
+
+    _, ind1 = tree.query(links_tyndp[["x2", "y2"]])
+    ind1_b = ind1 < len(buses)
+    links_tyndp.loc[ind1_b, "bus1"] = buses.index[ind1[ind1_b]]
+
+    links_tyndp_located_b = (
+        links_tyndp["bus0"].notnull() & links_tyndp["bus1"].notnull()
+    )
+    if not links_tyndp_located_b.all():
+        logger.warning(
+            "Did not find connected buses for TYNDP links (skipping): "
+            + ", ".join(links_tyndp.loc[~links_tyndp_located_b, "Name"])
+        )
+        links_tyndp = links_tyndp.loc[links_tyndp_located_b]
+
+    logger.info("Adding the following TYNDP links: " + ", ".join(links_tyndp["Name"]))
+
+    links_tyndp = links_tyndp[["bus0", "bus1"]].assign(
+        carrier="DC",
+        p_nom=links_tyndp["Power (MW)"],
+        length=links_tyndp["Length (given) (km)"].fillna(
+            links_tyndp["Length (distance*1.2) (km)"]
+        ),
+        under_construction=True,
+        underground=False,
+        geometry=(
+            links_tyndp[["x1", "y1", "x2", "y2"]].apply(
+                lambda s: str(LineString([[s.x1, s.y1], [s.x2, s.y2]])), axis=1
+            )
+        ),
+        tags=(
+            '"name"=>"'
+            + links_tyndp["Name"]
+            + '", '
+            + '"ref"=>"'
+            + links_tyndp["Ref"]
+            + '", '
+            + '"status"=>"'
+            + links_tyndp["status"]
+            + '"'
+        ),
+    )
+
+    links_tyndp.index = "T" + links_tyndp.index.astype(str)
+
+    links = pd.concat([links, links_tyndp], sort=True)
+
+    return buses, links
+
+
+def _load_lines_from_eg(buses, eg_lines):
+    lines = (
+        pd.read_csv(
+            eg_lines,
+            quotechar="'",
+            true_values=["t"],
+            false_values=["f"],
+            dtype=dict(
+                line_id="str",
+                bus0="str",
+                bus1="str",
+                underground="bool",
+                under_construction="bool",
+            ),
+        )
+        .set_index("line_id")
+        .rename(columns=dict(voltage="v_nom", circuits="num_parallel"))
+    )
+
+    lines["length"] /= 1e3
+
+    # lines["carrier"] = "AC" #TODO pypsa-eur clean/remove this
+    lines = _remove_dangling_branches(lines, buses)
+
+    return lines
+
+
+def _apply_parameter_corrections(n, parameter_corrections):
+    with open(parameter_corrections) as f:
+        corrections = yaml.safe_load(f)
+
+    if corrections is None:
+        return
+
+    for component, attrs in corrections.items():
+        df = n.df(component)
+        oid = _get_oid(df)
+        if attrs is None:
+            continue
+
+        for attr, repls in attrs.items():
+            for i, r in repls.items():
+                if i == "oid":
+                    r = oid.map(repls["oid"]).dropna()
+                elif i == "index":
+                    r = pd.Series(repls["index"])
+                else:
+                    raise NotImplementedError()
+                inds = r.index.intersection(df.index)
+                df.loc[inds, attr] = r[inds].astype(df[attr].dtype)
+
+
+def _reconnect_crimea(lines):
+    logger.info("Reconnecting Crimea to the Ukrainian grid.")
+    lines_to_crimea = pd.DataFrame(
+        {
+            "bus0": ["3065", "3181", "3181"],
+            "bus1": ["3057", "3055", "3057"],
+            "v_nom": [300, 300, 300],
+            "num_parallel": [1, 1, 1],
+            "length": [140, 120, 140],
+            "carrier": ["AC", "AC", "AC"],
+            "underground": [False, False, False],
+            "under_construction": [False, False, False],
+        },
+        index=["Melitopol", "Liubymivka left", "Luibymivka right"],
+    )
+
+    return pd.concat([lines, lines_to_crimea])
+
+
+# def _set_electrical_parameters_lines(lines, config):
+#     v_noms = config["electricity"]["voltages"]
+#     linetypes = config["lines"]["types"]
+
+#     for v_nom in v_noms:
+#         lines.loc[lines["v_nom"] == v_nom, "type"] = linetypes[v_nom]
+
+def _set_electrical_parameters_lines(lines_config, voltages, lines):
+    if lines.empty:
+        lines["type"] = []
+        return lines
+
+    linetypes = _get_linetypes_config(lines_config["types"], voltages)
+
+    lines["carrier"] = "AC"
+    lines["dc"] = False
+
+    lines.loc[:, "type"] = lines.v_nom.apply(
+        lambda x: _get_linetype_by_voltage(x, linetypes)
+    )
+
+    lines["s_max_pu"] = lines_config["s_max_pu"]
+
+    return lines
+
+
+def _set_lines_s_nom_from_linetypes(n):
+    n.lines["s_nom"] = (
+        np.sqrt(3)
+        * n.lines["type"].map(n.line_types.i_nom)
+        * n.lines["v_nom"]
+        * n.lines["num_parallel"]
+    )
+    # Re-define s_nom for DC lines
+    n.lines.loc[n.lines["carrier"] == "DC", "s_nom"] = n.lines["type"].map(
+        n.line_types.i_nom
+    ) * n.lines.eval("v_nom * num_parallel")
+
+
+def _set_electrical_parameters_dc_lines(lines_config, voltages, lines):
+    if lines.empty:
+        lines["type"] = []
+        return lines
+
+    linetypes = _get_linetypes_config(lines_config["dc_types"], voltages)
+
+    lines["carrier"] = "DC"
+    lines["dc"] = True
+    lines.loc[:, "type"] = lines.v_nom.apply(
+        lambda x: _get_linetype_by_voltage(x, linetypes)
+    )
+
+    lines["s_max_pu"] = lines_config["s_max_pu"]
+
+    return lines
+
+# TODO pypsa-eur: Clean/fix this, update list p_noms
+def _set_electrical_parameters_links(links, config, links_p_nom):
+    if links.empty:
+        return links
+
+    p_max_pu = config["links"].get("p_max_pu", 1.0)
+    links["p_max_pu"] = p_max_pu
+    links["p_min_pu"] = -p_max_pu
+
+    links_p_nom = pd.read_csv(links_p_nom)
+
+    # filter links that are not in operation anymore
+    removed_b = links_p_nom.Remarks.str.contains("Shut down|Replaced", na=False)
+    links_p_nom = links_p_nom[~removed_b]
+
+    # find closest link for all links in links_p_nom
+    links_p_nom["j"] = _find_closest_links(links, links_p_nom)
+
+    links_p_nom = links_p_nom.groupby(["j"], as_index=False).agg({"Power (MW)": "sum"})
+
+    p_nom = links_p_nom.dropna(subset=["j"]).set_index("j")["Power (MW)"]
+
+    # Don't update p_nom if it's already set
+    p_nom_unset = (
+        p_nom.drop(links.index[links.p_nom.notnull()], errors="ignore")
+        if "p_nom" in links
+        else p_nom
+    )
+    links.loc[p_nom_unset.index, "p_nom"] = p_nom_unset
+
+    links["carrier"] = "DC"
+    links["dc"] = True
+
+    return links
+
+
+def _set_electrical_parameters_converters(converters, config):
+    p_max_pu = config["links"].get("p_max_pu", 1.0)
+    converters["p_max_pu"] = p_max_pu
+    converters["p_min_pu"] = -p_max_pu
+
+    converters["p_nom"] = 2000
+
+    # Converters are combined with links
+    converters["under_construction"] = False
+    converters["underground"] = False
+
+    return converters
+
+
+def _set_electrical_parameters_transformers(transformers, config):
+    config = config["transformers"]
+
+    ## Add transformer parameters
+    transformers["x"] = config.get("x", 0.1)
+    transformers["s_nom"] = config.get("s_nom", 2000)
+    transformers["type"] = config.get("type", "")
+
+    return transformers
+
+
+def _remove_dangling_branches(branches, buses):
+    return pd.DataFrame(
+        branches.loc[branches.bus0.isin(buses.index) & branches.bus1.isin(buses.index)]
+    )
+
+
+def _remove_unconnected_components(network, threshold=6):
+    _, labels = csgraph.connected_components(network.adjacency_matrix(), directed=False)
+    component = pd.Series(labels, index=network.buses.index)
+
+    component_sizes = component.value_counts()
+    components_to_remove = component_sizes.loc[component_sizes < threshold]
+
+    logger.info(
+        f"Removing {len(components_to_remove)} unconnected network components with less than {components_to_remove.max()} buses. In total {components_to_remove.sum()} buses."
+    )
+
+    return network[component == component_sizes.index[0]]
+
+
+def _set_countries_and_substations(n, config, country_shapes, offshore_shapes):
+    buses = n.buses
+
+    def buses_in_shape(shape):
+        shape = shapely.prepared.prep(shape)
+        return pd.Series(
+            np.fromiter(
+                (
+                    shape.contains(Point(x, y))
+                    for x, y in buses.loc[:, ["x", "y"]].values
+                ),
+                dtype=bool,
+                count=len(buses),
+            ),
+            index=buses.index,
+        )
+
+    countries = config["countries"]
+    country_shapes = gpd.read_file(country_shapes).set_index("name")["geometry"]
+    # reindexing necessary for supporting empty geo-dataframes
+    offshore_shapes = gpd.read_file(offshore_shapes)
+    offshore_shapes = offshore_shapes.reindex(columns=["name", "geometry"]).set_index(
+        "name"
+    )["geometry"]
+    substation_b = buses["symbol"].str.contains(
+        "substation|converter station", case=False
+    )
+
+    def prefer_voltage(x, which):
+        index = x.index
+        if len(index) == 1:
+            return pd.Series(index, index)
+        key = (
+            x.index[0]
+            if x["v_nom"].isnull().all()
+            else getattr(x["v_nom"], "idx" + which)()
+        )
+        return pd.Series(key, index)
+
+    compat_kws = dict(include_groups=False) if PD_GE_2_2 else {}
+    gb = buses.loc[substation_b].groupby(
+        ["x", "y"], as_index=False, group_keys=False, sort=False
+    )
+    bus_map_low = gb.apply(prefer_voltage, "min", **compat_kws)
+    lv_b = (bus_map_low == bus_map_low.index).reindex(buses.index, fill_value=False)
+    bus_map_high = gb.apply(prefer_voltage, "max", **compat_kws)
+    hv_b = (bus_map_high == bus_map_high.index).reindex(buses.index, fill_value=False)
+
+    onshore_b = pd.Series(False, buses.index)
+    offshore_b = pd.Series(False, buses.index)
+
+    for country in countries:
+        onshore_shape = country_shapes[country]
+        onshore_country_b = buses_in_shape(onshore_shape)
+        onshore_b |= onshore_country_b
+
+        buses.loc[onshore_country_b, "country"] = country
+
+        if country not in offshore_shapes.index:
+            continue
+        offshore_country_b = buses_in_shape(offshore_shapes[country])
+        offshore_b |= offshore_country_b
+
+        buses.loc[offshore_country_b, "country"] = country
+
+    # Only accept buses as low-voltage substations (where load is attached), if
+    # they have at least one connection which is not under_construction
+    has_connections_b = pd.Series(False, index=buses.index)
+    for b, df in product(("bus0", "bus1"), (n.lines, n.links)):
+        has_connections_b |= ~df.groupby(b).under_construction.min()
+
+    buses["onshore_bus"] = onshore_b
+    buses["substation_lv"] = (
+        lv_b & onshore_b & (~buses["under_construction"]) & has_connections_b
+    )
+
+    # TODO: fix this in pypsa-eur master branch
+    # buses["substation_off"] = offshore_b & (
+    #     ~buses["under_construction"]
+    # )
+
+    buses["substation_off"] = (offshore_b | (hv_b & onshore_b)) & (
+        ~buses["under_construction"]
+    )
+
+    c_nan_b = buses.country.fillna("na") == "na"
+    if c_nan_b.sum() > 0:
+        c_tag = _get_country(buses.loc[c_nan_b])
+        c_tag.loc[~c_tag.isin(countries)] = np.nan
+        n.buses.loc[c_nan_b, "country"] = c_tag
+
+        c_tag_nan_b = n.buses.country.isnull()
+
+        # Nearest country in path length defines country of still homeless buses
+        # Work-around until commit 705119 lands in pypsa release
+        n.transformers["length"] = 0.0
+        graph = n.graph(weight="length")
+        n.transformers.drop("length", axis=1, inplace=True)
+
+        for b in n.buses.index[c_tag_nan_b]:
+            df = (
+                pd.DataFrame(
+                    dict(
+                        pathlength=nx.single_source_dijkstra_path_length(
+                            graph, b, cutoff=200
+                        )
+                    )
+                )
+                .join(n.buses.country)
+                .dropna()
+            )
+            assert (
+                not df.empty
+            ), "No buses with defined country within 200km of bus `{}`".format(b)
+            n.buses.at[b, "country"] = df.loc[df.pathlength.idxmin(), "country"]
+
+        logger.warning(
+            "{} buses are not in any country or offshore shape,"
+            " {} have been assigned from the tag of the entsoe map,"
+            " the rest from the next bus in terms of pathlength.".format(
+                c_nan_b.sum(), c_nan_b.sum() - c_tag_nan_b.sum()
+            )
+        )
+
+    return buses
+
+
+def _replace_b2b_converter_at_country_border_by_link(n):
+    # Affects only the B2B converter in Lithuania at the Polish border at the moment
+    buscntry = n.buses.country
+    linkcntry = n.links.bus0.map(buscntry)
+    converters_i = n.links.index[
+        (n.links.carrier == "B2B") & (linkcntry == n.links.bus1.map(buscntry))
+    ]
+
+    def findforeignbus(G, i):
+        cntry = linkcntry.at[i]
+        for busattr in ("bus0", "bus1"):
+            b0 = n.links.at[i, busattr]
+            for b1 in G[b0]:
+                if buscntry[b1] != cntry:
+                    return busattr, b0, b1
+        return None, None, None
+
+    for i in converters_i:
+        G = n.graph()
+        busattr, b0, b1 = findforeignbus(G, i)
+        if busattr is not None:
+            comp, line = next(iter(G[b0][b1]))
+            if comp != "Line":
+                logger.warning(
+                    "Unable to replace B2B `{}` expected a Line, but found a {}".format(
+                        i, comp
+                    )
+                )
+                continue
+
+            n.links.at[i, busattr] = b1
+            n.links.at[i, "p_nom"] = min(
+                n.links.at[i, "p_nom"], n.lines.at[line, "s_nom"]
+            )
+            n.links.at[i, "carrier"] = "DC"
+            n.links.at[i, "underwater_fraction"] = 0.0
+            n.links.at[i, "length"] = n.lines.at[line, "length"]
+
+            n.remove("Line", line)
+            n.remove("Bus", b0)
+
+            logger.info(
+                "Replacing B2B converter `{}` together with bus `{}` and line `{}` by an HVDC tie-line {}-{}".format(
+                    i, b0, line, linkcntry.at[i], buscntry.at[b1]
+                )
+            )
+
+
+def _set_links_underwater_fraction(n, offshore_shapes):
+    if n.links.empty:
+        return
+
+    if not hasattr(n.links, "geometry"):
+        n.links["underwater_fraction"] = 0.0
+    else:
+        offshore_shape = gpd.read_file(offshore_shapes).unary_union
+        links = gpd.GeoSeries(n.links.geometry.dropna().map(shapely.wkt.loads))
+        n.links["underwater_fraction"] = (
+            links.intersection(offshore_shape).length / links.length
+        )
+
+
+def _adjust_capacities_of_under_construction_branches(n, config):
+    lines_mode = config["lines"].get("under_construction", "undef")
+    if lines_mode == "zero":
+        n.lines.loc[n.lines.under_construction, "num_parallel"] = 0.0
+        n.lines.loc[n.lines.under_construction, "s_nom"] = 0.0
+    elif lines_mode == "remove":
+        n.mremove("Line", n.lines.index[n.lines.under_construction])
+    elif lines_mode != "keep":
+        logger.warning(
+            "Unrecognized configuration for `lines: under_construction` = `{}`. Keeping under construction lines."
+        )
+
+    links_mode = config["links"].get("under_construction", "undef")
+    if links_mode == "zero":
+        n.links.loc[n.links.under_construction, "p_nom"] = 0.0
+    elif links_mode == "remove":
+        n.mremove("Link", n.links.index[n.links.under_construction])
+    elif links_mode != "keep":
+        logger.warning(
+            "Unrecognized configuration for `links: under_construction` = `{}`. Keeping under construction links."
+        )
+
+    if lines_mode == "remove" or links_mode == "remove":
+        # We might need to remove further unconnected components
+        n = _remove_unconnected_components(n)
+
+    return n
+
+
+def _set_shapes(n, country_shapes, offshore_shapes):
+    # Write the geodataframes country_shapes and offshore_shapes to the network.shapes component
+    country_shapes = gpd.read_file(country_shapes).rename(columns={"name": "idx"})
+    country_shapes["type"] = "country"
+    offshore_shapes = gpd.read_file(offshore_shapes).rename(columns={"name": "idx"})
+    offshore_shapes["type"] = "offshore"
+    all_shapes = pd.concat([country_shapes, offshore_shapes], ignore_index=True)
+    n.madd(
+        "Shape",
+        all_shapes.index,
+        geometry=all_shapes.geometry,
+        idx=all_shapes.idx,
+        type=all_shapes["type"],
+    )
+
+
+def base_network_osm(
+    eg_buses,
+    eg_converters,
+    eg_transformers,
+    eg_lines,
+    links_p_nom,
+    europe_shape,
+    country_shapes,
+    offshore_shapes,
+    config,
+):
+    buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
+
+
+
+    #TODO pypsa-eur add this
+    # links = _load_links_from_eg(buses, eg_links)
+    # if config["links"].get("include_tyndp"):
+    #     buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
+
+    converters = _load_converters_from_eg(buses, eg_converters)
+
+    lines = _load_lines_from_eg(buses, eg_lines)
+    transformers = _load_transformers_from_eg(buses, eg_transformers)
+
+    if config["lines"].get("reconnect_crimea", True) and "UA" in config["countries"]:
+        lines = _reconnect_crimea(lines)
+
+    lines_ac = lines[lines.tag_frequency.astype(float) != 0].copy()
+    lines_dc = lines[lines.tag_frequency.astype(float) == 0].copy()
+
+    lines_ac = _set_electrical_parameters_lines(
+        config["lines"], 
+        config["electricity"]["voltages"], 
+        lines_ac
+        )
+
+    lines_dc = _set_electrical_parameters_dc_lines(
+        config["lines"], 
+        config["electricity"]["voltages"], 
+        lines_dc
+    )
+
+    # lines = _set_electrical_parameters_lines(lines, config)
+    transformers = _set_electrical_parameters_transformers(transformers, config)
+    # links = _set_electrical_parameters_links(links, config, links_p_nom)
+    converters = _set_electrical_parameters_converters(converters, config)
+
+    n = pypsa.Network()
+    n.name = "PyPSA-Eur (OSM)"
+
+    time = get_snapshots(snakemake.params.snapshots, snakemake.params.drop_leap_day)
+    n.set_snapshots(time)
+    n.madd("Carrier", ["AC", "DC"]) # TODO: fix hard code and check if AC/DC truly exist
+
+    n.import_components_from_dataframe(buses, "Bus")
+
+    if config["base_network_osm"]["hvdc_as_lines"]:
+        lines = pd.concat([lines_ac, lines_dc])
+        n.import_components_from_dataframe(lines, "Line")
+    else:
+        lines_dc = _set_electrical_parameters_links(lines_dc, config, links_p_nom)
+        # parse line information into p_nom required for converters
+        lines_dc["p_nom"] = lines_dc.apply(
+            lambda x: x["v_nom"] * n.line_types.i_nom[x["type"]],
+            axis=1,
+            result_type="reduce",
+        )
+        n.import_components_from_dataframe(lines_ac, "Line")
+        # The columns which names starts with "bus" are mixed up with the third-bus specification
+        # when executing additional_linkports()
+        # lines_dc.drop(
+        #     labels=[
+        #         "bus0_lon",
+        #         "bus0_lat",
+        #         "bus1_lon",
+        #         "bus1_lat",
+        #         "bus_0_coors",
+        #         "bus_1_coors",
+        #     ],
+        #     axis=1,
+        #     inplace=True,
+        # )
+        n.import_components_from_dataframe(lines_dc, "Link")
+
+    # n.import_components_from_dataframe(lines, "Line")
+    n.import_components_from_dataframe(transformers, "Transformer")
+    # n.import_components_from_dataframe(links, "Link")
+    n.import_components_from_dataframe(converters, "Link")
+
+    _set_lines_s_nom_from_linetypes(n)
+
+    #TODO pypsa-eur add this
+    # _apply_parameter_corrections(n, parameter_corrections) 
+
+    # TODO: what about this?
+    n = _remove_unconnected_components(n)
+
+    _set_countries_and_substations(n, config, country_shapes, offshore_shapes)
+
+    #TODO pypsa-eur add this
+    _set_links_underwater_fraction(n, offshore_shapes)
+
+    _replace_b2b_converter_at_country_border_by_link(n)
+
+    n = _adjust_capacities_of_under_construction_branches(n, config)
+
+    _set_shapes(n, country_shapes, offshore_shapes)
+
+    return n
+
+def _get_linetypes_config(line_types, voltages):
+    """
+    Return the dictionary of linetypes for selected voltages. The dictionary is
+    a subset of the dictionary line_types, whose keys match the selected
+    voltages.
+
+    Parameters
+    ----------
+    line_types : dict
+        Dictionary of linetypes: keys are nominal voltages and values are linetypes.
+    voltages : list
+        List of selected voltages.
+
+    Returns
+    -------
+        Dictionary of linetypes for selected voltages.
+    """
+    # get voltages value that are not availabile in the line types
+    vnoms_diff = set(voltages).symmetric_difference(set(line_types.keys()))
+    if vnoms_diff:
+        logger.warning(
+            f"Voltages {vnoms_diff} not in the {line_types} or {voltages} list."
+        )
+    return {k: v for k, v in line_types.items() if k in voltages}
+
+def _get_linetype_by_voltage(v_nom, d_linetypes):
+    """
+    Return the linetype of a specific line based on its voltage v_nom.
+
+    Parameters
+    ----------
+    v_nom : float
+        The voltage of the line.
+    d_linetypes : dict
+        Dictionary of linetypes: keys are nominal voltages and values are linetypes.
+
+    Returns
+    -------
+        The linetype of the line whose nominal voltage is closest to the line voltage.
+    """
+    v_nom_min, line_type_min = min(
+        d_linetypes.items(),
+        key=lambda x: abs(x[0] - v_nom),
+    )
+    return line_type_min
+
+
+def voronoi_partition_pts(points, outline):
+    """
+    Compute the polygons of a voronoi partition of `points` within the polygon
+    `outline`. Taken from
+    https://github.com/FRESNA/vresutils/blob/master/vresutils/graph.py.
+
+    Attributes
+    ----------
+    points : Nx2 - ndarray[dtype=float]
+    outline : Polygon
+    Returns
+    -------
+    polygons : N - ndarray[dtype=Polygon|MultiPolygon]
+    """
+    points = np.asarray(points)
+
+    if len(points) == 1:
+        polygons = [outline]
+    else:
+        xmin, ymin = np.amin(points, axis=0)
+        xmax, ymax = np.amax(points, axis=0)
+        xspan = xmax - xmin
+        yspan = ymax - ymin
+
+        # to avoid any network positions outside all Voronoi cells, append
+        # the corners of a rectangle framing these points
+        vor = spatial.Voronoi(
+            np.vstack(
+                (
+                    points,
+                    [
+                        [xmin - 3.0 * xspan, ymin - 3.0 * yspan],
+                        [xmin - 3.0 * xspan, ymax + 3.0 * yspan],
+                        [xmax + 3.0 * xspan, ymin - 3.0 * yspan],
+                        [xmax + 3.0 * xspan, ymax + 3.0 * yspan],
+                    ],
+                )
+            )
+        )
+
+        polygons = []
+        for i in range(len(points)):
+            poly = Polygon(vor.vertices[vor.regions[vor.point_region[i]]])
+
+            if not poly.is_valid:
+                poly = poly.buffer(0)
+
+            with np.errstate(invalid="ignore"):
+                poly = poly.intersection(outline)
+
+            polygons.append(poly)
+
+    return polygons
+
+
+def build_bus_shapes(n, country_shapes, offshore_shapes, countries):
+    country_shapes = gpd.read_file(country_shapes).set_index("name")["geometry"]
+    offshore_shapes = gpd.read_file(offshore_shapes)
+    offshore_shapes = offshore_shapes.reindex(columns=REGION_COLS).set_index("name")[
+        "geometry"
+    ]
+
+    onshore_regions = []
+    offshore_regions = []
+
+    for country in countries:
+        c_b = n.buses.country == country
+
+        onshore_shape = country_shapes[country]
+        onshore_locs = (
+            n.buses.loc[c_b & n.buses.onshore_bus]
+            .sort_values(
+                by="substation_lv", ascending=False
+            )  # preference for substations
+            .drop_duplicates(subset=["x", "y"], keep="first")[["x", "y"]]
+        )
+        onshore_regions.append(
+            gpd.GeoDataFrame(
+                {
+                    "name": onshore_locs.index,
+                    "x": onshore_locs["x"],
+                    "y": onshore_locs["y"],
+                    "geometry": voronoi_partition_pts(
+                        onshore_locs.values, onshore_shape
+                    ),
+                    "country": country,
+                }
+            )
+        )
+
+        if country not in offshore_shapes.index:
+            continue
+        offshore_shape = offshore_shapes[country]
+        offshore_locs = n.buses.loc[c_b & n.buses.substation_off, ["x", "y"]]
+        offshore_regions_c = gpd.GeoDataFrame(
+            {
+                "name": offshore_locs.index,
+                "x": offshore_locs["x"],
+                "y": offshore_locs["y"],
+                "geometry": voronoi_partition_pts(offshore_locs.values, offshore_shape),
+                "country": country,
+            }
+        )
+        offshore_regions_c = offshore_regions_c.loc[offshore_regions_c.area > 1e-2]
+        offshore_regions.append(offshore_regions_c)
+
+    shapes = pd.concat(onshore_regions, ignore_index=True)
+
+    return onshore_regions, offshore_regions, shapes
+
+
+def append_bus_shapes(n, shapes, type):
+    """
+    Append shapes to the network. If shapes with the same component and type
+    already exist, they will be removed.
+
+    Parameters:
+        n (pypsa.Network): The network to which the shapes will be appended.
+        shapes (geopandas.GeoDataFrame): The shapes to be appended.
+        **kwargs: Additional keyword arguments used in `n.madd`.
+
+    Returns:
+        None
+    """
+    remove = n.shapes.query("component == 'Bus' and type == @type").index
+    n.mremove("Shape", remove)
+
+    offset = n.shapes.index.astype(int).max() + 1 if not n.shapes.empty else 0
+    shapes = shapes.rename(lambda x: int(x) + offset)
+    n.madd(
+        "Shape",
+        shapes.index,
+        geometry=shapes.geometry,
+        idx=shapes.name,
+        component="Bus",
+        type=type,
+    )
+
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("base_network")
+    configure_logging(snakemake)
+    set_scenario_config(snakemake)
+
+    #TODO pypsa-eur add this
+    # n = base_network(
+    #     snakemake.input.eg_buses,
+    #     snakemake.input.eg_converters,
+    #     snakemake.input.eg_transformers,
+    #     snakemake.input.eg_lines,
+    #     snakemake.input.eg_links,
+    #     snakemake.input.links_p_nom,
+    #     snakemake.input.links_tyndp,
+    #     snakemake.input.europe_shape,
+    #     snakemake.input.country_shapes,
+    #     snakemake.input.offshore_shapes,
+    #     snakemake.input.parameter_corrections,
+    #     snakemake.config,
+    # )
+
+    n = base_network_osm(
+    snakemake.input.eg_buses,
+    snakemake.input.eg_converters,
+    snakemake.input.eg_transformers,
+    snakemake.input.eg_lines,
+    snakemake.input.links_p_nom,
+    snakemake.input.europe_shape,
+    snakemake.input.country_shapes,
+    snakemake.input.offshore_shapes,
+    snakemake.config,
+    )
+
+    onshore_regions, offshore_regions, shapes = build_bus_shapes(
+    n,
+    snakemake.input.country_shapes,
+    snakemake.input.offshore_shapes,
+    snakemake.params.countries,
+    )
+
+    shapes.to_file(snakemake.output.regions_onshore)
+    append_bus_shapes(n, shapes, "onshore")
+
+    if offshore_regions:
+        shapes = pd.concat(offshore_regions, ignore_index=True)
+        shapes.to_file(snakemake.output.regions_offshore)
+        append_bus_shapes(n, shapes, "offshore")
+    else:
+        offshore_shapes.to_frame().to_file(snakemake.output.regions_offshore)
+
+    n.meta = snakemake.config
+    n.export_to_netcdf(snakemake.output.base_network)
\ No newline at end of file
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index bc0e46541..c64a23706 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -1050,12 +1050,15 @@ def build_network(
                   "tag_type", "tag_frequency", "country", "bounds", 
                   "bus_0_coors", "bus_1_coors", "bus0_lon", "bus0_lat", "bus1_lon", "bus1_lat"]
     
+    cols_lines_csv = ["bus0", "bus1", "voltage", "circuits", "tag_frequency", "length", "underground", "under_construction", "geometry"]
+    lines_csv = lines[cols_lines_csv]
     lines = lines[cols_lines]
-    cols_lines_csv = ["bus_id", "station_id", "voltage", "dc", "symbol", "under_construction", "tags", "x","y"]
+    
+
 
-    to_csv_nafix(lines, outputs["lines"])  # Generate CSV
-    to_csv_nafix(converters, outputs["converters"])  # Generate CSV
-    to_csv_nafix(transformers, outputs["transformers"])  # Generate CSV
+    to_csv_nafix(lines_csv, outputs["lines"], quotechar="'")  # Generate CSV
+    to_csv_nafix(converters, outputs["converters"], quotechar="'")  # Generate CSV
+    to_csv_nafix(transformers, outputs["transformers"], quotechar="'")  # Generate CSV
 
     colstodrop = ["bounds", "bus_0_coors", "bus_1_coors"]
 
@@ -1068,7 +1071,7 @@ def build_network(
     if not os.path.exists(outputs["substations"]):
         os.makedirs(os.path.dirname(outputs["substations"]), exist_ok=True)
     # Generate CSV
-    to_csv_nafix(buses, outputs["substations"])
+    to_csv_nafix(buses, outputs["substations"], quotechar="'")
     save_to_geojson(gpd.GeoDataFrame(buses, geometry = "geometry", crs = geo_crs), outputs["substations_geojson"])
 
     return None
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index a87e30823..c7fe13f46 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -386,7 +386,6 @@ def add_line_endings_tosubstations(substations, lines):
         snakemake = mock_snakemake("clean_osm_data")
     
     configure_logging(snakemake)
-    logger.info("Dummy log: clean_osm_data()")
 
      ############# LINES AND CABLES ######################
 
@@ -401,6 +400,7 @@ def add_line_endings_tosubstations(substations, lines):
 
     # using tqdm loop over input path
 
+    logger.info("Importing lines and cables")
     for key in input_path_lines_cables:
         logger.info(f"Processing {key}...")
         for idx, ip in enumerate(input_path_lines_cables[key]):
@@ -436,6 +436,7 @@ def add_line_endings_tosubstations(substations, lines):
                 continue
         logger.info("---")
 
+    logger.info("Cleaning lines and cables")
     # Find duplicates based on id column
     duplicate_rows = df_lines[df_lines.duplicated(subset=['id'], keep=False)].copy()
     # group rows by id and aggregate the country column to a string split by semicolon
@@ -645,6 +646,7 @@ def add_line_endings_tosubstations(substations, lines):
     df_substations_way = pd.DataFrame(columns = cols_substations_way)
     df_substations_relation = pd.DataFrame(columns = cols_substations_relation)
 
+    logger.info("Importing substations")
     for key in input_path_substations:
         logger.info(f"Processing {key}...")
         for idx, ip in enumerate(input_path_substations[key]):
@@ -732,6 +734,7 @@ def add_line_endings_tosubstations(substations, lines):
     df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
     df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
 
+    logger.info("Cleaning substations")
     # Clean columns
     df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
     df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
@@ -771,11 +774,11 @@ def add_line_endings_tosubstations(substations, lines):
     df_substations["tag_area"] = None
     df_substations["tag_source"] = df_substations["id"]
 
-
     # Create an empty list to store the results
     results = []
 
-    for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines), desc="Processing LineStrings"):
+    logger.info("Removing linestrings within substation polygons...")
+    for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines)):
         line = row['geometry']  
         # Check if the LineString is within any Polygon in 'substations_df'
         is_within_any_substation = any(line.within(substation_polygon) for substation_polygon in df_substations["polygon"])
@@ -804,9 +807,9 @@ def add_line_endings_tosubstations(substations, lines):
         # Create the folder and its parent directories if they don't exist
         os.makedirs(parentfolder_lines)
 
+    logger.info(f"Exporting clean lines to {filepath_lines}")
     gdf_lines.to_file(filepath_lines, driver="GeoJSON")
 
-
     # rename columns
     df_substations.rename(
         columns={
@@ -833,6 +836,7 @@ def add_line_endings_tosubstations(substations, lines):
     
     df_substations["bus_id"] = df_substations.index
 
+    logger.info("Adding line endings to substations")
     df_substations = add_line_endings_tosubstations(
                 df_substations, gdf_lines
             )
@@ -855,5 +859,6 @@ def add_line_endings_tosubstations(substations, lines):
         # Create the folder and its parent directories if they don't exist
         os.makedirs(parentfolder_substations)
 
+    logger.info(f"Exporting clean substations to {filepath_substations}")
     gdf_substations.to_file(filepath_substations, driver="GeoJSON")    
     
\ No newline at end of file
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 558e4cf28..ceefb3dda 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -108,7 +108,7 @@
 logger = logging.getLogger(__name__)
 
 
-def simplify_network_to_380(n):
+def simplify_network_to_380(n, linetype_380):
     """
     Fix all lines to a voltage level of 380 kV and remove all transformers.
 
@@ -124,7 +124,7 @@ def simplify_network_to_380(n):
 
     n.buses["v_nom"] = 380.0
 
-    (linetype_380,) = n.lines.loc[n.lines.v_nom == 380.0, "type"].unique()
+    # TODO pypsa-eur: In the future, make this even more generic (voltage level)
     n.lines["type"] = linetype_380
     n.lines["v_nom"] = 380
     n.lines["i_nom"] = n.line_types.i_nom[linetype_380]
@@ -536,7 +536,8 @@ def cluster(
     # remove integer outputs for compatibility with PyPSA v0.26.0
     n.generators.drop("n_mod", axis=1, inplace=True, errors="ignore")
 
-    n, trafo_map = simplify_network_to_380(n)
+    linetype_380 = snakemake.config["lines"]["types"][380]
+    n, trafo_map = simplify_network_to_380(n, linetype_380)
 
     technology_costs = load_costs(
         snakemake.input.tech_costs,

From 6264bf8ae8f7395ce2d9dfae75414611fad36d22 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 7 May 2024 22:44:34 +0200
Subject: [PATCH 010/100] Bug fixes.

---
 scripts/base_network_osm.py | 2 +-
 scripts/clean_osm_data.py   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index 874c778fe..64e059b59 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -157,7 +157,7 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
     # remove all buses outside of all countries including exclusive economic zones (offshore)
     europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
     # TODO pypsa-eur: Temporary fix: Convex hull, this is important when nodes are between countries
-    europe_shape = europe_shape.convex_hull
+    # europe_shape = europe_shape.convex_hull
 
     europe_shape_prepped = shapely.prepared.prep(europe_shape)
     buses_in_europe_b = buses[["x", "y"]].apply(
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index c7fe13f46..ae51cfe19 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -777,7 +777,7 @@ def add_line_endings_tosubstations(substations, lines):
     # Create an empty list to store the results
     results = []
 
-    logger.info("Removing linestrings within substation polygons...")
+    logger.info("Identifying and removing lines within substation polygons...")
     for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines)):
         line = row['geometry']  
         # Check if the LineString is within any Polygon in 'substations_df'
@@ -798,6 +798,7 @@ def add_line_endings_tosubstations(substations, lines):
     # fig.add_child(m)
     # m
     gdf_lines = gdf_lines[~gdf_lines["within_substation"]]
+    logger.info(f"Removed {sum(results)} lines within substations.")
 
     filepath_lines = snakemake.output["lines"]
     # save substations output

From 027fbdee6d793b27a1f64878b4b9d0abd8278797 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 15 May 2024 14:20:40 +0200
Subject: [PATCH 011/100] Finalised and cleaned  including docstrings.

---
 scripts/retrieve_osm_data.py | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 9a4526a5f..91bf1782e 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -18,6 +18,20 @@
 
 
 def _get_overpass_areas(countries):
+    """
+    Retrieve the OSM area codes for the specified country codes.
+    
+    Parameters
+    ----------
+    countries : str or list
+        A single country code or a list of country codes for which the OSM area codes should be retrieved.
+
+    Returns
+    -------
+    dict
+        A dictionary mapping country codes to their corresponding OSM area codes.
+    """
+
     # If a single country code is provided, convert it to a list
     if not isinstance(countries, list):
         countries = [countries]
@@ -67,10 +81,26 @@ def retrieve_osm_data(
             "substations_way",
             "substations_node",
             "substations_relation",
-            # "transformers_way",
-            # "transformers_node",
-            # "route_relations",
             ]):
+    """
+    Retrieve OSM data for the specified country and save it to the specified output files.
+
+    Parameters
+    ----------
+    country : str
+        The country code for which the OSM data should be retrieved.
+    output : dict
+        A dictionary mapping feature names to the corresponding output file paths. Saving the OSM data to .json files.
+    features : list, optional
+        A list of OSM features to retrieve. The default is [
+            "cables_way",
+            "lines_way",
+            "substations_way",
+            "substations_node",
+            "substations_relation",
+            ].
+    """
+
     
     op_area = _get_overpass_areas(country)
 

From dc829d2ec8ff04e9edd29e8417794837c0925e9d Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 15 May 2024 18:59:32 +0200
Subject: [PATCH 012/100] Added try catch to retrieve_osm_data. Allows for
 parallelisation of downloads.

---
 rules/build_electricity.smk  |  95 +++++++++++-----------
 scripts/base_network_osm.py  |  68 ++++++----------
 scripts/build_osm_network.py |  21 +++--
 scripts/clean_osm_data.py    |  25 ++----
 scripts/retrieve_osm_data.py | 150 ++++++++++++++++++++++-------------
 5 files changed, 185 insertions(+), 174 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index ec47e1cbe..b4b53b621 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -64,7 +64,8 @@ rule build_powerplants:
     script:
         "../scripts/build_powerplants.py"
 
-if config["base_network"] == "eegk":
+
+if config["electricity_network"]["base_network"] == "gridkit":
     rule base_network:
         params:
             countries=config_provider("countries"),
@@ -102,7 +103,7 @@ if config["base_network"] == "eegk":
             "../scripts/base_network.py"
 
 
-if config["base_network"] == "osm":
+if config["electricity_network"]["base_network"] == "osm":
     rule base_network:
         params:
             countries=config_provider("countries"),
@@ -112,12 +113,11 @@ if config["base_network"] == "osm":
             links=config_provider("links"),
             transformers=config_provider("transformers"),
         input:
-            eg_buses="data/osm/buses.csv",
-            eg_lines="data/osm/lines.csv",
+            eg_buses=resources("osm/buses.csv"),
+            eg_lines=resources("osm/lines.csv"),
             # eg_links="data/entsoegridkit/links.csv",
-            eg_converters="data/osm/converters.csv",
-            eg_transformers="data/osm/transformers.csv",
-            # parameter_corrections="data/parameter_corrections.yaml",
+            eg_converters=resources("osm/converters.csv"),
+            eg_transformers=resources("osm/transformers.csv"),
             links_p_nom="data/links_p_nom.csv",
             links_tyndp="data/links_tyndp_osm.csv",
             country_shapes=resources("country_shapes.geojson"),
@@ -609,53 +609,56 @@ rule prepare_network:
         "../scripts/prepare_network.py"
 
 
-if config["osm"].get("retrieve", True):
-    rule retrieve_osm_data:
-        output:
-            cables_way="data/osm/raw/{country}/cables_way_raw.json",
-            lines_way="data/osm/raw/{country}/lines_way_raw.json",
-            substations_way="data/osm/raw/{country}/substations_way_raw.json",
-            substations_node="data/osm/raw/{country}/substations_node_raw.json",
-            substations_relation="data/osm/raw/{country}/substations_relation_raw.json",
-        log:
-            logs("retrieve_osm_data_{country}.log"),
-        script:
-            "../scripts/retrieve_osm_data.py"
+rule retrieve_osm_data:
+    output:
+        cables_way="data/osm/raw/{country}/cables_way.json",
+        lines_way="data/osm/raw/{country}/lines_way.json",
+        substations_way="data/osm/raw/{country}/substations_way.json",
+        substations_node="data/osm/raw/{country}/substations_node.json",
+        substations_relation="data/osm/raw/{country}/substations_relation.json",
+    log:
+        logs("retrieve_osm_data_{country}.log"),
+    script:
+        "../scripts/retrieve_osm_data.py"
 
 
 rule clean_osm_data:
     input:
-        cables_way=[f"data/osm/raw/{country}/cables_way_raw.json" for country in config["countries"]],
-        lines_way=[f"data/osm/raw/{country}/lines_way_raw.json" for country in config["countries"]],
-        substations_way=[f"data/osm/raw/{country}/substations_way_raw.json" for country in config["countries"]],
-        substations_node=[f"data/osm/raw/{country}/substations_node_raw.json" for country in config["countries"]],
-        substations_relation=[f"data/osm/raw/{country}/substations_relation_raw.json" for country in config["countries"]],
+        cables_way=[f"data/osm/raw/{country}/cables_way.json" for country in config["countries"]],
+        lines_way=[f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]],
+        substations_way=[f"data/osm/raw/{country}/substations_way.json" for country in config["countries"]],
+        substations_node=[f"data/osm/raw/{country}/substations_node.json" for country in config["countries"]],
+        substations_relation=[f"data/osm/raw/{country}/substations_relation.json" for country in config["countries"]],
     output:
-        substations="data/osm/clean/substations.geojson",
-        lines="data/osm/clean/lines.geojson",
+        substations=resources("osm/clean/substations.geojson"),
+        lines=resources("osm/clean/lines.geojson"),
     log:
         logs("clean_osm_data.log"),
     script:
         "../scripts/clean_osm_data.py"
 
 
-rule build_osm_network:
-    input:
-        substations="data/osm/clean/substations.geojson",
-        lines="data/osm/clean/lines.geojson",
-        country_shapes=resources("country_shapes.geojson"),
-    output:
-        lines="data/osm/lines.csv",
-        converters="data/osm/converters.csv",
-        transformers="data/osm/transformers.csv",
-        substations="data/osm/buses.csv",
-        lines_geojson="data/osm/lines.geojson",
-        converters_geojson="data/osm/converters.geojson",
-        transformers_geojson="data/osm/transformers.geojson",
-        substations_geojson="data/osm/buses.geojson",
-    log:
-        logs("build_osm_network.log"),
-    benchmark:
-        benchmarks("build_osm_network")
-    script:
-        "../scripts/build_osm_network.py"
\ No newline at end of file
+if config["electricity_network"]["build_osm_network"] == True:
+    rule build_osm_network:
+        input:
+            substations=resources("osm/clean/substations.geojson"),
+            lines=resources("osm/clean/lines.geojson"),
+            country_shapes=resources("country_shapes.geojson"),
+        output:
+            lines=resources("osm/lines.csv"),
+            converters=resources("osm/converters.csv"),
+            transformers=resources("osm/transformers.csv"),
+            substations=resources("osm/buses.csv"),
+            lines_geojson=resources("osm/lines.geojson"),
+            converters_geojson=resources("osm/converters.geojson"),
+            transformers_geojson=resources("osm/transformers.geojson"),
+            substations_geojson=resources("osm/buses.geojson"),
+        log:
+            logs("build_osm_network.log"),
+        benchmark:
+            benchmarks("build_osm_network")
+        script:
+            "../scripts/build_osm_network.py"
+
+if config["electricity_network"]["build_osm_network"] == False:
+    print("Use prebuilt.")
\ No newline at end of file
diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index 64e059b59..44b2636d6 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -836,33 +836,29 @@ def base_network_osm(
 
     n.import_components_from_dataframe(buses, "Bus")
 
-    if config["base_network_osm"]["hvdc_as_lines"]:
-        lines = pd.concat([lines_ac, lines_dc])
-        n.import_components_from_dataframe(lines, "Line")
-    else:
-        lines_dc = _set_electrical_parameters_links(lines_dc, config, links_p_nom)
-        # parse line information into p_nom required for converters
-        lines_dc["p_nom"] = lines_dc.apply(
-            lambda x: x["v_nom"] * n.line_types.i_nom[x["type"]],
-            axis=1,
-            result_type="reduce",
-        )
-        n.import_components_from_dataframe(lines_ac, "Line")
-        # The columns which names starts with "bus" are mixed up with the third-bus specification
-        # when executing additional_linkports()
-        # lines_dc.drop(
-        #     labels=[
-        #         "bus0_lon",
-        #         "bus0_lat",
-        #         "bus1_lon",
-        #         "bus1_lat",
-        #         "bus_0_coors",
-        #         "bus_1_coors",
-        #     ],
-        #     axis=1,
-        #     inplace=True,
-        # )
-        n.import_components_from_dataframe(lines_dc, "Link")
+    lines_dc = _set_electrical_parameters_links(lines_dc, config, links_p_nom)
+    # parse line information into p_nom required for converters
+    lines_dc["p_nom"] = lines_dc.apply(
+        lambda x: x["v_nom"] * n.line_types.i_nom[x["type"]],
+        axis=1,
+        result_type="reduce",
+    )
+    n.import_components_from_dataframe(lines_ac, "Line")
+    # The columns which names starts with "bus" are mixed up with the third-bus specification
+    # when executing additional_linkports()
+    # lines_dc.drop(
+    #     labels=[
+    #         "bus0_lon",
+    #         "bus0_lat",
+    #         "bus1_lon",
+    #         "bus1_lat",
+    #         "bus_0_coors",
+    #         "bus_1_coors",
+    #     ],
+    #     axis=1,
+    #     inplace=True,
+    # )
+    n.import_components_from_dataframe(lines_dc, "Link")
 
     # n.import_components_from_dataframe(lines, "Line")
     n.import_components_from_dataframe(transformers, "Transformer")
@@ -1084,22 +1080,6 @@ def append_bus_shapes(n, shapes, type):
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
-    #TODO pypsa-eur add this
-    # n = base_network(
-    #     snakemake.input.eg_buses,
-    #     snakemake.input.eg_converters,
-    #     snakemake.input.eg_transformers,
-    #     snakemake.input.eg_lines,
-    #     snakemake.input.eg_links,
-    #     snakemake.input.links_p_nom,
-    #     snakemake.input.links_tyndp,
-    #     snakemake.input.europe_shape,
-    #     snakemake.input.country_shapes,
-    #     snakemake.input.offshore_shapes,
-    #     snakemake.input.parameter_corrections,
-    #     snakemake.config,
-    # )
-
     n = base_network_osm(
     snakemake.input.eg_buses,
     snakemake.input.eg_converters,
@@ -1112,6 +1092,8 @@ def append_bus_shapes(n, shapes, type):
     snakemake.config,
     )
 
+    logger.info("Base network created using OSM.")
+
     onshore_regions, offshore_regions, shapes = build_bus_shapes(
     n,
     snakemake.input.country_shapes,
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index c64a23706..5eccfad4e 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -993,16 +993,13 @@ def build_network(
     buses = add_buses_to_empty_countries(countries_config, inputs.country_shapes, buses)
 
     # METHOD to merge buses with same voltage and within tolerance Step 4/5
-    if build_osm_network_config.get("group_close_buses", False):
-        tol = build_osm_network_config.get("group_tolerance_buses", 5000)
-        logger.info(
-            f"Stage 4/5: Aggregate close substations: enabled with tolerance {tol} m"
-        )
-        lines, buses = merge_stations_lines_by_station_id_and_voltage(
-            lines, buses, geo_crs, distance_crs, tol=tol
-        )
-    else:
-        logger.info("Stage 4/5: Aggregate close substations: disabled")
+    tol = build_osm_network_config.get("group_tolerance_buses", 5000)
+    logger.info(
+        f"Stage 4/5: Aggregate close substations: enabled with tolerance {tol} m"
+    )
+    lines, buses = merge_stations_lines_by_station_id_and_voltage(
+        lines, buses, geo_crs, distance_crs, tol=tol
+    )
 
     logger.info("Stage 5/5: Add augmented substation to country with no data")
 
@@ -1133,8 +1130,8 @@ def are_almost_equal(point1, point2, tolerance=1e-6):
     configure_logging(snakemake)
 
     # load default crs
-    geo_crs = snakemake.config["crs"]["geo_crs"]
-    distance_crs = snakemake.config["crs"]["distance_crs"]
+    geo_crs = "EPSG:4326"
+    distance_crs = "EPSG:3035"
 
     build_osm_network = snakemake.config["build_osm_network"]
     countries = snakemake.config["countries"]
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index ae51cfe19..eb03a8e28 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -6,8 +6,6 @@
 TODO To fill later
 """
 
-from branca.element import Figure
-import folium
 import geopandas as gpd
 import json
 import logging
@@ -15,7 +13,7 @@
 import numpy as np
 import pandas as pd
 import re
-from shapely.geometry import LineString, Point, Polygon
+from shapely.geometry import LineString, Polygon
 from shapely.ops import linemerge
 from tqdm import tqdm
 
@@ -37,7 +35,8 @@ def _create_polygon(row):
     Create a Shapely Polygon from a list of coordinate dictionaries.
     
     Parameters:
-        coords (list): List of dictionaries with 'lat' and 'lon' keys representing coordinates.
+        coords (list): List of dictionaries with 'lat' and 'lon' keys 
+        representing coordinates.
         
     Returns:
         shapely.geometry.Polygon: The constructed polygon object.
@@ -108,7 +107,8 @@ def _clean_voltage(column):
 
 def _clean_circuits(column):
     """
-    Function to clean the raw circuits column: manual fixing and drop nan values
+    Function to clean the raw circuits column: manual fixing and drop nan 
+    values
 
     Args:
     - column: pandas Series, the column to be cleaned
@@ -123,7 +123,7 @@ def _clean_circuits(column):
         .str.replace("partial", "")
         .str.replace("1operator=RTE operator:wikidata=Q2178795", "")
         .str.lower()
-        .str.replace("1,5", "3") # (way 998005838, should be corrected in OSM soon)
+        .str.replace("1,5", "3")
         .str.replace("1/3", "1")
         .str.replace("<na>", "")
         .str.replace("nan", "")
@@ -221,7 +221,8 @@ def _check_voltage(voltage, list_voltages):
 def _clean_frequency(column):   
     column = column.copy()
     """
-    Function to clean the raw frequency column: manual fixing and drop nan values
+    Function to clean the raw frequency column: manual fixing and drop nan 
+    values
 
     Args:
     - column: pandas Series, the column to be cleaned
@@ -787,16 +788,6 @@ def add_line_endings_tosubstations(substations, lines):
     # Add the results to 'gdf_lines'
     gdf_lines['within_substation'] = results
 
-    # gdf_sub = gpd.GeoDataFrame(df_substations[["id", "polygon"]], geometry = "polygon", crs = "EPSG:4326")
-    # fig = Figure(width = "70%", height = 600)
-
-    # m = gdf_sub.explore(name = "Subs", color = "red")
-    # m = gdf_lines.explore(m = m, name = "lines")
-
-    # folium.LayerControl(collapsed = False).add_to(m)
-
-    # fig.add_child(m)
-    # m
     gdf_lines = gdf_lines[~gdf_lines["within_substation"]]
     logger.info(f"Removed {sum(results)} lines within substations.")
 
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 91bf1782e..77cc398c5 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -2,13 +2,16 @@
 # SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors
 #
 # SPDX-License-Identifier: MIT
+
 """
-TODO To fill later
+Retrieve OSM data for the specified country using the overpass API and save it 
+to the specified output files. Note that overpass requests are based on a fair 
+use policy. `retrieve_osm_data` is meant to be used in a way that respects this 
+policy by fetching the needed data once, only. 
 """
 
 import json
 import logging
-# import overpass as op
 import os
 import requests
 import time
@@ -17,6 +20,8 @@
 logger = logging.getLogger(__name__)
 
 
+# Function currently not needed - Kept for backup purposes to retrieve the OSM 
+# area code if needed in the future
 def _get_overpass_areas(countries):
     """
     Retrieve the OSM area codes for the specified country codes.
@@ -24,12 +29,14 @@ def _get_overpass_areas(countries):
     Parameters
     ----------
     countries : str or list
-        A single country code or a list of country codes for which the OSM area codes should be retrieved.
+        A single country code or a list of country codes for which the OSM area 
+        codes should be retrieved.
 
     Returns
     -------
     dict
-        A dictionary mapping country codes to their corresponding OSM area codes.
+        A dictionary mapping country codes to their corresponding OSM area 
+        codes.
     """
 
     # If a single country code is provided, convert it to a list
@@ -51,22 +58,28 @@ def _get_overpass_areas(countries):
         # Send the request to Overpass API
         response = requests.post(overpass_url, data=overpass_query)
 
-        # Parse the response
-        data = response.json()
-
-        # Check if the response contains any results
-        if "elements" in data and len(data["elements"]) > 0:
-            # Extract the area ID from the relation
-            if c == "FR": # take second one for France
-                osm_area_id = data["elements"][1]["id"]
+        try:
+            # Parse the response
+            data = response.json()
+
+            # Check if the response contains any results
+            if "elements" in data and len(data["elements"]) > 0:
+                # Extract the area ID from the relation
+                if c == "FR": # take second one for France
+                    osm_area_id = data["elements"][1]["id"]
+                else:
+                    osm_area_id = data["elements"][0]["id"]
+                osm_areas.append(f"area({osm_area_id})")
             else:
-                osm_area_id = data["elements"][0]["id"]
-            osm_areas.append(f"area({osm_area_id})")
-        else:
-            # Print a warning if no results are found for the country code
-            logger.info(f"No area code found for the specified country code: {c}. Ommitted from the list.")
+                # Print a warning if no results are found for the country code
+                logger.info(f"No area code found for the specified country "
+                            f"code: {c}. Omitted from the list.")
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON decode error for country {c}: {e}")
+            logger.debug(f"Response text: {response.text}")
     
-    # Create a dictionary mapping country codes to their corresponding OSM area codes
+    # Create a dictionary mapping country codes to their corresponding OSM area 
+    # codes
     op_areas_dict = dict(zip(countries, osm_areas))
     
     return op_areas_dict
@@ -83,14 +96,16 @@ def retrieve_osm_data(
             "substations_relation",
             ]):
     """
-    Retrieve OSM data for the specified country and save it to the specified output files.
+    Retrieve OSM data for the specified country and save it to the specified 
+    output files.
 
     Parameters
     ----------
     country : str
         The country code for which the OSM data should be retrieved.
     output : dict
-        A dictionary mapping feature names to the corresponding output file paths. Saving the OSM data to .json files.
+        A dictionary mapping feature names to the corresponding output file 
+        paths. Saving the OSM data to .json files.
     features : list, optional
         A list of OSM features to retrieve. The default is [
             "cables_way",
@@ -100,13 +115,13 @@ def retrieve_osm_data(
             "substations_relation",
             ].
     """
-
-    
-    op_area = _get_overpass_areas(country)
-
     # Overpass API endpoint URL
     overpass_url = "https://overpass-api.de/api/interpreter"
 
+    # More features can in theory be retrieved that are currently not needed
+    # to build a functioning network. The following power-related
+    # features are supported:
+    
     # features_dict= {
     #     'cables_way': 'way["power"="cable"]',
     #     'lines_way': 'way["power"="line"]',
@@ -125,41 +140,68 @@ def retrieve_osm_data(
         'substations_relation': 'relation["power"="substation"]',
     }
 
+    wait_time = 5
+
     for f in features:
         if f not in features_dict:
-            raise ValueError(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
             logger.info(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
+            raise ValueError(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
 
-        logger.info(f" - Fetching OSM data for feature '{f}' in {country}...")
-        # Build the overpass query
-        op_query = f'''
-            [out:json];
-            {op_area[country]}->.searchArea;
-            (
-            {features_dict[f]}(area.searchArea);
-            );
-            out body geom;
-        '''
-
-        # Send the request
-        response = requests.post(overpass_url, data = op_query)
-        # response = op.API(timeout=300).get(op_query) # returns data in geojson format. Timeout (max.) set to 300s
-
-        filepath = output[f]
-        parentfolder = os.path.dirname(filepath)
-        if not os.path.exists(parentfolder):
-            # Create the folder and its parent directories if they don't exist
-            os.makedirs(parentfolder)
-
-        with open(filepath, mode = "w") as f:
-            # geojson.dump(response,f,indent=2)
-            json.dump(response.json(),f,indent=2)
-        logger.info(" - Done.")
-        # time.sleep(5) 
+        retries = 3
+        for attempt in range(retries):
+            logger.info(f" - Fetching OSM data for feature '{f}' in {country} (Attempt {attempt+1})...")
+
+            # Build the overpass query
+            op_area = f'area["ISO3166-1"="{country}"]'
+            op_query = f'''
+                [out:json];
+                {op_area}->.searchArea;
+                (
+                {features_dict[f]}(area.searchArea);
+                );
+                out body geom;
+            '''
+            try:
+                # Send the request
+                response = requests.post(overpass_url, data = op_query)
+                response.raise_for_status() # Raise HTTPError for bad responses
+                data = response.json()
+
+                filepath = output[f]
+                parentfolder = os.path.dirname(filepath)
+                if not os.path.exists(parentfolder):
+                    os.makedirs(parentfolder)
+
+                with open(filepath, mode = "w") as f:
+                    json.dump(response.json(),f,indent=2)
+                logger.info(" - Done.")
+                break  # Exit the retry loop on success
+            except (json.JSONDecodeError, requests.exceptions.RequestException) as e:
+                logger.error(f"Error for feature '{f}' in country {country}: {e}")
+                logger.debug(f"Response text: {response.text if response else 'No response'}")
+                if attempt < retries - 1:
+                    wait_time += 10
+                    logger.info(f"Waiting {wait_time} seconds before retrying...")
+                    time.sleep(wait_time)
+                else:
+                    logger.error(
+                        f"Failed to retrieve data for feature '{f}' in country {country} after {retries} attempts."
+                        )
+            except Exception as e:
+                # For now, catch any other exceptions and log them. Treat this 
+                # the same as a RequestException and try to run again two times.
+                logger.error(f"Unexpected error for feature '{f}' in country {country}: {e}")
+                if attempt < retries - 1:
+                    wait_time += 10
+                    logger.info(f"Waiting {wait_time} seconds before retrying...")
+                    time.sleep(wait_time)
+                else:
+                    logger.error(
+                        f"Failed to retrieve data for feature '{f}' in country {country} after {retries} attempts."
+                        )
 
 
 if __name__ == "__main__":
-    # Detect running outside of snakemake and mock snakemake for testing
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
 
@@ -171,8 +213,4 @@ def retrieve_osm_data(
     country = snakemake.wildcards.country
     output = snakemake.output
 
-    # Wait 5 seconds before fetching the OSM data to prevent too many requests error
-    # TODO pypsa-eur: Add try catch to implement this only when needed
-    logger.info(f"Waiting 5 seconds... Retrieving OSM data for {country}:")
-    time.sleep(5) 
     retrieve_osm_data(country, output)
\ No newline at end of file

From 7bb153bbd8e08d32f23f330be4ddaf7f64d220eb Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 15 May 2024 22:26:56 +0200
Subject: [PATCH 013/100] Updated cleaning process.

---
 rules/build_electricity.smk |  1 +
 scripts/clean_osm_data.py   | 54 +++++++++++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index b4b53b621..a5f767b09 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -631,6 +631,7 @@ rule clean_osm_data:
         substations_relation=[f"data/osm/raw/{country}/substations_relation.json" for country in config["countries"]],
     output:
         substations=resources("osm/clean/substations.geojson"),
+        substations_polygon=resources("osm/clean/substations_polygon.geojson"),
         lines=resources("osm/clean/lines.geojson"),
     log:
         logs("clean_osm_data.log"),
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index eb03a8e28..71bad62cb 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -775,21 +775,53 @@ def add_line_endings_tosubstations(substations, lines):
     df_substations["tag_area"] = None
     df_substations["tag_source"] = df_substations["id"]
 
-    # Create an empty list to store the results
-    results = []
+    gdf_substations_polygon = gpd.GeoDataFrame(
+        df_substations[["id", "polygon"]], 
+        geometry = "polygon", 
+        crs = "EPSG:4326"
+        )
+    
+    filepath_substations_polygon = snakemake.output["substations_polygon"]
+    # save substations output
+    logger.info(f"Exporting clean substations with polygon shapes to {filepath_substations_polygon}")
+    parentfolder_substations_polygon = os.path.dirname(filepath_substations_polygon)
+    if not os.path.exists(parentfolder_substations_polygon):
+        # Create the folder and its parent directories if they don't exist
+        os.makedirs(parentfolder_substations_polygon)
+
+    logger.info(f"Exporting clean substations to {filepath_substations_polygon}")
+    gdf_substations_polygon.to_file(filepath_substations_polygon, driver="GeoJSON")    
+    
 
     logger.info("Identifying and removing lines within substation polygons...")
-    for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines)):
-        line = row['geometry']  
-        # Check if the LineString is within any Polygon in 'substations_df'
-        is_within_any_substation = any(line.within(substation_polygon) for substation_polygon in df_substations["polygon"])
-        results.append(is_within_any_substation)
+    lines_within_substations = gpd.sjoin(
+        gdf_lines[["line_id", "geometry"]], 
+        gdf_substations_polygon, 
+        how = "inner",
+        predicate = "within"
+        )["line_id"]
+
+    logger.info(f"Removed {len(lines_within_substations)}/{len(gdf_lines)} lines within substations.")
+    gdf_lines = gdf_lines[~gdf_lines["line_id"].isin(lines_within_substations)]
+    
+    # # Create an empty list to store the results
+    # results = []
+
+    # subset a to find only country equal to "BE"
+    # a[a["country"] == "BE"]
+
+    # logger.info("Identifying and removing lines within substation polygons...")
+    # for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines)):
+    #     line = row['geometry']  
+    #     # Check if the LineString is within any Polygon in 'substations_df'
+    #     is_within_any_substation = any(line.within(substation_polygon) for substation_polygon in df_substations["polygon"])
+    #     results.append(is_within_any_substation)
 
-    # Add the results to 'gdf_lines'
-    gdf_lines['within_substation'] = results
+    # # Add the results to 'gdf_lines'
+    # gdf_lines['within_substation'] = results
 
-    gdf_lines = gdf_lines[~gdf_lines["within_substation"]]
-    logger.info(f"Removed {sum(results)} lines within substations.")
+    # gdf_lines = gdf_lines[~gdf_lines["within_substation"]]
+    # logger.info(f"Removed {sum(results)} lines within substations.")
 
     filepath_lines = snakemake.output["lines"]
     # save substations output

From 5ef4d71c9395178e26ad79558ec2f3e321ff421c Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 15 May 2024 23:02:02 +0200
Subject: [PATCH 014/100] Set maximum number of threads for retrieving to 4,
 wrt. fair usage policy and potential request errors.

---
 rules/build_electricity.smk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index a5f767b09..dce396119 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -618,6 +618,7 @@ rule retrieve_osm_data:
         substations_relation="data/osm/raw/{country}/substations_relation.json",
     log:
         logs("retrieve_osm_data_{country}.log"),
+    threads: 4
     script:
         "../scripts/retrieve_osm_data.py"
 

From f961ab8ed5932dd8063c7debfc287518dc29a26f Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 16 May 2024 14:21:32 +0200
Subject: [PATCH 015/100] Intermediate update on clean_osm_data.py. Added
 docstrings.

---
 rules/build_electricity.smk  |   3 +-
 scripts/clean_osm_data.py    | 345 +++++++++++++++++++++++++++--------
 scripts/retrieve_osm_data.py |   2 +-
 3 files changed, 272 insertions(+), 78 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index dce396119..249c9d843 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -618,7 +618,8 @@ rule retrieve_osm_data:
         substations_relation="data/osm/raw/{country}/substations_relation.json",
     log:
         logs("retrieve_osm_data_{country}.log"),
-    threads: 4
+    resources:
+        cores = 2, threads: 1
     script:
         "../scripts/retrieve_osm_data.py"
 
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 71bad62cb..f3687995f 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -3,7 +3,26 @@
 #
 # SPDX-License-Identifier: MIT
 """
-TODO To fill later
+This script is used to clean OpenStreetMap (OSM) data for the PyPSA-Eur 
+project.
+
+The script performs various cleaning operations on the OSM data, including:
+- Cleaning voltage, circuits, cables, wires, and frequency columns
+- Splitting semicolon-separated cells into new rows
+- Distributing values to circuits based on the number of splits
+- Adding line endings to substations based on line data
+
+The cleaned data is then written to an output file.
+
+Usage:
+    python clean_osm_data.py <output_file>
+
+Arguments:
+    output_file (str): The path to the output file where the cleaned data will 
+    be written.
+
+Example:
+    python clean_osm_data.py cleaned_data.csv
 """
 
 import geopandas as gpd
@@ -15,17 +34,22 @@
 import re
 from shapely.geometry import LineString, Polygon
 from shapely.ops import linemerge
-from tqdm import tqdm
 
 from _helpers import configure_logging
 logger = logging.getLogger(__name__)
 
-def clean_osm_data(output):
-    with open(output, "w") as file:
-        file.write("Hello, world!\n")
-
 
 def _create_linestring(row):
+    """
+    Create a LineString object from the given row.
+
+    Args:
+        row (dict): A dictionary containing the row data.
+
+    Returns:
+        LineString: A LineString object representing the geometry.
+
+    """
     coords = [(coord['lon'], coord['lat']) for coord in row["geometry"]]
     return LineString(coords)
 
@@ -64,6 +88,7 @@ def _clean_voltage(column):
     Returns:
     - column: pandas Series, the cleaned column
     """
+    logger.info("Cleaning voltages.")
     column = column.copy()
 
     column = (
@@ -116,6 +141,7 @@ def _clean_circuits(column):
     Returns:
     - column: pandas Series, the cleaned column
     """
+    logger.info("Cleaning circuits.")
     column = column.copy()
     column = (
         column
@@ -146,6 +172,7 @@ def _clean_cables(column):
     Returns:
     - column: pandas Series, the cleaned column
     """
+    logger.info("Cleaning cables.")
     column = column.copy()
     column = (
         column
@@ -174,6 +201,7 @@ def _clean_wires(column):
     Returns:
     - column: pandas Series, the cleaned column
     """
+    logger.info("Cleaning wires.")
     column = column.copy()
     column = (
         column
@@ -202,15 +230,18 @@ def _clean_wires(column):
     return column.astype(str)
 
 
-def _set_frequency(column):
-    column = column.copy()
-    to_fifty = column.astype(str) != "0"
-    column[to_fifty] = "50"    
-
-    return column
+def _check_voltage(voltage, list_voltages):
+    """
+    Check if the given voltage is present in the list of allowed voltages.
 
+    Parameters:
+    voltage (str): The voltage to check.
+    list_voltages (list): A list of allowed voltages.
 
-def _check_voltage(voltage, list_voltages):
+    Returns:
+    bool: True if the voltage is present in the list of allowed voltages, 
+    False otherwise.
+    """
     voltages = voltage.split(';')
     for v in voltages:
         if v in list_voltages:
@@ -219,7 +250,6 @@ def _check_voltage(voltage, list_voltages):
 
 
 def _clean_frequency(column):   
-    column = column.copy()
     """
     Function to clean the raw frequency column: manual fixing and drop nan 
     values
@@ -230,6 +260,7 @@ def _clean_frequency(column):
     Returns:
     - column: pandas Series, the cleaned column
     """
+    logger.info("Cleaning frequencies.")
     column = column.copy()
     column = (
         column
@@ -277,7 +308,8 @@ def _split_cells(df, cols=["voltage"]):
 
     # Create a dictionary to store the suffix count for each original ID
     suffix_counts = {}
-    # Create a dictionary to store the number of splits associated with each original ID
+    # Create a dictionary to store the number of splits associated with each 
+    # original ID
     num_splits = {}
 
     # Split cells and create new rows
@@ -290,7 +322,8 @@ def _split_cells(df, cols=["voltage"]):
     # Update the 'split_elements' column
     x["split_elements"] = x["id"].map(num_splits)
 
-    # Function to generate the new ID with suffix and update the number of splits
+    # Function to generate the new ID with suffix and update the number of 
+    # splits
     def generate_new_id(row):
         original_id = row["id"]
         if row["split_elements"] == 1:
@@ -306,6 +339,19 @@ def generate_new_id(row):
 
 
 def _distribute_to_circuits(row):
+    """
+    Distributes the number of circuits or cables to individual circuits based 
+    on the given row data.
+
+    Parameters:
+    - row: A dictionary representing a row of data containing information about 
+      circuits and cables.
+
+    Returns:
+    - single_circuit: The number of circuits to be assigned to each individual 
+      circuit.
+
+    """
     if row["circuits"] != "":
         circuits = int(row["circuits"])
     else:
@@ -318,13 +364,24 @@ def _distribute_to_circuits(row):
     return single_circuit
 
 
-# Function to check if any substring is in valid_strings
-def _any_substring_in_list(s, list_strings):
-    substrings = s.split(';')
-    return any(sub in list_strings for sub in substrings)
+def add_line_endings_tosubstations(substations, lines):
+    """
+    Add line endings to substations.
 
+    This function takes two pandas DataFrames, `substations` and `lines`, and 
+    adds line endings to the substations based on the information from the 
+    lines DataFrame.
 
-def add_line_endings_tosubstations(substations, lines):
+    Parameters:
+    - substations (pandas DataFrame): DataFrame containing information about 
+      substations.
+    - lines (pandas DataFrame): DataFrame containing information about lines.
+
+    Returns:
+    - buses (pandas DataFrame): DataFrame containing the updated information 
+      about substations with line endings.
+
+    """
     if lines.empty:
         return substations
 
@@ -379,27 +436,19 @@ def add_line_endings_tosubstations(substations, lines):
     return buses
 
 
-if __name__ == "__main__":
-    # Detect running outside of snakemake and mock snakemake for testing
-    if "snakemake" not in globals():
-        from _helpers import mock_snakemake
-
-        snakemake = mock_snakemake("clean_osm_data")
-    
-    configure_logging(snakemake)
+def _import_lines_and_cables(input_path_lines_cables):
+    """
+    Import lines and cables from the given input paths.
 
-     ############# LINES AND CABLES ######################
+    Parameters:
+    - input_path_lines_cables (dict): A dictionary containing the input paths for lines and cables data.
 
-    input_path_lines_cables = {
-        "lines": snakemake.input.lines_way,
-        "cables": snakemake.input.cables_way,
-    }
+    Returns:
+    - df_lines (DataFrame): A DataFrame containing the imported lines and cables data.
 
+    """
     columns = ["id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", "wires"]
     df_lines = pd.DataFrame(columns=columns)
-    crs = "EPSG:4326"
-
-    # using tqdm loop over input path
 
     logger.info("Importing lines and cables")
     for key in input_path_lines_cables:
@@ -436,11 +485,32 @@ def add_line_endings_tosubstations(substations, lines):
                 logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_lines_cables[key])).zfill(2)} (empty): {ip}")
                 continue
         logger.info("---")
+    
+    return df_lines
+
+
+def _drop_duplicate_lines(df_lines):
+    """
+    Drop duplicate lines from the given dataframe. Duplicates are usually lines 
+    cross-border lines or slightly outside the country border of focus.
+
+    Parameters:
+    - df_lines (pandas.DataFrame): The dataframe containing lines data.
 
-    logger.info("Cleaning lines and cables")
-    # Find duplicates based on id column
+    Returns:
+    - df_lines (pandas.DataFrame): The dataframe with duplicate lines removed 
+      and cleaned data.
+
+    This function drops duplicate lines from the given dataframe based on the 
+    'id' column. It groups the duplicate rows by 'id' and aggregates the 
+    'country' column to a string split by semicolon, as they appear in multiple
+    country datasets. One example of the duplicates is kept, accordingly. 
+    Finally, the updated dataframe without multiple duplicates is returned.
+    """
+    logger.info("Dropping duplicate lines.")
     duplicate_rows = df_lines[df_lines.duplicated(subset=['id'], keep=False)].copy()
-    # group rows by id and aggregate the country column to a string split by semicolon
+
+    # Group rows by id and aggregate the country column to a string split by semicolon
     grouped_duplicates = duplicate_rows.groupby('id')["country"].agg(lambda x: ';'.join(x)).reset_index()
     duplicate_rows.drop_duplicates(subset="id", inplace=True)
     duplicate_rows.drop(columns=["country"], inplace=True)
@@ -450,22 +520,78 @@ def add_line_endings_tosubstations(substations, lines):
     df_lines = df_lines[~df_lines["id"].isin(duplicate_rows["id"])]
     df_lines = pd.concat([df_lines, duplicate_rows], axis="rows")
 
-    # Initiate boolean with False, only set to true if all cleaning steps are passed
-    df_lines["cleaned"] = False
-    df_lines["voltage"] = _clean_voltage(df_lines["voltage"])
+    return df_lines
+
+
+def _filter_lines_by_voltage(df_lines, voltage_min=200000):
+    """
+    Filter lines in the DataFrame `df_lines` based on the voltage in V.
 
+    Parameters:
+    - df_lines (pandas.DataFrame): The DataFrame containing the lines data.
+    - voltage_min (int, optional): The minimum voltage value to filter the 
+      lines. Defaults to 200000 [unit: V].
+
+    Returns:
+    - filtered df_lines (pandas.DataFrame): The filtered DataFrame containing 
+      the lines data above voltage_min.
+    - list_voltages (list): A list of unique voltage values above voltage_min.
+      The type of the list elements is string.
+    """
+    logger.info(f"Filtering lines by voltage. Only keeping lines above and including {voltage_min} V.")
     list_voltages = df_lines["voltage"].str.split(";").explode().unique().astype(str)
-    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
-    list_voltages = list_voltages[~np.char.startswith(list_voltages, '1')]
+    # Keep numeric strings
+    list_voltages = list_voltages[np.vectorize(str.isnumeric)(list_voltages)]
+    list_voltages = list_voltages.astype(int)
+    list_voltages = list_voltages[list_voltages >= int(voltage_min)]
+    list_voltages = list_voltages.astype(str)
 
     bool_voltages = df_lines["voltage"].apply(_check_voltage, list_voltages=list_voltages)
     df_lines = df_lines[bool_voltages]
 
-    # Additional cleaning
-    df_lines["circuits"] = _clean_circuits(df_lines["circuits"])
-    df_lines["cables"] = _clean_cables(df_lines["cables"])
-    df_lines["frequency"] = _clean_frequency(df_lines["frequency"])
-    df_lines["wires"] = _clean_wires(df_lines["wires"])
+    return df_lines, list_voltages
+
+
+def _clean_lines(df_lines):
+    """
+    Cleans and processes the `df_lines` DataFrame heuristically based on the 
+    information available per respective line and cable.
+    Further checks to ensure data consistency and completeness.
+
+    Parameters
+    ----------
+    df_lines : pandas.DataFrame
+        The input DataFrame containing line information with columns such as 
+        'voltage', 'circuits', 'frequency', 'cables', 'split_elements', 'id', 
+        etc.
+
+    Returns
+    -------
+    df_lines : pandas.DataFrame
+        The cleaned DataFrame with updated columns 'circuits', 'frequency', and 
+        'cleaned' to reflect the applied transformations.
+
+    Description
+    -----------
+    This function performs the following operations:
+
+    - Initializes a 'cleaned' column with False, step-wise updates to True
+       following the respective cleaning step.
+    - Splits the voltage cells in the DataFrame at semicolons using a helper 
+       function `_split_cells`.
+    - Filters the DataFrame to only include rows with valid voltages.
+    - Sets circuits of remaining lines without any applicable heuristic equal 
+      to 1.
+
+    The function ensures that the resulting DataFrame has consistent and 
+    complete information for further processing or analysis while maintaining 
+    the data of the original OSM data set wherever possible.
+    """
+    logger.info("Cleaning lines and determining circuits.")
+    # Initiate boolean with False, only set to true if all cleaning steps are 
+    # passed
+    df_lines = df_lines.copy()
+    df_lines["cleaned"] = False
 
     df_lines["voltage_original"] = df_lines["voltage"]
     df_lines["circuits_original"] = df_lines["circuits"]
@@ -476,10 +602,10 @@ def add_line_endings_tosubstations(substations, lines):
 
     bool_ac = df_lines["frequency"] != "0"
     bool_dc = ~bool_ac
-    bool_noinfo = (df_lines["cables"] == "") & (df_lines["circuits"] == "")
     valid_frequency = ["50", "0"]
     bool_invalid_frequency = df_lines["frequency"].apply(lambda x: x not in valid_frequency)
 
+    bool_noinfo = (df_lines["cables"] == "") & (df_lines["circuits"] == "")
     # Fill in all values where cables info and circuits does not exist. Assuming 1 circuit
     df_lines.loc[bool_noinfo, "circuits"] = "1"
     df_lines.loc[bool_noinfo & bool_invalid_frequency, "frequency"] = "50"
@@ -582,7 +708,12 @@ def add_line_endings_tosubstations(substations, lines):
     df_lines.loc[bool_leftover & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_leftover, "cleaned"] = True
 
-    # rename columns
+    return df_lines
+
+
+def _finalise_lines(df_lines):
+    logger.info("Finalising lines column types.")
+    # Rename columns
     df_lines.rename(
         columns={
             "id": "line_id", 
@@ -590,17 +721,19 @@ def add_line_endings_tosubstations(substations, lines):
             "frequency":"tag_frequency",
             }, inplace=True)
     
-    df_lines["bus0"] = None
-    df_lines["bus1"] = None
-    df_lines["length"] = None
-    df_lines["underground"] = False
+    # Initiate new columns for subsequent build_osm_network step
+    df_lines.loc[:, "bus0"] = None
+    df_lines.loc[:, "bus1"] = None
+    df_lines.loc[:, "length"] = None
+    df_lines.loc[:, "underground"] = False
     df_lines.loc[df_lines["tag_type"] == "line", "underground"] = False
     df_lines.loc[df_lines["tag_type"] == "cable", "underground"] = True
-    df_lines["under_construction"] = False
-    df_lines["dc"] = False
+    df_lines.loc[:, "under_construction"] = False
+    df_lines.loc[:, "dc"] = False
     df_lines.loc[df_lines["tag_frequency"] == "50", "dc"] = False
     df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True
 
+    # Only include needed columns
     df_lines = df_lines[[
         "line_id",
         "circuits",
@@ -617,31 +750,37 @@ def add_line_endings_tosubstations(substations, lines):
         "geometry",
         ]]
     
-    df_lines["geometry"] = df_lines.apply(_create_linestring, axis=1)  
+    # Set lines data types
+    df_lines.loc[:, "circuits"] = df_lines["circuits"].astype(int)
+    df_lines.loc[:, "voltage"] = df_lines["voltage"].astype(int)
+    df_lines.loc[:, "tag_frequency"] = df_lines["tag_frequency"].astype(int)
+    
+    # Create shapely linestrings from geometries
+    df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1)  
+
     # Drop all rows where the geometry has equal start and end point
+    # These are usually not lines, but outlines of areas.
     bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
-    df_lines = df_lines[~bool_circle]    
+    df_lines = df_lines[~bool_circle]  
 
-    # TODO pypsa-eur: Temporary solution as one AC line between converters will create an error in simplify_network
-    # As this case is not considered there:
-    lines_to_drop = ["775580659"]
-    if lines_to_drop in df_lines["line_id"].values:
-        df_lines.drop(df_lines[df_lines["line_id"].isin(lines_to_drop)].index, inplace=True)
-    
-    gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = "EPSG:4326")
+    return df_lines
 
-     # Lines data types
-    gdf_lines["circuits"] = gdf_lines["circuits"].astype(int)
-    gdf_lines["voltage"] = gdf_lines["voltage"].astype(int)
-    gdf_lines["tag_frequency"] = gdf_lines["tag_frequency"].astype(int)
 
+def _import_substations(input_path_substations):
+    """
+    Import substations from the given input paths. This function imports both
+    substations from OSM ways as well as relations that contain nested 
+    information on the substations shape and electrical parameters. Ways and
+    relations are subsequently concatenated to form a single DataFrame 
+    containing unique bus ids.
 
-    ############# BUSES / SUBSTATIONS ######################
-    input_path_substations = {
-        "substations_way": snakemake.input.substations_way,
-        "substations_relation": snakemake.input.substations_relation,
-    }
+    Args:
+        input_path_substations (dict): A dictionary containing input paths for 
+        substations.
 
+    Returns:
+        pd.DataFrame: A DataFrame containing the imported substations data.
+    """
     cols_substations_way = ["id", "geometry", "country", "power", "substation", "voltage", "frequency"]
     cols_substations_relation = ["id", "country", "power", "substation", "voltage", "frequency"]
     df_substations_way = pd.DataFrame(columns = cols_substations_way)
@@ -729,6 +868,60 @@ def add_line_endings_tosubstations(substations, lines):
     df_substations_relation = df_substations_relation[cols_substations_way]
     df_substations = pd.concat([df_substations_way, df_substations_relation], axis="rows")
 
+    return df_substations
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("clean_osm_data")
+    
+    configure_logging(snakemake)
+    
+    # Parameters
+    crs = "EPSG:4326"       # Correct crs for OSM data
+    voltage_min = 200000    # [unit: V] Minimum voltage value to filter lines. 
+    
+    # TODO pypsa-eur: Temporary solution as one AC line between converters will 
+    # create an error in simplify_network:
+    lines_to_drop = ["775580659"]
+
+    # Input
+    input_path_substations = {
+        "substations_way": snakemake.input.substations_way,
+        "substations_relation": snakemake.input.substations_relation,
+    }
+
+    input_path_lines_cables = {
+        "lines": snakemake.input.lines_way,
+        "cables": snakemake.input.cables_way,
+    }
+
+    # Cleaning process
+    df_lines = _import_lines_and_cables(input_path_lines_cables)
+    df_lines = _drop_duplicate_lines(df_lines)
+    df_lines.loc[:, "voltage"] = _clean_voltage(df_lines["voltage"])
+    df_lines, list_voltages = _filter_lines_by_voltage(df_lines, voltage_min=voltage_min)
+
+    df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"])
+    df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"])
+    df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
+    df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"])
+
+    df_lines = _clean_lines(df_lines)
+    df_lines = _finalise_lines(df_lines)
+    
+    # Dropping specific lines, manually
+    if lines_to_drop in df_lines["line_id"].values:
+        df_lines.drop(df_lines[df_lines["line_id"].isin(lines_to_drop)].index, inplace=True)
+    
+    # Create GeoDataFrame
+    gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = crs)
+
+    ############# BUSES / SUBSTATIONS ######################
+    df_substations = _import_substations(input_path_substations)
+ 
+
     # Create centroids from geometries
     df_substations.loc[:, "polygon"] = df_substations["geometry"]
     df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 77cc398c5..bab645a48 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -180,7 +180,7 @@ def retrieve_osm_data(
                 logger.error(f"Error for feature '{f}' in country {country}: {e}")
                 logger.debug(f"Response text: {response.text if response else 'No response'}")
                 if attempt < retries - 1:
-                    wait_time += 10
+                    wait_time += 15
                     logger.info(f"Waiting {wait_time} seconds before retrying...")
                     time.sleep(wait_time)
                 else:

From 3b9076571bdc2786467ba2ac2071ed7125562959 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 16 May 2024 14:22:41 +0200
Subject: [PATCH 016/100] Bug fix.

---
 rules/build_electricity.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 249c9d843..2c5364066 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -619,7 +619,7 @@ rule retrieve_osm_data:
     log:
         logs("retrieve_osm_data_{country}.log"),
     resources:
-        cores = 2, threads: 1
+        cores = 2, threads= 1
     script:
         "../scripts/retrieve_osm_data.py"
 

From 93f09a508dd0940ed6f9192cfd61531e15445ee9 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 16 May 2024 14:23:09 +0200
Subject: [PATCH 017/100] Bug fix.

---
 rules/build_electricity.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 2c5364066..5c2346b1c 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -619,7 +619,7 @@ rule retrieve_osm_data:
     log:
         logs("retrieve_osm_data_{country}.log"),
     resources:
-        cores = 2, threads= 1
+        cores = 2, threads= 1,
     script:
         "../scripts/retrieve_osm_data.py"
 

From 98f50acf17a23f93da89544e9f2fccb870c26a55 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 16 May 2024 19:08:57 +0200
Subject: [PATCH 018/100] Bug fixes in data types out of clean_osm_data

---
 scripts/build_osm_network.py |  6 +++---
 scripts/clean_osm_data.py    | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 5eccfad4e..3cecf50c5 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -596,7 +596,7 @@ def merge_stations_lines_by_station_id_and_voltage(
     """
 
     logger.info(
-        "Stage 3a/4: Set substation ids with tolerance of %.2f km" % (tol / 1000)
+        "Stage 4a/5: Set substation ids with tolerance of %.2f km" % (tol / 1000)
     )
 
     # TODO pypsa-eur: Add this fix to pypsa-earth: Buses should not be clustered geographically if they are different 
@@ -656,7 +656,7 @@ def merge_stations_lines_by_station_id_and_voltage(
         lambda p: any([p.within(l) for l in all_dc_boundary_points])
     )
 
-    logger.info("Stage 3b/4: Merge substations with the same id")
+    logger.info("Stage 4b/5: Merge substations with the same id")
 
     # merge buses with same station id and voltage
     if not buses.empty:
@@ -666,7 +666,7 @@ def merge_stations_lines_by_station_id_and_voltage(
         buses = pd.concat([buses_ac, buses_dc], ignore_index=True)
         set_substations_ids(buses, distance_crs, tol=tol)
 
-    logger.info("Stage 3c/4: Specify the bus ids of the line endings")
+    logger.info("Stage 4c/5: Specify the bus ids of the line endings")
 
     # set the bus ids to the line dataset
     lines, buses = set_lines_ids(lines, buses, distance_crs)
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index f3687995f..915f3c770 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -712,7 +712,19 @@ def _clean_lines(df_lines):
 
 
 def _finalise_lines(df_lines):
+    """
+    Finalises the lines column types and creates geometries.
+
+    Args:
+        df_lines (pandas.DataFrame): The input DataFrame containing lines data.
+
+    Returns:
+        df_lines (pandas.DataFrame(): The DataFrame with finalised column types 
+        and transformed data.
+
+    """
     logger.info("Finalising lines column types.")
+    df_lines = df_lines.copy()
     # Rename columns
     df_lines.rename(
         columns={
@@ -750,18 +762,28 @@ def _finalise_lines(df_lines):
         "geometry",
         ]]
     
-    # Set lines data types
-    df_lines.loc[:, "circuits"] = df_lines["circuits"].astype(int)
-    df_lines.loc[:, "voltage"] = df_lines["voltage"].astype(int)
-    df_lines.loc[:, "tag_frequency"] = df_lines["tag_frequency"].astype(int)
-    
+    # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
+    # This workaround is needed as otherwise the column dtypes remain "objects"
+    df_lines.loc[:, "circuits_num"] = df_lines["circuits"].astype(int)
+    df_lines.loc[:, "voltage_num"] = df_lines["voltage"].astype(int)
+    df_lines.loc[:, "tag_frequency_num"] = df_lines["tag_frequency"].astype(int)
+    df_lines.drop(columns=["circuits", "voltage", "tag_frequency"], inplace=True)
+
+    col_rename_dict = {
+        "circuits_num": "circuits",
+        "voltage_num": "voltage",
+        "tag_frequency_num": "tag_frequency"
+    }   
+
+    df_lines.rename(columns=col_rename_dict, inplace=True)
+
     # Create shapely linestrings from geometries
     df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1)  
 
     # Drop all rows where the geometry has equal start and end point
     # These are usually not lines, but outlines of areas.
     bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
-    df_lines = df_lines[~bool_circle]  
+    df_lines = df_lines[~bool_circle] 
 
     return df_lines
 
@@ -1059,7 +1081,7 @@ def _import_substations(input_path_substations):
             )
     
     #group gdf_substations by voltage and and geometry (dropping duplicates)
-    df_substations = df_substations.groupby(["voltage", "lon", "lat", "tag_source"]).first().reset_index()
+    df_substations = df_substations.groupby(["voltage", "lon", "lat", "dc", "tag_source"]).first().reset_index()
     df_substations["bus_id"] = df_substations.index
     
     gdf_substations = gpd.GeoDataFrame(df_substations, geometry = "geometry", crs = "EPSG:4326")

From 4f2308d4b888e3efff225e3aebf50c4f5a38e747 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 17 May 2024 13:47:47 +0200
Subject: [PATCH 019/100] Significant improvements to retrieve_osm_data,
 clean_osm_data. Cleaned code. Speed improvements

---
 config/config.default.yaml   | 146 +++++----
 rules/build_electricity.smk  |   8 +-
 scripts/build_osm_network.py |   6 +-
 scripts/clean_osm_data.py    | 605 ++++++++++++++++++++---------------
 scripts/retrieve_osm_data.py |   3 -
 5 files changed, 439 insertions(+), 329 deletions(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index 092c9c9ce..62c3cfdfa 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -15,13 +15,13 @@ private:
     entsoe_api:
 
 remote:
-  ssh: ""
-  path: ""
+  ssh: "z1"
+  path: "~/scratch/projects/pypsa-eur"
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
 run:
   prefix: ""
-  name: ""
+  name: "europe-osm-update-hydro"
   scenarios:
     enable: false
     file: config/scenarios.yaml
@@ -40,13 +40,15 @@ scenario:
   simpl:
   - ''
   ll:
-  - vopt
+  - v1.0 # TODO mit und ohne Netzausbau v1.0
   clusters:
-  - 37
+  - 50
   - 128
   - 256
+  - 512
+  # - 1024
   opts:
-  - ''
+  - 'Co2L0-25H'
   sector_opts:
   - ''
   planning_horizons:
@@ -56,7 +58,20 @@ scenario:
   - 2050
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
+# countries: ["NO"]
 countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']
+# countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MD', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA']
+
+# Settings related to the high-voltage electricity grid
+electricity_network:
+  base_network: "osm"  # "osm" or "gridkit"
+  build_osm_network: true  # If 'true', the network will be built from scratch (retrieving OSM data, cleaning, and building) and stored under resources, 'false' will use snapshots in data/osm
+
+build_osm_network:  # Options of the build_osm_network script; osm = OpenStreetMap
+  group_tolerance_buses: 5000  # [m] (default 5000) Tolerance in meters of the close buses to merge
+  split_overpassing_lines: false  # When True, lines overpassing buses are splitted and connected to the bueses
+  overpassing_lines_tolerance: 1  # [m] (default 1) Tolerance to identify lines overpassing buses
+  force_ac: false  # When true, it forces all components (lines and substation) to be AC-only. To be used if DC assets create problem.
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
 snapshots:
@@ -64,18 +79,18 @@ snapshots:
   end: "2014-01-01"
   inclusive: 'left'
 
-osm:
-  retrieve: true
-  use-prebuilt: false
-
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
 enable:
-  retrieve: auto
+  retrieve: true
   prepare_links_p_nom: false
   retrieve_databundle: true
+  retrieve_sector_databundle: true
   retrieve_cost_data: true
   build_cutout: false
+  retrieve_irena: false
   retrieve_cutout: true
+  build_natura_raster: false
+  retrieve_natura_raster: true
   custom_busmap: false
   drop_leap_day: true
 
@@ -91,7 +106,7 @@ co2_budget:
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
-  voltages: [220., 300., 380., 500., 750.]
+  voltages: [200., 220., 300., 380., 400., 500., 750.]
   gaslimit_enable: false
   gaslimit: false
   co2limit_enable: false
@@ -110,7 +125,7 @@ electricity:
     H2: 168
 
   extendable_carriers:
-    Generator: [solar, onwind, offwind-ac, offwind-dc, offwind-float, OCGT]
+    Generator: [solar, onwind, offwind-ac, offwind-dc, OCGT]
     StorageUnit: [] # battery, H2
     Store: [battery, H2]
     Link: [] # H2 pipeline
@@ -120,7 +135,7 @@ electricity:
   everywhere_powerplants: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
 
   conventional_carriers: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
-  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, offwind-float, hydro]
+  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro] # hydro removed
 
   estimate_renewable_capacities:
     enable: true
@@ -128,7 +143,7 @@ electricity:
     year: 2020
     expansion_limit: false
     technology_mapping:
-      Offshore: [offwind-ac, offwind-dc, offwind-float]
+      Offshore: [offwind-ac, offwind-dc]
       Onshore: [onwind]
       PV: [solar]
 
@@ -196,7 +211,7 @@ renewable:
     luisa: false # [0, 5230]
     natura: true
     ship_threshold: 400
-    max_depth: 60
+    max_depth: 50
     max_shore_distance: 30000
     excluder_resolution: 200
     clip_p_max_pu: 1.e-2
@@ -212,28 +227,10 @@ renewable:
     luisa: false # [0, 5230]
     natura: true
     ship_threshold: 400
-    max_depth: 60
+    max_depth: 50
     min_shore_distance: 30000
     excluder_resolution: 200
     clip_p_max_pu: 1.e-2
-  offwind-float:
-    cutout: europe-2013-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_5MW_offshore
-    # ScholzPhd Tab 4.3.1: 10MW/km^2
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    # proxy for wake losses
-    # from 10.1016/j.energy.2018.08.153
-    # until done more rigorously in #153
-    corine: [44, 255]
-    natura: true
-    ship_threshold: 400
-    excluder_resolution: 200
-    min_depth: 60
-    max_depth: 1000
-    clip_p_max_pu: 1.e-2
   solar:
     cutout: europe-2013-sarah
     resource:
@@ -271,17 +268,27 @@ conventional:
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
 lines:
   types:
+    200.: "Al/St 240/40 2-bundle 220.0"
     220.: "Al/St 240/40 2-bundle 220.0"
     300.: "Al/St 240/40 3-bundle 300.0"
     380.: "Al/St 240/40 4-bundle 380.0"
+    400.: "Al/St 240/40 4-bundle 380.0"
     500.: "Al/St 240/40 4-bundle 380.0"
     750.: "Al/St 560/50 4-bundle 750.0"
+  dc_types: # setting only for osm
+    200.: "HVDC XLPE 1000"
+    220.: "HVDC XLPE 1000"
+    300.: "HVDC XLPE 1000"
+    750.: "HVDC XLPE 1000"
+    380.: "HVDC XLPE 1000"
+    400.: "HVDC XLPE 1000"
+    500.: "HVDC XLPE 1000"
   s_max_pu: 0.7
   s_nom_max: .inf
   max_extension: 20000 #MW
   length_factor: 1.25
   reconnect_crimea: true
-  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
   dynamic_line_rating:
     activate: false
     cutout: europe-2013-era5
@@ -294,7 +301,7 @@ links:
   p_max_pu: 1.0
   p_nom_max: .inf
   max_extension: 30000 #MW
-  include_tyndp: true
+  include_tyndp: false
   under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
@@ -327,7 +334,6 @@ pypsa_eur:
   - onwind
   - offwind-ac
   - offwind-dc
-  - offwind-float
   - solar
   - ror
   - nuclear
@@ -556,7 +562,7 @@ sector:
   - nearshore    # within 50 km of sea
     # - offshore
   ammonia: false
-  min_part_load_fischer_tropsch: 0.5
+  min_part_load_fischer_tropsch: 0.7
   min_part_load_methanolisation: 0.3
   min_part_load_methanation: 0.3
   use_fischer_tropsch_waste_heat: true
@@ -672,9 +678,6 @@ industry:
     2040: 0.12
     2045: 0.16
     2050: 0.20
-  HVC_environment_sequestration_fraction: 0.
-  waste_to_energy: false
-  waste_to_energy_cc: false
   sector_ratios_fraction_future:
     2020: 0.0
     2025: 0.1
@@ -700,7 +703,7 @@ industry:
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#costs
 costs:
   year: 2030
-  version: v0.9.0
+  version: v0.8.1
   rooftop_share: 0.14  # based on the potentials, assuming  (0.1 kW/m2 and 10 m2/person)
   social_discountrate: 0.02
   fill_values:
@@ -791,7 +794,7 @@ solving:
 
   solver_options:
     highs-default:
-      # refer to https://ergo-code.github.io/HiGHS/dev/options/definitions/
+      # refer to https://ergo-code.github.io/HiGHS/options/definitions.html#solver
       threads: 4
       solver: "ipm"
       run_crossover: "off"
@@ -844,17 +847,23 @@ solving:
     cbc-default: {} # Used in CI
     glpk-default: {} # Used in CI
 
-  mem_mb: 30000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
-  runtime: 6h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
+  mem_mb: 100000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
+  runtime: 12h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
 
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
+
 plotting:
   map:
-    boundaries: [-11, 30, 34, 71]
-    color_geomap:
-      ocean: white
-      land: white
+    boundaries:
+  eu_node_location:
+    x: -5.5
+    y: 46.
+  # costs_max: 1000
+  # costs_threshold: 0.0000001
+  # energy_max:
+  # energy_min:
+  # energy_threshold: 0.000001
   projection:
     name: "EqualEarth"
     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
@@ -862,21 +871,34 @@ plotting:
     # central_longitude: 10.
     # central_latitude: 50.
     # standard_parallels: [35, 65]
-  eu_node_location:
-    x: -5.5
-    y: 46.
-  costs_max: 1000
-  costs_threshold: 1
-  energy_max: 20000
-  energy_min: -20000
-  energy_threshold: 50.
+
+# plotting:
+#   map:
+#     boundaries: [-11, 30, 34, 71]
+#     color_geomap:
+#       ocean: white
+#       land: white
+#   projection:
+#     name: "EqualEarth"
+#     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
+#     # name: "LambertConformal"
+#     # central_longitude: 10.
+#     # central_latitude: 50.
+#     # standard_parallels: [35, 65]
+#   eu_node_location:
+#     x: -5.5
+#     y: 46.
+#   costs_max: 1000
+#   costs_threshold: 1
+#   energy_max: 20000
+#   energy_min: -20000
+#   energy_threshold: 50.
 
   nice_names:
     OCGT: "Open-Cycle Gas"
     CCGT: "Combined-Cycle Gas"
     offwind-ac: "Offshore Wind (AC)"
     offwind-dc: "Offshore Wind (DC)"
-    offwind-float: "Offshore Wind (Floating)"
     onwind: "Onshore Wind"
     solar: "Solar"
     PHS: "Pumped Hydro Storage"
@@ -901,9 +923,6 @@ plotting:
     offwind-dc: "#74c6f2"
     offshore wind (DC): "#74c6f2"
     offshore wind dc: "#74c6f2"
-    offwind-float: "#b5e2fa"
-    offshore wind (Float): "#b5e2fa"
-    offshore wind float: "#b5e2fa"
     # water
     hydro: '#298c81'
     hydro reservoir: '#298c81'
@@ -1159,6 +1178,3 @@ plotting:
     DC-DC: "#8a1caf"
     DC link: "#8a1caf"
     load: "#dd2e23"
-    waste CHP: '#e3d37d'
-    waste CHP CC: '#e3d3ff'
-    HVC to air: 'k'
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 5c2346b1c..e4b5711f7 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -494,7 +494,7 @@ rule simplify_network:
         benchmarks("simplify_network/elec_s{simpl}")
     threads: 1
     resources:
-        mem_mb=12000,
+        mem_mb=40000,
     conda:
         "../envs/environment.yaml"
     script:
@@ -541,7 +541,7 @@ rule cluster_network:
         benchmarks("cluster_network/elec_s{simpl}_{clusters}")
     threads: 1
     resources:
-        mem_mb=10000,
+        mem_mb=40000,
     conda:
         "../envs/environment.yaml"
     script:
@@ -614,7 +614,6 @@ rule retrieve_osm_data:
         cables_way="data/osm/raw/{country}/cables_way.json",
         lines_way="data/osm/raw/{country}/lines_way.json",
         substations_way="data/osm/raw/{country}/substations_way.json",
-        substations_node="data/osm/raw/{country}/substations_node.json",
         substations_relation="data/osm/raw/{country}/substations_relation.json",
     log:
         logs("retrieve_osm_data_{country}.log"),
@@ -629,8 +628,9 @@ rule clean_osm_data:
         cables_way=[f"data/osm/raw/{country}/cables_way.json" for country in config["countries"]],
         lines_way=[f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]],
         substations_way=[f"data/osm/raw/{country}/substations_way.json" for country in config["countries"]],
-        substations_node=[f"data/osm/raw/{country}/substations_node.json" for country in config["countries"]],
         substations_relation=[f"data/osm/raw/{country}/substations_relation.json" for country in config["countries"]],
+        offshore_shapes=resources("offshore_shapes.geojson"),
+        country_shapes=resources("country_shapes.geojson"),
     output:
         substations=resources("osm/clean/substations.geojson"),
         substations_polygon=resources("osm/clean/substations_polygon.geojson"),
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 3cecf50c5..4cd5dd315 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -13,13 +13,12 @@
 import pandas as pd
 from _helpers import (
     configure_logging,
+    set_scenario_config,
 )
 from shapely.geometry import LineString, Point
 from shapely.ops import linemerge, split
-from shapely import wkt
 from tqdm import tqdm
-from _benchmark import memory_logger
-import yaml
+from _benchmark import memory_logger 
 
 logger = logging.getLogger(__name__)
 
@@ -1128,6 +1127,7 @@ def are_almost_equal(point1, point2, tolerance=1e-6):
         snakemake = mock_snakemake("build_osm_network")
     
     configure_logging(snakemake)
+    set_scenario_config(snakemake)
 
     # load default crs
     geo_crs = "EPSG:4326"
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 915f3c770..c4f99858c 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -35,7 +35,7 @@
 from shapely.geometry import LineString, Polygon
 from shapely.ops import linemerge
 
-from _helpers import configure_logging
+from _helpers import configure_logging, set_scenario_config
 logger = logging.getLogger(__name__)
 
 
@@ -98,6 +98,7 @@ def _clean_voltage(column):
         .str.replace("400/220/110 kV'", "400000;220000;110000")
         .str.replace("400/220/110/20_kv", "400000;220000;110000;20000")
         .str.replace("2x25000", "25000;25000")
+        .str.replace("é", ";")
     )
 
     column = (
@@ -364,7 +365,12 @@ def _distribute_to_circuits(row):
     return single_circuit
 
 
-def add_line_endings_tosubstations(substations, lines):
+def _add_line_endings_to_substations(
+        df_substations, 
+        gdf_lines, 
+        path_country_shapes,
+        path_offshore_shapes,
+        ):
     """
     Add line endings to substations.
 
@@ -382,82 +388,106 @@ def add_line_endings_tosubstations(substations, lines):
       about substations with line endings.
 
     """
-    if lines.empty:
-        return substations
-
-    # extract columns from substation df
-    bus_s = pd.DataFrame(columns=substations.columns)
-    bus_e = pd.DataFrame(columns=substations.columns)
-
-    # Read information from line.csv
-    bus_s[["voltage", "country"]] = lines[["voltage", "country"]].astype(str)
-    bus_s["geometry"] = lines.geometry.boundary.map(
+    if gdf_lines.empty:
+        return df_substations
+    
+    logger.info("Adding line endings to substations")
+    # extract columns from df_substations
+    bus_s = pd.DataFrame(columns=df_substations.columns)
+    bus_e = pd.DataFrame(columns=df_substations.columns)
+
+    # TODO pypsa-eur: fix country code to contain single country code
+    # Read information from gdf_lines
+    bus_s[["voltage", "country"]] = gdf_lines[["voltage", "country"]]
+    bus_s.loc[:, "geometry"] = gdf_lines.geometry.boundary.map(
         lambda p: p.geoms[0] if len(p.geoms) >= 2 else None
     )
-    bus_s["lon"] = bus_s["geometry"].map(lambda p: p.x if p != None else None)
-    bus_s["lat"] = bus_s["geometry"].map(lambda p: p.y if p != None else None)
-    bus_s["bus_id"] = (
-        (substations["bus_id"].max() if "bus_id" in substations else 0)
-        + 1
-        + bus_s.index
-    )
-    bus_s["dc"] = lines["dc"]
+    bus_s.loc[:, "lon"] = bus_s["geometry"].map(lambda p: p.x if p != None else None)
+    bus_s.loc[:, "lat"] = bus_s["geometry"].map(lambda p: p.y if p != None else None)
+    bus_s.loc[:, "dc"] = gdf_lines["dc"]
 
-    bus_e[["voltage", "country"]] = lines[["voltage", "country"]].astype(str)
-    bus_e["geometry"] = lines.geometry.boundary.map(
+    bus_e[["voltage", "country"]] = gdf_lines[["voltage", "country"]]
+    bus_e.loc[:, "geometry"] = gdf_lines.geometry.boundary.map(
         lambda p: p.geoms[1] if len(p.geoms) >= 2 else None
     )
-    bus_e["lon"] = bus_e["geometry"].map(lambda p: p.x if p != None else None)
-    bus_e["lat"] = bus_e["geometry"].map(lambda p: p.y if p != None else None)
-    bus_e["bus_id"] = bus_s["bus_id"].max() + 1 + bus_e.index
-    bus_e["dc"] = lines["dc"]
+    bus_e.loc[:, "lon"] = bus_e["geometry"].map(lambda p: p.x if p != None else None)
+    bus_e.loc[:, "lat"] = bus_e["geometry"].map(lambda p: p.y if p != None else None)
+    bus_e.loc[:, "dc"] = gdf_lines["dc"]
 
     bus_all = pd.concat([bus_s, bus_e], ignore_index=True)
 
+    # Group gdf_substations by voltage and and geometry (dropping duplicates)
+    bus_all = bus_all.groupby(["voltage", "lon", "lat", "dc"]).first().reset_index()
+    bus_all = bus_all[df_substations.columns]
+    bus_all.loc[:, "bus_id"] = bus_all.apply(lambda row: f"line-end/{row.name + 1}", axis=1)
+
     # Initialize default values
     bus_all["station_id"] = np.nan
     # Assuming substations completed for installed lines
     bus_all["under_construction"] = False
-    bus_all["tag_area"] = 0.0
+    bus_all["tag_area"] = None
     bus_all["symbol"] = "substation"
     # TODO: this tag may be improved, maybe depending on voltage levels
     bus_all["tag_substation"] = "transmission"
-    bus_all["tag_source"] = "line_ending"
-
-    buses = pd.concat([substations, bus_all], ignore_index=True)
+    bus_all["tag_source"] = "line-end"
 
-    # # Assign index to bus_id
-    buses["bus_id"] = buses.index
+    buses = pd.concat([df_substations, bus_all], ignore_index=True)
+    buses.set_index("bus_id", inplace=True)
 
-    # TODO: pypsa-eur: change this later to improve country assignment
+    # Fix country codes
+    # TODO pypsa-eur: Temporary solution as long as the shapes have a low, 
+    # incomplete resolution (cf. 2500 meters for buffering)
     bool_multiple_countries = buses["country"].str.contains(";")
-    buses.loc[bool_multiple_countries, "country"] = buses.loc[bool_multiple_countries, "country"].str.split(";").str[0]
+    gdf_offshore = gpd.read_file(path_offshore_shapes).set_index("name")["geometry"]
+    gdf_offshore = gpd.GeoDataFrame(gdf_offshore, geometry=gdf_offshore, crs = gdf_offshore.crs)
+    gdf_countries = gpd.read_file(path_country_shapes).set_index("name")["geometry"]
+    # reproject to enable buffer
+    gdf_countries = gpd.GeoDataFrame(geometry=gdf_countries, crs = gdf_countries.crs)
+    gdf_union = gdf_countries.merge(gdf_offshore, how="outer", left_index=True, right_index=True)
+    gdf_union["geometry"] = gdf_union.apply(lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]) \
+                                            .unary_union, axis=1)
+    gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs = crs)
+    utm = gdf_union.estimate_utm_crs(datum_name = "WGS 84")
+    gdf_union = gdf_union.to_crs(utm)
+    gdf_union = gdf_union.buffer(2500) # meters
+    gdf_union = gdf_union.to_crs(crs)
+    gdf_union = gpd.GeoDataFrame(geometry=gdf_union, crs = crs)
+    gdf_buses_tofix = gpd.GeoDataFrame(buses[bool_multiple_countries], geometry="geometry", crs = crs)
+    joined = gpd.sjoin(gdf_buses_tofix, gdf_union, how="left", predicate="within")
+    joined.reset_index(inplace=True)
+    joined = joined.drop_duplicates(subset="bus_id")
+    joined.set_index("bus_id", inplace=True)
+    
+    buses.loc[bool_multiple_countries, "country"] = joined.loc[bool_multiple_countries, "index_right"]
 
     return buses
 
 
-def _import_lines_and_cables(input_path_lines_cables):
+def _import_lines_and_cables(path_lines):
     """
     Import lines and cables from the given input paths.
 
     Parameters:
-    - input_path_lines_cables (dict): A dictionary containing the input paths for lines and cables data.
+    - path_lines (dict): A dictionary containing the input paths for lines and cables data.
 
     Returns:
     - df_lines (DataFrame): A DataFrame containing the imported lines and cables data.
 
     """
-    columns = ["id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", "wires"]
+    columns = ["id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", 
+               "wires"]
     df_lines = pd.DataFrame(columns=columns)
 
     logger.info("Importing lines and cables")
-    for key in input_path_lines_cables:
+    for key in path_lines:
         logger.info(f"Processing {key}...")
-        for idx, ip in enumerate(input_path_lines_cables[key]):
+        for idx, ip in enumerate(path_lines[key]):
             if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
-                country = os.path.basename(os.path.dirname(input_path_lines_cables[key][idx]))
+                country = os.path.basename(os.path.dirname(path_lines[key][idx]))
                 
-                logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path_lines_cables[key])).zfill(2)}: {ip}")
+                logger.info(
+                    f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_lines[key])).zfill(2)}: {ip}"
+                    )
                 with open(ip, "r") as f:
                     data = json.load(f)
                 
@@ -482,7 +512,9 @@ def _import_lines_and_cables(input_path_lines_cables):
                 df_lines = pd.concat([df_lines, df], axis="rows")
 
             else:
-                logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_lines_cables[key])).zfill(2)} (empty): {ip}")
+                logger.info(
+                    f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_lines[key])).zfill(2)} (empty): {ip}"
+                    )
                 continue
         logger.info("---")
     
@@ -523,36 +555,80 @@ def _drop_duplicate_lines(df_lines):
     return df_lines
 
 
-def _filter_lines_by_voltage(df_lines, voltage_min=200000):
+def _filter_by_voltage(df, voltage_min=200000):
     """
-    Filter lines in the DataFrame `df_lines` based on the voltage in V.
+    Filter rows in the DataFrame based on the voltage in V.
 
     Parameters:
-    - df_lines (pandas.DataFrame): The DataFrame containing the lines data.
+    - df (pandas.DataFrame): The DataFrame containing the substations or lines data.
     - voltage_min (int, optional): The minimum voltage value to filter the 
-      lines. Defaults to 200000 [unit: V].
+      rows. Defaults to 200000 [unit: V].
 
     Returns:
-    - filtered df_lines (pandas.DataFrame): The filtered DataFrame containing 
-      the lines data above voltage_min.
+    - filtered df (pandas.DataFrame): The filtered DataFrame containing 
+      the lines or substations above voltage_min.
     - list_voltages (list): A list of unique voltage values above voltage_min.
       The type of the list elements is string.
     """
-    logger.info(f"Filtering lines by voltage. Only keeping lines above and including {voltage_min} V.")
-    list_voltages = df_lines["voltage"].str.split(";").explode().unique().astype(str)
+    logger.info(f"Filtering dataframe by voltage. Only keeping rows above and including {voltage_min} V.")
+    list_voltages = df["voltage"].str.split(";").explode().unique().astype(str)
     # Keep numeric strings
     list_voltages = list_voltages[np.vectorize(str.isnumeric)(list_voltages)]
     list_voltages = list_voltages.astype(int)
     list_voltages = list_voltages[list_voltages >= int(voltage_min)]
     list_voltages = list_voltages.astype(str)
 
-    bool_voltages = df_lines["voltage"].apply(_check_voltage, list_voltages=list_voltages)
-    df_lines = df_lines[bool_voltages]
+    bool_voltages = df["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df = df[bool_voltages]
+
+    return df, list_voltages
+
+
+def _clean_substations(df_substations, list_voltages):
+    """
+    Clean the substation data by performing the following steps:
+    - Split cells in the dataframe.
+    - Filter substation data based on specified voltages.
+    - Update the frequency values based on the split count.
+    - Split cells in the 'frequency' column.
+    - Set remaining invalid frequency values that are not in ['0', '50'] 
+      to '50'.
+
+    Parameters:
+    - df_substations (pandas.DataFrame): The input dataframe containing 
+      substation data.
+    - list_voltages (list): A list of voltages above voltage_min to filter the 
+    substation data.
+
+    Returns:
+    - df_substations (pandas.DataFrame): The cleaned substation dataframe.
+    """
+    df_substations = df_substations.copy()
+
+    df_substations = _split_cells(df_substations)
+
+    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    df_substations = df_substations[bool_voltages]
+    df_substations.loc[:, "split_count"] = df_substations["id"].apply(lambda x: x.split("-")[1] if "-" in x else "0")
+    df_substations.loc[:, "split_count"] = df_substations["split_count"].astype(int)
+
+    bool_split = df_substations["split_elements"] > 1
+    bool_frequency_len = df_substations["frequency"] \
+        .apply(lambda x: len(x.split(";"))) == df_substations["split_elements"]
+    
+    op_freq = lambda row: row["frequency"].split(";")[row["split_count"]-1]
+
+    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = df_substations \
+        .loc[bool_frequency_len & bool_split, ].apply(op_freq, axis=1)
+    
+    df_substations = _split_cells(df_substations, cols=["frequency"])
+    bool_invalid_frequency = df_substations["frequency"].apply(lambda x: x not in ["50", "0"])
+    df_substations.loc[bool_invalid_frequency, "frequency"] = "50"
 
-    return df_lines, list_voltages
+    return df_substations
 
 
-def _clean_lines(df_lines):
+def _clean_lines(df_lines, list_voltages):
     """
     Cleans and processes the `df_lines` DataFrame heuristically based on the 
     information available per respective line and cable.
@@ -564,6 +640,8 @@ def _clean_lines(df_lines):
         The input DataFrame containing line information with columns such as 
         'voltage', 'circuits', 'frequency', 'cables', 'split_elements', 'id', 
         etc.
+    list_voltages : list
+        A list of unique voltage values above a certain threshold. (type: str)
 
     Returns
     -------
@@ -651,7 +729,8 @@ def _clean_lines(df_lines):
     df_lines.loc[bool_lines & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_lines, "cleaned"] = True
 
-    # Clean those values where number of voltages split by semicolon is larger than no cables or no circuits
+    # Clean those values where number of voltages split by semicolon is larger 
+    # than no cables or no circuits
     bool_cables = (df_lines["voltage_original"].apply(lambda x: len(x.split(";")) > 1)) & \
         (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
         (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & \
@@ -663,7 +742,8 @@ def _clean_lines(df_lines):
     df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_cables, "cleaned"] = True
 
-    # Clean those values where multiple circuit values are present, divided by semicolon
+    # Clean those values where multiple circuit values are present, divided by 
+    # semicolon
     bool_cables = (df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1)) & \
         (df_lines.apply(lambda row: len(row["circuits"].split(";")) == row["split_elements"], axis=1)) & \
         (df_lines["cleaned"] == False)
@@ -677,7 +757,8 @@ def _clean_lines(df_lines):
     df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_cables, "cleaned"] = True
 
-    # Clean those values where multiple cables values are present, divided by semicolon
+    # Clean those values where multiple cables values are present, divided by 
+    # semicolon
     bool_cables = (df_lines["cables"].apply(lambda x: len(x.split(";")) > 1)) & \
         (df_lines.apply(lambda row: len(row["cables"].split(";")) == row["split_elements"], axis=1)) & \
         (df_lines["cleaned"] == False)
@@ -711,9 +792,117 @@ def _clean_lines(df_lines):
     return df_lines
 
 
+def _create_substations_geometry(df_substations):
+    """
+    Creates centroids from geometries and keeps the original polygons.
+
+    Parameters:
+    df_substations (DataFrame): The input DataFrame containing the substations 
+    data.
+
+    Returns:
+    df_substations (DataFrame): A new DataFrame with the centroids ["geometry"] 
+    and polygons ["polygon"] of the substations geometries.
+
+    """
+    logger.info("Creating substations geometry.")
+    df_substations = df_substations.copy()
+    
+    # Create centroids from geometries and keep the original polygons
+    df_substations.loc[:, "polygon"] = df_substations["geometry"]
+    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
+    df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
+    df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
+
+    return df_substations
+
+
+def _create_lines_geometry(df_lines):
+    """
+    Create line geometry for the given DataFrame of lines.
+
+    Parameters:
+    - df_lines (pandas.DataFrame): DataFrame containing lines data.
+
+    Returns:
+    - df_lines (pandas.DataFrame): DataFrame with transformed 'geometry' 
+      column (type: shapely LineString).
+
+    Notes:
+    - This function transforms 'geometry' column in the input DataFrame by 
+      applying the '_create_linestring' function to each row.
+    - It then drops rows where the geometry has equal start and end points, 
+      as these are usually not lines but outlines of areas.
+    """
+    logger.info("Creating lines geometry.")
+    df_lines = df_lines.copy()
+    df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1)  
+
+    bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
+    df_lines = df_lines[~bool_circle] 
+
+    return df_lines
+
+
+def _finalise_substations(df_substations):
+    """
+    Finalises the substations column types.
+
+    Args:
+        df_substations (pandas.DataFrame): The input DataFrame 
+        containing substations data.
+
+    Returns:
+        df_substations (pandas.DataFrame(): The DataFrame with finalised column 
+        types and transformed data.
+    """
+    logger.info("Finalising substations column types.")
+    df_substations = df_substations.copy()
+    # rename columns
+    df_substations.rename(
+        columns={
+            "id": "bus_id", 
+            "power": "symbol",
+            "substation":"tag_substation",
+            }, inplace=True)
+    
+    # Initiate new columns for subsequent build_osm_network step 
+    df_substations.loc[:, "symbol"] = "substation"
+    df_substations.loc[:, "tag_substation"] = "transmission"
+    df_substations.loc[:, "dc"] = False
+    df_substations.loc[df_substations["frequency"] == "0", "dc"] = True
+    df_substations.loc[:, "under_construction"] = False
+    df_substations.loc[:, "station_id"] = None
+    df_substations.loc[:, "tag_area"] = None
+    df_substations.loc[:, "tag_source"] = df_substations["bus_id"]
+
+    # Only included needed columns
+    df_substations = df_substations[[
+        "bus_id",
+        "symbol", 
+        "tag_substation", 
+        "voltage", 
+        "lon", 
+        "lat", 
+        "dc", 
+        "under_construction", 
+        "station_id", 
+        "tag_area", 
+        "country",
+        "geometry",
+        "polygon",
+        "tag_source",
+        ]]
+    
+    # Substation data types
+    df_substations["voltage"] = df_substations["voltage"].astype(int)
+
+    return df_substations
+
+
 def _finalise_lines(df_lines):
     """
-    Finalises the lines column types and creates geometries.
+    Finalises the lines column types.
 
     Args:
         df_lines (pandas.DataFrame): The input DataFrame containing lines data.
@@ -721,7 +910,6 @@ def _finalise_lines(df_lines):
     Returns:
         df_lines (pandas.DataFrame(): The DataFrame with finalised column types 
         and transformed data.
-
     """
     logger.info("Finalising lines column types.")
     df_lines = df_lines.copy()
@@ -764,31 +952,14 @@ def _finalise_lines(df_lines):
     
     # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
     # This workaround is needed as otherwise the column dtypes remain "objects"
-    df_lines.loc[:, "circuits_num"] = df_lines["circuits"].astype(int)
-    df_lines.loc[:, "voltage_num"] = df_lines["voltage"].astype(int)
-    df_lines.loc[:, "tag_frequency_num"] = df_lines["tag_frequency"].astype(int)
-    df_lines.drop(columns=["circuits", "voltage", "tag_frequency"], inplace=True)
-
-    col_rename_dict = {
-        "circuits_num": "circuits",
-        "voltage_num": "voltage",
-        "tag_frequency_num": "tag_frequency"
-    }   
-
-    df_lines.rename(columns=col_rename_dict, inplace=True)
-
-    # Create shapely linestrings from geometries
-    df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1)  
-
-    # Drop all rows where the geometry has equal start and end point
-    # These are usually not lines, but outlines of areas.
-    bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
-    df_lines = df_lines[~bool_circle] 
+    df_lines["circuits"] = df_lines["circuits"].astype(int)
+    df_lines["voltage"] = df_lines["voltage"].astype(int)
+    df_lines["tag_frequency"] = df_lines["tag_frequency"].astype(int)
 
     return df_lines
 
 
-def _import_substations(input_path_substations):
+def _import_substations(path_substations):
     """
     Import substations from the given input paths. This function imports both
     substations from OSM ways as well as relations that contain nested 
@@ -797,7 +968,7 @@ def _import_substations(input_path_substations):
     containing unique bus ids.
 
     Args:
-        input_path_substations (dict): A dictionary containing input paths for 
+        path_substations (dict): A dictionary containing input paths for 
         substations.
 
     Returns:
@@ -809,12 +980,14 @@ def _import_substations(input_path_substations):
     df_substations_relation = pd.DataFrame(columns = cols_substations_relation)
 
     logger.info("Importing substations")
-    for key in input_path_substations:
+    for key in path_substations:
         logger.info(f"Processing {key}...")
-        for idx, ip in enumerate(input_path_substations[key]):
+        for idx, ip in enumerate(path_substations[key]):
             if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
-                country = os.path.basename(os.path.dirname(input_path_substations[key][idx]))  
-                logger.info(f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)}: {ip}")
+                country = os.path.basename(os.path.dirname(path_substations[key][idx]))  
+                logger.info(
+                    f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_substations[key])).zfill(2)}: {ip}"
+                    )
                 with open(ip, "r") as f:
                     data = json.load(f)
                 
@@ -845,7 +1018,9 @@ def _import_substations(input_path_substations):
                     df_substations_relation = pd.concat([df_substations_relation, df], axis="rows")
 
             else:
-                logger.info(f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(input_path_substations[key])).zfill(2)} (empty): {ip}")
+                logger.info(
+                    f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_substations[key])).zfill(2)} (empty): {ip}"
+                    )
                 continue
         logger.info("---")
 
@@ -878,7 +1053,8 @@ def _import_substations(input_path_substations):
     df_substations_relation_members["linestring"] = df_substations_relation_members.apply(_create_linestring, axis=1)  
     df_substations_relation_members_grouped = df_substations_relation_members.groupby('id')['linestring'] \
         .apply(lambda x: linemerge(x.tolist())).reset_index()
-    df_substations_relation_members_grouped["geometry"] = df_substations_relation_members_grouped["linestring"].apply(lambda x: x.convex_hull)
+    df_substations_relation_members_grouped["geometry"] = df_substations_relation_members_grouped["linestring"] \
+        .apply(lambda x: x.convex_hull)
     
     df_substations_relation = df_substations_relation.join(
         df_substations_relation_members_grouped.set_index('id'), 
@@ -892,6 +1068,36 @@ def _import_substations(input_path_substations):
 
     return df_substations
 
+
+def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
+    """
+    Removes lines that are within substation polygons from the given 
+    GeoDataFrame of lines. These are not needed to create network (e.g. bus 
+    bars, switchgear, etc.)
+
+    Parameters:
+    - gdf_lines (GeoDataFrame): A GeoDataFrame containing lines with 'line_id' 
+      and 'geometry' columns.
+    - gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing substation 
+      polygons.
+
+    Returns:
+    GeoDataFrame: A new GeoDataFrame without lines within substation polygons.
+    """
+    logger.info("Identifying and removing lines within substation polygons...")
+    gdf = gpd.sjoin(
+        gdf_lines[["line_id", "geometry"]], 
+        gdf_substations_polygon, 
+        how="inner",
+        predicate="within"
+    )["line_id"]
+
+    logger.info(f"Removed {len(gdf)} lines within substations of original {len(gdf_lines)} lines.")
+    gdf_lines = gdf_lines[~gdf_lines["line_id"].isin(gdf)]
+
+    return gdf_lines
+
+
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
@@ -899,38 +1105,58 @@ def _import_substations(input_path_substations):
         snakemake = mock_snakemake("clean_osm_data")
     
     configure_logging(snakemake)
+    set_scenario_config(snakemake)
     
     # Parameters
     crs = "EPSG:4326"       # Correct crs for OSM data
     voltage_min = 200000    # [unit: V] Minimum voltage value to filter lines. 
-    
+
     # TODO pypsa-eur: Temporary solution as one AC line between converters will 
     # create an error in simplify_network:
     lines_to_drop = ["775580659"]
 
+    logger.info("---")
+    logger.info("SUBSTATIONS")
     # Input
-    input_path_substations = {
+    path_substations = {
         "substations_way": snakemake.input.substations_way,
         "substations_relation": snakemake.input.substations_relation,
     }
 
-    input_path_lines_cables = {
+    # Cleaning process
+    df_substations = _import_substations(path_substations)
+    df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
+    df_substations, list_voltages = _filter_by_voltage(df_substations, voltage_min=voltage_min)
+    df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
+    df_substations = _clean_substations(df_substations, list_voltages)
+    df_substations = _create_substations_geometry(df_substations)
+    df_substations = _finalise_substations(df_substations)
+
+    # Create polygon GeoDataFrame to remove lines within substations
+    gdf_substations_polygon = gpd.GeoDataFrame(
+        df_substations[["bus_id", "polygon", "voltage"]],
+        geometry = "polygon", 
+        crs = crs,
+    )
+
+    logger.info("---")
+    logger.info("LINES AND CABLES")
+    path_lines = {
         "lines": snakemake.input.lines_way,
         "cables": snakemake.input.cables_way,
     }
 
     # Cleaning process
-    df_lines = _import_lines_and_cables(input_path_lines_cables)
+    df_lines = _import_lines_and_cables(path_lines)
     df_lines = _drop_duplicate_lines(df_lines)
     df_lines.loc[:, "voltage"] = _clean_voltage(df_lines["voltage"])
-    df_lines, list_voltages = _filter_lines_by_voltage(df_lines, voltage_min=voltage_min)
-
+    df_lines, list_voltages = _filter_by_voltage(df_lines, voltage_min=voltage_min)
     df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"])
     df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"])
     df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
     df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"])
-
-    df_lines = _clean_lines(df_lines)
+    df_lines = _clean_lines(df_lines, list_voltages)
+    df_lines = _create_lines_geometry(df_lines)
     df_lines = _finalise_lines(df_lines)
     
     # Dropping specific lines, manually
@@ -939,165 +1165,36 @@ def _import_substations(input_path_substations):
     
     # Create GeoDataFrame
     gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = crs)
-
-    ############# BUSES / SUBSTATIONS ######################
-    df_substations = _import_substations(input_path_substations)
- 
-
-    # Create centroids from geometries
-    df_substations.loc[:, "polygon"] = df_substations["geometry"]
-    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
-    df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
-    df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
-
-    logger.info("Cleaning substations")
-    # Clean columns
-    df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
-    df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
-    df_substations["frequency"] = df_substations["frequency"].astype(str, errors="ignore")
-
-    list_voltages = df_substations["voltage"].str.split(";").explode().unique().astype(str)
-    list_voltages = list_voltages[np.vectorize(len)(list_voltages) >= 6]
-    list_voltages = list_voltages[~np.char.startswith(list_voltages, '1')]
-
-    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
-    df_substations = df_substations[bool_voltages]
-
-    df_substations = _split_cells(df_substations)
-    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
-    df_substations = df_substations[bool_voltages]
-    df_substations["split_count"] = df_substations["id"].apply(lambda x: x.split("-")[1] if "-" in x else "0")
-    df_substations["split_count"] = df_substations["split_count"].astype(int)
-
-    bool_split = df_substations["split_elements"] > 1
-    bool_frequency_len = df_substations["frequency"].apply(lambda x: len(x.split(";"))) == df_substations["split_elements"]
-    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = df_substations.loc[bool_frequency_len & bool_split, "frequency"] \
-    
-    op_freq = lambda row: row["frequency"].split(";")[row["split_count"]-1]
-
-    df_substations.loc[bool_frequency_len & bool_split, ["frequency"]] = df_substations.loc[bool_frequency_len & bool_split, ] \
-        .apply(op_freq, axis=1)
-    
-    df_substations = _split_cells(df_substations, cols=["frequency"])
-    bool_invalid_frequency = df_substations["frequency"].apply(lambda x: x not in ["50", "0"])
-    df_substations.loc[bool_invalid_frequency, "frequency"] = "50"
-    df_substations["power"] = "substation"
-    df_substations["substation"] = "transmission"
-    df_substations["dc"] = False
-    df_substations.loc[df_substations["frequency"] == "0", "dc"] = True
-    df_substations["under_construction"] = False
-    df_substations["station_id"] = None
-    df_substations["tag_area"] = None
-    df_substations["tag_source"] = df_substations["id"]
-
-    gdf_substations_polygon = gpd.GeoDataFrame(
-        df_substations[["id", "polygon"]], 
-        geometry = "polygon", 
-        crs = "EPSG:4326"
+    gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon)
+
+    # Add line endings to substations
+    path_country_shapes = snakemake.input.country_shapes
+    path_offshore_shapes = snakemake.input.offshore_shapes
+    df_substations = _add_line_endings_to_substations(
+        df_substations, 
+        gdf_lines,
+        path_country_shapes,
+        path_offshore_shapes,
         )
     
-    filepath_substations_polygon = snakemake.output["substations_polygon"]
-    # save substations output
-    logger.info(f"Exporting clean substations with polygon shapes to {filepath_substations_polygon}")
-    parentfolder_substations_polygon = os.path.dirname(filepath_substations_polygon)
-    if not os.path.exists(parentfolder_substations_polygon):
-        # Create the folder and its parent directories if they don't exist
-        os.makedirs(parentfolder_substations_polygon)
-
-    logger.info(f"Exporting clean substations to {filepath_substations_polygon}")
-    gdf_substations_polygon.to_file(filepath_substations_polygon, driver="GeoJSON")    
-    
-
-    logger.info("Identifying and removing lines within substation polygons...")
-    lines_within_substations = gpd.sjoin(
-        gdf_lines[["line_id", "geometry"]], 
-        gdf_substations_polygon, 
-        how = "inner",
-        predicate = "within"
-        )["line_id"]
-
-    logger.info(f"Removed {len(lines_within_substations)}/{len(gdf_lines)} lines within substations.")
-    gdf_lines = gdf_lines[~gdf_lines["line_id"].isin(lines_within_substations)]
-    
-    # # Create an empty list to store the results
-    # results = []
-
-    # subset a to find only country equal to "BE"
-    # a[a["country"] == "BE"]
-
-    # logger.info("Identifying and removing lines within substation polygons...")
-    # for index, row in tqdm(gdf_lines.iterrows(), total=len(gdf_lines)):
-    #     line = row['geometry']  
-    #     # Check if the LineString is within any Polygon in 'substations_df'
-    #     is_within_any_substation = any(line.within(substation_polygon) for substation_polygon in df_substations["polygon"])
-    #     results.append(is_within_any_substation)
-
-    # # Add the results to 'gdf_lines'
-    # gdf_lines['within_substation'] = results
-
-    # gdf_lines = gdf_lines[~gdf_lines["within_substation"]]
-    # logger.info(f"Removed {sum(results)} lines within substations.")
-
-    filepath_lines = snakemake.output["lines"]
-    # save substations output
-    logger.info(f"Exporting clean lines to {filepath_lines}")
-    parentfolder_lines = os.path.dirname(filepath_lines)
-    if not os.path.exists(parentfolder_lines):
-        # Create the folder and its parent directories if they don't exist
-        os.makedirs(parentfolder_lines)
-
-    logger.info(f"Exporting clean lines to {filepath_lines}")
-    gdf_lines.to_file(filepath_lines, driver="GeoJSON")
-
-    # rename columns
-    df_substations.rename(
-        columns={
-            "id": "bus_id", 
-            "power": "symbol",
-            "substation":"tag_substation",
-            }, inplace=True)
-    
-    df_substations = df_substations[[
-        "bus_id",
-        "symbol", 
-        "tag_substation", 
-        "voltage", 
-        "lon", 
-        "lat", 
-        "dc", 
-        "under_construction", 
-        "station_id", 
-        "tag_area", 
-        "country",
-        "geometry",
-        "tag_source",
-        ]]
-    
-    df_substations["bus_id"] = df_substations.index
-
-    logger.info("Adding line endings to substations")
-    df_substations = add_line_endings_tosubstations(
-                df_substations, gdf_lines
-            )
-    
-    #group gdf_substations by voltage and and geometry (dropping duplicates)
-    df_substations = df_substations.groupby(["voltage", "lon", "lat", "dc", "tag_source"]).first().reset_index()
-    df_substations["bus_id"] = df_substations.index
-    
-    gdf_substations = gpd.GeoDataFrame(df_substations, geometry = "geometry", crs = "EPSG:4326")
-
-    # Substation data types
-    gdf_substations["bus_id"] = gdf_substations["bus_id"].astype(int)
-    gdf_substations["voltage"] = gdf_substations["voltage"].astype(int)
-
-    filepath_substations = snakemake.output["substations"]
-    # save substations output
-    logger.info(f"Exporting clean substations to {filepath_substations}")
-    parentfolder_substations = os.path.dirname(filepath_substations)
-    if not os.path.exists(parentfolder_substations):
-        # Create the folder and its parent directories if they don't exist
-        os.makedirs(parentfolder_substations)
-
-    logger.info(f"Exporting clean substations to {filepath_substations}")
-    gdf_substations.to_file(filepath_substations, driver="GeoJSON")    
+    # Drop polygons and create GDF
+    gdf_substations = gpd.GeoDataFrame(df_substations.drop(columns=["polygon"]), 
+                                       geometry = "geometry", crs = crs)
+
+    # Export GeoDataFrames to GeoJSON in specified output paths
+    parentfolder = os.path.dirname(snakemake.output.substations)
+    if not os.path.exists(parentfolder):
+        os.makedirs(parentfolder)
+    output_substations_polygon = snakemake.output["substations_polygon"]
+    output_substations = snakemake.output["substations"]
+    output_lines = snakemake.output["lines"]
+
+    logger.info(f"Exporting clean substations with polygon shapes to {output_substations_polygon}")
+    gdf_substations_polygon.to_file(output_substations_polygon, driver="GeoJSON") 
+    logger.info(f"Exporting clean substations to {output_substations}")
+    gdf_substations.to_file(output_substations, driver="GeoJSON")    
+    logger.info(f"Exporting clean lines to {output_lines}")
+    gdf_lines.to_file(output_lines, driver="GeoJSON")
+
+    logger.info("Cleaning OSM data completed.")
     
\ No newline at end of file
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index bab645a48..0ad9743e4 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -92,7 +92,6 @@ def retrieve_osm_data(
             "cables_way", 
             "lines_way", 
             "substations_way",
-            "substations_node",
             "substations_relation",
             ]):
     """
@@ -111,7 +110,6 @@ def retrieve_osm_data(
             "cables_way",
             "lines_way",
             "substations_way",
-            "substations_node",
             "substations_relation",
             ].
     """
@@ -136,7 +134,6 @@ def retrieve_osm_data(
         'cables_way': 'way["power"="cable"]',
         'lines_way': 'way["power"="line"]',
         'substations_way': 'way["power"="substation"]',
-        'substations_node': 'node["power"="substation"]',
         'substations_relation': 'relation["power"="substation"]',
     }
 

From da94a964c74ac266d3502f7956f2c27f60f7484b Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 22 May 2024 09:12:28 +0200
Subject: [PATCH 020/100] Cleaned config.

---
 config/config_backup.yaml    | 1181 ++++++++++++++++++++++++++++++++++
 rules/build_electricity.smk  |   44 +-
 scripts/build_osm_network.py |   15 +-
 scripts/solve_network.py     |    6 +-
 4 files changed, 1214 insertions(+), 32 deletions(-)
 create mode 100644 config/config_backup.yaml

diff --git a/config/config_backup.yaml b/config/config_backup.yaml
new file mode 100644
index 000000000..2bcaf173c
--- /dev/null
+++ b/config/config_backup.yaml
@@ -0,0 +1,1181 @@
+# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: CC0-1.0
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#top-level-configuration
+version: 0.10.0
+tutorial: false
+
+logging:
+  level: INFO
+  format: '%(levelname)s:%(name)s:%(message)s'
+
+private:
+  keys:
+    entsoe_api:
+
+remote:
+  ssh: "z1"
+  path: "~/scratch/projects/pypsa-eur"
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
+run:
+  prefix: ""
+  # name: "test-europe1-gridkit"
+  name: "test-begb-gridkit"
+  scenarios:
+    enable: false
+    file: config/scenarios.yaml
+  disable_progressbar: false
+  shared_resources:
+    policy: false
+    exclude: []
+  shared_cutouts: true
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#foresight
+foresight: overnight
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#scenario
+# Wildcard docs in https://pypsa-eur.readthedocs.io/en/latest/wildcards.html
+scenario:
+  simpl:
+  - ''
+  ll:
+  - v1.0 # TODO mit und ohne Netzausbau v1.0
+  clusters:
+  - 40
+  # - 128
+  # - 256
+  # - 512
+  # # - 1024
+  opts:
+  - 'Co2L0-169H'
+  sector_opts:
+  - ''
+  planning_horizons:
+  # - 2020
+  # - 2030
+  # - 2040
+  - 2050
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
+countries: ["BE", "GB"]
+# countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']
+# countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MD', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA']
+
+# Settings related to the high-voltage electricity grid
+electricity_network:
+  base_network: "gridkit"  # "osm" or "gridkit"
+  build_osm_network: true  # If 'true', the network will be built from scratch (retrieving OSM data, cleaning, and building) and stored under resources, 'false' will use snapshots in data/osm
+
+build_osm_network:  # Options of the build_osm_network script; osm = OpenStreetMap
+  group_tolerance_buses: 5000  # [m] (default 5000) Tolerance in meters of the close buses to merge
+  split_overpassing_lines: false  # When True, lines overpassing buses are splitted and connected to the bueses
+  overpassing_lines_tolerance: 1  # [m] (default 1) Tolerance to identify lines overpassing buses
+  force_ac: false  # When true, it forces all components (lines and substation) to be AC-only. To be used if DC assets create problem.
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
+snapshots:
+  start: "2013-01-01"
+  end: "2014-01-01"
+  inclusive: 'left'
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
+enable:
+  retrieve: true
+  prepare_links_p_nom: false
+  retrieve_databundle: true
+  retrieve_sector_databundle: true
+  retrieve_cost_data: true
+  build_cutout: false
+  retrieve_irena: false
+  retrieve_cutout: true
+  build_natura_raster: false
+  retrieve_natura_raster: true
+  custom_busmap: false
+  drop_leap_day: true
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
+co2_budget:
+  2020: 0.701
+  2025: 0.524
+  2030: 0.297
+  2035: 0.150
+  2040: 0.071
+  2045: 0.032
+  2050: 0.000
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
+electricity:
+  voltages: [200., 220., 300., 380., 400., 500., 750.]
+  gaslimit_enable: false
+  gaslimit: false
+  co2limit_enable: false
+  co2limit: 7.75e+7
+  co2base: 1.487e+9
+  agg_p_nom_limits: data/agg_p_nom_minmax.csv
+
+  operational_reserve:
+    activate: false
+    epsilon_load: 0.02
+    epsilon_vres: 0.02
+    contingency: 4000
+
+  max_hours:
+    battery: 6
+    H2: 168
+
+  extendable_carriers:
+    Generator: [solar, onwind, offwind-ac, offwind-dc, OCGT]
+    StorageUnit: [] # battery, H2
+    Store: [battery, H2]
+    Link: [] # H2 pipeline
+
+  powerplants_filter: (DateOut >= 2023 or DateOut != DateOut) and not (Country == 'Germany' and Fueltype == 'Nuclear')
+  custom_powerplants: false
+  everywhere_powerplants: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
+
+  conventional_carriers: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
+  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro] # hydro removed
+
+  estimate_renewable_capacities:
+    enable: true
+    from_opsd: true
+    year: 2020
+    expansion_limit: false
+    technology_mapping:
+      Offshore: [offwind-ac, offwind-dc]
+      Onshore: [onwind]
+      PV: [solar]
+
+  autarky:
+    enable: false
+    by_country: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#atlite
+atlite:
+  default_cutout: europe-2013-era5
+  nprocesses: 4
+  show_progress: false
+  cutouts:
+    # use 'base' to determine geographical bounds and time span from config
+    # base:
+      # module: era5
+    europe-2013-era5:
+      module: era5 # in priority order
+      x: [-12., 42.]
+      y: [33., 72]
+      dx: 0.3
+      dy: 0.3
+      time: ['2013', '2013']
+    europe-2013-sarah:
+      module: [sarah, era5] # in priority order
+      x: [-12., 42.]
+      y: [33., 65]
+      dx: 0.2
+      dy: 0.2
+      time: ['2013', '2013']
+      sarah_interpolate: false
+      sarah_dir:
+      features: [influx, temperature]
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#renewable
+renewable:
+  onwind:
+    cutout: europe-2013-era5
+    resource:
+      method: wind
+      turbine: Vestas_V112_3MW
+      add_cutout_windspeed: true
+    capacity_per_sqkm: 3
+    # correction_factor: 0.93
+    corine:
+      grid_codes: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32]
+      distance: 1000
+      distance_grid_codes: [1, 2, 3, 4, 5, 6]
+    luisa: false
+      # grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
+      # distance: 1000
+      # distance_grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
+    natura: true
+    excluder_resolution: 100
+    clip_p_max_pu: 1.e-2
+  offwind-ac:
+    cutout: europe-2013-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
+      add_cutout_windspeed: true
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    corine: [44, 255]
+    luisa: false # [0, 5230]
+    natura: true
+    ship_threshold: 400
+    max_depth: 50
+    max_shore_distance: 30000
+    excluder_resolution: 200
+    clip_p_max_pu: 1.e-2
+  offwind-dc:
+    cutout: europe-2013-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
+      add_cutout_windspeed: true
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    corine: [44, 255]
+    luisa: false # [0, 5230]
+    natura: true
+    ship_threshold: 400
+    max_depth: 50
+    min_shore_distance: 30000
+    excluder_resolution: 200
+    clip_p_max_pu: 1.e-2
+  solar:
+    cutout: europe-2013-sarah
+    resource:
+      method: pv
+      panel: CSi
+      orientation:
+        slope: 35.
+        azimuth: 180.
+    capacity_per_sqkm: 5.1
+    # correction_factor: 0.854337
+    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
+    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
+    natura: true
+    excluder_resolution: 100
+    clip_p_max_pu: 1.e-2
+  hydro:
+    cutout: europe-2013-era5
+    carriers: [ror, PHS, hydro]
+    PHS_max_hours: 6
+    hydro_max_hours: "energy_capacity_totals_by_country" # one of energy_capacity_totals_by_country, estimate_by_large_installations or a float
+    flatten_dispatch: false
+    flatten_dispatch_buffer: 0.2
+    clip_min_inflow: 1.0
+    eia_norm_year: false
+    eia_correct_by_capacity: false
+    eia_approximate_missing: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#conventional
+conventional:
+  unit_commitment: false
+  dynamic_fuel_price: false
+  nuclear:
+    p_max_pu: "data/nuclear_p_max_pu.csv" # float of file name
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
+lines:
+  types:
+    200.: "Al/St 240/40 2-bundle 220.0"
+    220.: "Al/St 240/40 2-bundle 220.0"
+    300.: "Al/St 240/40 3-bundle 300.0"
+    380.: "Al/St 240/40 4-bundle 380.0"
+    400.: "Al/St 240/40 4-bundle 380.0"
+    500.: "Al/St 240/40 4-bundle 380.0"
+    750.: "Al/St 560/50 4-bundle 750.0"
+  dc_types: # setting only for osm
+    200.: "HVDC XLPE 1000"
+    220.: "HVDC XLPE 1000"
+    300.: "HVDC XLPE 1000"
+    750.: "HVDC XLPE 1000"
+    380.: "HVDC XLPE 1000"
+    400.: "HVDC XLPE 1000"
+    500.: "HVDC XLPE 1000"
+  s_max_pu: 0.7
+  s_nom_max: .inf
+  max_extension: 20000 #MW
+  length_factor: 1.25
+  reconnect_crimea: true
+  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+  dynamic_line_rating:
+    activate: false
+    cutout: europe-2013-era5
+    correction_factor: 0.95
+    max_voltage_difference: false
+    max_line_rating: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#links
+links:
+  p_max_pu: 1.0
+  p_nom_max: .inf
+  max_extension: 30000 #MW
+  include_tyndp: false
+  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
+transformers:
+  x: 0.1
+  s_nom: 2000.
+  type: ''
+
+# docs-load in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#load
+load:
+  interpolate_limit: 3
+  time_shift_for_large_gaps: 1w
+  manual_adjustments: true # false
+  scaling_factor: 1.0
+  fixed_year: false # false or year (e.g. 2013)
+  supplement_synthetic: true
+
+# docs
+# TODO: PyPSA-Eur merge issue in prepare_sector_network.py
+# regulate what components with which carriers are kept from PyPSA-Eur;
+# some technologies are removed because they are implemented differently
+# (e.g. battery or H2 storage) or have different year-dependent costs
+# in PyPSA-Eur-Sec
+pypsa_eur:
+  Bus:
+  - AC
+  Link:
+  - DC
+  Generator:
+  - onwind
+  - offwind-ac
+  - offwind-dc
+  - solar
+  - ror
+  - nuclear
+  StorageUnit:
+  - PHS
+  - hydro
+  Store: []
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#energy
+energy:
+  energy_totals_year: 2019
+  base_emissions_year: 1990
+  emissions: CO2
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#biomass
+biomass:
+  year: 2030
+  scenario: ENS_Med
+  classes:
+    solid biomass:
+    - Agricultural waste
+    - Fuelwood residues
+    - Secondary Forestry residues - woodchips
+    - Sawdust
+    - Residues from landscape care
+    - Municipal waste
+    not included:
+    - Sugar from sugar beet
+    - Rape seed
+    - "Sunflower, soya seed "
+    - Bioethanol barley, wheat, grain maize, oats, other cereals and rye
+    - Miscanthus, switchgrass, RCG
+    - Willow
+    - Poplar
+    - FuelwoodRW
+    - C&P_RW
+    biogas:
+    - Manure solid, liquid
+    - Sludge
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solar-thermal
+solar_thermal:
+  clearsky_model: simple  # should be "simple" or "enhanced"?
+  orientation:
+    slope: 45.
+    azimuth: 180.
+  cutout: default
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#existing-capacities
+existing_capacities:
+  grouping_years_power: [1895, 1920, 1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025, 2030]
+  grouping_years_heat: [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020] # heat grouping years >= baseyear will be ignored
+  threshold_capacity: 10
+  default_heating_lifetime: 20
+  conventional_carriers:
+  - lignite
+  - coal
+  - oil
+  - uranium
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#sector
+sector:
+  transport: true
+  heating: true
+  biomass: true
+  industry: true
+  agriculture: true
+  district_heating:
+    potential: 0.6
+    progress:
+      2020: 0.0
+      2025: 0.15
+      2030: 0.3
+      2035: 0.45
+      2040: 0.6
+      2045: 0.8
+      2050: 1.0
+    district_heating_loss: 0.15
+  cluster_heat_buses: true
+  heat_demand_cutout: default
+  bev_dsm_restriction_value: 0.75
+  bev_dsm_restriction_time: 7
+  transport_heating_deadband_upper: 20.
+  transport_heating_deadband_lower: 15.
+  ICE_lower_degree_factor: 0.375
+  ICE_upper_degree_factor: 1.6
+  EV_lower_degree_factor: 0.98
+  EV_upper_degree_factor: 0.63
+  bev_dsm: true
+  bev_availability: 0.5
+  bev_energy: 0.05
+  bev_charge_efficiency: 0.9
+  bev_plug_to_wheel_efficiency: 0.2
+  bev_charge_rate: 0.011
+  bev_avail_max: 0.95
+  bev_avail_mean: 0.8
+  v2g: true
+  land_transport_fuel_cell_share:
+    2020: 0
+    2025: 0
+    2030: 0
+    2035: 0
+    2040: 0
+    2045: 0
+    2050: 0
+  land_transport_electric_share:
+    2020: 0
+    2025: 0.15
+    2030: 0.3
+    2035: 0.45
+    2040: 0.7
+    2045: 0.85
+    2050: 1
+  land_transport_ice_share:
+    2020: 1
+    2025: 0.85
+    2030: 0.7
+    2035: 0.55
+    2040: 0.3
+    2045: 0.15
+    2050: 0
+  transport_fuel_cell_efficiency: 0.5
+  transport_internal_combustion_efficiency: 0.3
+  agriculture_machinery_electric_share: 0
+  agriculture_machinery_oil_share: 1
+  agriculture_machinery_fuel_efficiency: 0.7
+  agriculture_machinery_electric_efficiency: 0.3
+  MWh_MeOH_per_MWh_H2: 0.8787
+  MWh_MeOH_per_tCO2: 4.0321
+  MWh_MeOH_per_MWh_e: 3.6907
+  shipping_hydrogen_liquefaction: false
+  shipping_hydrogen_share:
+    2020: 0
+    2025: 0
+    2030: 0
+    2035: 0
+    2040: 0
+    2045: 0
+    2050: 0
+  shipping_methanol_share:
+    2020: 0
+    2025: 0.15
+    2030: 0.3
+    2035: 0.5
+    2040: 0.7
+    2045: 0.85
+    2050: 1
+  shipping_oil_share:
+    2020: 1
+    2025: 0.85
+    2030: 0.7
+    2035: 0.5
+    2040: 0.3
+    2045: 0.15
+    2050: 0
+  shipping_methanol_efficiency: 0.46
+  shipping_oil_efficiency: 0.40
+  aviation_demand_factor: 1.
+  HVC_demand_factor: 1.
+  time_dep_hp_cop: true
+  heat_pump_sink_T: 55.
+  reduce_space_heat_exogenously: true
+  reduce_space_heat_exogenously_factor:
+    2020: 0.10  # this results in a space heat demand reduction of 10%
+    2025: 0.09  # first heat demand increases compared to 2020 because of larger floor area per capita
+    2030: 0.09
+    2035: 0.11
+    2040: 0.16
+    2045: 0.21
+    2050: 0.29
+  retrofitting:
+    retro_endogen: false
+    cost_factor: 1.0
+    interest_rate: 0.04
+    annualise_cost: true
+    tax_weighting: false
+    construction_index: true
+  tes: true
+  tes_tau:
+    decentral: 3
+    central: 180
+  boilers: true
+  resistive_heaters: true
+  oil_boilers: false
+  biomass_boiler: true
+  overdimension_individual_heating: 1.1  #to cover demand peaks bigger than data
+  chp: true
+  micro_chp: false
+  solar_thermal: true
+  solar_cf_correction: 0.788457  # =  >>> 1/1.2683
+  marginal_cost_storage: 0. #1e-4
+  methanation: true
+  coal_cc: false
+  dac: true
+  co2_vent: false
+  central_heat_vent: false
+  allam_cycle: false
+  hydrogen_fuel_cell: true
+  hydrogen_turbine: false
+  SMR: true
+  SMR_cc: true
+  regional_methanol_demand: false
+  regional_oil_demand: false
+  regional_coal_demand: false
+  regional_co2_sequestration_potential:
+    enable: false
+    attribute:
+    - conservative estimate Mt
+    - conservative estimate GAS Mt
+    - conservative estimate OIL Mt
+    - conservative estimate aquifer Mt
+    include_onshore: false
+    min_size: 3
+    max_size: 25
+    years_of_storage: 25
+  co2_sequestration_potential: 200
+  co2_sequestration_cost: 10
+  co2_sequestration_lifetime: 50
+  co2_spatial: false
+  co2network: false
+  co2_network_cost_factor: 1
+  cc_fraction: 0.9
+  hydrogen_underground_storage: true
+  hydrogen_underground_storage_locations:
+    # - onshore  # more than 50 km from sea
+  - nearshore    # within 50 km of sea
+    # - offshore
+  ammonia: false
+  min_part_load_fischer_tropsch: 0.7
+  min_part_load_methanolisation: 0.3
+  min_part_load_methanation: 0.3
+  use_fischer_tropsch_waste_heat: true
+  use_haber_bosch_waste_heat: true
+  use_methanolisation_waste_heat: true
+  use_methanation_waste_heat: true
+  use_fuel_cell_waste_heat: true
+  use_electrolysis_waste_heat: true
+  electricity_transmission_grid: true
+  electricity_distribution_grid: true
+  electricity_distribution_grid_cost_factor: 1.0
+  electricity_grid_connection: true
+  transmission_efficiency:
+    DC:
+      efficiency_static: 0.98
+      efficiency_per_1000km: 0.977
+    H2 pipeline:
+      efficiency_per_1000km: 1 # 0.982
+      compression_per_1000km: 0.018
+    gas pipeline:
+      efficiency_per_1000km: 1 #0.977
+      compression_per_1000km: 0.01
+  H2_network: true
+  gas_network: false
+  H2_retrofit: false
+  H2_retrofit_capacity_per_CH4: 0.6
+  gas_network_connectivity_upgrade: 1
+  gas_distribution_grid: true
+  gas_distribution_grid_cost_factor: 1.0
+  biomass_spatial: false
+  biomass_transport: false
+  biogas_upgrading_cc: false
+  conventional_generation:
+    OCGT: gas
+  biomass_to_liquid: false
+  biosng: false
+  limit_max_growth:
+    enable: false
+    # allowing 30% larger than max historic growth
+    factor: 1.3
+    max_growth:  # unit GW
+      onwind: 16 # onshore max grow so far 16 GW in Europe https://www.iea.org/reports/renewables-2020/wind
+      solar: 28 # solar max grow so far 28 GW in Europe https://www.iea.org/reports/renewables-2020/solar-pv
+      offwind-ac: 35 # offshore max grow so far 3.5 GW in Europe https://windeurope.org/about-wind/statistics/offshore/european-offshore-wind-industry-key-trends-statistics-2019/
+      offwind-dc: 35
+    max_relative_growth:
+      onwind: 3
+      solar: 3
+      offwind-ac: 3
+      offwind-dc: 3
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#industry
+industry:
+  St_primary_fraction:
+    2020: 0.6
+    2025: 0.55
+    2030: 0.5
+    2035: 0.45
+    2040: 0.4
+    2045: 0.35
+    2050: 0.3
+  DRI_fraction:
+    2020: 0
+    2025: 0
+    2030: 0.05
+    2035: 0.2
+    2040: 0.4
+    2045: 0.7
+    2050: 1
+  H2_DRI: 1.7
+  elec_DRI: 0.322
+  Al_primary_fraction:
+    2020: 0.4
+    2025: 0.375
+    2030: 0.35
+    2035: 0.325
+    2040: 0.3
+    2045: 0.25
+    2050: 0.2
+  MWh_NH3_per_tNH3: 5.166
+  MWh_CH4_per_tNH3_SMR: 10.8
+  MWh_elec_per_tNH3_SMR: 0.7
+  MWh_H2_per_tNH3_electrolysis: 5.93
+  MWh_elec_per_tNH3_electrolysis: 0.2473
+  MWh_NH3_per_MWh_H2_cracker: 1.46 # https://github.com/euronion/trace/blob/44a5ff8401762edbef80eff9cfe5a47c8d3c8be4/data/efficiencies.csv
+  NH3_process_emissions: 24.5
+  petrochemical_process_emissions: 25.5
+  #HVC primary/recycling based on values used in Neumann et al https://doi.org/10.1016/j.joule.2023.06.016, linearly interpolated between 2020 and 2050
+  #2020 recycling rates based on Agora https://static.agora-energiewende.de/fileadmin/Projekte/2021/2021_02_EU_CEAP/A-EW_254_Mobilising-circular-economy_study_WEB.pdf
+  #fractions refer to the total primary HVC production in 2020
+  #assumes 6.7 Mtplastics produced from recycling in 2020
+  HVC_primary_fraction:
+    2020: 1.0
+    2025: 0.9
+    2030: 0.8
+    2035: 0.7
+    2040: 0.6
+    2045: 0.5
+    2050: 0.4
+  HVC_mechanical_recycling_fraction:
+    2020: 0.12
+    2025: 0.15
+    2030: 0.18
+    2035: 0.21
+    2040: 0.24
+    2045: 0.27
+    2050: 0.30
+  HVC_chemical_recycling_fraction:
+    2020: 0.0
+    2025: 0.0
+    2030: 0.04
+    2035: 0.08
+    2040: 0.12
+    2045: 0.16
+    2050: 0.20
+  sector_ratios_fraction_future:
+    2020: 0.0
+    2025: 0.1
+    2030: 0.3
+    2035: 0.5
+    2040: 0.7
+    2045: 0.9
+    2050: 1.0
+  basic_chemicals_without_NH3_production_today: 69. #Mt/a, = 86 Mtethylene-equiv - 17 MtNH3
+  HVC_production_today: 52.
+  MWh_elec_per_tHVC_mechanical_recycling: 0.547
+  MWh_elec_per_tHVC_chemical_recycling: 6.9
+  chlorine_production_today: 9.58
+  MWh_elec_per_tCl: 3.6
+  MWh_H2_per_tCl: -0.9372
+  methanol_production_today: 1.5
+  MWh_elec_per_tMeOH: 0.167
+  MWh_CH4_per_tMeOH: 10.25
+  hotmaps_locate_missing: false
+  reference_year: 2015
+
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#costs
+costs:
+  year: 2030
+  version: v0.8.1
+  rooftop_share: 0.14  # based on the potentials, assuming  (0.1 kW/m2 and 10 m2/person)
+  social_discountrate: 0.02
+  fill_values:
+    FOM: 0
+    VOM: 0
+    efficiency: 1
+    fuel: 0
+    investment: 0
+    lifetime: 25
+    "CO2 intensity": 0
+    "discount rate": 0.07
+  # Marginal and capital costs can be overwritten
+  # capital_cost:
+  #   onwind: 500
+  marginal_cost:
+    solar: 0.01
+    onwind: 0.015
+    offwind: 0.015
+    hydro: 0.
+    H2: 0.
+    electrolysis: 0.
+    fuel cell: 0.
+    battery: 0.
+    battery inverter: 0.
+  emission_prices:
+    enable: false
+    co2: 0.
+    co2_monthly_prices: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#clustering
+clustering:
+  focus_weights: false
+  simplify_network:
+    to_substations: false
+    algorithm: kmeans # choose from: [hac, kmeans]
+    feature: solar+onwind-time
+    exclude_carriers: []
+    remove_stubs: true
+    remove_stubs_across_borders: true
+  cluster_network:
+    algorithm: kmeans
+    feature: solar+onwind-time
+    exclude_carriers: []
+    consider_efficiency_classes: false
+  aggregation_strategies:
+    generators:
+      committable: any
+      ramp_limit_up: max
+      ramp_limit_down: max
+  temporal:
+    resolution_elec: 169H
+    resolution_sector: 169H
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#adjustments
+adjustments:
+  electricity: false
+  sector: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solving
+solving:
+  #tmpdir: "path/to/tmp"
+  options:
+    clip_p_max_pu: 1.e-2
+    load_shedding: false
+    noisy_costs: true
+    skip_iterations: true
+    rolling_horizon: false
+    seed: 123
+    custom_extra_functionality: "../data/custom_extra_functionality.py"
+    # io_api: "direct"  # Increases performance but only supported for the highs and gurobi solvers
+    # options that go into the optimize function
+    track_iterations: false
+    min_iterations: 4
+    max_iterations: 6
+    transmission_losses: 2
+    linearized_unit_commitment: true
+    horizon: 365
+
+  constraints:
+    CCL: false
+    EQ: false
+    BAU: false
+    SAFE: false
+
+  solver:
+    name: gurobi
+    options: gurobi-default
+
+  solver_options:
+    highs-default:
+      # refer to https://ergo-code.github.io/HiGHS/options/definitions.html#solver
+      threads: 4
+      solver: "ipm"
+      run_crossover: "off"
+      small_matrix_value: 1e-6
+      large_matrix_value: 1e9
+      primal_feasibility_tolerance: 1e-5
+      dual_feasibility_tolerance: 1e-5
+      ipm_optimality_tolerance: 1e-4
+      parallel: "on"
+      random_seed: 123
+    gurobi-default:
+      threads: 4
+      method: 2 # barrier
+      crossover: 0
+      BarConvTol: 1.e-6
+      Seed: 123
+      AggFill: 0
+      PreDual: 0
+      GURO_PAR_BARDENSETHRESH: 200
+    gurobi-numeric-focus:
+      NumericFocus: 3       # Favour numeric stability over speed
+      method: 2             # barrier
+      crossover: 0          # do not use crossover
+      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
+      BarConvTol: 1.e-5
+      FeasibilityTol: 1.e-4
+      OptimalityTol: 1.e-4
+      ObjScale: -0.5
+      threads: 8
+      Seed: 123
+    gurobi-fallback:        # Use gurobi defaults
+      crossover: 0
+      method: 2             # barrier
+      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
+      BarConvTol: 1.e-5
+      FeasibilityTol: 1.e-5
+      OptimalityTol: 1.e-5
+      Seed: 123
+      threads: 8
+    cplex-default:
+      threads: 4
+      lpmethod: 4 # barrier
+      solutiontype: 2 # non basic solution, ie no crossover
+      barrier.convergetol: 1.e-5
+      feasopt.tolerance: 1.e-6
+    copt-default:
+      Threads: 8
+      LpMethod: 2
+      Crossover: 0
+    cbc-default: {} # Used in CI
+    glpk-default: {} # Used in CI
+
+  mem_mb: 100000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
+  runtime: 12h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
+
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
+
+plotting:
+  map:
+    boundaries:
+  eu_node_location:
+    x: -5.5
+    y: 46.
+  # costs_max: 1000
+  # costs_threshold: 0.0000001
+  # energy_max:
+  # energy_min:
+  # energy_threshold: 0.000001
+  projection:
+    name: "EqualEarth"
+    # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
+    # name: "LambertConformal"
+    # central_longitude: 10.
+    # central_latitude: 50.
+    # standard_parallels: [35, 65]
+
+# plotting:
+#   map:
+#     boundaries: [-11, 30, 34, 71]
+#     color_geomap:
+#       ocean: white
+#       land: white
+#   projection:
+#     name: "EqualEarth"
+#     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
+#     # name: "LambertConformal"
+#     # central_longitude: 10.
+#     # central_latitude: 50.
+#     # standard_parallels: [35, 65]
+#   eu_node_location:
+#     x: -5.5
+#     y: 46.
+#   costs_max: 1000
+#   costs_threshold: 1
+#   energy_max: 20000
+#   energy_min: -20000
+#   energy_threshold: 50.
+
+  nice_names:
+    OCGT: "Open-Cycle Gas"
+    CCGT: "Combined-Cycle Gas"
+    offwind-ac: "Offshore Wind (AC)"
+    offwind-dc: "Offshore Wind (DC)"
+    onwind: "Onshore Wind"
+    solar: "Solar"
+    PHS: "Pumped Hydro Storage"
+    hydro: "Reservoir & Dam"
+    battery: "Battery Storage"
+    H2: "Hydrogen Storage"
+    lines: "Transmission Lines"
+    ror: "Run of River"
+    load: "Load Shedding"
+    ac: "AC"
+    dc: "DC"
+
+  tech_colors:
+    # wind
+    onwind: "#235ebc"
+    onshore wind: "#235ebc"
+    offwind: "#6895dd"
+    offshore wind: "#6895dd"
+    offwind-ac: "#6895dd"
+    offshore wind (AC): "#6895dd"
+    offshore wind ac: "#6895dd"
+    offwind-dc: "#74c6f2"
+    offshore wind (DC): "#74c6f2"
+    offshore wind dc: "#74c6f2"
+    # water
+    hydro: '#298c81'
+    hydro reservoir: '#298c81'
+    ror: '#3dbfb0'
+    run of river: '#3dbfb0'
+    hydroelectricity: '#298c81'
+    PHS: '#51dbcc'
+    hydro+PHS: "#08ad97"
+    # solar
+    solar: "#f9d002"
+    solar PV: "#f9d002"
+    solar thermal: '#ffbf2b'
+    residential rural solar thermal: '#f1c069'
+    services rural solar thermal: '#eabf61'
+    residential urban decentral solar thermal: '#e5bc5a'
+    services urban decentral solar thermal: '#dfb953'
+    urban central solar thermal: '#d7b24c'
+    solar rooftop: '#ffea80'
+    # gas
+    OCGT: '#e0986c'
+    OCGT marginal: '#e0986c'
+    OCGT-heat: '#e0986c'
+    gas boiler: '#db6a25'
+    gas boilers: '#db6a25'
+    gas boiler marginal: '#db6a25'
+    residential rural gas boiler: '#d4722e'
+    residential urban decentral gas boiler: '#cb7a36'
+    services rural gas boiler: '#c4813f'
+    services urban decentral gas boiler: '#ba8947'
+    urban central gas boiler: '#b0904f'
+    gas: '#e05b09'
+    fossil gas: '#e05b09'
+    natural gas: '#e05b09'
+    biogas to gas: '#e36311'
+    biogas to gas CC: '#e51245'
+    CCGT: '#a85522'
+    CCGT marginal: '#a85522'
+    allam: '#B98F76'
+    gas for industry co2 to atmosphere: '#692e0a'
+    gas for industry co2 to stored: '#8a3400'
+    gas for industry: '#853403'
+    gas for industry CC: '#692e0a'
+    gas pipeline: '#ebbca0'
+    gas pipeline new: '#a87c62'
+    # oil
+    oil: '#c9c9c9'
+    imported oil: '#a3a3a3'
+    oil boiler: '#adadad'
+    residential rural oil boiler: '#a9a9a9'
+    services rural oil boiler: '#a5a5a5'
+    residential urban decentral oil boiler: '#a1a1a1'
+    urban central oil boiler: '#9d9d9d'
+    services urban decentral oil boiler: '#999999'
+    agriculture machinery oil: '#949494'
+    shipping oil: "#808080"
+    land transport oil: '#afafaf'
+    # nuclear
+    Nuclear: '#ff8c00'
+    Nuclear marginal: '#ff8c00'
+    nuclear: '#ff8c00'
+    uranium: '#ff8c00'
+    # coal
+    Coal: '#545454'
+    coal: '#545454'
+    Coal marginal: '#545454'
+    coal for industry: '#343434'
+    solid: '#545454'
+    Lignite: '#826837'
+    lignite: '#826837'
+    Lignite marginal: '#826837'
+    # biomass
+    biogas: '#e3d37d'
+    biomass: '#baa741'
+    solid biomass: '#baa741'
+    solid biomass transport: '#baa741'
+    solid biomass for industry: '#7a6d26'
+    solid biomass for industry CC: '#47411c'
+    solid biomass for industry co2 from atmosphere: '#736412'
+    solid biomass for industry co2 to stored: '#47411c'
+    urban central solid biomass CHP: '#9d9042'
+    urban central solid biomass CHP CC: '#6c5d28'
+    biomass boiler: '#8A9A5B'
+    residential rural biomass boiler: '#a1a066'
+    residential urban decentral biomass boiler: '#b0b87b'
+    services rural biomass boiler: '#c6cf98'
+    services urban decentral biomass boiler: '#dde5b5'
+    biomass to liquid: '#32CD32'
+    BioSNG: '#123456'
+    # power transmission
+    lines: '#6c9459'
+    transmission lines: '#6c9459'
+    electricity distribution grid: '#97ad8c'
+    low voltage: '#97ad8c'
+    # electricity demand
+    Electric load: '#110d63'
+    electric demand: '#110d63'
+    electricity: '#110d63'
+    industry electricity: '#2d2a66'
+    industry new electricity: '#2d2a66'
+    agriculture electricity: '#494778'
+    # battery + EVs
+    battery: '#ace37f'
+    battery storage: '#ace37f'
+    battery charger: '#88a75b'
+    battery discharger: '#5d4e29'
+    home battery: '#80c944'
+    home battery storage: '#80c944'
+    home battery charger: '#5e8032'
+    home battery discharger: '#3c5221'
+    BEV charger: '#baf238'
+    V2G: '#e5ffa8'
+    land transport EV: '#baf238'
+    Li ion: '#baf238'
+    # hot water storage
+    water tanks: '#e69487'
+    residential rural water tanks: '#f7b7a3'
+    services rural water tanks: '#f3afa3'
+    residential urban decentral water tanks: '#f2b2a3'
+    services urban decentral water tanks: '#f1b4a4'
+    urban central water tanks: '#e9977d'
+    hot water storage: '#e69487'
+    hot water charging: '#e8998b'
+    urban central water tanks charger: '#b57a67'
+    residential rural water tanks charger: '#b4887c'
+    residential urban decentral water tanks charger: '#b39995'
+    services rural water tanks charger: '#b3abb0'
+    services urban decentral water tanks charger: '#b3becc'
+    hot water discharging: '#e99c8e'
+    urban central water tanks discharger: '#b9816e'
+    residential rural water tanks discharger: '#ba9685'
+    residential urban decentral water tanks discharger: '#baac9e'
+    services rural water tanks discharger: '#bbc2b8'
+    services urban decentral water tanks discharger: '#bdd8d3'
+    # heat demand
+    Heat load: '#cc1f1f'
+    heat: '#cc1f1f'
+    heat vent: '#aa3344'
+    heat demand: '#cc1f1f'
+    rural heat: '#ff5c5c'
+    residential rural heat: '#ff7c7c'
+    services rural heat: '#ff9c9c'
+    central heat: '#cc1f1f'
+    urban central heat: '#d15959'
+    urban central heat vent: '#a74747'
+    decentral heat: '#750606'
+    residential urban decentral heat: '#a33c3c'
+    services urban decentral heat: '#cc1f1f'
+    low-temperature heat for industry: '#8f2727'
+    process heat: '#ff0000'
+    agriculture heat: '#d9a5a5'
+    # heat supply
+    heat pumps: '#2fb537'
+    heat pump: '#2fb537'
+    air heat pump: '#36eb41'
+    residential urban decentral air heat pump: '#48f74f'
+    services urban decentral air heat pump: '#5af95d'
+    services rural air heat pump: '#5af95d'
+    urban central air heat pump: '#6cfb6b'
+    ground heat pump: '#2fb537'
+    residential rural ground heat pump: '#48f74f'
+    residential rural air heat pump: '#48f74f'
+    services rural ground heat pump: '#5af95d'
+    Ambient: '#98eb9d'
+    CHP: '#8a5751'
+    urban central gas CHP: '#8d5e56'
+    CHP CC: '#634643'
+    urban central gas CHP CC: '#6e4e4c'
+    CHP heat: '#8a5751'
+    CHP electric: '#8a5751'
+    district heating: '#e8beac'
+    resistive heater: '#d8f9b8'
+    residential rural resistive heater: '#bef5b5'
+    residential urban decentral resistive heater: '#b2f1a9'
+    services rural resistive heater: '#a5ed9d'
+    services urban decentral resistive heater: '#98e991'
+    urban central resistive heater: '#8cdf85'
+    retrofitting: '#8487e8'
+    building retrofitting: '#8487e8'
+    # hydrogen
+    H2 for industry: "#f073da"
+    H2 for shipping: "#ebaee0"
+    H2: '#bf13a0'
+    hydrogen: '#bf13a0'
+    retrofitted H2 boiler: '#e5a0d9'
+    SMR: '#870c71'
+    SMR CC: '#4f1745'
+    H2 liquefaction: '#d647bd'
+    hydrogen storage: '#bf13a0'
+    H2 Store: '#bf13a0'
+    H2 storage: '#bf13a0'
+    land transport fuel cell: '#6b3161'
+    H2 pipeline: '#f081dc'
+    H2 pipeline retrofitted: '#ba99b5'
+    H2 Fuel Cell: '#c251ae'
+    H2 fuel cell: '#c251ae'
+    H2 turbine: '#991f83'
+    H2 Electrolysis: '#ff29d9'
+    H2 electrolysis: '#ff29d9'
+    # ammonia
+    NH3: '#46caf0'
+    ammonia: '#46caf0'
+    ammonia store: '#00ace0'
+    ammonia cracker: '#87d0e6'
+    Haber-Bosch: '#076987'
+    # syngas
+    Sabatier: '#9850ad'
+    methanation: '#c44ce6'
+    methane: '#c44ce6'
+    # synfuels
+    Fischer-Tropsch: '#25c49a'
+    liquid: '#25c49a'
+    kerosene for aviation: '#a1ffe6'
+    naphtha for industry: '#57ebc4'
+    methanolisation: '#83d6d5'
+    methanol: '#468c8b'
+    shipping methanol: '#468c8b'
+    # co2
+    CC: '#f29dae'
+    CCS: '#f29dae'
+    CO2 sequestration: '#f29dae'
+    DAC: '#ff5270'
+    co2 stored: '#f2385a'
+    co2 sequestered: '#f2682f'
+    co2: '#f29dae'
+    co2 vent: '#ffd4dc'
+    CO2 pipeline: '#f5627f'
+    # emissions
+    process emissions CC: '#000000'
+    process emissions: '#222222'
+    process emissions to stored: '#444444'
+    process emissions to atmosphere: '#888888'
+    oil emissions: '#aaaaaa'
+    shipping oil emissions: "#555555"
+    shipping methanol emissions: '#666666'
+    land transport oil emissions: '#777777'
+    agriculture machinery oil emissions: '#333333'
+    # other
+    shipping: '#03a2ff'
+    power-to-heat: '#2fb537'
+    power-to-gas: '#c44ce6'
+    power-to-H2: '#ff29d9'
+    power-to-liquid: '#25c49a'
+    gas-to-power/heat: '#ee8340'
+    waste: '#e3d37d'
+    other: '#000000'
+    geothermal: '#ba91b1'
+    AC: "#70af1d"
+    AC-AC: "#70af1d"
+    AC line: "#70af1d"
+    links: "#8a1caf"
+    HVDC links: "#8a1caf"
+    DC: "#8a1caf"
+    DC-DC: "#8a1caf"
+    DC link: "#8a1caf"
+    load: "#dd2e23"
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 0bca9ec5e..f998d958d 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -641,27 +641,23 @@ rule clean_osm_data:
         "../scripts/clean_osm_data.py"
 
 
-if config["electricity_network"]["build_osm_network"] == True:
-    rule build_osm_network:
-        input:
-            substations=resources("osm/clean/substations.geojson"),
-            lines=resources("osm/clean/lines.geojson"),
-            country_shapes=resources("country_shapes.geojson"),
-        output:
-            lines=resources("osm/lines.csv"),
-            converters=resources("osm/converters.csv"),
-            transformers=resources("osm/transformers.csv"),
-            substations=resources("osm/buses.csv"),
-            lines_geojson=resources("osm/lines.geojson"),
-            converters_geojson=resources("osm/converters.geojson"),
-            transformers_geojson=resources("osm/transformers.geojson"),
-            substations_geojson=resources("osm/buses.geojson"),
-        log:
-            logs("build_osm_network.log"),
-        benchmark:
-            benchmarks("build_osm_network")
-        script:
-            "../scripts/build_osm_network.py"
-
-if config["electricity_network"]["build_osm_network"] == False:
-    print("Use prebuilt.")
\ No newline at end of file
+rule build_osm_network:
+    input:
+        substations=resources("osm/clean/substations.geojson"),
+        lines=resources("osm/clean/lines.geojson"),
+        country_shapes=resources("country_shapes.geojson"),
+    output:
+        lines=resources("osm/lines.csv"),
+        converters=resources("osm/converters.csv"),
+        transformers=resources("osm/transformers.csv"),
+        substations=resources("osm/buses.csv"),
+        lines_geojson=resources("osm/lines.geojson"),
+        converters_geojson=resources("osm/converters.geojson"),
+        transformers_geojson=resources("osm/transformers.geojson"),
+        substations_geojson=resources("osm/buses.geojson"),
+    log:
+        logs("build_osm_network.log"),
+    benchmark:
+        benchmarks("build_osm_network")
+    script:
+        "../scripts/build_osm_network.py"
\ No newline at end of file
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 4cd5dd315..ac59548b3 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -979,20 +979,21 @@ def build_network(
 
     logger.info("Stage 2/5: AC and DC network: enabled")
 
+    # TODO pypsa-eur: Remove entirely after testing, not needed for PyPSA-Eur
     # Address the overpassing line issue Step 3/5
-    if build_osm_network_config.get("split_overpassing_lines", False):
-        tol = build_osm_network_config.get("overpassing_lines_tolerance", 1)
-        logger.info("Stage 3/5: Avoid nodes overpassing lines: enabled with tolerance")
+    # if snakemake.config["electricity_network"]["osm_split_overpassing_lines"]:
+    #     tol = snakemake.config["electricity_network"]["osm_overpassing_lines_tolerance"]
+    #     logger.info("Stage 3/5: Avoid nodes overpassing lines: enabled with tolerance")
 
-        lines, buses = fix_overpassing_lines(lines, buses, distance_crs, tol=tol)
-    else:
-        logger.info("Stage 3/5: Avoid nodes overpassing lines: disabled")
+    #     lines, buses = fix_overpassing_lines(lines, buses, distance_crs, tol=tol)
+    # else:
+    logger.info("Stage 3/5: Avoid nodes overpassing lines: disabled")
     
     # Add bus to countries with no buses
     buses = add_buses_to_empty_countries(countries_config, inputs.country_shapes, buses)
 
     # METHOD to merge buses with same voltage and within tolerance Step 4/5
-    tol = build_osm_network_config.get("group_tolerance_buses", 5000)
+    tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
     logger.info(
         f"Stage 4/5: Aggregate close substations: enabled with tolerance {tol} m"
     )
diff --git a/scripts/solve_network.py b/scripts/solve_network.py
index 67f39d16c..db4dc08b9 100644
--- a/scripts/solve_network.py
+++ b/scripts/solve_network.py
@@ -998,7 +998,11 @@ def extra_functionality(n, snapshots):
     if EQ_o := constraints["EQ"]:
         add_EQ_constraints(n, EQ_o.replace("EQ", ""))
 
-    if {"solar-hsat", "solar"}.issubset(config["renewable"].keys()):
+    if {"solar-hsat", "solar"}.issubset(
+        config["electricity"]["renewable_carriers"]
+    ) and {"solar-hsat", "solar"}.issubset(
+        config["electricity"]["extendable_carriers"]["Generator"]
+    ):
         add_solar_potential_constraints(n, config)
 
     add_battery_constraints(n)

From 095d936c43264388c5c0826724f5efb82474b8d0 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 22 May 2024 10:07:26 +0200
Subject: [PATCH 021/100] Fixes.

---
 rules/build_electricity.smk  |   2 +-
 scripts/build_osm_network.py | 188 +----------------------------------
 scripts/clean_osm_data.py    |  13 +--
 3 files changed, 11 insertions(+), 192 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index f998d958d..ec4c56f60 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -119,7 +119,7 @@ if config["electricity_network"]["base_network"] == "osm":
             eg_converters=resources("osm/converters.csv"),
             eg_transformers=resources("osm/transformers.csv"),
             links_p_nom="data/links_p_nom.csv",
-            links_tyndp="data/links_tyndp_osm.csv",
+            links_tyndp="data/links_tyndp.csv",
             country_shapes=resources("country_shapes.geojson"),
             offshore_shapes=resources("offshore_shapes.geojson"),
             europe_shape=resources("europe_shape.geojson"),
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index ac59548b3..467a7b795 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -760,167 +760,6 @@ def _split_linestring_by_point(linestring, points):
     return list_linestrings
 
 
-def fix_overpassing_lines(lines, buses, distance_crs, tol=1):
-    """
-    Function to avoid buses overpassing lines with no connection when the bus
-    is within a given tolerance from the line.
-
-    Parameters
-    ----------
-    lines : GeoDataFrame
-        Geodataframe of lines
-    buses : GeoDataFrame
-        Geodataframe of substations
-    tol : float
-        Tolerance in meters of the distance between the substation and the line
-        below which the line will be split
-    """
-
-    lines_to_add = []  # list of lines to be added
-    lines_to_split = []  # list of lines that have been split
-
-    lines_epsgmod = lines.to_crs(distance_crs)
-    buses_epsgmod = buses.to_crs(distance_crs)
-
-    # set tqdm options for substation ids
-    tqdm_kwargs_substation_ids = dict(
-        ascii=False,
-        unit=" lines",
-        total=lines.shape[0],
-        desc="Verify lines overpassing nodes ",
-    )
-
-    for l in tqdm(lines.index, **tqdm_kwargs_substation_ids):
-        # bus indices being within tolerance from the line
-        bus_in_tol_epsg = buses_epsgmod[
-            buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol
-        ]
-
-        # exclude endings of the lines
-        bus_in_tol_epsg = bus_in_tol_epsg[
-            (
-                (
-                    bus_in_tol_epsg.geometry.distance(
-                        lines_epsgmod.geometry.loc[l].boundary.geoms[0]
-                    )
-                    > tol
-                )
-                | (
-                    bus_in_tol_epsg.geometry.distance(
-                        lines_epsgmod.geometry.loc[l].boundary.geoms[1]
-                    )
-                    > tol
-                )
-            )
-        ]
-
-        if not bus_in_tol_epsg.empty:
-            # add index of line to split
-            lines_to_split.append(l)
-
-            buses_locs = buses.geometry.loc[bus_in_tol_epsg.index]
-
-            # get new line geometries
-            new_geometries = _split_linestring_by_point(lines.geometry[l], buses_locs)
-            n_geoms = len(new_geometries)
-
-            # create temporary copies of the line
-            df_append = gpd.GeoDataFrame([lines.loc[l]] * n_geoms)
-            # update geometries
-            df_append["geometry"] = new_geometries
-            # update name of the line
-            df_append["line_id"] = [
-                str(df_append["line_id"].iloc[0]) + f"_{id}" for id in range(n_geoms)
-            ]
-
-            lines_to_add.append(df_append)
-
-    if not lines_to_add:
-        return lines, buses
-
-    df_to_add = gpd.GeoDataFrame(pd.concat(lines_to_add, ignore_index=True))
-    df_to_add.set_crs(lines.crs, inplace=True)
-    df_to_add.set_index(lines.index[-1] + df_to_add.index, inplace=True)
-
-    # update length
-    df_to_add["length"] = df_to_add.to_crs(distance_crs).geometry.length
-
-    # update line endings
-    df_to_add = line_endings_to_bus_conversion(df_to_add)
-
-    # remove original lines
-    lines.drop(lines_to_split, inplace=True)
-
-    lines = gpd.GeoDataFrame(
-        pd.concat([lines, df_to_add], ignore_index=True).reset_index(drop=True),
-        crs=lines.crs,
-    )
-
-    return lines, buses
-
-
-def add_buses_to_empty_countries(country_list, fp_country_shapes, buses):
-    """
-    Function to add a bus for countries missing substation data.
-    """
-    country_shapes = gpd.read_file(fp_country_shapes).set_index("name")["geometry"]
-    bus_country_list = buses["country"].unique().tolist()
-
-    # it may happen that bus_country_list contains entries not relevant as a country name (e.g. "not found")
-    # difference can't give negative values; the following will return only relevant country names
-    no_data_countries = list(set(country_list).difference(set(bus_country_list)))
-
-    if len(no_data_countries) > 0:
-        logger.info(
-            f"No buses for the following countries: {no_data_countries}. Adding a node for everyone of them."
-        )
-        no_data_countries_shape = (
-            country_shapes[country_shapes.index.isin(no_data_countries) == True]
-            .reset_index()
-            .to_crs(geo_crs)
-        )
-        length = len(no_data_countries)
-        df = gpd.GeoDataFrame(
-            {
-                "voltage": [220000] * length,
-                "country": no_data_countries_shape["name"],
-                "x": no_data_countries_shape["geometry"].centroid.x,
-                "y": no_data_countries_shape["geometry"].centroid.y,
-                "bus_id": np.arange(len(buses) + 1, len(buses) + (length + 1), 1),
-                "station_id": [np.nan] * length,
-                # All lines for the countries with NA bus data are assumed to be AC
-                "dc": [False] * length,
-                "under_construction": [False] * length,
-                "tag_area": [0.0] * length,
-                "symbol": ["substation"] * length,
-                "tag_substation": ["transmission"] * length,
-                "geometry": no_data_countries_shape["geometry"].centroid,
-                "substation_lv": [True] * length,
-            },
-            crs=geo_crs,
-        ).astype(
-            buses.dtypes.to_dict()
-        )  # keep the same dtypes as buses
-        buses = gpd.GeoDataFrame(
-            pd.concat([buses, df], ignore_index=True).reset_index(drop=True),
-            crs=buses.crs,
-        )
-
-        # update country list by buses dataframe
-        bus_country_list = buses["country"].unique().tolist()
-
-    non_allocated_countries = list(
-        set(country_list).symmetric_difference(set(bus_country_list))
-    )
-
-    if len(non_allocated_countries) > 0:
-        logger.error(
-            f"There following countries could not be allocated properly: {non_allocated_countries}"
-        )
-
-    return buses
-
-
 def build_network(
     inputs,
     outputs,
@@ -962,7 +801,7 @@ def build_network(
         }
     }
 
-    logger.info("Stage 1/5: Read input data")
+    logger.info("Read input data.")
     buses = read_geojson(
         inputs["substations"],
         osm_clean_columns["substation"].keys(),
@@ -976,33 +815,16 @@ def build_network(
     )
 
     lines = line_endings_to_bus_conversion(lines)
-
-    logger.info("Stage 2/5: AC and DC network: enabled")
-
-    # TODO pypsa-eur: Remove entirely after testing, not needed for PyPSA-Eur
-    # Address the overpassing line issue Step 3/5
-    # if snakemake.config["electricity_network"]["osm_split_overpassing_lines"]:
-    #     tol = snakemake.config["electricity_network"]["osm_overpassing_lines_tolerance"]
-    #     logger.info("Stage 3/5: Avoid nodes overpassing lines: enabled with tolerance")
-
-    #     lines, buses = fix_overpassing_lines(lines, buses, distance_crs, tol=tol)
-    # else:
-    logger.info("Stage 3/5: Avoid nodes overpassing lines: disabled")
     
-    # Add bus to countries with no buses
-    buses = add_buses_to_empty_countries(countries_config, inputs.country_shapes, buses)
-
-    # METHOD to merge buses with same voltage and within tolerance Step 4/5
+    # METHOD to merge buses with same voltage and within tolerance
     tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
     logger.info(
-        f"Stage 4/5: Aggregate close substations: enabled with tolerance {tol} m"
+        f"Aggregate close substations: Enabled with tolerance {tol} m"
     )
     lines, buses = merge_stations_lines_by_station_id_and_voltage(
         lines, buses, geo_crs, distance_crs, tol=tol
     )
 
-    logger.info("Stage 5/5: Add augmented substation to country with no data")
-
     # Recalculate lengths of lines
     utm = lines.estimate_utm_crs(datum_name = "WGS 84")
     lines["length"] = lines.to_crs(utm).length
@@ -1019,7 +841,6 @@ def build_network(
     if not os.path.exists(outputs["lines"]):
         os.makedirs(os.path.dirname(outputs["lines"]), exist_ok=True)
 
-
     ### Convert output to pypsa-eur friendly format
     # Rename "substation" in buses["symbol"] to "Substation"
     buses["symbol"] = buses["symbol"].replace({"substation": "Substation"})
@@ -1030,7 +851,6 @@ def build_network(
     transformers.set_index("transformer_id", inplace=True)
     buses.set_index("bus_id", inplace=True)
 
-
     # Convert voltages from V to kV
     lines["voltage"] = lines["voltage"] / 1000
     transformers["voltage_bus0"], transformers["voltage_bus1"] = transformers["voltage_bus0"] / 1000, \
@@ -1051,8 +871,6 @@ def build_network(
     lines_csv = lines[cols_lines_csv]
     lines = lines[cols_lines]
     
-
-
     to_csv_nafix(lines_csv, outputs["lines"], quotechar="'")  # Generate CSV
     to_csv_nafix(converters, outputs["converters"], quotechar="'")  # Generate CSV
     to_csv_nafix(transformers, outputs["transformers"], quotechar="'")  # Generate CSV
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index c4f99858c..cde44d412 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -468,10 +468,12 @@ def _import_lines_and_cables(path_lines):
     Import lines and cables from the given input paths.
 
     Parameters:
-    - path_lines (dict): A dictionary containing the input paths for lines and cables data.
+    - path_lines (dict): A dictionary containing the input paths for lines and 
+      cables data.
 
     Returns:
-    - df_lines (DataFrame): A DataFrame containing the imported lines and cables data.
+    - df_lines (DataFrame): A DataFrame containing the imported lines and 
+      cables data.
 
     """
     columns = ["id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", 
@@ -1078,8 +1080,8 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     Parameters:
     - gdf_lines (GeoDataFrame): A GeoDataFrame containing lines with 'line_id' 
       and 'geometry' columns.
-    - gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing substation 
-      polygons.
+    - gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing 
+      substation polygons.
 
     Returns:
     GeoDataFrame: A new GeoDataFrame without lines within substation polygons.
@@ -1196,5 +1198,4 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     logger.info(f"Exporting clean lines to {output_lines}")
     gdf_lines.to_file(output_lines, driver="GeoJSON")
 
-    logger.info("Cleaning OSM data completed.")
-    
\ No newline at end of file
+    logger.info("Cleaning OSM data completed.")
\ No newline at end of file

From 6032f9676a5a302130e08c7cecae5c1c1a4dec72 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 22 May 2024 15:54:53 +0200
Subject: [PATCH 022/100] Bug fixes.

---
 scripts/base_network_osm.py  |   7 +-
 scripts/build_osm_network.py | 260 ++++++++++++++++++++++-------------
 scripts/clean_osm_data.py    |   1 +
 3 files changed, 168 insertions(+), 100 deletions(-)

diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index 44b2636d6..81b7339e1 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -170,11 +170,14 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
     #     buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
     # )
 
+    v_nom_min = min(config_elec["voltages"])
+    v_nom_max = max(config_elec["voltages"])
+
     # Quick fix:
-    buses_with_v_nom_to_keep_b = (min(config_elec["voltages"]) <= buses.v_nom) & (buses.v_nom <= max(config_elec["voltages"]))
+    buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
 
     logger.info(
-        f'Removing buses with voltages {pd.Index(buses.v_nom.unique()).dropna().difference(config_elec["voltages"])}'
+        f'Removing buses outside of range {v_nom_min} - {v_nom_max} V'
     )
     return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
 
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 467a7b795..19f9f4ad1 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -39,6 +39,16 @@ def read_csv_nafix(file, **kwargs):
 
 
 def save_to_geojson(df, fn):
+    """
+    Save a (Geo)DataFrame to a GeoJSON file.
+
+    Parameters:
+    - df: The (Geo)DataFrame to be saved.
+    - fn: The filename (including the path) of the output GeoJSON file.
+
+    Returns:
+    None
+    """
     if os.path.exists(fn):
         os.unlink(fn)  # remove file if it exists
 
@@ -82,9 +92,27 @@ def read_geojson(fn, cols=[], dtype=None, crs="EPSG:4326"):
 
 
 def to_csv_nafix(df, path, **kwargs):
+    """
+    Write a pandas DataFrame to a CSV file with NA values replaced.
+
+    Parameters:
+    - df: pandas DataFrame
+        The DataFrame to be written to the CSV file.
+    - path: str
+        The file path where the CSV file will be saved.
+    - **kwargs: keyword arguments
+        Additional arguments to be passed to the `to_csv` function of pandas.
+
+    Returns:
+    - None
+
+    If the DataFrame is not empty or does not have empty columns, it will be 
+    written to the CSV file with NA values replaced by the first value in the 
+    `NA_VALUES` list. If the DataFrame is empty or has empty columns, an empty 
+    file will be created at the specified path.
+    """
     if "na_rep" in kwargs:
         del kwargs["na_rep"]
-    # if len(df) > 0:
     if not df.empty or not df.columns.empty:
         return df.to_csv(path, **kwargs, na_rep=NA_VALUES[0])
     else:
@@ -93,6 +121,20 @@ def to_csv_nafix(df, path, **kwargs):
 
 
 def line_endings_to_bus_conversion(lines):
+    """
+    Converts line endings to bus connections.
+    
+    This function takes a df of lines and converts the line endings to bus 
+    connections. It performs the necessary operations to ensure that the line 
+    endings are properly connected to the buses in the network.
+    
+    Parameters:
+    lines (DataFrame)
+    
+    Returns:
+    lines (DataFrame)
+    
+    """
     # Assign to every line a start and end point
 
     lines["bounds"] = lines["geometry"].boundary  # create start and end point
@@ -414,7 +456,7 @@ def get_transformers(buses, lines):
     return df_transformers
 
 
-def get_converters(buses, lines):
+def get_converters(buses):
     """
     Function to create fake converter lines that connect buses of the same
     station_id of different polarities.
@@ -578,16 +620,8 @@ def set_lv_substations(buses):
     return buses
 
 
-# Note tolerance = 0.01 means around 700m
-# TODO: the current tolerance is high to avoid an issue in the Nigeria case where line 565939360-1
-#       seems to be interconnected to both ends, but at the eastern one, the node is actually not connected
-#       another line seems to be exactly touching the node, but from the data point of view it only fly over it.
-#       There may be the need to split a line in several segments in the case the line is within tolerance with
-#       respect to a node
-
-
 def merge_stations_lines_by_station_id_and_voltage(
-    lines, buses, geo_crs, distance_crs, tol=5000
+    lines, buses, distance_crs, tol=5000
 ):
     """
     Function to merge close stations and adapt the line datasets to adhere to
@@ -595,7 +629,7 @@ def merge_stations_lines_by_station_id_and_voltage(
     """
 
     logger.info(
-        "Stage 4a/5: Set substation ids with tolerance of %.2f km" % (tol / 1000)
+        " - Setting substation ids with tolerance of %.2f m" % (tol)
     )
 
     # TODO pypsa-eur: Add this fix to pypsa-earth: Buses should not be clustered geographically if they are different 
@@ -645,7 +679,10 @@ def merge_stations_lines_by_station_id_and_voltage(
         if dc_boundary_points.empty:
             all_dc_boundary_points = dc_boundary_points
         else:
-            all_dc_boundary_points = pd.concat([all_dc_boundary_points, dc_boundary_points], ignore_index = True)
+            if all_dc_boundary_points.empty:
+                all_dc_boundary_points = dc_boundary_points
+            else:
+                all_dc_boundary_points = pd.concat([all_dc_boundary_points, dc_boundary_points], ignore_index = True)
 
 
     # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
@@ -655,7 +692,7 @@ def merge_stations_lines_by_station_id_and_voltage(
         lambda p: any([p.within(l) for l in all_dc_boundary_points])
     )
 
-    logger.info("Stage 4b/5: Merge substations with the same id")
+    logger.info(" - Merging substations with the same id")
 
     # merge buses with same station id and voltage
     if not buses.empty:
@@ -665,7 +702,7 @@ def merge_stations_lines_by_station_id_and_voltage(
         buses = pd.concat([buses_ac, buses_dc], ignore_index=True)
         set_substations_ids(buses, distance_crs, tol=tol)
 
-    logger.info("Stage 4c/5: Specify the bus ids of the line endings")
+    logger.info(" - Specifying the bus ids of the line endings")
 
     # set the bus ids to the line dataset
     lines, buses = set_lines_ids(lines, buses, distance_crs)
@@ -678,7 +715,7 @@ def merge_stations_lines_by_station_id_and_voltage(
     # set substation_lv
     set_lv_substations(buses)
 
-    logger.info("Stage 3d/4: Add converters to lines")
+    logger.info(" - Adding converters to lines")
 
     # append fake converters
     # lines = pd.concat([lines, converters], ignore_index=True)
@@ -691,80 +728,9 @@ def merge_stations_lines_by_station_id_and_voltage(
     return lines, buses
 
 
-def create_station_at_equal_bus_locations(
-    lines, buses, geo_crs, distance_crs, tol=5000
-):
-    # V1. Create station_id at same bus location
-    # - We saw that buses are not connected exactly at one point, they are
-    #   usually connected to a substation "area" (analysed on maps)
-    # - Create station_id at exactly the same location might therefore be not
-    #   always correct
-    # - Though as you can see below, it might be still sometime the case.
-    #   Examples are **station 4** (2 lines with the same voltage connect at the
-    #   same point) and **station 23** (4 lines with two different voltages connect
-    #   at the same point)
-    # TODO: Filter out the generator lines - defined as going from generator to
-    #       the next station which is connected to a load. Excluding generator
-    #       lines make probably sense because they are not transmission expansion
-    #       relevant. For now we simplify and include generator lines.
-
-    # If same location/geometry make station
-    bus_all = buses
-
-    # set substation ids
-    set_substations_ids(buses, distance_crs, tol=tol)
-
-    # set the bus ids to the line dataset
-    lines, buses = set_lines_ids(lines, buses, distance_crs)
-
-    # update line endings
-    lines = line_endings_to_bus_conversion(lines)
-
-    # For each station number with multiple buses make lowest voltage `substation_lv = TRUE`
-    set_lv_substations(bus_all)
-
-    # TRY: Keep only buses that are not duplicated & lv_substation = True
-    # TODO: Check if this is necessary. What effect do duplicates have?
-    bus_all = bus_all[bus_all["substation_lv"] == True]
-
-    lines = connect_stations_same_station_id(lines, buses)
-
-    return lines, buses
-
-
-def _split_linestring_by_point(linestring, points):
-    """
-    Function to split a linestring geometry by multiple inner points.
-
-    Parameters
-    ----------
-    lstring : LineString
-        Linestring of the line to be split
-    points : list
-        List of points to split the linestring
-
-    Return
-    ------
-    list_lines : list
-        List of linestring to split the line
-    """
-
-    list_linestrings = [linestring]
-
-    for p in points:
-        # execute split to all lines and store results
-        temp_list = [split(l, p) for l in list_linestrings]
-        # nest all geometries
-        list_linestrings = [lstring for tval in temp_list for lstring in tval.geoms]
-
-    return list_linestrings
-
-
 def build_network(
     inputs,
     outputs,
-    build_osm_network_config,
-    countries_config,
     geo_crs,
     distance_crs,
 ):  
@@ -801,7 +767,7 @@ def build_network(
         }
     }
 
-    logger.info("Read input data.")
+    logger.info("Reading input data.")
     buses = read_geojson(
         inputs["substations"],
         osm_clean_columns["substation"].keys(),
@@ -819,10 +785,10 @@ def build_network(
     # METHOD to merge buses with same voltage and within tolerance
     tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
     logger.info(
-        f"Aggregate close substations: Enabled with tolerance {tol} m"
+        f"Aggregating close substations: Enabled with tolerance {tol} m"
     )
     lines, buses = merge_stations_lines_by_station_id_and_voltage(
-        lines, buses, geo_crs, distance_crs, tol=tol
+        lines, buses, distance_crs, tol=tol
     )
 
     # Recalculate lengths of lines
@@ -833,9 +799,9 @@ def build_network(
     transformers = get_transformers(buses, lines)
 
     # get converters: currently modelled as links connecting buses with different polarity
-    converters = get_converters(buses, lines)
+    converters = get_converters(buses)
 
-    logger.info("Save outputs")
+    logger.info("Saving outputs")
 
     # create clean directory if not already exist
     if not os.path.exists(outputs["lines"]):
@@ -870,7 +836,7 @@ def build_network(
     cols_lines_csv = ["bus0", "bus1", "voltage", "circuits", "tag_frequency", "length", "underground", "under_construction", "geometry"]
     lines_csv = lines[cols_lines_csv]
     lines = lines[cols_lines]
-    
+
     to_csv_nafix(lines_csv, outputs["lines"], quotechar="'")  # Generate CSV
     to_csv_nafix(converters, outputs["converters"], quotechar="'")  # Generate CSV
     to_csv_nafix(transformers, outputs["transformers"], quotechar="'")  # Generate CSV
@@ -894,16 +860,50 @@ def build_network(
 
 # Function to check if two lines are connected
 def are_lines_connected(line1, line2):
-    # return (line1['geometry'].touches(line2['geometry']))
+    """
+    Check if two lines are connected.
+
+    Parameters:
+    line1 (dict): A dictionary representing the first line.
+    line2 (dict): A dictionary representing the second line.
+
+    Returns:
+    tuple: A tuple of boolean values indicating the connection status between 
+    the lines.
+
+    The tuple contains four elements:
+    - True if the first line's bus_0_coors is almost equal to the second line's 
+      bus_0_coors, False otherwise.
+    - True if the first line's bus_0_coors is almost equal to the second line's 
+      bus_1_coors, False otherwise.
+    - True if the first line's bus_1_coors is almost equal to the second line's 
+      bus_0_coors, False otherwise.
+    - True if the first line's bus_1_coors is almost equal to the second line's 
+      bus_1_coors, False otherwise.
+    """
     return (
         are_almost_equal(line1["bus_0_coors"], line2["bus_0_coors"]),
         are_almost_equal(line1["bus_0_coors"], line2["bus_1_coors"]),
         are_almost_equal(line1["bus_1_coors"], line2["bus_0_coors"]),
         are_almost_equal(line1["bus_1_coors"], line2["bus_1_coors"])
-        )
+    )
 
 
 def _dfs(adj_matrix, visited, current_vertex, path):
+    """
+    Perform a depth-first search (DFS) on a graph represented by an adjacency 
+    matrix.
+
+    Parameters:
+    - adj_matrix (list of lists): The adjacency matrix representing the graph.
+    - visited (list of bool): A list to keep track of visited vertices.
+    - current_vertex (int): The current vertex being visited.
+    - path (list): The path of vertices visited so far.
+
+    Returns:
+    - path (list): The path of vertices visited during the DFS.
+
+    """
     visited[current_vertex] = True
     path.append(current_vertex)
     for neighbor in range(len(adj_matrix)):
@@ -914,6 +914,17 @@ def _dfs(adj_matrix, visited, current_vertex, path):
 
 # Returns all connected paths as a vector
 def find_paths(adj_matrix):
+    """
+    Find all paths in a graph represented by an adjacency matrix.
+
+    Parameters:
+    - adj_matrix (list of lists): The adjacency matrix representing the graph.
+
+    Returns:
+    - paths (list of lists): A list of lists, where each inner list represents 
+      a path in the graph.
+
+    """
     visited = [False] * len(adj_matrix)
     paths = []
     for vertex in range(len(adj_matrix)):
@@ -923,6 +934,7 @@ def find_paths(adj_matrix):
                 paths.append(path)
     return paths
 
+
 def are_almost_equal(point1, point2, tolerance=1e-6):
     """
     Check if two Shapely points are almost equal with a given tolerance.
@@ -938,6 +950,61 @@ def are_almost_equal(point1, point2, tolerance=1e-6):
     return abs(point1.x - point2.x) < tolerance and abs(point1.y - point2.y) < tolerance
 
 
+def merge_linestrings(gdf):
+    """
+    Merge LineStrings in a GeoDataFrame wherever the endpoints match.
+
+    Parameters:
+    gdf (GeoDataFrame): A GeoDataFrame containing LineString geometries.
+
+    Returns:
+    GeoDataFrame: A GeoDataFrame with merged LineString geometries.
+    """
+    gdf = gdf.copy()
+    if len(gdf) == 1:
+        return gdf
+
+    lines = list(gdf.geometry)
+    merged_lines = []
+    while lines:
+        line = lines.pop(0)
+        merged_line = line
+        i = 0
+        while i < len(lines):
+            if are_almost_equal(
+                    Point(merged_line.coords[-1]), 
+                    Point(lines[i].coords[0])
+                ):
+                merged_line = LineString(list(merged_line.coords) + list(lines.pop(i).coords[1:]))
+                i = 0  # Restart the scan after merging
+            elif are_almost_equal(
+                Point(merged_line.coords[0]), 
+                Point(lines[i].coords[-1])
+                ):
+                merged_line = LineString(list(lines.pop(i).coords)[:-1] + list(merged_line.coords))
+                i = 0  # Restart the scan after merging
+            elif are_almost_equal(
+                Point(merged_line.coords[-1]), 
+                Point(lines[i].coords[-1])
+                ):
+                merged_line = LineString(list(merged_line.coords) + list(lines.pop(i).coords[::-1])[1:])
+                i = 0  # Restart the scan after merging
+            elif are_almost_equal(
+                Point(merged_line.coords[0]), 
+                Point(lines[i].coords[0])
+                ):
+                merged_line = LineString(list(lines.pop(i).coords[::-1])[:-1] + list(merged_line.coords))
+                i = 0  # Restart the scan after merging
+            else:
+                i += 1
+        merged_lines.append(merged_line)
+        no_coordinates = [len(merged_lines[i].coords) for i in range(len(merged_lines))]
+        max_index = np.argmax(no_coordinates)
+        merged_lines = [merged_lines[max_index]]
+
+    return gpd.GeoDataFrame(geometry=merged_lines, crs=gdf.crs)
+
+
 if __name__ == "__main__":
     # Detect running outside of snakemake and mock snakemake for testing
     if "snakemake" not in globals():
@@ -952,7 +1019,6 @@ def are_almost_equal(point1, point2, tolerance=1e-6):
     geo_crs = "EPSG:4326"
     distance_crs = "EPSG:3035"
 
-    build_osm_network = snakemake.config["build_osm_network"]
     countries = snakemake.config["countries"]
 
     with memory_logger(
@@ -961,8 +1027,6 @@ def are_almost_equal(point1, point2, tolerance=1e-6):
         build_network(
         snakemake.input,
         snakemake.output,
-        build_osm_network,
-        countries,
         geo_crs,
         distance_crs,
         )
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index cde44d412..dadc2c902 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -497,6 +497,7 @@ def _import_lines_and_cables(path_lines):
                 df["id"] = df["id"].astype(str)
                 df["country"] = country
 
+                # col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires", "capacity", "rating"]
                 col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires"]
 
                 tags = pd.json_normalize(df["tags"]) \

From 7bd83b4fa14761ba8afedb2fcb61f8ff7d75ac64 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 22 May 2024 15:57:26 +0200
Subject: [PATCH 023/100] Updated default config

---
 config/config.default.yaml | 146 ++++++++++++++++++-------------------
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index d742f806f..8ccafab79 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -15,13 +15,13 @@ private:
     entsoe_api:
 
 remote:
-  ssh: "z1"
-  path: "~/scratch/projects/pypsa-eur"
+  ssh: ""
+  path: ""
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
 run:
   prefix: ""
-  name: "europe-osm-update-hydro"
+  name: ""
   scenarios:
     enable: false
     file: config/scenarios.yaml
@@ -40,15 +40,13 @@ scenario:
   simpl:
   - ''
   ll:
-  - v1.0 # TODO mit und ohne Netzausbau v1.0
+  - vopt
   clusters:
-  - 50
+  - 37
   - 128
   - 256
-  - 512
-  # - 1024
   opts:
-  - 'Co2L0-25H'
+  - ''
   sector_opts:
   - ''
   planning_horizons:
@@ -58,20 +56,7 @@ scenario:
   - 2050
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
-# countries: ["NO"]
 countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']
-# countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MD', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA']
-
-# Settings related to the high-voltage electricity grid
-electricity_network:
-  base_network: "osm"  # "osm" or "gridkit"
-  build_osm_network: true  # If 'true', the network will be built from scratch (retrieving OSM data, cleaning, and building) and stored under resources, 'false' will use snapshots in data/osm
-
-build_osm_network:  # Options of the build_osm_network script; osm = OpenStreetMap
-  group_tolerance_buses: 5000  # [m] (default 5000) Tolerance in meters of the close buses to merge
-  split_overpassing_lines: false  # When True, lines overpassing buses are splitted and connected to the bueses
-  overpassing_lines_tolerance: 1  # [m] (default 1) Tolerance to identify lines overpassing buses
-  force_ac: false  # When true, it forces all components (lines and substation) to be AC-only. To be used if DC assets create problem.
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
 snapshots:
@@ -81,19 +66,20 @@ snapshots:
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
 enable:
-  retrieve: true
+  retrieve: auto
   prepare_links_p_nom: false
   retrieve_databundle: true
-  retrieve_sector_databundle: true
   retrieve_cost_data: true
   build_cutout: false
-  retrieve_irena: false
   retrieve_cutout: true
-  build_natura_raster: false
-  retrieve_natura_raster: true
   custom_busmap: false
   drop_leap_day: true
 
+# Settings related to the high-voltage electricity grid
+electricity_network:
+  base_network: "osm"  # "osm" or "gridkit"
+  osm_group_tolerance_buses: 5000  # [m] (default 5000) Tolerance in meters of the close buses to merge
+
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
 co2_budget:
   2020: 0.701
@@ -106,7 +92,7 @@ co2_budget:
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
-  voltages: [200., 220., 300., 380., 400., 500., 750.]
+  voltages: [200., 220., 300., 380., 500., 750.]
   gaslimit_enable: false
   gaslimit: false
   co2limit_enable: false
@@ -142,7 +128,7 @@ electricity:
     year: 2020
     expansion_limit: false
     technology_mapping:
-      Offshore: [offwind-ac, offwind-dc]
+      Offshore: [offwind-ac, offwind-dc, offwind-float]
       Onshore: [onwind]
       PV: [solar]
 
@@ -210,7 +196,7 @@ renewable:
     luisa: false # [0, 5230]
     natura: true
     ship_threshold: 400
-    max_depth: 50
+    max_depth: 60
     max_shore_distance: 30000
     excluder_resolution: 200
     clip_p_max_pu: 1.e-2
@@ -226,10 +212,28 @@ renewable:
     luisa: false # [0, 5230]
     natura: true
     ship_threshold: 400
-    max_depth: 50
+    max_depth: 60
     min_shore_distance: 30000
     excluder_resolution: 200
     clip_p_max_pu: 1.e-2
+  offwind-float:
+    cutout: europe-2013-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_5MW_offshore
+    # ScholzPhd Tab 4.3.1: 10MW/km^2
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    # proxy for wake losses
+    # from 10.1016/j.energy.2018.08.153
+    # until done more rigorously in #153
+    corine: [44, 255]
+    natura: true
+    ship_threshold: 400
+    excluder_resolution: 200
+    min_depth: 60
+    max_depth: 1000
+    clip_p_max_pu: 1.e-2
   solar:
     cutout: europe-2013-sarah
     resource:
@@ -289,20 +293,26 @@ lines:
     400.: "Al/St 240/40 4-bundle 380.0"
     500.: "Al/St 240/40 4-bundle 380.0"
     750.: "Al/St 560/50 4-bundle 750.0"
-  dc_types: # setting only for osm
+  dc_types:
     200.: "HVDC XLPE 1000"
-    220.: "HVDC XLPE 1000"
+    250.: "HVDC XLPE 1000"
+    270.: "HVDC XLPE 1000"
+    285.: "HVDC XLPE 1000"
     300.: "HVDC XLPE 1000"
-    750.: "HVDC XLPE 1000"
-    380.: "HVDC XLPE 1000"
+    320.: "HVDC XLPE 1000"
+    350.: "HVDC XLPE 1000"
+    380.: "HVDC Oil filled 1400"
     400.: "HVDC XLPE 1000"
-    500.: "HVDC XLPE 1000"
+    450.: "HVDC XLPE 1000"
+    515.: "HVDC XLPE 1000"
+    525.: "HVDC XLPE 1000"
+    600.: "HVDC XLPE 1000"
   s_max_pu: 0.7
   s_nom_max: .inf
   max_extension: 20000 #MW
   length_factor: 1.25
   reconnect_crimea: true
-  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
   dynamic_line_rating:
     activate: false
     cutout: europe-2013-era5
@@ -315,7 +325,7 @@ links:
   p_max_pu: 1.0
   p_nom_max: .inf
   max_extension: 30000 #MW
-  include_tyndp: false
+  include_tyndp: true
   under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
@@ -578,7 +588,7 @@ sector:
   - nearshore    # within 50 km of sea
     # - offshore
   ammonia: false
-  min_part_load_fischer_tropsch: 0.7
+  min_part_load_fischer_tropsch: 0.5
   min_part_load_methanolisation: 0.3
   min_part_load_methanation: 0.3
   use_fischer_tropsch_waste_heat: true
@@ -696,6 +706,9 @@ industry:
     2040: 0.12
     2045: 0.16
     2050: 0.20
+  HVC_environment_sequestration_fraction: 0.
+  waste_to_energy: false
+  waste_to_energy_cc: false
   sector_ratios_fraction_future:
     2020: 0.0
     2025: 0.1
@@ -829,7 +842,7 @@ solving:
 
   solver_options:
     highs-default:
-      # refer to https://ergo-code.github.io/HiGHS/options/definitions.html#solver
+      # refer to https://ergo-code.github.io/HiGHS/dev/options/definitions/
       threads: 4
       solver: "ipm"
       run_crossover: "off"
@@ -882,23 +895,17 @@ solving:
     cbc-default: {} # Used in CI
     glpk-default: {} # Used in CI
 
-  mem_mb: 100000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
-  runtime: 12h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
+  mem_mb: 30000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
+  runtime: 6h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
 
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
-
 plotting:
   map:
-    boundaries:
-  eu_node_location:
-    x: -5.5
-    y: 46.
-  # costs_max: 1000
-  # costs_threshold: 0.0000001
-  # energy_max:
-  # energy_min:
-  # energy_threshold: 0.000001
+    boundaries: [-11, 30, 34, 71]
+    color_geomap:
+      ocean: white
+      land: white
   projection:
     name: "EqualEarth"
     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
@@ -906,34 +913,21 @@ plotting:
     # central_longitude: 10.
     # central_latitude: 50.
     # standard_parallels: [35, 65]
-
-# plotting:
-#   map:
-#     boundaries: [-11, 30, 34, 71]
-#     color_geomap:
-#       ocean: white
-#       land: white
-#   projection:
-#     name: "EqualEarth"
-#     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
-#     # name: "LambertConformal"
-#     # central_longitude: 10.
-#     # central_latitude: 50.
-#     # standard_parallels: [35, 65]
-#   eu_node_location:
-#     x: -5.5
-#     y: 46.
-#   costs_max: 1000
-#   costs_threshold: 1
-#   energy_max: 20000
-#   energy_min: -20000
-#   energy_threshold: 50.
+  eu_node_location:
+    x: -5.5
+    y: 46.
+  costs_max: 1000
+  costs_threshold: 1
+  energy_max: 20000
+  energy_min: -20000
+  energy_threshold: 50.
 
   nice_names:
     OCGT: "Open-Cycle Gas"
     CCGT: "Combined-Cycle Gas"
     offwind-ac: "Offshore Wind (AC)"
     offwind-dc: "Offshore Wind (DC)"
+    offwind-float: "Offshore Wind (Floating)"
     onwind: "Onshore Wind"
     solar: "Solar"
     PHS: "Pumped Hydro Storage"
@@ -958,6 +952,9 @@ plotting:
     offwind-dc: "#74c6f2"
     offshore wind (DC): "#74c6f2"
     offshore wind dc: "#74c6f2"
+    offwind-float: "#b5e2fa"
+    offshore wind (Float): "#b5e2fa"
+    offshore wind float: "#b5e2fa"
     # water
     hydro: '#298c81'
     hydro reservoir: '#298c81'
@@ -1216,3 +1213,6 @@ plotting:
     DC-DC: "#8a1caf"
     DC link: "#8a1caf"
     load: "#dd2e23"
+    waste CHP: '#e3d37d'
+    waste CHP CC: '#e3d3ff'
+    HVC to air: 'k'

From d30ad7f632b17e58af6d13f75441c8c55c33e0b3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 14:30:21 +0000
Subject: [PATCH 024/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 rules/build_electricity.smk  |  26 +-
 scripts/base_network_osm.py  |  61 ++-
 scripts/build_osm_network.py | 270 +++++++------
 scripts/clean_osm_data.py    | 752 ++++++++++++++++++++---------------
 scripts/retrieve_osm_data.py | 113 +++---
 5 files changed, 710 insertions(+), 512 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index ec4c56f60..80789900a 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -66,6 +66,7 @@ rule build_powerplants:
 
 
 if config["electricity_network"]["base_network"] == "gridkit":
+
     rule base_network:
         params:
             countries=config_provider("countries"),
@@ -104,6 +105,7 @@ if config["electricity_network"]["base_network"] == "gridkit":
 
 
 if config["electricity_network"]["base_network"] == "osm":
+
     rule base_network:
         params:
             countries=config_provider("countries"),
@@ -618,17 +620,29 @@ rule retrieve_osm_data:
     log:
         logs("retrieve_osm_data_{country}.log"),
     resources:
-        cores = 2, threads= 1,
+        cores=2,
+        threads=1,
     script:
         "../scripts/retrieve_osm_data.py"
 
 
 rule clean_osm_data:
     input:
-        cables_way=[f"data/osm/raw/{country}/cables_way.json" for country in config["countries"]],
-        lines_way=[f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]],
-        substations_way=[f"data/osm/raw/{country}/substations_way.json" for country in config["countries"]],
-        substations_relation=[f"data/osm/raw/{country}/substations_relation.json" for country in config["countries"]],
+        cables_way=[
+            f"data/osm/raw/{country}/cables_way.json"
+            for country in config["countries"]
+        ],
+        lines_way=[
+            f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]
+        ],
+        substations_way=[
+            f"data/osm/raw/{country}/substations_way.json"
+            for country in config["countries"]
+        ],
+        substations_relation=[
+            f"data/osm/raw/{country}/substations_relation.json"
+            for country in config["countries"]
+        ],
         offshore_shapes=resources("offshore_shapes.geojson"),
         country_shapes=resources("country_shapes.geojson"),
     output:
@@ -660,4 +674,4 @@ rule build_osm_network:
     benchmark:
         benchmarks("build_osm_network")
     script:
-        "../scripts/build_osm_network.py"
\ No newline at end of file
+        "../scripts/build_osm_network.py"
diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index 81b7339e1..beec06f84 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -164,7 +164,6 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
         lambda p: europe_shape_prepped.contains(Point(p)), axis=1
     )
 
-
     # TODO pypsa-eur: Find a long-term solution
     # buses_with_v_nom_to_keep_b = (
     #     buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
@@ -176,9 +175,7 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
     # Quick fix:
     buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
 
-    logger.info(
-        f'Removing buses outside of range {v_nom_min} - {v_nom_max} V'
-    )
+    logger.info(f"Removing buses outside of range {v_nom_min} - {v_nom_max} V")
     return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
 
 
@@ -418,6 +415,7 @@ def _reconnect_crimea(lines):
 #     for v_nom in v_noms:
 #         lines.loc[lines["v_nom"] == v_nom, "type"] = linetypes[v_nom]
 
+
 def _set_electrical_parameters_lines(lines_config, voltages, lines):
     if lines.empty:
         lines["type"] = []
@@ -467,6 +465,7 @@ def _set_electrical_parameters_dc_lines(lines_config, voltages, lines):
 
     return lines
 
+
 # TODO pypsa-eur: Clean/fix this, update list p_noms
 def _set_electrical_parameters_links(links, config, links_p_nom):
     if links.empty:
@@ -795,9 +794,7 @@ def base_network_osm(
 ):
     buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
 
-
-
-    #TODO pypsa-eur add this
+    # TODO pypsa-eur add this
     # links = _load_links_from_eg(buses, eg_links)
     # if config["links"].get("include_tyndp"):
     #     buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
@@ -814,15 +811,11 @@ def base_network_osm(
     lines_dc = lines[lines.tag_frequency.astype(float) == 0].copy()
 
     lines_ac = _set_electrical_parameters_lines(
-        config["lines"], 
-        config["electricity"]["voltages"], 
-        lines_ac
-        )
+        config["lines"], config["electricity"]["voltages"], lines_ac
+    )
 
     lines_dc = _set_electrical_parameters_dc_lines(
-        config["lines"], 
-        config["electricity"]["voltages"], 
-        lines_dc
+        config["lines"], config["electricity"]["voltages"], lines_dc
     )
 
     # lines = _set_electrical_parameters_lines(lines, config)
@@ -835,7 +828,9 @@ def base_network_osm(
 
     time = get_snapshots(snakemake.params.snapshots, snakemake.params.drop_leap_day)
     n.set_snapshots(time)
-    n.madd("Carrier", ["AC", "DC"]) # TODO: fix hard code and check if AC/DC truly exist
+    n.madd(
+        "Carrier", ["AC", "DC"]
+    )  # TODO: fix hard code and check if AC/DC truly exist
 
     n.import_components_from_dataframe(buses, "Bus")
 
@@ -870,15 +865,15 @@ def base_network_osm(
 
     _set_lines_s_nom_from_linetypes(n)
 
-    #TODO pypsa-eur add this
-    # _apply_parameter_corrections(n, parameter_corrections) 
+    # TODO pypsa-eur add this
+    # _apply_parameter_corrections(n, parameter_corrections)
 
     # TODO: what about this?
     n = _remove_unconnected_components(n)
 
     _set_countries_and_substations(n, config, country_shapes, offshore_shapes)
 
-    #TODO pypsa-eur add this
+    # TODO pypsa-eur add this
     _set_links_underwater_fraction(n, offshore_shapes)
 
     _replace_b2b_converter_at_country_border_by_link(n)
@@ -889,6 +884,7 @@ def base_network_osm(
 
     return n
 
+
 def _get_linetypes_config(line_types, voltages):
     """
     Return the dictionary of linetypes for selected voltages. The dictionary is
@@ -914,6 +910,7 @@ def _get_linetypes_config(line_types, voltages):
         )
     return {k: v for k, v in line_types.items() if k in voltages}
 
+
 def _get_linetype_by_voltage(v_nom, d_linetypes):
     """
     Return the linetype of a specific line based on its voltage v_nom.
@@ -1084,24 +1081,24 @@ def append_bus_shapes(n, shapes, type):
     set_scenario_config(snakemake)
 
     n = base_network_osm(
-    snakemake.input.eg_buses,
-    snakemake.input.eg_converters,
-    snakemake.input.eg_transformers,
-    snakemake.input.eg_lines,
-    snakemake.input.links_p_nom,
-    snakemake.input.europe_shape,
-    snakemake.input.country_shapes,
-    snakemake.input.offshore_shapes,
-    snakemake.config,
+        snakemake.input.eg_buses,
+        snakemake.input.eg_converters,
+        snakemake.input.eg_transformers,
+        snakemake.input.eg_lines,
+        snakemake.input.links_p_nom,
+        snakemake.input.europe_shape,
+        snakemake.input.country_shapes,
+        snakemake.input.offshore_shapes,
+        snakemake.config,
     )
 
     logger.info("Base network created using OSM.")
 
     onshore_regions, offshore_regions, shapes = build_bus_shapes(
-    n,
-    snakemake.input.country_shapes,
-    snakemake.input.offshore_shapes,
-    snakemake.params.countries,
+        n,
+        snakemake.input.country_shapes,
+        snakemake.input.offshore_shapes,
+        snakemake.params.countries,
     )
 
     shapes.to_file(snakemake.output.regions_onshore)
@@ -1115,4 +1112,4 @@ def append_bus_shapes(n, shapes, type):
         offshore_shapes.to_frame().to_file(snakemake.output.regions_offshore)
 
     n.meta = snakemake.config
-    n.export_to_netcdf(snakemake.output.base_network)
\ No newline at end of file
+    n.export_to_netcdf(snakemake.output.base_network)
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 19f9f4ad1..60576a34d 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -11,20 +11,18 @@
 import geopandas as gpd
 import numpy as np
 import pandas as pd
-from _helpers import (
-    configure_logging,
-    set_scenario_config,
-)
+from _benchmark import memory_logger
+from _helpers import configure_logging, set_scenario_config
 from shapely.geometry import LineString, Point
 from shapely.ops import linemerge, split
 from tqdm import tqdm
-from _benchmark import memory_logger 
 
 logger = logging.getLogger(__name__)
 
 # list of recognised nan values (NA and na excluded as may be confused with Namibia 2-letter country code)
 NA_VALUES = ["NULL", "", "N/A", "NAN", "NaN", "nan", "Nan", "n/a", "null"]
 
+
 def read_csv_nafix(file, **kwargs):
     "Function to open a csv as pandas file and standardize the na value"
     if "keep_default_na" not in kwargs:
@@ -106,9 +104,9 @@ def to_csv_nafix(df, path, **kwargs):
     Returns:
     - None
 
-    If the DataFrame is not empty or does not have empty columns, it will be 
-    written to the CSV file with NA values replaced by the first value in the 
-    `NA_VALUES` list. If the DataFrame is empty or has empty columns, an empty 
+    If the DataFrame is not empty or does not have empty columns, it will be
+    written to the CSV file with NA values replaced by the first value in the
+    `NA_VALUES` list. If the DataFrame is empty or has empty columns, an empty
     file will be created at the specified path.
     """
     if "na_rep" in kwargs:
@@ -123,17 +121,16 @@ def to_csv_nafix(df, path, **kwargs):
 def line_endings_to_bus_conversion(lines):
     """
     Converts line endings to bus connections.
-    
-    This function takes a df of lines and converts the line endings to bus 
-    connections. It performs the necessary operations to ensure that the line 
+
+    This function takes a df of lines and converts the line endings to bus
+    connections. It performs the necessary operations to ensure that the line
     endings are properly connected to the buses in the network.
-    
+
     Parameters:
     lines (DataFrame)
-    
+
     Returns:
     lines (DataFrame)
-    
     """
     # Assign to every line a start and end point
 
@@ -340,7 +337,7 @@ def merge_stations_same_station_id(
                     lon_bus,  # "lon"
                     lat_bus,  # "lat"
                     bus_row["country"].iloc[0],  # "country",
-                    is_dclink_boundary_point, # check if new bus was formed of at least one DC link boundary point
+                    is_dclink_boundary_point,  # check if new bus was formed of at least one DC link boundary point
                     Point(
                         lon_bus,
                         lat_bus,
@@ -369,9 +366,9 @@ def merge_stations_same_station_id(
         "geometry",
     ]
 
-    gdf_buses_clean = gpd.GeoDataFrame(buses_clean, columns=buses_clean_columns).set_crs(
-        crs=buses.crs, inplace=True
-    )
+    gdf_buses_clean = gpd.GeoDataFrame(
+        buses_clean, columns=buses_clean_columns
+    ).set_crs(crs=buses.crs, inplace=True)
 
     return gdf_buses_clean
 
@@ -497,7 +494,7 @@ def get_converters(buses):
                             g_value.country.loc[id_0],  # "country"
                             geom_conv,  # "geometry"
                         ]
-                )
+                    )
 
     # name of the columns
     conv_columns = [
@@ -628,14 +625,12 @@ def merge_stations_lines_by_station_id_and_voltage(
     the merged dataset.
     """
 
-    logger.info(
-        " - Setting substation ids with tolerance of %.2f m" % (tol)
-    )
+    logger.info(" - Setting substation ids with tolerance of %.2f m" % (tol))
 
-    # TODO pypsa-eur: Add this fix to pypsa-earth: Buses should not be clustered geographically if they are different 
+    # TODO pypsa-eur: Add this fix to pypsa-earth: Buses should not be clustered geographically if they are different
     # bus types (AC != DC)
     buses_ac = buses[buses["dc"] == False].reset_index()
-    buses_dc = buses[buses["dc"] == True].reset_index() 
+    buses_dc = buses[buses["dc"] == True].reset_index()
 
     # set substation ids
     # set_substations_ids(buses, distance_crs, tol=tol)
@@ -646,7 +641,7 @@ def merge_stations_lines_by_station_id_and_voltage(
     # lines_dc_shape = lines[lines["dc"] == True].unary_union
     # lines_dc_bounds = lines_dc_shape.boundary
     # lines_dc_points = [p for p in lines_dc_bounds.geoms]
-    lines_dc = lines[lines['dc'] == True].reset_index()
+    lines_dc = lines[lines["dc"] == True].reset_index()
     lines_dc["adj_idx"] = range(0, len(lines_dc))
 
     # Initialize an empty adjacency matrix
@@ -667,12 +662,15 @@ def merge_stations_lines_by_station_id_and_voltage(
         bus_1_coors = lines_dc.iloc[path]["bus_1_coors"]
 
         # Create DataFrame containing all points within a path
-        dc_points = pd.concat([bus_0_coors, bus_1_coors], ignore_index = True)
+        dc_points = pd.concat([bus_0_coors, bus_1_coors], ignore_index=True)
 
-        # Determine the value counts of individual points. If it occurs more than 
+        # Determine the value counts of individual points. If it occurs more than
         # once, it cannot be an end-point of a path
-        bool_duplicates = dc_points.apply(lambda p: sum([are_almost_equal(p, s) for s in dc_points])) > 1
-        
+        bool_duplicates = (
+            dc_points.apply(lambda p: sum([are_almost_equal(p, s) for s in dc_points]))
+            > 1
+        )
+
         # Drop all duplicates
         dc_boundary_points = dc_points[~bool_duplicates]
 
@@ -682,8 +680,9 @@ def merge_stations_lines_by_station_id_and_voltage(
             if all_dc_boundary_points.empty:
                 all_dc_boundary_points = dc_boundary_points
             else:
-                all_dc_boundary_points = pd.concat([all_dc_boundary_points, dc_boundary_points], ignore_index = True)
-
+                all_dc_boundary_points = pd.concat(
+                    [all_dc_boundary_points, dc_boundary_points], ignore_index=True
+                )
 
     # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
     # check for each entry in buses_dc whether it is included in lines_dc_points
@@ -733,38 +732,38 @@ def build_network(
     outputs,
     geo_crs,
     distance_crs,
-):  
+):
     osm_clean_columns = {
-        'substation': {
-            'bus_id': 'object',
-            'station_id': 'float',
-            'voltage': 'float',
-            'dc': 'bool',
-            'symbol': 'object',
-            'under_construction': 'bool',
-            'tag_substation': 'str',
-            'tag_area': 'str',
-            'lon': 'float',
-            'lat': 'float',
-            'country': 'str',
-            'geometry': 'object',
-            'tag_source': 'str',
+        "substation": {
+            "bus_id": "object",
+            "station_id": "float",
+            "voltage": "float",
+            "dc": "bool",
+            "symbol": "object",
+            "under_construction": "bool",
+            "tag_substation": "str",
+            "tag_area": "str",
+            "lon": "float",
+            "lat": "float",
+            "country": "str",
+            "geometry": "object",
+            "tag_source": "str",
+        },
+        "line": {
+            "line_id": "object",
+            "bus0": "object",
+            "bus1": "object",
+            "voltage": "float",
+            "circuits": "float",
+            "length": "float",
+            "underground": "bool",
+            "under_construction": "bool",
+            "tag_type": "str",
+            "tag_frequency": "float",
+            "dc": "bool",
+            "country": "object",
+            "geometry": "object",
         },
-        'line': {
-            'line_id': 'object',
-            'bus0': 'object',
-            'bus1': 'object',
-            'voltage': 'float',
-            'circuits': 'float',
-            'length': 'float',
-            'underground': 'bool',
-            'under_construction': 'bool',
-            'tag_type': 'str',
-            'tag_frequency': 'float',
-            'dc': 'bool',
-            'country': 'object',
-            'geometry': 'object',
-        }
     }
 
     logger.info("Reading input data.")
@@ -781,18 +780,16 @@ def build_network(
     )
 
     lines = line_endings_to_bus_conversion(lines)
-    
+
     # METHOD to merge buses with same voltage and within tolerance
     tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
-    logger.info(
-        f"Aggregating close substations: Enabled with tolerance {tol} m"
-    )
+    logger.info(f"Aggregating close substations: Enabled with tolerance {tol} m")
     lines, buses = merge_stations_lines_by_station_id_and_voltage(
         lines, buses, distance_crs, tol=tol
     )
 
     # Recalculate lengths of lines
-    utm = lines.estimate_utm_crs(datum_name = "WGS 84")
+    utm = lines.estimate_utm_crs(datum_name="WGS 84")
     lines["length"] = lines.to_crs(utm).length
 
     # get transformers: modelled as lines connecting buses with different voltage
@@ -819,21 +816,50 @@ def build_network(
 
     # Convert voltages from V to kV
     lines["voltage"] = lines["voltage"] / 1000
-    transformers["voltage_bus0"], transformers["voltage_bus1"] = transformers["voltage_bus0"] / 1000, \
-        transformers["voltage_bus1"] / 1000
+    transformers["voltage_bus0"], transformers["voltage_bus1"] = (
+        transformers["voltage_bus0"] / 1000,
+        transformers["voltage_bus1"] / 1000,
+    )
     buses["voltage"] = buses["voltage"] / 1000
 
-    # Convert 'true' and 'false' to 't' and 'f'    
+    # Convert 'true' and 'false' to 't' and 'f'
     lines = lines.replace({True: "t", False: "f"})
     converters = converters.replace({True: "t", False: "f"})
     buses = buses.replace({True: "t", False: "f"})
-    
+
     # Change column orders
-    cols_lines = ["bus0", "bus1", "voltage", "circuits", "length", "underground", "under_construction", "geometry",
-                  "tag_type", "tag_frequency", "country", "bounds", 
-                  "bus_0_coors", "bus_1_coors", "bus0_lon", "bus0_lat", "bus1_lon", "bus1_lat"]
-    
-    cols_lines_csv = ["bus0", "bus1", "voltage", "circuits", "tag_frequency", "length", "underground", "under_construction", "geometry"]
+    cols_lines = [
+        "bus0",
+        "bus1",
+        "voltage",
+        "circuits",
+        "length",
+        "underground",
+        "under_construction",
+        "geometry",
+        "tag_type",
+        "tag_frequency",
+        "country",
+        "bounds",
+        "bus_0_coors",
+        "bus_1_coors",
+        "bus0_lon",
+        "bus0_lat",
+        "bus1_lon",
+        "bus1_lat",
+    ]
+
+    cols_lines_csv = [
+        "bus0",
+        "bus1",
+        "voltage",
+        "circuits",
+        "tag_frequency",
+        "length",
+        "underground",
+        "under_construction",
+        "geometry",
+    ]
     lines_csv = lines[cols_lines_csv]
     lines = lines[cols_lines]
 
@@ -844,16 +870,32 @@ def build_network(
     colstodrop = ["bounds", "bus_0_coors", "bus_1_coors"]
 
     # Export to GeoJSON for quick validations
-    save_to_geojson(gpd.GeoDataFrame(lines.drop(columns = colstodrop), geometry = "geometry", crs = geo_crs), outputs["lines_geojson"])
-    save_to_geojson(gpd.GeoDataFrame(converters, geometry = "geometry", crs = geo_crs), outputs["converters_geojson"])
-    save_to_geojson(gpd.GeoDataFrame(transformers.drop(columns = colstodrop), geometry = "geometry", crs = geo_crs), outputs["transformers_geojson"])
+    save_to_geojson(
+        gpd.GeoDataFrame(
+            lines.drop(columns=colstodrop), geometry="geometry", crs=geo_crs
+        ),
+        outputs["lines_geojson"],
+    )
+    save_to_geojson(
+        gpd.GeoDataFrame(converters, geometry="geometry", crs=geo_crs),
+        outputs["converters_geojson"],
+    )
+    save_to_geojson(
+        gpd.GeoDataFrame(
+            transformers.drop(columns=colstodrop), geometry="geometry", crs=geo_crs
+        ),
+        outputs["transformers_geojson"],
+    )
 
     # create clean directory if not already exist
     if not os.path.exists(outputs["substations"]):
         os.makedirs(os.path.dirname(outputs["substations"]), exist_ok=True)
     # Generate CSV
     to_csv_nafix(buses, outputs["substations"], quotechar="'")
-    save_to_geojson(gpd.GeoDataFrame(buses, geometry = "geometry", crs = geo_crs), outputs["substations_geojson"])
+    save_to_geojson(
+        gpd.GeoDataFrame(buses, geometry="geometry", crs=geo_crs),
+        outputs["substations_geojson"],
+    )
 
     return None
 
@@ -868,30 +910,30 @@ def are_lines_connected(line1, line2):
     line2 (dict): A dictionary representing the second line.
 
     Returns:
-    tuple: A tuple of boolean values indicating the connection status between 
+    tuple: A tuple of boolean values indicating the connection status between
     the lines.
 
     The tuple contains four elements:
-    - True if the first line's bus_0_coors is almost equal to the second line's 
+    - True if the first line's bus_0_coors is almost equal to the second line's
       bus_0_coors, False otherwise.
-    - True if the first line's bus_0_coors is almost equal to the second line's 
+    - True if the first line's bus_0_coors is almost equal to the second line's
       bus_1_coors, False otherwise.
-    - True if the first line's bus_1_coors is almost equal to the second line's 
+    - True if the first line's bus_1_coors is almost equal to the second line's
       bus_0_coors, False otherwise.
-    - True if the first line's bus_1_coors is almost equal to the second line's 
+    - True if the first line's bus_1_coors is almost equal to the second line's
       bus_1_coors, False otherwise.
     """
     return (
         are_almost_equal(line1["bus_0_coors"], line2["bus_0_coors"]),
         are_almost_equal(line1["bus_0_coors"], line2["bus_1_coors"]),
         are_almost_equal(line1["bus_1_coors"], line2["bus_0_coors"]),
-        are_almost_equal(line1["bus_1_coors"], line2["bus_1_coors"])
+        are_almost_equal(line1["bus_1_coors"], line2["bus_1_coors"]),
     )
 
 
 def _dfs(adj_matrix, visited, current_vertex, path):
     """
-    Perform a depth-first search (DFS) on a graph represented by an adjacency 
+    Perform a depth-first search (DFS) on a graph represented by an adjacency
     matrix.
 
     Parameters:
@@ -902,7 +944,6 @@ def _dfs(adj_matrix, visited, current_vertex, path):
 
     Returns:
     - path (list): The path of vertices visited during the DFS.
-
     """
     visited[current_vertex] = True
     path.append(current_vertex)
@@ -921,9 +962,8 @@ def find_paths(adj_matrix):
     - adj_matrix (list of lists): The adjacency matrix representing the graph.
 
     Returns:
-    - paths (list of lists): A list of lists, where each inner list represents 
+    - paths (list of lists): A list of lists, where each inner list represents
       a path in the graph.
-
     """
     visited = [False] * len(adj_matrix)
     paths = []
@@ -938,12 +978,12 @@ def find_paths(adj_matrix):
 def are_almost_equal(point1, point2, tolerance=1e-6):
     """
     Check if two Shapely points are almost equal with a given tolerance.
-    
+
     Args:
     point1 (Point): First Shapely point.
     point2 (Point): Second Shapely point.
     tolerance (float): Tolerance for coordinate deviation.
-    
+
     Returns:
     bool: True if the points are almost equal, False otherwise.
     """
@@ -972,28 +1012,32 @@ def merge_linestrings(gdf):
         i = 0
         while i < len(lines):
             if are_almost_equal(
-                    Point(merged_line.coords[-1]), 
-                    Point(lines[i].coords[0])
-                ):
-                merged_line = LineString(list(merged_line.coords) + list(lines.pop(i).coords[1:]))
+                Point(merged_line.coords[-1]), Point(lines[i].coords[0])
+            ):
+                merged_line = LineString(
+                    list(merged_line.coords) + list(lines.pop(i).coords[1:])
+                )
                 i = 0  # Restart the scan after merging
             elif are_almost_equal(
-                Point(merged_line.coords[0]), 
-                Point(lines[i].coords[-1])
-                ):
-                merged_line = LineString(list(lines.pop(i).coords)[:-1] + list(merged_line.coords))
+                Point(merged_line.coords[0]), Point(lines[i].coords[-1])
+            ):
+                merged_line = LineString(
+                    list(lines.pop(i).coords)[:-1] + list(merged_line.coords)
+                )
                 i = 0  # Restart the scan after merging
             elif are_almost_equal(
-                Point(merged_line.coords[-1]), 
-                Point(lines[i].coords[-1])
-                ):
-                merged_line = LineString(list(merged_line.coords) + list(lines.pop(i).coords[::-1])[1:])
+                Point(merged_line.coords[-1]), Point(lines[i].coords[-1])
+            ):
+                merged_line = LineString(
+                    list(merged_line.coords) + list(lines.pop(i).coords[::-1])[1:]
+                )
                 i = 0  # Restart the scan after merging
             elif are_almost_equal(
-                Point(merged_line.coords[0]), 
-                Point(lines[i].coords[0])
-                ):
-                merged_line = LineString(list(lines.pop(i).coords[::-1])[:-1] + list(merged_line.coords))
+                Point(merged_line.coords[0]), Point(lines[i].coords[0])
+            ):
+                merged_line = LineString(
+                    list(lines.pop(i).coords[::-1])[:-1] + list(merged_line.coords)
+                )
                 i = 0  # Restart the scan after merging
             else:
                 i += 1
@@ -1011,7 +1055,7 @@ def merge_linestrings(gdf):
         from _helpers import mock_snakemake
 
         snakemake = mock_snakemake("build_osm_network")
-    
+
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
@@ -1025,10 +1069,10 @@ def merge_linestrings(gdf):
         filename=getattr(snakemake.log, "memory", None), interval=30.0
     ) as mem:
         build_network(
-        snakemake.input,
-        snakemake.output,
-        geo_crs,
-        distance_crs,
+            snakemake.input,
+            snakemake.output,
+            geo_crs,
+            distance_crs,
         )
 
-    logger.info(f"Maximum memory usage: {mem.mem_usage}")
\ No newline at end of file
+    logger.info(f"Maximum memory usage: {mem.mem_usage}")
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index dadc2c902..882c1229e 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -3,7 +3,7 @@
 #
 # SPDX-License-Identifier: MIT
 """
-This script is used to clean OpenStreetMap (OSM) data for the PyPSA-Eur 
+This script is used to clean OpenStreetMap (OSM) data for the PyPSA-Eur
 project.
 
 The script performs various cleaning operations on the OSM data, including:
@@ -18,24 +18,25 @@
     python clean_osm_data.py <output_file>
 
 Arguments:
-    output_file (str): The path to the output file where the cleaned data will 
+    output_file (str): The path to the output file where the cleaned data will
     be written.
 
 Example:
     python clean_osm_data.py cleaned_data.csv
 """
 
-import geopandas as gpd
 import json
 import logging
 import os
+import re
+
+import geopandas as gpd
 import numpy as np
 import pandas as pd
-import re
+from _helpers import configure_logging, set_scenario_config
 from shapely.geometry import LineString, Polygon
 from shapely.ops import linemerge
 
-from _helpers import configure_logging, set_scenario_config
 logger = logging.getLogger(__name__)
 
 
@@ -48,33 +49,32 @@ def _create_linestring(row):
 
     Returns:
         LineString: A LineString object representing the geometry.
-
     """
-    coords = [(coord['lon'], coord['lat']) for coord in row["geometry"]]
+    coords = [(coord["lon"], coord["lat"]) for coord in row["geometry"]]
     return LineString(coords)
 
 
 def _create_polygon(row):
     """
     Create a Shapely Polygon from a list of coordinate dictionaries.
-    
+
     Parameters:
-        coords (list): List of dictionaries with 'lat' and 'lon' keys 
+        coords (list): List of dictionaries with 'lat' and 'lon' keys
         representing coordinates.
-        
+
     Returns:
         shapely.geometry.Polygon: The constructed polygon object.
     """
     # Extract coordinates as tuples
-    point_coords = [(coord['lon'], coord['lat']) for coord in row["geometry"]]
-    
+    point_coords = [(coord["lon"], coord["lat"]) for coord in row["geometry"]]
+
     # Ensure closure by repeating the first coordinate as the last coordinate
     if point_coords[0] != point_coords[-1]:
         point_coords.append(point_coords[0])
-    
+
     # Create Polygon object
     polygon = Polygon(point_coords)
-    
+
     return polygon
 
 
@@ -92,8 +92,7 @@ def _clean_voltage(column):
     column = column.copy()
 
     column = (
-        column
-        .astype(str)
+        column.astype(str)
         .str.lower()
         .str.replace("400/220/110 kV'", "400000;220000;110000")
         .str.replace("400/220/110/20_kv", "400000;220000;110000;20000")
@@ -102,8 +101,7 @@ def _clean_voltage(column):
     )
 
     column = (
-        column
-        .astype(str)
+        column.astype(str)
         .str.lower()
         .str.replace("(temp 150000)", "")
         .str.replace("low", "1000")
@@ -119,13 +117,13 @@ def _clean_voltage(column):
         .str.replace(",", ";")
         .str.replace("kv", "000")
         .str.replace("kva", "000")
-        .str.replace("/", ";") 
+        .str.replace("/", ";")
         .str.replace("nan", "")
         .str.replace("<na>", "")
     )
 
     # Remove all remaining non-numeric characters except for semicolons
-    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', str(x)))
+    column = column.apply(lambda x: re.sub(r"[^0-9;]", "", str(x)))
 
     column.dropna(inplace=True)
     return column
@@ -133,7 +131,7 @@ def _clean_voltage(column):
 
 def _clean_circuits(column):
     """
-    Function to clean the raw circuits column: manual fixing and drop nan 
+    Function to clean the raw circuits column: manual fixing and drop nan
     values
 
     Args:
@@ -145,8 +143,7 @@ def _clean_circuits(column):
     logger.info("Cleaning circuits.")
     column = column.copy()
     column = (
-        column
-        .astype(str)
+        column.astype(str)
         .str.replace("partial", "")
         .str.replace("1operator=RTE operator:wikidata=Q2178795", "")
         .str.lower()
@@ -157,7 +154,7 @@ def _clean_circuits(column):
     )
 
     # Remove all remaining non-numeric characters except for semicolons
-    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+    column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x))
 
     column.dropna(inplace=True)
     return column.astype(str)
@@ -176,8 +173,7 @@ def _clean_cables(column):
     logger.info("Cleaning cables.")
     column = column.copy()
     column = (
-        column
-        .astype(str)
+        column.astype(str)
         .str.lower()
         .str.replace("1/3", "1")
         .str.replace("3x2;2", "3")
@@ -186,7 +182,7 @@ def _clean_cables(column):
     )
 
     # Remove all remaining non-numeric characters except for semicolons
-    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+    column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x))
 
     column.dropna(inplace=True)
     return column.astype(str)
@@ -205,8 +201,7 @@ def _clean_wires(column):
     logger.info("Cleaning wires.")
     column = column.copy()
     column = (
-        column
-        .astype(str)
+        column.astype(str)
         .str.lower()
         .str.replace("?", "")
         .str.replace("trzyprzewodowe", "3")
@@ -225,7 +220,7 @@ def _clean_wires(column):
     )
 
     # Remove all remaining non-numeric characters except for semicolons
-    column = column.apply(lambda x: re.sub(r'[^0-9;]', '', x))
+    column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x))
 
     column.dropna(inplace=True)
     return column.astype(str)
@@ -240,19 +235,19 @@ def _check_voltage(voltage, list_voltages):
     list_voltages (list): A list of allowed voltages.
 
     Returns:
-    bool: True if the voltage is present in the list of allowed voltages, 
+    bool: True if the voltage is present in the list of allowed voltages,
     False otherwise.
     """
-    voltages = voltage.split(';')
+    voltages = voltage.split(";")
     for v in voltages:
         if v in list_voltages:
             return True
     return False
 
 
-def _clean_frequency(column):   
+def _clean_frequency(column):
     """
-    Function to clean the raw frequency column: manual fixing and drop nan 
+    Function to clean the raw frequency column: manual fixing and drop nan
     values
 
     Args:
@@ -264,8 +259,7 @@ def _clean_frequency(column):
     logger.info("Cleaning frequencies.")
     column = column.copy()
     column = (
-        column
-        .astype(str)
+        column.astype(str)
         .str.lower()
         .str.replace("16.67", "16.7")
         .str.replace("16,7", "16.7")
@@ -277,7 +271,7 @@ def _clean_frequency(column):
     )
 
     # Remove all remaining non-numeric characters except for semicolons
-    column = column.apply(lambda x: re.sub(r'[^0-9;.]', '', x))
+    column = column.apply(lambda x: re.sub(r"[^0-9;.]", "", x))
 
     column.dropna(inplace=True)
     return column.astype(str)
@@ -309,7 +303,7 @@ def _split_cells(df, cols=["voltage"]):
 
     # Create a dictionary to store the suffix count for each original ID
     suffix_counts = {}
-    # Create a dictionary to store the number of splits associated with each 
+    # Create a dictionary to store the number of splits associated with each
     # original ID
     num_splits = {}
 
@@ -318,12 +312,12 @@ def _split_cells(df, cols=["voltage"]):
     x = x.explode(cols, ignore_index=True)
 
     # Count the number of splits associated with each original ID
-    num_splits = x.groupby('id').size().to_dict()
+    num_splits = x.groupby("id").size().to_dict()
 
     # Update the 'split_elements' column
     x["split_elements"] = x["id"].map(num_splits)
 
-    # Function to generate the new ID with suffix and update the number of 
+    # Function to generate the new ID with suffix and update the number of
     # splits
     def generate_new_id(row):
         original_id = row["id"]
@@ -341,17 +335,16 @@ def generate_new_id(row):
 
 def _distribute_to_circuits(row):
     """
-    Distributes the number of circuits or cables to individual circuits based 
+    Distributes the number of circuits or cables to individual circuits based
     on the given row data.
 
     Parameters:
-    - row: A dictionary representing a row of data containing information about 
+    - row: A dictionary representing a row of data containing information about
       circuits and cables.
 
     Returns:
-    - single_circuit: The number of circuits to be assigned to each individual 
+    - single_circuit: The number of circuits to be assigned to each individual
       circuit.
-
     """
     if row["circuits"] != "":
         circuits = int(row["circuits"])
@@ -366,31 +359,30 @@ def _distribute_to_circuits(row):
 
 
 def _add_line_endings_to_substations(
-        df_substations, 
-        gdf_lines, 
-        path_country_shapes,
-        path_offshore_shapes,
-        ):
+    df_substations,
+    gdf_lines,
+    path_country_shapes,
+    path_offshore_shapes,
+):
     """
     Add line endings to substations.
 
-    This function takes two pandas DataFrames, `substations` and `lines`, and 
-    adds line endings to the substations based on the information from the 
+    This function takes two pandas DataFrames, `substations` and `lines`, and
+    adds line endings to the substations based on the information from the
     lines DataFrame.
 
     Parameters:
-    - substations (pandas DataFrame): DataFrame containing information about 
+    - substations (pandas DataFrame): DataFrame containing information about
       substations.
     - lines (pandas DataFrame): DataFrame containing information about lines.
 
     Returns:
-    - buses (pandas DataFrame): DataFrame containing the updated information 
+    - buses (pandas DataFrame): DataFrame containing the updated information
       about substations with line endings.
-
     """
     if gdf_lines.empty:
         return df_substations
-    
+
     logger.info("Adding line endings to substations")
     # extract columns from df_substations
     bus_s = pd.DataFrame(columns=df_substations.columns)
@@ -419,7 +411,9 @@ def _add_line_endings_to_substations(
     # Group gdf_substations by voltage and and geometry (dropping duplicates)
     bus_all = bus_all.groupby(["voltage", "lon", "lat", "dc"]).first().reset_index()
     bus_all = bus_all[df_substations.columns]
-    bus_all.loc[:, "bus_id"] = bus_all.apply(lambda row: f"line-end/{row.name + 1}", axis=1)
+    bus_all.loc[:, "bus_id"] = bus_all.apply(
+        lambda row: f"line-end/{row.name + 1}", axis=1
+    )
 
     # Initialize default values
     bus_all["station_id"] = np.nan
@@ -435,30 +429,40 @@ def _add_line_endings_to_substations(
     buses.set_index("bus_id", inplace=True)
 
     # Fix country codes
-    # TODO pypsa-eur: Temporary solution as long as the shapes have a low, 
+    # TODO pypsa-eur: Temporary solution as long as the shapes have a low,
     # incomplete resolution (cf. 2500 meters for buffering)
     bool_multiple_countries = buses["country"].str.contains(";")
     gdf_offshore = gpd.read_file(path_offshore_shapes).set_index("name")["geometry"]
-    gdf_offshore = gpd.GeoDataFrame(gdf_offshore, geometry=gdf_offshore, crs = gdf_offshore.crs)
+    gdf_offshore = gpd.GeoDataFrame(
+        gdf_offshore, geometry=gdf_offshore, crs=gdf_offshore.crs
+    )
     gdf_countries = gpd.read_file(path_country_shapes).set_index("name")["geometry"]
     # reproject to enable buffer
-    gdf_countries = gpd.GeoDataFrame(geometry=gdf_countries, crs = gdf_countries.crs)
-    gdf_union = gdf_countries.merge(gdf_offshore, how="outer", left_index=True, right_index=True)
-    gdf_union["geometry"] = gdf_union.apply(lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]) \
-                                            .unary_union, axis=1)
-    gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs = crs)
-    utm = gdf_union.estimate_utm_crs(datum_name = "WGS 84")
+    gdf_countries = gpd.GeoDataFrame(geometry=gdf_countries, crs=gdf_countries.crs)
+    gdf_union = gdf_countries.merge(
+        gdf_offshore, how="outer", left_index=True, right_index=True
+    )
+    gdf_union["geometry"] = gdf_union.apply(
+        lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]).unary_union,
+        axis=1,
+    )
+    gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs=crs)
+    utm = gdf_union.estimate_utm_crs(datum_name="WGS 84")
     gdf_union = gdf_union.to_crs(utm)
-    gdf_union = gdf_union.buffer(2500) # meters
+    gdf_union = gdf_union.buffer(2500)  # meters
     gdf_union = gdf_union.to_crs(crs)
-    gdf_union = gpd.GeoDataFrame(geometry=gdf_union, crs = crs)
-    gdf_buses_tofix = gpd.GeoDataFrame(buses[bool_multiple_countries], geometry="geometry", crs = crs)
+    gdf_union = gpd.GeoDataFrame(geometry=gdf_union, crs=crs)
+    gdf_buses_tofix = gpd.GeoDataFrame(
+        buses[bool_multiple_countries], geometry="geometry", crs=crs
+    )
     joined = gpd.sjoin(gdf_buses_tofix, gdf_union, how="left", predicate="within")
     joined.reset_index(inplace=True)
     joined = joined.drop_duplicates(subset="bus_id")
     joined.set_index("bus_id", inplace=True)
-    
-    buses.loc[bool_multiple_countries, "country"] = joined.loc[bool_multiple_countries, "index_right"]
+
+    buses.loc[bool_multiple_countries, "country"] = joined.loc[
+        bool_multiple_countries, "index_right"
+    ]
 
     return buses
 
@@ -468,88 +472,112 @@ def _import_lines_and_cables(path_lines):
     Import lines and cables from the given input paths.
 
     Parameters:
-    - path_lines (dict): A dictionary containing the input paths for lines and 
+    - path_lines (dict): A dictionary containing the input paths for lines and
       cables data.
 
     Returns:
-    - df_lines (DataFrame): A DataFrame containing the imported lines and 
+    - df_lines (DataFrame): A DataFrame containing the imported lines and
       cables data.
-
     """
-    columns = ["id", "bounds", "nodes", "geometry", "country", "power", "cables", "circuits", "frequency", "voltage", 
-               "wires"]
+    columns = [
+        "id",
+        "bounds",
+        "nodes",
+        "geometry",
+        "country",
+        "power",
+        "cables",
+        "circuits",
+        "frequency",
+        "voltage",
+        "wires",
+    ]
     df_lines = pd.DataFrame(columns=columns)
 
     logger.info("Importing lines and cables")
     for key in path_lines:
         logger.info(f"Processing {key}...")
         for idx, ip in enumerate(path_lines[key]):
-            if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
+            if (
+                os.path.exists(ip) and os.path.getsize(ip) > 400
+            ):  # unpopulated OSM json is about 51 bytes
                 country = os.path.basename(os.path.dirname(path_lines[key][idx]))
-                
+
                 logger.info(
                     f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_lines[key])).zfill(2)}: {ip}"
-                    )
+                )
                 with open(ip, "r") as f:
                     data = json.load(f)
-                
-                df = pd.DataFrame(data['elements'])
+
+                df = pd.DataFrame(data["elements"])
                 df["id"] = df["id"].astype(str)
                 df["country"] = country
 
                 # col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires", "capacity", "rating"]
-                col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires"]
+                col_tags = [
+                    "power",
+                    "cables",
+                    "circuits",
+                    "frequency",
+                    "voltage",
+                    "wires",
+                ]
+
+                tags = pd.json_normalize(df["tags"]).map(
+                    lambda x: str(x) if pd.notnull(x) else x
+                )
 
-                tags = pd.json_normalize(df["tags"]) \
-                    .map(lambda x: str(x) if pd.notnull(x) else x)
-                
                 for ct in col_tags:
                     if ct not in tags.columns:
                         tags[ct] = pd.NA
-                
+
                 tags = tags.loc[:, col_tags]
 
-                df = pd.concat([df, tags], axis="columns") 
+                df = pd.concat([df, tags], axis="columns")
                 df.drop(columns=["type", "tags"], inplace=True)
-                
+
                 df_lines = pd.concat([df_lines, df], axis="rows")
 
             else:
                 logger.info(
                     f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_lines[key])).zfill(2)} (empty): {ip}"
-                    )
+                )
                 continue
         logger.info("---")
-    
+
     return df_lines
 
 
 def _drop_duplicate_lines(df_lines):
     """
-    Drop duplicate lines from the given dataframe. Duplicates are usually lines 
+    Drop duplicate lines from the given dataframe. Duplicates are usually lines
     cross-border lines or slightly outside the country border of focus.
 
     Parameters:
     - df_lines (pandas.DataFrame): The dataframe containing lines data.
 
     Returns:
-    - df_lines (pandas.DataFrame): The dataframe with duplicate lines removed 
+    - df_lines (pandas.DataFrame): The dataframe with duplicate lines removed
       and cleaned data.
 
-    This function drops duplicate lines from the given dataframe based on the 
-    'id' column. It groups the duplicate rows by 'id' and aggregates the 
+    This function drops duplicate lines from the given dataframe based on the
+    'id' column. It groups the duplicate rows by 'id' and aggregates the
     'country' column to a string split by semicolon, as they appear in multiple
-    country datasets. One example of the duplicates is kept, accordingly. 
+    country datasets. One example of the duplicates is kept, accordingly.
     Finally, the updated dataframe without multiple duplicates is returned.
     """
     logger.info("Dropping duplicate lines.")
-    duplicate_rows = df_lines[df_lines.duplicated(subset=['id'], keep=False)].copy()
+    duplicate_rows = df_lines[df_lines.duplicated(subset=["id"], keep=False)].copy()
 
     # Group rows by id and aggregate the country column to a string split by semicolon
-    grouped_duplicates = duplicate_rows.groupby('id')["country"].agg(lambda x: ';'.join(x)).reset_index()
+    grouped_duplicates = (
+        duplicate_rows.groupby("id")["country"].agg(lambda x: ";".join(x)).reset_index()
+    )
     duplicate_rows.drop_duplicates(subset="id", inplace=True)
     duplicate_rows.drop(columns=["country"], inplace=True)
-    duplicate_rows = duplicate_rows.join(grouped_duplicates.set_index('id'), on='id', how='left')
+    duplicate_rows = duplicate_rows.join(
+        grouped_duplicates.set_index("id"), on="id", how="left"
+    )
 
     # Drop duplicates and update the df_lines dataframe with the cleaned data
     df_lines = df_lines[~df_lines["id"].isin(duplicate_rows["id"])]
@@ -564,16 +592,18 @@ def _filter_by_voltage(df, voltage_min=200000):
 
     Parameters:
     - df (pandas.DataFrame): The DataFrame containing the substations or lines data.
-    - voltage_min (int, optional): The minimum voltage value to filter the 
+    - voltage_min (int, optional): The minimum voltage value to filter the
       rows. Defaults to 200000 [unit: V].
 
     Returns:
-    - filtered df (pandas.DataFrame): The filtered DataFrame containing 
+    - filtered df (pandas.DataFrame): The filtered DataFrame containing
       the lines or substations above voltage_min.
     - list_voltages (list): A list of unique voltage values above voltage_min.
       The type of the list elements is string.
     """
-    logger.info(f"Filtering dataframe by voltage. Only keeping rows above and including {voltage_min} V.")
+    logger.info(
+        f"Filtering dataframe by voltage. Only keeping rows above and including {voltage_min} V."
+    )
     list_voltages = df["voltage"].str.split(";").explode().unique().astype(str)
     # Keep numeric strings
     list_voltages = list_voltages[np.vectorize(str.isnumeric)(list_voltages)]
@@ -594,13 +624,13 @@ def _clean_substations(df_substations, list_voltages):
     - Filter substation data based on specified voltages.
     - Update the frequency values based on the split count.
     - Split cells in the 'frequency' column.
-    - Set remaining invalid frequency values that are not in ['0', '50'] 
+    - Set remaining invalid frequency values that are not in ['0', '50']
       to '50'.
 
     Parameters:
-    - df_substations (pandas.DataFrame): The input dataframe containing 
+    - df_substations (pandas.DataFrame): The input dataframe containing
       substation data.
-    - list_voltages (list): A list of voltages above voltage_min to filter the 
+    - list_voltages (list): A list of voltages above voltage_min to filter the
     substation data.
 
     Returns:
@@ -610,22 +640,31 @@ def _clean_substations(df_substations, list_voltages):
 
     df_substations = _split_cells(df_substations)
 
-    bool_voltages = df_substations["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    bool_voltages = df_substations["voltage"].apply(
+        _check_voltage, list_voltages=list_voltages
+    )
     df_substations = df_substations[bool_voltages]
-    df_substations.loc[:, "split_count"] = df_substations["id"].apply(lambda x: x.split("-")[1] if "-" in x else "0")
+    df_substations.loc[:, "split_count"] = df_substations["id"].apply(
+        lambda x: x.split("-")[1] if "-" in x else "0"
+    )
     df_substations.loc[:, "split_count"] = df_substations["split_count"].astype(int)
 
     bool_split = df_substations["split_elements"] > 1
-    bool_frequency_len = df_substations["frequency"] \
-        .apply(lambda x: len(x.split(";"))) == df_substations["split_elements"]
-    
-    op_freq = lambda row: row["frequency"].split(";")[row["split_count"]-1]
-
-    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = df_substations \
-        .loc[bool_frequency_len & bool_split, ].apply(op_freq, axis=1)
-    
+    bool_frequency_len = (
+        df_substations["frequency"].apply(lambda x: len(x.split(";")))
+        == df_substations["split_elements"]
+    )
+
+    op_freq = lambda row: row["frequency"].split(";")[row["split_count"] - 1]
+
+    df_substations.loc[bool_frequency_len & bool_split, "frequency"] = (
+        df_substations.loc[bool_frequency_len & bool_split,].apply(op_freq, axis=1)
+    )
+
     df_substations = _split_cells(df_substations, cols=["frequency"])
-    bool_invalid_frequency = df_substations["frequency"].apply(lambda x: x not in ["50", "0"])
+    bool_invalid_frequency = df_substations["frequency"].apply(
+        lambda x: x not in ["50", "0"]
+    )
     df_substations.loc[bool_invalid_frequency, "frequency"] = "50"
 
     return df_substations
@@ -633,15 +672,15 @@ def _clean_substations(df_substations, list_voltages):
 
 def _clean_lines(df_lines, list_voltages):
     """
-    Cleans and processes the `df_lines` DataFrame heuristically based on the 
-    information available per respective line and cable.
-    Further checks to ensure data consistency and completeness.
+    Cleans and processes the `df_lines` DataFrame heuristically based on the
+    information available per respective line and cable. Further checks to
+    ensure data consistency and completeness.
 
     Parameters
     ----------
     df_lines : pandas.DataFrame
-        The input DataFrame containing line information with columns such as 
-        'voltage', 'circuits', 'frequency', 'cables', 'split_elements', 'id', 
+        The input DataFrame containing line information with columns such as
+        'voltage', 'circuits', 'frequency', 'cables', 'split_elements', 'id',
         etc.
     list_voltages : list
         A list of unique voltage values above a certain threshold. (type: str)
@@ -649,7 +688,7 @@ def _clean_lines(df_lines, list_voltages):
     Returns
     -------
     df_lines : pandas.DataFrame
-        The cleaned DataFrame with updated columns 'circuits', 'frequency', and 
+        The cleaned DataFrame with updated columns 'circuits', 'frequency', and
         'cleaned' to reflect the applied transformations.
 
     Description
@@ -658,18 +697,18 @@ def _clean_lines(df_lines, list_voltages):
 
     - Initializes a 'cleaned' column with False, step-wise updates to True
        following the respective cleaning step.
-    - Splits the voltage cells in the DataFrame at semicolons using a helper 
+    - Splits the voltage cells in the DataFrame at semicolons using a helper
        function `_split_cells`.
     - Filters the DataFrame to only include rows with valid voltages.
-    - Sets circuits of remaining lines without any applicable heuristic equal 
+    - Sets circuits of remaining lines without any applicable heuristic equal
       to 1.
 
-    The function ensures that the resulting DataFrame has consistent and 
-    complete information for further processing or analysis while maintaining 
+    The function ensures that the resulting DataFrame has consistent and
+    complete information for further processing or analysis while maintaining
     the data of the original OSM data set wherever possible.
     """
     logger.info("Cleaning lines and determining circuits.")
-    # Initiate boolean with False, only set to true if all cleaning steps are 
+    # Initiate boolean with False, only set to true if all cleaning steps are
     # passed
     df_lines = df_lines.copy()
     df_lines["cleaned"] = False
@@ -678,13 +717,17 @@ def _clean_lines(df_lines, list_voltages):
     df_lines["circuits_original"] = df_lines["circuits"]
 
     df_lines = _split_cells(df_lines)
-    bool_voltages = df_lines["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    bool_voltages = df_lines["voltage"].apply(
+        _check_voltage, list_voltages=list_voltages
+    )
     df_lines = df_lines[bool_voltages]
 
     bool_ac = df_lines["frequency"] != "0"
     bool_dc = ~bool_ac
     valid_frequency = ["50", "0"]
-    bool_invalid_frequency = df_lines["frequency"].apply(lambda x: x not in valid_frequency)
+    bool_invalid_frequency = df_lines["frequency"].apply(
+        lambda x: x not in valid_frequency
+    )
 
     bool_noinfo = (df_lines["cables"] == "") & (df_lines["circuits"] == "")
     # Fill in all values where cables info and circuits does not exist. Assuming 1 circuit
@@ -693,100 +736,127 @@ def _clean_lines(df_lines, list_voltages):
     df_lines.loc[bool_noinfo, "cleaned"] = True
 
     # Fill in all values where cables info exists and split_elements == 1
-    bool_cables_ac = (df_lines["cables"] != "") & \
-        (df_lines["split_elements"] == 1) & \
-        (df_lines["cables"] != "0") & \
-        (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
-        (df_lines["circuits"] == "") & \
-        (df_lines["cleaned"] == False) & \
-        bool_ac
-    
-    df_lines.loc[bool_cables_ac, "circuits"] = df_lines.loc[bool_cables_ac, "cables"] \
-        .apply(lambda x: str(int(max(1, np.floor_divide(int(x),3)))))
-    
+    bool_cables_ac = (
+        (df_lines["cables"] != "")
+        & (df_lines["split_elements"] == 1)
+        & (df_lines["cables"] != "0")
+        & (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1))
+        & (df_lines["circuits"] == "")
+        & (df_lines["cleaned"] == False)
+        & bool_ac
+    )
+
+    df_lines.loc[bool_cables_ac, "circuits"] = df_lines.loc[
+        bool_cables_ac, "cables"
+    ].apply(lambda x: str(int(max(1, np.floor_divide(int(x), 3)))))
+
     df_lines.loc[bool_cables_ac, "frequency"] = "50"
     df_lines.loc[bool_cables_ac, "cleaned"] = True
 
-    bool_cables_dc = (df_lines["cables"] != "") & \
-        (df_lines["split_elements"] == 1) & \
-        (df_lines["cables"] != "0") & \
-        (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
-        (df_lines["circuits"] == "") & \
-        (df_lines["cleaned"] == False) & \
-        bool_dc
-    
-    df_lines.loc[bool_cables_dc, "circuits"] = df_lines.loc[bool_cables_dc, "cables"] \
-        .apply(lambda x: str(int(max(1, np.floor_divide(int(x),2)))))
-    
+    bool_cables_dc = (
+        (df_lines["cables"] != "")
+        & (df_lines["split_elements"] == 1)
+        & (df_lines["cables"] != "0")
+        & (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1))
+        & (df_lines["circuits"] == "")
+        & (df_lines["cleaned"] == False)
+        & bool_dc
+    )
+
+    df_lines.loc[bool_cables_dc, "circuits"] = df_lines.loc[
+        bool_cables_dc, "cables"
+    ].apply(lambda x: str(int(max(1, np.floor_divide(int(x), 2)))))
+
     df_lines.loc[bool_cables_dc, "frequency"] = "0"
     df_lines.loc[bool_cables_dc, "cleaned"] = True
 
     # Fill in all values where circuits info exists and split_elements == 1
-    bool_lines = (df_lines["circuits"] != "") & \
-        (df_lines["split_elements"] == 1) & \
-        (df_lines["circuits"] != "0") & \
-        (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & \
-        (df_lines["cleaned"] == False) 
-    
+    bool_lines = (
+        (df_lines["circuits"] != "")
+        & (df_lines["split_elements"] == 1)
+        & (df_lines["circuits"] != "0")
+        & (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1))
+        & (df_lines["cleaned"] == False)
+    )
+
     df_lines.loc[bool_lines & bool_ac, "frequency"] = "50"
     df_lines.loc[bool_lines & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_lines, "cleaned"] = True
 
-    # Clean those values where number of voltages split by semicolon is larger 
+    # Clean those values where number of voltages split by semicolon is larger
     # than no cables or no circuits
-    bool_cables = (df_lines["voltage_original"].apply(lambda x: len(x.split(";")) > 1)) & \
-        (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1)) & \
-        (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1)) & \
-        (df_lines["cleaned"] == False)
-    
-    df_lines.loc[bool_cables, "circuits"] = df_lines[bool_cables] \
-        .apply(_distribute_to_circuits, axis=1)
+    bool_cables = (
+        (df_lines["voltage_original"].apply(lambda x: len(x.split(";")) > 1))
+        & (df_lines["cables"].apply(lambda x: len(x.split(";")) == 1))
+        & (df_lines["circuits"].apply(lambda x: len(x.split(";")) == 1))
+        & (df_lines["cleaned"] == False)
+    )
+
+    df_lines.loc[bool_cables, "circuits"] = df_lines[bool_cables].apply(
+        _distribute_to_circuits, axis=1
+    )
     df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
     df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_cables, "cleaned"] = True
 
-    # Clean those values where multiple circuit values are present, divided by 
+    # Clean those values where multiple circuit values are present, divided by
     # semicolon
-    bool_cables = (df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1)) & \
-        (df_lines.apply(lambda row: len(row["circuits"].split(";")) == row["split_elements"], axis=1)) & \
-        (df_lines["cleaned"] == False)
-    
-    df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables] \
-        .apply(lambda row: str(row["circuits"].split(";")[
-            int(row["id"].split("-")[-1])-1
-        ]), axis=1)
-    
+    bool_cables = (
+        (df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1))
+        & (
+            df_lines.apply(
+                lambda row: len(row["circuits"].split(";")) == row["split_elements"],
+                axis=1,
+            )
+        )
+        & (df_lines["cleaned"] == False)
+    )
+
+    df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables].apply(
+        lambda row: str(row["circuits"].split(";")[int(row["id"].split("-")[-1]) - 1]),
+        axis=1,
+    )
+
     df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
     df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_cables, "cleaned"] = True
 
-    # Clean those values where multiple cables values are present, divided by 
+    # Clean those values where multiple cables values are present, divided by
     # semicolon
-    bool_cables = (df_lines["cables"].apply(lambda x: len(x.split(";")) > 1)) & \
-        (df_lines.apply(lambda row: len(row["cables"].split(";")) == row["split_elements"], axis=1)) & \
-        (df_lines["cleaned"] == False)
+    bool_cables = (
+        (df_lines["cables"].apply(lambda x: len(x.split(";")) > 1))
+        & (
+            df_lines.apply(
+                lambda row: len(row["cables"].split(";")) == row["split_elements"],
+                axis=1,
+            )
+        )
+        & (df_lines["cleaned"] == False)
+    )
 
-    df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables] \
-        .apply(lambda row: 
-            str(max(1,
+    df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables].apply(
+        lambda row: str(
+            max(
+                1,
                 np.floor_divide(
-                    int(row["cables"].split(";")[int(row["id"].split("-")[-1])-1]),
-                    3
-                    )
-                )),
-            axis=1)
-    
+                    int(row["cables"].split(";")[int(row["id"].split("-")[-1]) - 1]), 3
+                ),
+            )
+        ),
+        axis=1,
+    )
+
     df_lines.loc[bool_cables & bool_ac, "frequency"] = "50"
     df_lines.loc[bool_cables & bool_dc, "frequency"] = "0"
     df_lines.loc[bool_cables, "cleaned"] = True
 
     # All remaining lines to circuits == 1
-    bool_leftover = (df_lines["cleaned"] == False)
+    bool_leftover = df_lines["cleaned"] == False
     if sum(bool_leftover) > 0:
         str_id = "; ".join(str(id) for id in df_lines.loc[bool_leftover, "id"])
         logger.info(f"Setting circuits of remaining {sum(bool_leftover)} lines to 1...")
         logger.info(f"Lines affected: {str_id}")
-    
+
     df_lines.loc[bool_leftover, "circuits"] = "1"
     df_lines.loc[bool_leftover & bool_ac, "frequency"] = "50"
     df_lines.loc[bool_leftover & bool_dc, "frequency"] = "0"
@@ -800,20 +870,21 @@ def _create_substations_geometry(df_substations):
     Creates centroids from geometries and keeps the original polygons.
 
     Parameters:
-    df_substations (DataFrame): The input DataFrame containing the substations 
+    df_substations (DataFrame): The input DataFrame containing the substations
     data.
 
     Returns:
-    df_substations (DataFrame): A new DataFrame with the centroids ["geometry"] 
+    df_substations (DataFrame): A new DataFrame with the centroids ["geometry"]
     and polygons ["polygon"] of the substations geometries.
-
     """
     logger.info("Creating substations geometry.")
     df_substations = df_substations.copy()
-    
+
     # Create centroids from geometries and keep the original polygons
     df_substations.loc[:, "polygon"] = df_substations["geometry"]
-    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(lambda x: x.centroid)
+    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(
+        lambda x: x.centroid
+    )
     df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
     df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
 
@@ -828,21 +899,21 @@ def _create_lines_geometry(df_lines):
     - df_lines (pandas.DataFrame): DataFrame containing lines data.
 
     Returns:
-    - df_lines (pandas.DataFrame): DataFrame with transformed 'geometry' 
+    - df_lines (pandas.DataFrame): DataFrame with transformed 'geometry'
       column (type: shapely LineString).
 
     Notes:
-    - This function transforms 'geometry' column in the input DataFrame by 
+    - This function transforms 'geometry' column in the input DataFrame by
       applying the '_create_linestring' function to each row.
-    - It then drops rows where the geometry has equal start and end points, 
+    - It then drops rows where the geometry has equal start and end points,
       as these are usually not lines but outlines of areas.
     """
     logger.info("Creating lines geometry.")
     df_lines = df_lines.copy()
-    df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1)  
+    df_lines.loc[:, "geometry"] = df_lines.apply(_create_linestring, axis=1)
 
-    bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1]) 
-    df_lines = df_lines[~bool_circle] 
+    bool_circle = df_lines["geometry"].apply(lambda x: x.coords[0] == x.coords[-1])
+    df_lines = df_lines[~bool_circle]
 
     return df_lines
 
@@ -852,11 +923,11 @@ def _finalise_substations(df_substations):
     Finalises the substations column types.
 
     Args:
-        df_substations (pandas.DataFrame): The input DataFrame 
+        df_substations (pandas.DataFrame): The input DataFrame
         containing substations data.
 
     Returns:
-        df_substations (pandas.DataFrame(): The DataFrame with finalised column 
+        df_substations (pandas.DataFrame(): The DataFrame with finalised column
         types and transformed data.
     """
     logger.info("Finalising substations column types.")
@@ -864,12 +935,14 @@ def _finalise_substations(df_substations):
     # rename columns
     df_substations.rename(
         columns={
-            "id": "bus_id", 
+            "id": "bus_id",
             "power": "symbol",
-            "substation":"tag_substation",
-            }, inplace=True)
-    
-    # Initiate new columns for subsequent build_osm_network step 
+            "substation": "tag_substation",
+        },
+        inplace=True,
+    )
+
+    # Initiate new columns for subsequent build_osm_network step
     df_substations.loc[:, "symbol"] = "substation"
     df_substations.loc[:, "tag_substation"] = "transmission"
     df_substations.loc[:, "dc"] = False
@@ -880,23 +953,25 @@ def _finalise_substations(df_substations):
     df_substations.loc[:, "tag_source"] = df_substations["bus_id"]
 
     # Only included needed columns
-    df_substations = df_substations[[
-        "bus_id",
-        "symbol", 
-        "tag_substation", 
-        "voltage", 
-        "lon", 
-        "lat", 
-        "dc", 
-        "under_construction", 
-        "station_id", 
-        "tag_area", 
-        "country",
-        "geometry",
-        "polygon",
-        "tag_source",
-        ]]
-    
+    df_substations = df_substations[
+        [
+            "bus_id",
+            "symbol",
+            "tag_substation",
+            "voltage",
+            "lon",
+            "lat",
+            "dc",
+            "under_construction",
+            "station_id",
+            "tag_area",
+            "country",
+            "geometry",
+            "polygon",
+            "tag_source",
+        ]
+    ]
+
     # Substation data types
     df_substations["voltage"] = df_substations["voltage"].astype(int)
 
@@ -911,7 +986,7 @@ def _finalise_lines(df_lines):
         df_lines (pandas.DataFrame): The input DataFrame containing lines data.
 
     Returns:
-        df_lines (pandas.DataFrame(): The DataFrame with finalised column types 
+        df_lines (pandas.DataFrame(): The DataFrame with finalised column types
         and transformed data.
     """
     logger.info("Finalising lines column types.")
@@ -919,11 +994,13 @@ def _finalise_lines(df_lines):
     # Rename columns
     df_lines.rename(
         columns={
-            "id": "line_id", 
+            "id": "line_id",
             "power": "tag_type",
-            "frequency":"tag_frequency",
-            }, inplace=True)
-    
+            "frequency": "tag_frequency",
+        },
+        inplace=True,
+    )
+
     # Initiate new columns for subsequent build_osm_network step
     df_lines.loc[:, "bus0"] = None
     df_lines.loc[:, "bus1"] = None
@@ -937,22 +1014,24 @@ def _finalise_lines(df_lines):
     df_lines.loc[df_lines["tag_frequency"] == "0", "dc"] = True
 
     # Only include needed columns
-    df_lines = df_lines[[
-        "line_id",
-        "circuits",
-        "tag_type",
-        "voltage",
-        "tag_frequency",
-        "bus0",
-        "bus1",
-        "length",
-        "underground",
-        "under_construction",
-        "dc",
-        "country",
-        "geometry",
-        ]]
-    
+    df_lines = df_lines[
+        [
+            "line_id",
+            "circuits",
+            "tag_type",
+            "voltage",
+            "tag_frequency",
+            "bus0",
+            "bus1",
+            "length",
+            "underground",
+            "under_construction",
+            "dc",
+            "country",
+            "geometry",
+        ]
+    ]
+
     # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
     # This workaround is needed as otherwise the column dtypes remain "objects"
     df_lines["circuits"] = df_lines["circuits"].astype(int)
@@ -965,81 +1044,107 @@ def _finalise_lines(df_lines):
 def _import_substations(path_substations):
     """
     Import substations from the given input paths. This function imports both
-    substations from OSM ways as well as relations that contain nested 
+    substations from OSM ways as well as relations that contain nested
     information on the substations shape and electrical parameters. Ways and
-    relations are subsequently concatenated to form a single DataFrame 
+    relations are subsequently concatenated to form a single DataFrame
     containing unique bus ids.
 
     Args:
-        path_substations (dict): A dictionary containing input paths for 
+        path_substations (dict): A dictionary containing input paths for
         substations.
 
     Returns:
         pd.DataFrame: A DataFrame containing the imported substations data.
     """
-    cols_substations_way = ["id", "geometry", "country", "power", "substation", "voltage", "frequency"]
-    cols_substations_relation = ["id", "country", "power", "substation", "voltage", "frequency"]
-    df_substations_way = pd.DataFrame(columns = cols_substations_way)
-    df_substations_relation = pd.DataFrame(columns = cols_substations_relation)
+    cols_substations_way = [
+        "id",
+        "geometry",
+        "country",
+        "power",
+        "substation",
+        "voltage",
+        "frequency",
+    ]
+    cols_substations_relation = [
+        "id",
+        "country",
+        "power",
+        "substation",
+        "voltage",
+        "frequency",
+    ]
+    df_substations_way = pd.DataFrame(columns=cols_substations_way)
+    df_substations_relation = pd.DataFrame(columns=cols_substations_relation)
 
     logger.info("Importing substations")
     for key in path_substations:
         logger.info(f"Processing {key}...")
         for idx, ip in enumerate(path_substations[key]):
-            if os.path.exists(ip) and os.path.getsize(ip) > 400: # unpopulated OSM json is about 51 bytes
-                country = os.path.basename(os.path.dirname(path_substations[key][idx]))  
+            if (
+                os.path.exists(ip) and os.path.getsize(ip) > 400
+            ):  # unpopulated OSM json is about 51 bytes
+                country = os.path.basename(os.path.dirname(path_substations[key][idx]))
                 logger.info(
                     f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_substations[key])).zfill(2)}: {ip}"
-                    )
+                )
                 with open(ip, "r") as f:
                     data = json.load(f)
-                
-                df = pd.DataFrame(data['elements'])
+
+                df = pd.DataFrame(data["elements"])
                 df["id"] = df["id"].astype(str)
                 # new string that adds "way/" to id
-                df["id"] = df["id"].apply(lambda x: f"way/{x}" if key == "substations_way" else f"relation/{x}")
+                df["id"] = df["id"].apply(
+                    lambda x: (
+                        f"way/{x}" if key == "substations_way" else f"relation/{x}"
+                    )
+                )
                 df["country"] = country
 
                 col_tags = ["power", "substation", "voltage", "frequency"]
 
-                tags = pd.json_normalize(df["tags"]) \
-                    .map(lambda x: str(x) if pd.notnull(x) else x)
-                
+                tags = pd.json_normalize(df["tags"]).map(
+                    lambda x: str(x) if pd.notnull(x) else x
+                )
+
                 for ct in col_tags:
                     if ct not in tags.columns:
                         tags[ct] = pd.NA
-                
+
                 tags = tags.loc[:, col_tags]
 
-                df = pd.concat([df, tags], axis="columns") 
+                df = pd.concat([df, tags], axis="columns")
 
                 if key == "substations_way":
                     df.drop(columns=["type", "tags", "bounds", "nodes"], inplace=True)
-                    df_substations_way = pd.concat([df_substations_way, df], axis="rows")
+                    df_substations_way = pd.concat(
+                        [df_substations_way, df], axis="rows"
+                    )
                 elif key == "substations_relation":
                     df.drop(columns=["type", "tags", "bounds"], inplace=True)
-                    df_substations_relation = pd.concat([df_substations_relation, df], axis="rows")
+                    df_substations_relation = pd.concat(
+                        [df_substations_relation, df], axis="rows"
+                    )
 
             else:
                 logger.info(
                     f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_substations[key])).zfill(2)} (empty): {ip}"
-                    )
+                )
                 continue
         logger.info("---")
 
-    df_substations_way.drop_duplicates(subset='id', keep='first', inplace=True)
-    df_substations_relation.drop_duplicates(subset='id', keep='first', inplace=True)
+    df_substations_way.drop_duplicates(subset="id", keep="first", inplace=True)
+    df_substations_relation.drop_duplicates(subset="id", keep="first", inplace=True)
 
     df_substations_way["geometry"] = df_substations_way.apply(_create_polygon, axis=1)
 
     # Normalise the members column of df_substations_relation
     cols_members = ["id", "type", "ref", "role", "geometry"]
-    df_substations_relation_members = pd.DataFrame(columns = cols_members)
+    df_substations_relation_members = pd.DataFrame(columns=cols_members)
 
     for index, row in df_substations_relation.iterrows():
         col_members = ["type", "ref", "role", "geometry"]
-        df = pd.json_normalize(row["members"]) 
-                
+        df = pd.json_normalize(row["members"])
+
         for cm in col_members:
             if cm not in df.columns:
                 df[cm] = pd.NA
@@ -1050,38 +1155,52 @@ def _import_substations(path_substations):
         df = df[df["type"] != "node"]
         df = df.dropna(subset=["geometry"])
         df = df[~df["role"].isin(["", "incoming_line", "substation", "inner"])]
-        df_substations_relation_members = pd.concat([df_substations_relation_members, df], axis="rows")
-    
+        df_substations_relation_members = pd.concat(
+            [df_substations_relation_members, df], axis="rows"
+        )
+
     df_substations_relation_members.reset_index(inplace=True)
-    df_substations_relation_members["linestring"] = df_substations_relation_members.apply(_create_linestring, axis=1)  
-    df_substations_relation_members_grouped = df_substations_relation_members.groupby('id')['linestring'] \
-        .apply(lambda x: linemerge(x.tolist())).reset_index()
-    df_substations_relation_members_grouped["geometry"] = df_substations_relation_members_grouped["linestring"] \
-        .apply(lambda x: x.convex_hull)
-    
-    df_substations_relation = df_substations_relation.join(
-        df_substations_relation_members_grouped.set_index('id'), 
-        on='id', how='left'
-        ).drop(columns=["members", "linestring"]) \
+    df_substations_relation_members["linestring"] = (
+        df_substations_relation_members.apply(_create_linestring, axis=1)
+    )
+    df_substations_relation_members_grouped = (
+        df_substations_relation_members.groupby("id")["linestring"]
+        .apply(lambda x: linemerge(x.tolist()))
+        .reset_index()
+    )
+    df_substations_relation_members_grouped["geometry"] = (
+        df_substations_relation_members_grouped["linestring"].apply(
+            lambda x: x.convex_hull
+        )
+    )
+
+    df_substations_relation = (
+        df_substations_relation.join(
+            df_substations_relation_members_grouped.set_index("id"), on="id", how="left"
+        )
+        .drop(columns=["members", "linestring"])
         .dropna(subset=["geometry"])
-    
+    )
+
     # reorder columns and concatenate
     df_substations_relation = df_substations_relation[cols_substations_way]
-    df_substations = pd.concat([df_substations_way, df_substations_relation], axis="rows")
+    df_substations = pd.concat(
+        [df_substations_way, df_substations_relation], axis="rows"
+    )
 
     return df_substations
 
 
 def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     """
-    Removes lines that are within substation polygons from the given 
-    GeoDataFrame of lines. These are not needed to create network (e.g. bus 
+    Removes lines that are within substation polygons from the given
+    GeoDataFrame of lines. These are not needed to create network (e.g. bus
     bars, switchgear, etc.)
 
     Parameters:
-    - gdf_lines (GeoDataFrame): A GeoDataFrame containing lines with 'line_id' 
+    - gdf_lines (GeoDataFrame): A GeoDataFrame containing lines with 'line_id'
       and 'geometry' columns.
-    - gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing 
+    - gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing
       substation polygons.
 
     Returns:
@@ -1089,13 +1208,15 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     """
     logger.info("Identifying and removing lines within substation polygons...")
     gdf = gpd.sjoin(
-        gdf_lines[["line_id", "geometry"]], 
-        gdf_substations_polygon, 
+        gdf_lines[["line_id", "geometry"]],
+        gdf_substations_polygon,
         how="inner",
-        predicate="within"
+        predicate="within",
     )["line_id"]
 
-    logger.info(f"Removed {len(gdf)} lines within substations of original {len(gdf_lines)} lines.")
+    logger.info(
+        f"Removed {len(gdf)} lines within substations of original {len(gdf_lines)} lines."
+    )
     gdf_lines = gdf_lines[~gdf_lines["line_id"].isin(gdf)]
 
     return gdf_lines
@@ -1106,15 +1227,15 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
         from _helpers import mock_snakemake
 
         snakemake = mock_snakemake("clean_osm_data")
-    
+
     configure_logging(snakemake)
     set_scenario_config(snakemake)
-    
+
     # Parameters
-    crs = "EPSG:4326"       # Correct crs for OSM data
-    voltage_min = 200000    # [unit: V] Minimum voltage value to filter lines. 
+    crs = "EPSG:4326"  # Correct crs for OSM data
+    voltage_min = 200000  # [unit: V] Minimum voltage value to filter lines.
 
-    # TODO pypsa-eur: Temporary solution as one AC line between converters will 
+    # TODO pypsa-eur: Temporary solution as one AC line between converters will
     # create an error in simplify_network:
     lines_to_drop = ["775580659"]
 
@@ -1129,7 +1250,9 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     # Cleaning process
     df_substations = _import_substations(path_substations)
     df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
-    df_substations, list_voltages = _filter_by_voltage(df_substations, voltage_min=voltage_min)
+    df_substations, list_voltages = _filter_by_voltage(
+        df_substations, voltage_min=voltage_min
+    )
     df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
     df_substations = _clean_substations(df_substations, list_voltages)
     df_substations = _create_substations_geometry(df_substations)
@@ -1138,8 +1261,8 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     # Create polygon GeoDataFrame to remove lines within substations
     gdf_substations_polygon = gpd.GeoDataFrame(
         df_substations[["bus_id", "polygon", "voltage"]],
-        geometry = "polygon", 
-        crs = crs,
+        geometry="polygon",
+        crs=crs,
     )
 
     logger.info("---")
@@ -1161,28 +1284,31 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     df_lines = _clean_lines(df_lines, list_voltages)
     df_lines = _create_lines_geometry(df_lines)
     df_lines = _finalise_lines(df_lines)
-    
+
     # Dropping specific lines, manually
     if lines_to_drop in df_lines["line_id"].values:
-        df_lines.drop(df_lines[df_lines["line_id"].isin(lines_to_drop)].index, inplace=True)
-    
+        df_lines.drop(
+            df_lines[df_lines["line_id"].isin(lines_to_drop)].index, inplace=True
+        )
+
     # Create GeoDataFrame
-    gdf_lines = gpd.GeoDataFrame(df_lines, geometry = "geometry", crs = crs)
+    gdf_lines = gpd.GeoDataFrame(df_lines, geometry="geometry", crs=crs)
     gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon)
 
     # Add line endings to substations
     path_country_shapes = snakemake.input.country_shapes
     path_offshore_shapes = snakemake.input.offshore_shapes
     df_substations = _add_line_endings_to_substations(
-        df_substations, 
+        df_substations,
         gdf_lines,
         path_country_shapes,
         path_offshore_shapes,
-        )
-    
+    )
+
     # Drop polygons and create GDF
-    gdf_substations = gpd.GeoDataFrame(df_substations.drop(columns=["polygon"]), 
-                                       geometry = "geometry", crs = crs)
+    gdf_substations = gpd.GeoDataFrame(
+        df_substations.drop(columns=["polygon"]), geometry="geometry", crs=crs
+    )
 
     # Export GeoDataFrames to GeoJSON in specified output paths
     parentfolder = os.path.dirname(snakemake.output.substations)
@@ -1192,11 +1318,13 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     output_substations = snakemake.output["substations"]
     output_lines = snakemake.output["lines"]
 
-    logger.info(f"Exporting clean substations with polygon shapes to {output_substations_polygon}")
-    gdf_substations_polygon.to_file(output_substations_polygon, driver="GeoJSON") 
+    logger.info(
+        f"Exporting clean substations with polygon shapes to {output_substations_polygon}"
+    )
+    gdf_substations_polygon.to_file(output_substations_polygon, driver="GeoJSON")
     logger.info(f"Exporting clean substations to {output_substations}")
-    gdf_substations.to_file(output_substations, driver="GeoJSON")    
+    gdf_substations.to_file(output_substations, driver="GeoJSON")
     logger.info(f"Exporting clean lines to {output_lines}")
     gdf_lines.to_file(output_lines, driver="GeoJSON")
 
-    logger.info("Cleaning OSM data completed.")
\ No newline at end of file
+    logger.info("Cleaning OSM data completed.")
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 0ad9743e4..901145728 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -2,40 +2,42 @@
 # SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors
 #
 # SPDX-License-Identifier: MIT
-
 """
-Retrieve OSM data for the specified country using the overpass API and save it 
-to the specified output files. Note that overpass requests are based on a fair 
-use policy. `retrieve_osm_data` is meant to be used in a way that respects this 
-policy by fetching the needed data once, only. 
+Retrieve OSM data for the specified country using the overpass API and save it
+to the specified output files.
+
+Note that overpass requests are based on a fair
+use policy. `retrieve_osm_data` is meant to be used in a way that respects this
+policy by fetching the needed data once, only.
 """
 
 import json
 import logging
 import os
-import requests
 import time
 
+import requests
 from _helpers import configure_logging
+
 logger = logging.getLogger(__name__)
 
 
-# Function currently not needed - Kept for backup purposes to retrieve the OSM 
+# Function currently not needed - Kept for backup purposes to retrieve the OSM
 # area code if needed in the future
 def _get_overpass_areas(countries):
     """
     Retrieve the OSM area codes for the specified country codes.
-    
+
     Parameters
     ----------
     countries : str or list
-        A single country code or a list of country codes for which the OSM area 
+        A single country code or a list of country codes for which the OSM area
         codes should be retrieved.
 
     Returns
     -------
     dict
-        A dictionary mapping country codes to their corresponding OSM area 
+        A dictionary mapping country codes to their corresponding OSM area
         codes.
     """
 
@@ -65,37 +67,40 @@ def _get_overpass_areas(countries):
             # Check if the response contains any results
             if "elements" in data and len(data["elements"]) > 0:
                 # Extract the area ID from the relation
-                if c == "FR": # take second one for France
+                if c == "FR":  # take second one for France
                     osm_area_id = data["elements"][1]["id"]
                 else:
                     osm_area_id = data["elements"][0]["id"]
                 osm_areas.append(f"area({osm_area_id})")
             else:
                 # Print a warning if no results are found for the country code
-                logger.info(f"No area code found for the specified country "
-                            f"code: {c}. Omitted from the list.")
+                logger.info(
+                    f"No area code found for the specified country "
+                    f"code: {c}. Omitted from the list."
+                )
         except json.JSONDecodeError as e:
             logger.error(f"JSON decode error for country {c}: {e}")
             logger.debug(f"Response text: {response.text}")
-    
-    # Create a dictionary mapping country codes to their corresponding OSM area 
+
+    # Create a dictionary mapping country codes to their corresponding OSM area
     # codes
     op_areas_dict = dict(zip(countries, osm_areas))
-    
+
     return op_areas_dict
-    
+
 
 def retrieve_osm_data(
-        country, 
-        output,
-        features=[
-            "cables_way", 
-            "lines_way", 
-            "substations_way",
-            "substations_relation",
-            ]):
+    country,
+    output,
+    features=[
+        "cables_way",
+        "lines_way",
+        "substations_way",
+        "substations_relation",
+    ],
+):
     """
-    Retrieve OSM data for the specified country and save it to the specified 
+    Retrieve OSM data for the specified country and save it to the specified
     output files.
 
     Parameters
@@ -103,7 +108,7 @@ def retrieve_osm_data(
     country : str
         The country code for which the OSM data should be retrieved.
     output : dict
-        A dictionary mapping feature names to the corresponding output file 
+        A dictionary mapping feature names to the corresponding output file
         paths. Saving the OSM data to .json files.
     features : list, optional
         A list of OSM features to retrieve. The default is [
@@ -119,7 +124,7 @@ def retrieve_osm_data(
     # More features can in theory be retrieved that are currently not needed
     # to build a functioning network. The following power-related
     # features are supported:
-    
+
     # features_dict= {
     #     'cables_way': 'way["power"="cable"]',
     #     'lines_way': 'way["power"="line"]',
@@ -130,38 +135,44 @@ def retrieve_osm_data(
     #     'route_relations': 'rel["route"="power"]["type"="route"]'
     # }
 
-    features_dict= {
-        'cables_way': 'way["power"="cable"]',
-        'lines_way': 'way["power"="line"]',
-        'substations_way': 'way["power"="substation"]',
-        'substations_relation': 'relation["power"="substation"]',
+    features_dict = {
+        "cables_way": 'way["power"="cable"]',
+        "lines_way": 'way["power"="line"]',
+        "substations_way": 'way["power"="substation"]',
+        "substations_relation": 'relation["power"="substation"]',
     }
 
     wait_time = 5
 
     for f in features:
         if f not in features_dict:
-            logger.info(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
-            raise ValueError(f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}")
+            logger.info(
+                f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}"
+            )
+            raise ValueError(
+                f"Invalid feature: {f}. Supported features: {list(features_dict.keys())}"
+            )
 
         retries = 3
         for attempt in range(retries):
-            logger.info(f" - Fetching OSM data for feature '{f}' in {country} (Attempt {attempt+1})...")
+            logger.info(
+                f" - Fetching OSM data for feature '{f}' in {country} (Attempt {attempt+1})..."
+            )
 
             # Build the overpass query
             op_area = f'area["ISO3166-1"="{country}"]'
-            op_query = f'''
+            op_query = f"""
                 [out:json];
                 {op_area}->.searchArea;
                 (
                 {features_dict[f]}(area.searchArea);
                 );
                 out body geom;
-            '''
+            """
             try:
                 # Send the request
-                response = requests.post(overpass_url, data = op_query)
-                response.raise_for_status() # Raise HTTPError for bad responses
+                response = requests.post(overpass_url, data=op_query)
+                response.raise_for_status()  # Raise HTTPError for bad responses
                 data = response.json()
 
                 filepath = output[f]
@@ -169,13 +180,15 @@ def retrieve_osm_data(
                 if not os.path.exists(parentfolder):
                     os.makedirs(parentfolder)
 
-                with open(filepath, mode = "w") as f:
-                    json.dump(response.json(),f,indent=2)
+                with open(filepath, mode="w") as f:
+                    json.dump(response.json(), f, indent=2)
                 logger.info(" - Done.")
                 break  # Exit the retry loop on success
             except (json.JSONDecodeError, requests.exceptions.RequestException) as e:
                 logger.error(f"Error for feature '{f}' in country {country}: {e}")
-                logger.debug(f"Response text: {response.text if response else 'No response'}")
+                logger.debug(
+                    f"Response text: {response.text if response else 'No response'}"
+                )
                 if attempt < retries - 1:
                     wait_time += 15
                     logger.info(f"Waiting {wait_time} seconds before retrying...")
@@ -183,11 +196,13 @@ def retrieve_osm_data(
                 else:
                     logger.error(
                         f"Failed to retrieve data for feature '{f}' in country {country} after {retries} attempts."
-                        )
+                    )
             except Exception as e:
-                # For now, catch any other exceptions and log them. Treat this 
+                # For now, catch any other exceptions and log them. Treat this
                 # the same as a RequestException and try to run again two times.
-                logger.error(f"Unexpected error for feature '{f}' in country {country}: {e}")
+                logger.error(
+                    f"Unexpected error for feature '{f}' in country {country}: {e}"
+                )
                 if attempt < retries - 1:
                     wait_time += 10
                     logger.info(f"Waiting {wait_time} seconds before retrying...")
@@ -195,7 +210,7 @@ def retrieve_osm_data(
                 else:
                     logger.error(
                         f"Failed to retrieve data for feature '{f}' in country {country} after {retries} attempts."
-                        )
+                    )
 
 
 if __name__ == "__main__":
@@ -203,11 +218,11 @@ def retrieve_osm_data(
         from _helpers import mock_snakemake
 
         snakemake = mock_snakemake("retrieve_osm_data", country="BE")
-    
+
     configure_logging(snakemake)
 
     # Retrieve the OSM data
     country = snakemake.wildcards.country
     output = snakemake.output
 
-    retrieve_osm_data(country, output)
\ No newline at end of file
+    retrieve_osm_data(country, output)

From f2761a2bb10fabc3f621be18c80cd6708b0deece Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 24 May 2024 10:09:17 +0200
Subject: [PATCH 025/100] Removed overpass from required packages. Not needed
 anymore.

---
 envs/environment.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/envs/environment.yaml b/envs/environment.yaml
index 2b771f0b9..dd5df3250 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -64,5 +64,4 @@ dependencies:
   - snakemake-storage-plugin-http
   - snakemake-executor-plugin-slurm
   - snakemake-executor-plugin-cluster-generic
-  - highspy
-  - overpass
+  - highspy
\ No newline at end of file

From c71a3b934b2e50ed2c3c8572887ebe329cb77bd6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 May 2024 08:10:45 +0000
Subject: [PATCH 026/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 envs/environment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/environment.yaml b/envs/environment.yaml
index dd5df3250..fbc61d367 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -64,4 +64,4 @@ dependencies:
   - snakemake-storage-plugin-http
   - snakemake-executor-plugin-slurm
   - snakemake-executor-plugin-cluster-generic
-  - highspy
\ No newline at end of file
+  - highspy

From ff882e69f9aedb4e5aa9f3ecb399e0c85f654355 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 24 May 2024 14:14:26 +0200
Subject: [PATCH 027/100] Added links_relations (route = power, frequency = 0)
 to retrieval. This will change how HVDC links are extracted in the near
 future.

---
 rules/build_electricity.smk  | 13 +++---
 scripts/retrieve_osm_data.py | 83 +-----------------------------------
 2 files changed, 9 insertions(+), 87 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index a1e44f5fa..683b75d57 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -616,6 +616,7 @@ rule retrieve_osm_data:
     output:
         cables_way="data/osm/raw/{country}/cables_way.json",
         lines_way="data/osm/raw/{country}/lines_way.json",
+        links_relation="data/osm/raw/{country}/links_relation.json",
         substations_way="data/osm/raw/{country}/substations_way.json",
         substations_relation="data/osm/raw/{country}/substations_relation.json",
     log:
@@ -630,19 +631,19 @@ rule retrieve_osm_data:
 rule clean_osm_data:
     input:
         cables_way=[
-            f"data/osm/raw/{country}/cables_way.json"
-            for country in config["countries"]
+            f"data/osm/raw/{country}/cables_way.json" for country in config["countries"]
         ],
         lines_way=[
             f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]
         ],
+        links_relation=[
+            f"data/osm/raw/{country}/links_relation.json" for country in config["countries"]
+        ],
         substations_way=[
-            f"data/osm/raw/{country}/substations_way.json"
-            for country in config["countries"]
+            f"data/osm/raw/{country}/substations_way.json" for country in config["countries"]
         ],
         substations_relation=[
-            f"data/osm/raw/{country}/substations_relation.json"
-            for country in config["countries"]
+            f"data/osm/raw/{country}/substations_relation.json" for country in config["countries"]
         ],
         offshore_shapes=resources("offshore_shapes.geojson"),
         country_shapes=resources("country_shapes.geojson"),
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 901145728..bad99df3a 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -22,79 +22,13 @@
 logger = logging.getLogger(__name__)
 
 
-# Function currently not needed - Kept for backup purposes to retrieve the OSM
-# area code if needed in the future
-def _get_overpass_areas(countries):
-    """
-    Retrieve the OSM area codes for the specified country codes.
-
-    Parameters
-    ----------
-    countries : str or list
-        A single country code or a list of country codes for which the OSM area
-        codes should be retrieved.
-
-    Returns
-    -------
-    dict
-        A dictionary mapping country codes to their corresponding OSM area
-        codes.
-    """
-
-    # If a single country code is provided, convert it to a list
-    if not isinstance(countries, list):
-        countries = [countries]
-
-    # Overpass API endpoint URL
-    overpass_url = "https://overpass-api.de/api/interpreter"
-
-    osm_areas = []
-    for c in countries:
-        # Overpass query to fetch the relation for the specified country code
-        overpass_query = f"""
-            [out:json];
-            area["ISO3166-1"="{c}"];
-            out;
-        """
-
-        # Send the request to Overpass API
-        response = requests.post(overpass_url, data=overpass_query)
-
-        try:
-            # Parse the response
-            data = response.json()
-
-            # Check if the response contains any results
-            if "elements" in data and len(data["elements"]) > 0:
-                # Extract the area ID from the relation
-                if c == "FR":  # take second one for France
-                    osm_area_id = data["elements"][1]["id"]
-                else:
-                    osm_area_id = data["elements"][0]["id"]
-                osm_areas.append(f"area({osm_area_id})")
-            else:
-                # Print a warning if no results are found for the country code
-                logger.info(
-                    f"No area code found for the specified country "
-                    f"code: {c}. Omitted from the list."
-                )
-        except json.JSONDecodeError as e:
-            logger.error(f"JSON decode error for country {c}: {e}")
-            logger.debug(f"Response text: {response.text}")
-
-    # Create a dictionary mapping country codes to their corresponding OSM area
-    # codes
-    op_areas_dict = dict(zip(countries, osm_areas))
-
-    return op_areas_dict
-
-
 def retrieve_osm_data(
     country,
     output,
     features=[
         "cables_way",
         "lines_way",
+        "links_relation",
         "substations_way",
         "substations_relation",
     ],
@@ -121,23 +55,10 @@ def retrieve_osm_data(
     # Overpass API endpoint URL
     overpass_url = "https://overpass-api.de/api/interpreter"
 
-    # More features can in theory be retrieved that are currently not needed
-    # to build a functioning network. The following power-related
-    # features are supported:
-
-    # features_dict= {
-    #     'cables_way': 'way["power"="cable"]',
-    #     'lines_way': 'way["power"="line"]',
-    #     'substations_way': 'way["power"="substation"]',
-    #     'substations_node': 'node["power"="substation"]',
-    #     'transformers_way': 'way["power"="transformer"]',
-    #     'transformers_node': 'node["power"="transformer"]',
-    #     'route_relations': 'rel["route"="power"]["type"="route"]'
-    # }
-
     features_dict = {
         "cables_way": 'way["power"="cable"]',
         "lines_way": 'way["power"="line"]',
+        "links_relation": 'relation["route"="power"]["frequency"="0"]',
         "substations_way": 'way["power"="substation"]',
         "substations_relation": 'relation["power"="substation"]',
     }

From 0c0aff7cc888b62611ec4cd85b9586fac5f71bff Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 May 2024 12:15:10 +0000
Subject: [PATCH 028/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 rules/build_electricity.smk | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 683b75d57..680dedd89 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -631,19 +631,23 @@ rule retrieve_osm_data:
 rule clean_osm_data:
     input:
         cables_way=[
-            f"data/osm/raw/{country}/cables_way.json" for country in config["countries"]
+            f"data/osm/raw/{country}/cables_way.json"
+            for country in config["countries"]
         ],
         lines_way=[
             f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]
         ],
         links_relation=[
-            f"data/osm/raw/{country}/links_relation.json" for country in config["countries"]
+            f"data/osm/raw/{country}/links_relation.json"
+            for country in config["countries"]
         ],
         substations_way=[
-            f"data/osm/raw/{country}/substations_way.json" for country in config["countries"]
+            f"data/osm/raw/{country}/substations_way.json"
+            for country in config["countries"]
         ],
         substations_relation=[
-            f"data/osm/raw/{country}/substations_relation.json" for country in config["countries"]
+            f"data/osm/raw/{country}/substations_relation.json"
+            for country in config["countries"]
         ],
         offshore_shapes=resources("offshore_shapes.geojson"),
         country_shapes=resources("country_shapes.geojson"),

From ea9b3830b2beac8fd6c21468565bc0665766d549 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 28 May 2024 10:00:32 +0200
Subject: [PATCH 029/100] Work-in-progress clean_osm_data

---
 scripts/clean_osm_data.py | 122 ++++++++++++++++++++++++++++++++++----
 1 file changed, 111 insertions(+), 11 deletions(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 882c1229e..e40bd4234 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -513,7 +513,6 @@ def _import_lines_and_cables(path_lines):
                 df["id"] = df["id"].astype(str)
                 df["country"] = country
 
-                # col_tags = ["power", "cables", "circuits", "frequency", "voltage", "wires", "capacity", "rating"]
                 col_tags = [
                     "power",
                     "cables",
@@ -548,6 +547,82 @@ def _import_lines_and_cables(path_lines):
     return df_lines
 
 
+def _import_links(path_links):
+    """
+    Import links from the given input paths.
+
+    Parameters:
+    - path_links (dict): A dictionary containing the input paths for links.
+
+    Returns:
+    - df_links (DataFrame): A DataFrame containing the imported links data.
+    """
+    columns = [
+        "id",
+        "bounds",
+        "nodes",
+        "geometry",
+        "country",
+        "circuits",
+        "frequency",
+        "rating",
+        "voltage",
+        "wires",
+    ]
+    df_links = pd.DataFrame(columns=columns)
+
+    logger.info("Importing links")
+    for key in path_links:
+        logger.info(f"Processing {key}...")
+        for idx, ip in enumerate(path_links[key]):
+            if (
+                os.path.exists(ip) and os.path.getsize(ip) > 400
+            ):  # unpopulated OSM json is about 51 bytes
+                country = os.path.basename(os.path.dirname(path_links[key][idx]))
+
+                logger.info(
+                    f" - Importing {key} {str(idx+1).zfill(2)}/{str(len(path_links[key])).zfill(2)}: {ip}"
+                )
+                with open(ip, "r") as f:
+                    data = json.load(f)
+
+                df = pd.DataFrame(data["elements"])
+                df["id"] = df["id"].astype(str)
+                df["country"] = country
+
+                col_tags = [
+                    "circuits",
+                    "frequency",
+                    "rating",
+                    "voltage",
+                    "wires",
+                ]
+
+                tags = pd.json_normalize(df["tags"]).map(
+                    lambda x: str(x) if pd.notnull(x) else x
+                )
+
+                for ct in col_tags:
+                    if ct not in tags.columns:
+                        tags[ct] = pd.NA
+
+                tags = tags.loc[:, col_tags]
+
+                df = pd.concat([df, tags], axis="columns")
+                df.drop(columns=["type", "tags"], inplace=True)
+
+                df_links = pd.concat([df_links, df], axis="rows")
+
+            else:
+                logger.info(
+                    f" - Skipping {key} {str(idx+1).zfill(2)}/{str(len(path_links[key])).zfill(2)} (empty): {ip}"
+                )
+                continue
+        logger.info("---")
+
+    return df_links
+
+
 def _drop_duplicate_lines(df_lines):
     """
     Drop duplicate lines from the given dataframe. Duplicates are usually lines
@@ -586,29 +661,29 @@ def _drop_duplicate_lines(df_lines):
     return df_lines
 
 
-def _filter_by_voltage(df, voltage_min=200000):
+def _filter_by_voltage(df, min_voltage=200000):
     """
     Filter rows in the DataFrame based on the voltage in V.
 
     Parameters:
     - df (pandas.DataFrame): The DataFrame containing the substations or lines data.
-    - voltage_min (int, optional): The minimum voltage value to filter the
+    - min_voltage (int, optional): The minimum voltage value to filter the
       rows. Defaults to 200000 [unit: V].
 
     Returns:
     - filtered df (pandas.DataFrame): The filtered DataFrame containing
-      the lines or substations above voltage_min.
-    - list_voltages (list): A list of unique voltage values above voltage_min.
+      the lines or substations above min_voltage.
+    - list_voltages (list): A list of unique voltage values above min_voltage.
       The type of the list elements is string.
     """
     logger.info(
-        f"Filtering dataframe by voltage. Only keeping rows above and including {voltage_min} V."
+        f"Filtering dataframe by voltage. Only keeping rows above and including {min_voltage} V."
     )
     list_voltages = df["voltage"].str.split(";").explode().unique().astype(str)
     # Keep numeric strings
     list_voltages = list_voltages[np.vectorize(str.isnumeric)(list_voltages)]
     list_voltages = list_voltages.astype(int)
-    list_voltages = list_voltages[list_voltages >= int(voltage_min)]
+    list_voltages = list_voltages[list_voltages >= int(min_voltage_ac)]
     list_voltages = list_voltages.astype(str)
 
     bool_voltages = df["voltage"].apply(_check_voltage, list_voltages=list_voltages)
@@ -630,7 +705,7 @@ def _clean_substations(df_substations, list_voltages):
     Parameters:
     - df_substations (pandas.DataFrame): The input dataframe containing
       substation data.
-    - list_voltages (list): A list of voltages above voltage_min to filter the
+    - list_voltages (list): A list of voltages above min_voltage to filter the
     substation data.
 
     Returns:
@@ -1233,7 +1308,8 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
 
     # Parameters
     crs = "EPSG:4326"  # Correct crs for OSM data
-    voltage_min = 200000  # [unit: V] Minimum voltage value to filter lines.
+    min_voltage_ac = 200000  # [unit: V] Minimum voltage value to filter AC lines.
+    min_voltage_dc = 150000 #  [unit: V] Minimum voltage value to filter DC links.
 
     # TODO pypsa-eur: Temporary solution as one AC line between converters will
     # create an error in simplify_network:
@@ -1251,7 +1327,7 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     df_substations = _import_substations(path_substations)
     df_substations["voltage"] = _clean_voltage(df_substations["voltage"])
     df_substations, list_voltages = _filter_by_voltage(
-        df_substations, voltage_min=voltage_min
+        df_substations, min_voltage=min_voltage_ac
     )
     df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
     df_substations = _clean_substations(df_substations, list_voltages)
@@ -1276,7 +1352,7 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     df_lines = _import_lines_and_cables(path_lines)
     df_lines = _drop_duplicate_lines(df_lines)
     df_lines.loc[:, "voltage"] = _clean_voltage(df_lines["voltage"])
-    df_lines, list_voltages = _filter_by_voltage(df_lines, voltage_min=voltage_min)
+    df_lines, list_voltages = _filter_by_voltage(df_lines, min_voltage=min_voltage_ac)
     df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"])
     df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"])
     df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
@@ -1327,4 +1403,28 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     logger.info(f"Exporting clean lines to {output_lines}")
     gdf_lines.to_file(output_lines, driver="GeoJSON")
 
+    logger.info("---")
+    logger.info("HVDC LINKS")
+    path_links = {
+        "links": snakemake.input.links_relation,
+    }
+
+
+    ### CONTINUE HERE
+    # Cleaning process
+    df_links = _import_links(path_links)
+    df_links = _drop_duplicate_lines(df_links)
+    df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"])
+    df_links, list_voltages = _filter_by_voltage(df_links, min_voltage=min_voltage_dc)
+    
+    
+    df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"])
+    df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"])
+    df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
+    df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"])
+    df_lines = _clean_lines(df_lines, list_voltages)
+    df_lines = _create_lines_geometry(df_lines)
+    df_lines = _finalise_lines(df_lines)
+
+
     logger.info("Cleaning OSM data completed.")

From 055699ba170d1c73310016d8f84732ac13076b45 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 28 May 2024 08:02:14 +0000
Subject: [PATCH 030/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/clean_osm_data.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index e40bd4234..531169cdb 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1309,7 +1309,7 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     # Parameters
     crs = "EPSG:4326"  # Correct crs for OSM data
     min_voltage_ac = 200000  # [unit: V] Minimum voltage value to filter AC lines.
-    min_voltage_dc = 150000 #  [unit: V] Minimum voltage value to filter DC links.
+    min_voltage_dc = 150000  #  [unit: V] Minimum voltage value to filter DC links.
 
     # TODO pypsa-eur: Temporary solution as one AC line between converters will
     # create an error in simplify_network:
@@ -1409,15 +1409,13 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
         "links": snakemake.input.links_relation,
     }
 
-
     ### CONTINUE HERE
     # Cleaning process
     df_links = _import_links(path_links)
     df_links = _drop_duplicate_lines(df_links)
     df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"])
     df_links, list_voltages = _filter_by_voltage(df_links, min_voltage=min_voltage_dc)
-    
-    
+
     df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"])
     df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"])
     df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
@@ -1426,5 +1424,4 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     df_lines = _create_lines_geometry(df_lines)
     df_lines = _finalise_lines(df_lines)
 
-
     logger.info("Cleaning OSM data completed.")

From 2b9d6982cbdc57983861f7c43123d3a379fb441e Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 31 May 2024 18:34:44 +0200
Subject: [PATCH 031/100] Added clean links output to clean_osm_data. Script
 uses OSM relations to retrieve clean HVDC links.

---
 rules/build_electricity.smk |   1 +
 scripts/clean_osm_data.py   | 164 +++++++++++++++++++++++++++++++++---
 2 files changed, 154 insertions(+), 11 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 680dedd89..b9161423c 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -655,6 +655,7 @@ rule clean_osm_data:
         substations=resources("osm/clean/substations.geojson"),
         substations_polygon=resources("osm/clean/substations_polygon.geojson"),
         lines=resources("osm/clean/lines.geojson"),
+        links=resources("osm/clean/links.geojson"),
     log:
         logs("clean_osm_data.log"),
     script:
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 531169cdb..5674f53f3 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -34,7 +34,7 @@
 import numpy as np
 import pandas as pd
 from _helpers import configure_logging, set_scenario_config
-from shapely.geometry import LineString, Polygon
+from shapely.geometry import LineString, Polygon, MultiLineString
 from shapely.ops import linemerge
 
 logger = logging.getLogger(__name__)
@@ -277,6 +277,34 @@ def _clean_frequency(column):
     return column.astype(str)
 
 
+def _clean_rating(column):
+    """
+    Function to clean and sum the rating columns: 
+
+    Args:
+    - column: pandas Series, the column to be cleaned
+
+    Returns:
+    - column: pandas Series, the cleaned column
+    """
+    logger.info("Cleaning ratings.")
+    column = column.copy()
+    column = (
+        column.astype(str)
+        .str.replace("MW", "")
+    )
+
+    # Remove all remaining non-numeric characters except for semicolons
+    column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x))
+
+    # Sum up all ratings if there are multiple entries
+    column = column.str.split(";").apply(lambda x: sum([int(i) for i in x]))
+    
+
+    column.dropna(inplace=True)
+    return column.astype(str)
+
+
 def _split_cells(df, cols=["voltage"]):
     """
     Split semicolon separated cells i.e. [66000;220000] and create new
@@ -567,7 +595,6 @@ def _import_links(path_links):
         "frequency",
         "rating",
         "voltage",
-        "wires",
     ]
     df_links = pd.DataFrame(columns=columns)
 
@@ -595,7 +622,6 @@ def _import_links(path_links):
                     "frequency",
                     "rating",
                     "voltage",
-                    "wires",
                 ]
 
                 tags = pd.json_normalize(df["tags"]).map(
@@ -619,10 +645,66 @@ def _import_links(path_links):
                 )
                 continue
         logger.info("---")
+        logger.info("Dropping lines without rating.")
+        len_before = len(df_links)
+        df_links = df_links.dropna(subset=["rating"])
+        len_after = len(df_links)
+        logger.info(f"Dropped {len_before-len_after} elements without rating. " + 
+                    f"Imported {len_after} elements.")
 
     return df_links
 
 
+def _create_single_link(row):
+    """
+    Create a single link from multiple rows within a OSM link relation.
+
+    Parameters:
+    - row: A row of OSM data containing information about the link.
+
+    Returns:
+    - single_link: A single LineString representing the link.
+
+    This function takes a row of OSM data and extracts the relevant information 
+    to create a single link. It filters out elements (substations, electrodes) 
+    with invalid roles and finds the longest link based on its endpoints. 
+    If the longest link is a MultiLineString, it extracts the longest 
+    linestring from it. The resulting single link is returned.
+    """
+    valid_roles = ["line", "cable"]    
+    df = pd.json_normalize(row["members"])
+    df = df[df["role"].isin(valid_roles)]
+    df.loc[:, "geometry"] = df.apply(_create_linestring, axis=1)
+    df.loc[:, "length"] = df["geometry"].apply(lambda x: x.length)
+
+    list_endpoints = []
+    for idx, row in df.iterrows():
+        tuple = sorted([row["geometry"].coords[0], row["geometry"].coords[-1]])
+        # round tuple to 3 decimals
+        tuple = (
+            round(tuple[0][0], 2), 
+            round(tuple[0][1], 2), 
+            round(tuple[1][0], 2), 
+            round(tuple[1][1], 2)
+            )
+        list_endpoints.append(tuple)
+
+    df.loc[:, "endpoints"] = list_endpoints
+    df_longest = df.loc[df.groupby("endpoints")["length"].idxmax()]
+    
+    single_link = linemerge(df_longest["geometry"].values.tolist())
+
+    # If the longest component is a MultiLineString, extract the longest linestring from it
+    if isinstance(single_link, MultiLineString):
+        # Find connected components
+        components = list(single_link.geoms)
+
+        # Find the longest connected linestring
+        single_link = max(components, key=lambda x: x.length)
+
+    return single_link
+
+
 def _drop_duplicate_lines(df_lines):
     """
     Drop duplicate lines from the given dataframe. Duplicates are usually lines
@@ -654,9 +736,14 @@ def _drop_duplicate_lines(df_lines):
         grouped_duplicates.set_index("id"), on="id", how="left"
     )
 
+    len_before = len(df_lines)
     # Drop duplicates and update the df_lines dataframe with the cleaned data
     df_lines = df_lines[~df_lines["id"].isin(duplicate_rows["id"])]
     df_lines = pd.concat([df_lines, duplicate_rows], axis="rows")
+    len_after = len(df_lines)
+
+    logger.info(f"Dropped {len_before - len_after} duplicate elements. " +
+                f"Keeping {len_after} elements." )
 
     return df_lines
 
@@ -687,7 +774,11 @@ def _filter_by_voltage(df, min_voltage=200000):
     list_voltages = list_voltages.astype(str)
 
     bool_voltages = df["voltage"].apply(_check_voltage, list_voltages=list_voltages)
+    len_before = len(df)
     df = df[bool_voltages]
+    len_after = len(df)
+    logger.info(f"Dropped {len_before - len_after} elements with voltage below {min_voltage}. " +
+                f"Keeping {len_after} elements." )
 
     return df, list_voltages
 
@@ -1116,6 +1207,54 @@ def _finalise_lines(df_lines):
     return df_lines
 
 
+def _finalise_links(df_links):
+    """
+    Finalises the links column types.
+
+    Args:
+        df_links (pandas.DataFrame): The input DataFrame containing links data.
+
+    Returns:
+        df_links (pandas.DataFrame(): The DataFrame with finalised column types
+        and transformed data.
+    """
+    logger.info("Finalising links column types.")
+    df_links = df_links.copy()
+    # Rename columns
+    df_links.rename(
+        columns={
+            "id": "link_id",
+            "rating": "p_nom",
+        },
+        inplace=True,
+    )
+
+    # Initiate new columns for subsequent build_osm_network step
+    df_links.loc[:, "bus0"] = None
+    df_links.loc[:, "bus1"] = None
+    df_links.loc[:, "length"] = None
+
+    # Only include needed columns
+    df_links = df_links[
+        [
+            "link_id",
+            "voltage",
+            "p_nom",
+            "bus0",
+            "bus1",
+            "length",
+            "country",
+            "geometry",
+        ]
+    ]
+
+    # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
+    # This workaround is needed as otherwise the column dtypes remain "objects"
+    df_links["p_nom"] = df_links["p_nom"].astype(int)
+
+    return df_links
+
+
 def _import_substations(path_substations):
     """
     Import substations from the given input paths. This function imports both
@@ -1393,6 +1532,7 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     output_substations_polygon = snakemake.output["substations_polygon"]
     output_substations = snakemake.output["substations"]
     output_lines = snakemake.output["lines"]
+    output_links = snakemake.output["links"]
 
     logger.info(
         f"Exporting clean substations with polygon shapes to {output_substations_polygon}"
@@ -1412,16 +1552,18 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     ### CONTINUE HERE
     # Cleaning process
     df_links = _import_links(path_links)
+
     df_links = _drop_duplicate_lines(df_links)
     df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"])
     df_links, list_voltages = _filter_by_voltage(df_links, min_voltage=min_voltage_dc)
-
-    df_lines.loc[:, "circuits"] = _clean_circuits(df_lines["circuits"])
-    df_lines.loc[:, "cables"] = _clean_cables(df_lines["cables"])
-    df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
-    df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"])
-    df_lines = _clean_lines(df_lines, list_voltages)
-    df_lines = _create_lines_geometry(df_lines)
-    df_lines = _finalise_lines(df_lines)
+    df_links.loc[:, "frequency"] = _clean_frequency(df_links["frequency"])
+    df_links.loc[:, "rating"] = _clean_rating(df_links["rating"])
+    df_links.loc[:, "geometry"] = df_links.apply(_create_single_link, axis=1)
+    df_links = _finalise_links(df_links)
+    gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs)
+
+    logger.info(f"Exporting clean links to {output_links}")
+    gdf_links.to_file(output_links, driver="GeoJSON")
+    
 
     logger.info("Cleaning OSM data completed.")

From ff2b8390dc837d6cd3f12d109a209482a80ae963 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 31 May 2024 16:35:14 +0000
Subject: [PATCH 032/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/clean_osm_data.py | 51 ++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 5674f53f3..2bd6454fe 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -34,7 +34,7 @@
 import numpy as np
 import pandas as pd
 from _helpers import configure_logging, set_scenario_config
-from shapely.geometry import LineString, Polygon, MultiLineString
+from shapely.geometry import LineString, MultiLineString, Polygon
 from shapely.ops import linemerge
 
 logger = logging.getLogger(__name__)
@@ -279,7 +279,7 @@ def _clean_frequency(column):
 
 def _clean_rating(column):
     """
-    Function to clean and sum the rating columns: 
+    Function to clean and sum the rating columns:
 
     Args:
     - column: pandas Series, the column to be cleaned
@@ -289,17 +289,13 @@ def _clean_rating(column):
     """
     logger.info("Cleaning ratings.")
     column = column.copy()
-    column = (
-        column.astype(str)
-        .str.replace("MW", "")
-    )
+    column = column.astype(str).str.replace("MW", "")
 
     # Remove all remaining non-numeric characters except for semicolons
     column = column.apply(lambda x: re.sub(r"[^0-9;]", "", x))
 
     # Sum up all ratings if there are multiple entries
     column = column.str.split(";").apply(lambda x: sum([int(i) for i in x]))
-    
 
     column.dropna(inplace=True)
     return column.astype(str)
@@ -649,8 +645,10 @@ def _import_links(path_links):
         len_before = len(df_links)
         df_links = df_links.dropna(subset=["rating"])
         len_after = len(df_links)
-        logger.info(f"Dropped {len_before-len_after} elements without rating. " + 
-                    f"Imported {len_after} elements.")
+        logger.info(
+            f"Dropped {len_before-len_after} elements without rating. "
+            + f"Imported {len_after} elements."
+        )
 
     return df_links
 
@@ -665,13 +663,13 @@ def _create_single_link(row):
     Returns:
     - single_link: A single LineString representing the link.
 
-    This function takes a row of OSM data and extracts the relevant information 
-    to create a single link. It filters out elements (substations, electrodes) 
-    with invalid roles and finds the longest link based on its endpoints. 
-    If the longest link is a MultiLineString, it extracts the longest 
+    This function takes a row of OSM data and extracts the relevant information
+    to create a single link. It filters out elements (substations, electrodes)
+    with invalid roles and finds the longest link based on its endpoints.
+    If the longest link is a MultiLineString, it extracts the longest
     linestring from it. The resulting single link is returned.
     """
-    valid_roles = ["line", "cable"]    
+    valid_roles = ["line", "cable"]
     df = pd.json_normalize(row["members"])
     df = df[df["role"].isin(valid_roles)]
     df.loc[:, "geometry"] = df.apply(_create_linestring, axis=1)
@@ -682,16 +680,16 @@ def _create_single_link(row):
         tuple = sorted([row["geometry"].coords[0], row["geometry"].coords[-1]])
         # round tuple to 3 decimals
         tuple = (
-            round(tuple[0][0], 2), 
-            round(tuple[0][1], 2), 
-            round(tuple[1][0], 2), 
-            round(tuple[1][1], 2)
-            )
+            round(tuple[0][0], 2),
+            round(tuple[0][1], 2),
+            round(tuple[1][0], 2),
+            round(tuple[1][1], 2),
+        )
         list_endpoints.append(tuple)
 
     df.loc[:, "endpoints"] = list_endpoints
     df_longest = df.loc[df.groupby("endpoints")["length"].idxmax()]
-    
+
     single_link = linemerge(df_longest["geometry"].values.tolist())
 
     # If the longest component is a MultiLineString, extract the longest linestring from it
@@ -742,8 +740,10 @@ def _drop_duplicate_lines(df_lines):
     df_lines = pd.concat([df_lines, duplicate_rows], axis="rows")
     len_after = len(df_lines)
 
-    logger.info(f"Dropped {len_before - len_after} duplicate elements. " +
-                f"Keeping {len_after} elements." )
+    logger.info(
+        f"Dropped {len_before - len_after} duplicate elements. "
+        + f"Keeping {len_after} elements."
+    )
 
     return df_lines
 
@@ -777,8 +777,10 @@ def _filter_by_voltage(df, min_voltage=200000):
     len_before = len(df)
     df = df[bool_voltages]
     len_after = len(df)
-    logger.info(f"Dropped {len_before - len_after} elements with voltage below {min_voltage}. " +
-                f"Keeping {len_after} elements." )
+    logger.info(
+        f"Dropped {len_before - len_after} elements with voltage below {min_voltage}. "
+        + f"Keeping {len_after} elements."
+    )
 
     return df, list_voltages
 
@@ -1564,6 +1566,5 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
 
     logger.info(f"Exporting clean links to {output_links}")
     gdf_links.to_file(output_links, driver="GeoJSON")
-    
 
     logger.info("Cleaning OSM data completed.")

From 24aa2e0a22bd1baeb7f9ac3622540db08d76f3d7 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 3 Jun 2024 17:21:48 +0200
Subject: [PATCH 033/100] New code for integrating HVDC links. Using relations.
 Base network implementation functioning.

---
 rules/build_electricity.smk  |   7 +-
 scripts/base_network_osm.py  |  90 +++-----------
 scripts/build_osm_network.py | 229 +++++++++++++++++++----------------
 scripts/clean_osm_data.py    | 164 ++++++++++++++++++++-----
 4 files changed, 278 insertions(+), 212 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index b9161423c..324dc6410 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -117,11 +117,9 @@ if config["electricity_network"]["base_network"] == "osm":
         input:
             eg_buses=resources("osm/buses.csv"),
             eg_lines=resources("osm/lines.csv"),
-            # eg_links="data/entsoegridkit/links.csv",
+            eg_links=resources("osm/links.csv"),
             eg_converters=resources("osm/converters.csv"),
             eg_transformers=resources("osm/transformers.csv"),
-            links_p_nom="data/links_p_nom.csv",
-            links_tyndp="data/links_tyndp.csv",
             country_shapes=resources("country_shapes.geojson"),
             offshore_shapes=resources("offshore_shapes.geojson"),
             europe_shape=resources("europe_shape.geojson"),
@@ -666,13 +664,16 @@ rule build_osm_network:
     input:
         substations=resources("osm/clean/substations.geojson"),
         lines=resources("osm/clean/lines.geojson"),
+        links=resources("osm/clean/links.geojson"),
         country_shapes=resources("country_shapes.geojson"),
     output:
         lines=resources("osm/lines.csv"),
+        links=resources("osm/links.csv"),
         converters=resources("osm/converters.csv"),
         transformers=resources("osm/transformers.csv"),
         substations=resources("osm/buses.csv"),
         lines_geojson=resources("osm/lines.geojson"),
+        links_geojson=resources("osm/links.geojson"),
         converters_geojson=resources("osm/converters.geojson"),
         transformers_geojson=resources("osm/transformers.geojson"),
         substations_geojson=resources("osm/buses.geojson"),
diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index beec06f84..68d3232e5 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -215,15 +215,17 @@ def _load_links_from_eg(buses, eg_links):
         quotechar="'",
         true_values=["t"],
         false_values=["f"],
-        dtype=dict(link_id="str", bus0="str", bus1="str", under_construction="bool"),
+        dtype=dict(
+            link_id="str", 
+            bus0="str", 
+            bus1="str",
+            voltage="int",
+            p_nom="float",
+            ),
     ).set_index("link_id")
 
     links["length"] /= 1e3
 
-    # Skagerrak Link is connected to 132kV bus which is removed in _load_buses_from_eg.
-    # Connect to neighboring 380kV bus
-    links.loc[links.bus1 == "6396", "bus1"] = "6398"
-
     links = _remove_dangling_branches(links, buses)
 
     # Add DC line parameters
@@ -448,54 +450,14 @@ def _set_lines_s_nom_from_linetypes(n):
     ) * n.lines.eval("v_nom * num_parallel")
 
 
-def _set_electrical_parameters_dc_lines(lines_config, voltages, lines):
-    if lines.empty:
-        lines["type"] = []
-        return lines
-
-    linetypes = _get_linetypes_config(lines_config["dc_types"], voltages)
-
-    lines["carrier"] = "DC"
-    lines["dc"] = True
-    lines.loc[:, "type"] = lines.v_nom.apply(
-        lambda x: _get_linetype_by_voltage(x, linetypes)
-    )
-
-    lines["s_max_pu"] = lines_config["s_max_pu"]
-
-    return lines
-
-
 # TODO pypsa-eur: Clean/fix this, update list p_noms
-def _set_electrical_parameters_links(links, config, links_p_nom):
+def _set_electrical_parameters_links(links, config):
     if links.empty:
         return links
 
     p_max_pu = config["links"].get("p_max_pu", 1.0)
     links["p_max_pu"] = p_max_pu
     links["p_min_pu"] = -p_max_pu
-
-    links_p_nom = pd.read_csv(links_p_nom)
-
-    # filter links that are not in operation anymore
-    removed_b = links_p_nom.Remarks.str.contains("Shut down|Replaced", na=False)
-    links_p_nom = links_p_nom[~removed_b]
-
-    # find closest link for all links in links_p_nom
-    links_p_nom["j"] = _find_closest_links(links, links_p_nom)
-
-    links_p_nom = links_p_nom.groupby(["j"], as_index=False).agg({"Power (MW)": "sum"})
-
-    p_nom = links_p_nom.dropna(subset=["j"]).set_index("j")["Power (MW)"]
-
-    # Don't update p_nom if it's already set
-    p_nom_unset = (
-        p_nom.drop(links.index[links.p_nom.notnull()], errors="ignore")
-        if "p_nom" in links
-        else p_nom
-    )
-    links.loc[p_nom_unset.index, "p_nom"] = p_nom_unset
-
     links["carrier"] = "DC"
     links["dc"] = True
 
@@ -786,7 +748,7 @@ def base_network_osm(
     eg_converters,
     eg_transformers,
     eg_lines,
-    links_p_nom,
+    eg_links,
     europe_shape,
     country_shapes,
     offshore_shapes,
@@ -795,7 +757,7 @@ def base_network_osm(
     buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
 
     # TODO pypsa-eur add this
-    # links = _load_links_from_eg(buses, eg_links)
+    links = _load_links_from_eg(buses, eg_links)
     # if config["links"].get("include_tyndp"):
     #     buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
 
@@ -807,20 +769,13 @@ def base_network_osm(
     if config["lines"].get("reconnect_crimea", True) and "UA" in config["countries"]:
         lines = _reconnect_crimea(lines)
 
-    lines_ac = lines[lines.tag_frequency.astype(float) != 0].copy()
-    lines_dc = lines[lines.tag_frequency.astype(float) == 0].copy()
-
-    lines_ac = _set_electrical_parameters_lines(
-        config["lines"], config["electricity"]["voltages"], lines_ac
+    lines = _set_electrical_parameters_lines(
+        config["lines"], config["electricity"]["voltages"], lines
     )
 
-    lines_dc = _set_electrical_parameters_dc_lines(
-        config["lines"], config["electricity"]["voltages"], lines_dc
-    )
+    links = _set_electrical_parameters_links(links, config)
 
-    # lines = _set_electrical_parameters_lines(lines, config)
     transformers = _set_electrical_parameters_transformers(transformers, config)
-    # links = _set_electrical_parameters_links(links, config, links_p_nom)
     converters = _set_electrical_parameters_converters(converters, config)
 
     n = pypsa.Network()
@@ -833,15 +788,7 @@ def base_network_osm(
     )  # TODO: fix hard code and check if AC/DC truly exist
 
     n.import_components_from_dataframe(buses, "Bus")
-
-    lines_dc = _set_electrical_parameters_links(lines_dc, config, links_p_nom)
-    # parse line information into p_nom required for converters
-    lines_dc["p_nom"] = lines_dc.apply(
-        lambda x: x["v_nom"] * n.line_types.i_nom[x["type"]],
-        axis=1,
-        result_type="reduce",
-    )
-    n.import_components_from_dataframe(lines_ac, "Line")
+    n.import_components_from_dataframe(lines, "Line")
     # The columns which names starts with "bus" are mixed up with the third-bus specification
     # when executing additional_linkports()
     # lines_dc.drop(
@@ -856,17 +803,12 @@ def base_network_osm(
     #     axis=1,
     #     inplace=True,
     # )
-    n.import_components_from_dataframe(lines_dc, "Link")
-
-    # n.import_components_from_dataframe(lines, "Line")
+    n.import_components_from_dataframe(links, "Link")
     n.import_components_from_dataframe(transformers, "Transformer")
-    # n.import_components_from_dataframe(links, "Link")
     n.import_components_from_dataframe(converters, "Link")
 
     _set_lines_s_nom_from_linetypes(n)
 
-    # TODO pypsa-eur add this
-    # _apply_parameter_corrections(n, parameter_corrections)
 
     # TODO: what about this?
     n = _remove_unconnected_components(n)
@@ -1085,7 +1027,7 @@ def append_bus_shapes(n, shapes, type):
         snakemake.input.eg_converters,
         snakemake.input.eg_transformers,
         snakemake.input.eg_lines,
-        snakemake.input.links_p_nom,
+        snakemake.input.eg_links,
         snakemake.input.europe_shape,
         snakemake.input.country_shapes,
         snakemake.input.offshore_shapes,
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 60576a34d..8c47cbc91 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -313,7 +313,7 @@ def merge_stations_same_station_id(
         # average location of the buses having the same station_id
         station_point_x = np.round(g_value.geometry.x.mean(), precision)
         station_point_y = np.round(g_value.geometry.y.mean(), precision)
-        is_dclink_boundary_point = any(g_value["is_dclink_boundary_point"])
+        # is_dclink_boundary_point = any(g_value["is_dclink_boundary_point"])
 
         # loop for every voltage level in the bus
         # The location of the buses is averaged; in the case of multiple voltage levels for the same station_id,
@@ -337,7 +337,7 @@ def merge_stations_same_station_id(
                     lon_bus,  # "lon"
                     lat_bus,  # "lat"
                     bus_row["country"].iloc[0],  # "country",
-                    is_dclink_boundary_point,  # check if new bus was formed of at least one DC link boundary point
+                    # is_dclink_boundary_point,  # check if new bus was formed of at least one DC link boundary point
                     Point(
                         lon_bus,
                         lat_bus,
@@ -362,7 +362,7 @@ def merge_stations_same_station_id(
         "x",
         "y",
         "country",
-        "is_dclink_boundary_point",
+        # "is_dclink_boundary_point",
         "geometry",
     ]
 
@@ -483,18 +483,17 @@ def get_converters(buses):
                 )
 
                 # check if bus is a dclink boundary point, only then add converter
-                if g_value["is_dclink_boundary_point"].loc[id_0]:
-                    df_converters.append(
-                        [
-                            f"convert_{g_name}_{id_0}",  # "line_id"
-                            g_value["bus_id"].loc[id_0],  # "bus0"
-                            g_value["bus_id"].loc[id_1],  # "bus1"
-                            False,  # "underground"
-                            False,  # "under_construction"
-                            g_value.country.loc[id_0],  # "country"
-                            geom_conv,  # "geometry"
-                        ]
-                    )
+                df_converters.append(
+                    [
+                        f"convert_{g_name}_{id_0}",  # "line_id"
+                        g_value["bus_id"].loc[id_0],  # "bus0"
+                        g_value["bus_id"].loc[id_1],  # "bus1"
+                        False,  # "underground"
+                        False,  # "under_construction"
+                        g_value.country.loc[id_0],  # "country"
+                        geom_conv,  # "geometry"
+                    ]
+                )
 
     # name of the columns
     conv_columns = [
@@ -618,7 +617,7 @@ def set_lv_substations(buses):
 
 
 def merge_stations_lines_by_station_id_and_voltage(
-    lines, buses, distance_crs, tol=5000
+    lines, links, buses, distance_crs, tol=5000
 ):
     """
     Function to merge close stations and adapt the line datasets to adhere to
@@ -637,59 +636,59 @@ def merge_stations_lines_by_station_id_and_voltage(
     set_substations_ids(buses_ac, distance_crs, tol=tol)
     set_substations_ids(buses_dc, distance_crs, tol=tol)
 
-    # Find boundary points of DC links
-    # lines_dc_shape = lines[lines["dc"] == True].unary_union
-    # lines_dc_bounds = lines_dc_shape.boundary
-    # lines_dc_points = [p for p in lines_dc_bounds.geoms]
-    lines_dc = lines[lines["dc"] == True].reset_index()
-    lines_dc["adj_idx"] = range(0, len(lines_dc))
-
-    # Initialize an empty adjacency matrix
-    dc_adj_matrix = np.zeros((len(lines_dc), len(lines_dc)), dtype=int)
-
-    # Fill the adjacency matrix
-    for i in range(len(lines_dc)):
-        for j in range(len(lines_dc)):
-            if are_lines_connected(lines_dc.iloc[i], lines_dc.iloc[j]):
-                dc_adj_matrix[i, j] = 1
-
-    dc_paths = find_paths(dc_adj_matrix)
-
-    all_dc_boundary_points = pd.Series()
-
-    for path in dc_paths:
-        bus_0_coors = lines_dc.iloc[path]["bus_0_coors"]
-        bus_1_coors = lines_dc.iloc[path]["bus_1_coors"]
-
-        # Create DataFrame containing all points within a path
-        dc_points = pd.concat([bus_0_coors, bus_1_coors], ignore_index=True)
-
-        # Determine the value counts of individual points. If it occurs more than
-        # once, it cannot be an end-point of a path
-        bool_duplicates = (
-            dc_points.apply(lambda p: sum([are_almost_equal(p, s) for s in dc_points]))
-            > 1
-        )
-
-        # Drop all duplicates
-        dc_boundary_points = dc_points[~bool_duplicates]
-
-        if dc_boundary_points.empty:
-            all_dc_boundary_points = dc_boundary_points
-        else:
-            if all_dc_boundary_points.empty:
-                all_dc_boundary_points = dc_boundary_points
-            else:
-                all_dc_boundary_points = pd.concat(
-                    [all_dc_boundary_points, dc_boundary_points], ignore_index=True
-                )
-
-    # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
-    # check for each entry in buses_dc whether it is included in lines_dc_points
-    buses_ac["is_dclink_boundary_point"] = False
-    buses_dc["is_dclink_boundary_point"] = buses_dc.geometry.apply(
-        lambda p: any([p.within(l) for l in all_dc_boundary_points])
-    )
+    # # Find boundary points of DC links
+    # # lines_dc_shape = lines[lines["dc"] == True].unary_union
+    # # lines_dc_bounds = lines_dc_shape.boundary
+    # # lines_dc_points = [p for p in lines_dc_bounds.geoms]
+    # lines_dc = lines[lines["dc"] == True].reset_index()
+    # lines_dc["adj_idx"] = range(0, len(lines_dc))
+
+    # # Initialize an empty adjacency matrix
+    # dc_adj_matrix = np.zeros((len(lines_dc), len(lines_dc)), dtype=int)
+
+    # # Fill the adjacency matrix
+    # for i in range(len(lines_dc)):
+    #     for j in range(len(lines_dc)):
+    #         if are_lines_connected(lines_dc.iloc[i], lines_dc.iloc[j]):
+    #             dc_adj_matrix[i, j] = 1
+
+    # dc_paths = find_paths(dc_adj_matrix)
+
+    # all_dc_boundary_points = pd.Series()
+
+    # for path in dc_paths:
+    #     bus_0_coors = lines_dc.iloc[path]["bus_0_coors"]
+    #     bus_1_coors = lines_dc.iloc[path]["bus_1_coors"]
+
+    #     # Create DataFrame containing all points within a path
+    #     dc_points = pd.concat([bus_0_coors, bus_1_coors], ignore_index=True)
+
+    #     # Determine the value counts of individual points. If it occurs more than
+    #     # once, it cannot be an end-point of a path
+    #     bool_duplicates = (
+    #         dc_points.apply(lambda p: sum([are_almost_equal(p, s) for s in dc_points]))
+    #         > 1
+    #     )
+
+    #     # Drop all duplicates
+    #     dc_boundary_points = dc_points[~bool_duplicates]
+
+    #     if dc_boundary_points.empty:
+    #         all_dc_boundary_points = dc_boundary_points
+    #     else:
+    #         if all_dc_boundary_points.empty:
+    #             all_dc_boundary_points = dc_boundary_points
+    #         else:
+    #             all_dc_boundary_points = pd.concat(
+    #                 [all_dc_boundary_points, dc_boundary_points], ignore_index=True
+    #             )
+    
+    # # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
+    # # check for each entry in buses_dc whether it is included in lines_dc_points
+    # buses_ac["is_dclink_boundary_point"] = False
+    # buses_dc["is_dclink_boundary_point"] = buses_dc.geometry.apply(
+    #     lambda p: any([p.within(l) for l in all_dc_boundary_points])
+    # )
 
     logger.info(" - Merging substations with the same id")
 
@@ -705,26 +704,25 @@ def merge_stations_lines_by_station_id_and_voltage(
 
     # set the bus ids to the line dataset
     lines, buses = set_lines_ids(lines, buses, distance_crs)
+    links, buses = set_lines_ids(links, buses, distance_crs)
 
     # drop lines starting and ending in the same node
     lines.drop(lines[lines["bus0"] == lines["bus1"]].index, inplace=True)
+    links.drop(links[links["bus0"] == links["bus1"]].index, inplace=True)
     # update line endings
     lines = line_endings_to_bus_conversion(lines)
+    links = line_endings_to_bus_conversion(links)
 
     # set substation_lv
     set_lv_substations(buses)
 
-    logger.info(" - Adding converters to lines")
-
-    # append fake converters
-    # lines = pd.concat([lines, converters], ignore_index=True)
-
     # reset index
     lines.reset_index(drop=True, inplace=True)
+    links.reset_index(drop=True, inplace=True)
     # if len(links) > 0:
     #     links.reset_index(drop=True, inplace=True)
 
-    return lines, buses
+    return lines, links, buses
 
 
 def build_network(
@@ -764,6 +762,17 @@ def build_network(
             "country": "object",
             "geometry": "object",
         },
+        "link": {
+            "link_id": "object",
+            "bus0": "object",
+            "bus1": "object",
+            "voltage": "float",
+            "length": "float",
+            "under_construction": "bool",
+            "dc": "bool",
+            "country": "object",
+            "geometry": "object",
+        },
     }
 
     logger.info("Reading input data.")
@@ -779,19 +788,29 @@ def build_network(
         dtype=osm_clean_columns["line"],
     )
 
+    links = read_geojson(
+        inputs["links"],
+        osm_clean_columns["link"].keys(),
+        dtype=osm_clean_columns["link"],
+    )
+
     lines = line_endings_to_bus_conversion(lines)
+    links = line_endings_to_bus_conversion(links)
 
     # METHOD to merge buses with same voltage and within tolerance
     tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
     logger.info(f"Aggregating close substations: Enabled with tolerance {tol} m")
-    lines, buses = merge_stations_lines_by_station_id_and_voltage(
-        lines, buses, distance_crs, tol=tol
+
+    lines, links, buses = merge_stations_lines_by_station_id_and_voltage(
+        lines, links, buses, distance_crs, tol=tol
     )
 
     # Recalculate lengths of lines
     utm = lines.estimate_utm_crs(datum_name="WGS 84")
     lines["length"] = lines.to_crs(utm).length
+    links["length"] = links.to_crs(utm).length
 
+    # TODO pypsa-eur: check if needed for updated links scripts
     # get transformers: modelled as lines connecting buses with different voltage
     transformers = get_transformers(buses, lines)
 
@@ -810,12 +829,14 @@ def build_network(
 
     # Drop unncessary index column and set respective element ids as index
     lines.set_index("line_id", inplace=True)
+    links.set_index("link_id", inplace=True)
     converters.set_index("converter_id", inplace=True)
     transformers.set_index("transformer_id", inplace=True)
     buses.set_index("bus_id", inplace=True)
 
     # Convert voltages from V to kV
     lines["voltage"] = lines["voltage"] / 1000
+    links["voltage"] = links["voltage"] / 1000
     transformers["voltage_bus0"], transformers["voltage_bus1"] = (
         transformers["voltage_bus0"] / 1000,
         transformers["voltage_bus1"] / 1000,
@@ -824,66 +845,68 @@ def build_network(
 
     # Convert 'true' and 'false' to 't' and 'f'
     lines = lines.replace({True: "t", False: "f"})
+    links = links.replace({True: "t", False: "f"})
     converters = converters.replace({True: "t", False: "f"})
     buses = buses.replace({True: "t", False: "f"})
 
     # Change column orders
-    cols_lines = [
+    cols_lines= [
         "bus0",
         "bus1",
         "voltage",
         "circuits",
+        "tag_frequency",
         "length",
         "underground",
         "under_construction",
         "geometry",
-        "tag_type",
-        "tag_frequency",
-        "country",
-        "bounds",
-        "bus_0_coors",
-        "bus_1_coors",
-        "bus0_lon",
-        "bus0_lat",
-        "bus1_lon",
-        "bus1_lat",
     ]
 
-    cols_lines_csv = [
+    lines = lines[cols_lines]
+
+    cols_links = [
         "bus0",
         "bus1",
         "voltage",
-        "circuits",
-        "tag_frequency",
+        "p_nom",
         "length",
-        "underground",
         "under_construction",
         "geometry",
     ]
-    lines_csv = lines[cols_lines_csv]
-    lines = lines[cols_lines]
 
-    to_csv_nafix(lines_csv, outputs["lines"], quotechar="'")  # Generate CSV
+    links = links[cols_links]
+
+    cols_transformers = [
+        "bus0",
+        "bus1",
+        "voltage_bus0",
+        "voltage_bus1",
+        "country",
+        "geometry",
+    ]
+
+    transformers = transformers[cols_transformers]
+
+    to_csv_nafix(lines, outputs["lines"], quotechar="'")  # Generate CSV
+    to_csv_nafix(links, outputs["links"], quotechar="'")  # Generate CSV
     to_csv_nafix(converters, outputs["converters"], quotechar="'")  # Generate CSV
     to_csv_nafix(transformers, outputs["transformers"], quotechar="'")  # Generate CSV
 
-    colstodrop = ["bounds", "bus_0_coors", "bus_1_coors"]
-
     # Export to GeoJSON for quick validations
     save_to_geojson(
-        gpd.GeoDataFrame(
-            lines.drop(columns=colstodrop), geometry="geometry", crs=geo_crs
-        ),
+        gpd.GeoDataFrame(lines),
         outputs["lines_geojson"],
     )
+    save_to_geojson(
+        gpd.GeoDataFrame(links),
+        outputs["links_geojson"],
+    )
     save_to_geojson(
         gpd.GeoDataFrame(converters, geometry="geometry", crs=geo_crs),
         outputs["converters_geojson"],
     )
     save_to_geojson(
-        gpd.GeoDataFrame(
-            transformers.drop(columns=colstodrop), geometry="geometry", crs=geo_crs
-        ),
+        gpd.GeoDataFrame(transformers, geometry="geometry", crs=geo_crs),
         outputs["transformers_geojson"],
     )
 
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 5674f53f3..e80f0b709 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -78,6 +78,72 @@ def _create_polygon(row):
     return polygon
 
 
+def _extended_linemerge(lines):
+    """
+    Merges a list of LineStrings into a single LineString by finding the 
+    closest pair of points between all pairs of LineStrings.
+
+    Parameters:
+    lines (list): A list of LineStrings to be merged.
+
+    Returns:
+    merged_line (LineString): The merged LineString.
+
+    Raises:
+    TypeError: If the input is not a list of LineStrings.
+
+    """
+    # Ensure we have a list of LineStrings
+    if not isinstance(lines, list):
+        raise TypeError("Input should be a list of LineStrings")
+    if any(not isinstance(line, LineString) for line in lines):
+        raise TypeError("All elements in the list should be LineStrings")
+    
+    if len(lines) == 1:
+        return lines[0]
+    
+    merged_linestring = linemerge(lines)
+
+    if isinstance(merged_linestring, LineString):
+        return merged_linestring
+    else:
+        def find_closest_points(line1, line2):
+            min_dist = np.inf
+            closest_points = (None, None)
+            for point1 in line1.coords:
+                for point2 in line2.coords:
+                    dist = np.linalg.norm(np.array(point1) - np.array(point2))
+                    if dist < min_dist:
+                        min_dist = dist
+                        closest_points = (point1, point2)
+            return closest_points
+        
+        def merge_lines(lines):
+            while len(lines) > 1:
+                min_distance = np.inf
+                closest_pair = (None, None)
+                pair_indices = (None, None)
+                for i in range(len(lines)):
+                    for j in range(i + 1, len(lines)):
+                        point1, point2 = find_closest_points(lines[i], lines[j])
+                        distance = np.linalg.norm(np.array(point1) - np.array(point2))
+                        if distance < min_distance:
+                            min_distance = distance
+                            closest_pair = (point1, point2)
+                            pair_indices = (i, j)
+                
+                connecting_line = LineString([closest_pair[0], closest_pair[1]])
+                combined_line = linemerge(MultiLineString([lines[pair_indices[0]], lines[pair_indices[1]], connecting_line]))
+                
+                new_lines = [line for k, line in enumerate(lines) if k not in pair_indices]
+                new_lines.append(combined_line)
+                lines = new_lines
+            
+            return lines[0]
+        lines = list(merged_linestring.geoms)
+        return merge_lines(lines)
+
+
 def _clean_voltage(column):
     """
     Function to clean the raw voltage column: manual fixing and drop nan values
@@ -391,6 +457,7 @@ def _add_line_endings_to_substations(
     gdf_lines,
     path_country_shapes,
     path_offshore_shapes,
+    prefix,
 ):
     """
     Add line endings to substations.
@@ -440,18 +507,18 @@ def _add_line_endings_to_substations(
     bus_all = bus_all.groupby(["voltage", "lon", "lat", "dc"]).first().reset_index()
     bus_all = bus_all[df_substations.columns]
     bus_all.loc[:, "bus_id"] = bus_all.apply(
-        lambda row: f"line-end/{row.name + 1}", axis=1
+        lambda row: f"{prefix}/{row.name + 1}", axis=1
     )
 
     # Initialize default values
-    bus_all["station_id"] = np.nan
+    bus_all["station_id"] = None
     # Assuming substations completed for installed lines
     bus_all["under_construction"] = False
     bus_all["tag_area"] = None
     bus_all["symbol"] = "substation"
     # TODO: this tag may be improved, maybe depending on voltage levels
     bus_all["tag_substation"] = "transmission"
-    bus_all["tag_source"] = "line-end"
+    bus_all["tag_source"] = prefix
 
     buses = pd.concat([df_substations, bus_all], ignore_index=True)
     buses.set_index("bus_id", inplace=True)
@@ -492,7 +559,7 @@ def _add_line_endings_to_substations(
         bool_multiple_countries, "index_right"
     ]
 
-    return buses
+    return buses.reset_index()
 
 
 def _import_lines_and_cables(path_lines):
@@ -615,6 +682,11 @@ def _import_links(path_links):
 
                 df = pd.DataFrame(data["elements"])
                 df["id"] = df["id"].astype(str)
+                df["id"] = df["id"].apply(
+                    lambda x: (
+                        f"relation/{x}"
+                    )
+                )
                 df["country"] = country
 
                 col_tags = [
@@ -682,18 +754,18 @@ def _create_single_link(row):
         tuple = sorted([row["geometry"].coords[0], row["geometry"].coords[-1]])
         # round tuple to 3 decimals
         tuple = (
-            round(tuple[0][0], 2), 
-            round(tuple[0][1], 2), 
-            round(tuple[1][0], 2), 
-            round(tuple[1][1], 2)
+            round(tuple[0][0], 3), 
+            round(tuple[0][1], 3), 
+            round(tuple[1][0], 3), 
+            round(tuple[1][1], 3)
             )
         list_endpoints.append(tuple)
 
     df.loc[:, "endpoints"] = list_endpoints
-    df_longest = df.loc[df.groupby("endpoints")["length"].idxmax()]
+    df_longest = df.loc[df.groupby("endpoints")["length"].idxmin()]
     
     single_link = linemerge(df_longest["geometry"].values.tolist())
-
+    
     # If the longest component is a MultiLineString, extract the longest linestring from it
     if isinstance(single_link, MultiLineString):
         # Find connected components
@@ -1233,6 +1305,8 @@ def _finalise_links(df_links):
     df_links.loc[:, "bus0"] = None
     df_links.loc[:, "bus1"] = None
     df_links.loc[:, "length"] = None
+    df_links.loc[:, "under_construction"] = False
+    df_links.loc[:, "dc"] = True
 
     # Only include needed columns
     df_links = df_links[
@@ -1243,6 +1317,8 @@ def _finalise_links(df_links):
             "bus0",
             "bus1",
             "length",
+            "under_construction",
+            "dc",
             "country",
             "geometry",
         ]
@@ -1251,6 +1327,9 @@ def _finalise_links(df_links):
     # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
     # This workaround is needed as otherwise the column dtypes remain "objects"
     df_links["p_nom"] = df_links["p_nom"].astype(int)
+        # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
+    # This workaround is needed as otherwise the column dtypes remain "objects"
+    df_links["voltage"] = df_links["voltage"].astype(int)
 
     return df_links
 
@@ -1497,6 +1576,13 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     df_lines.loc[:, "frequency"] = _clean_frequency(df_lines["frequency"])
     df_lines.loc[:, "wires"] = _clean_wires(df_lines["wires"])
     df_lines = _clean_lines(df_lines, list_voltages)
+
+    # Drop DC lines, will be added through relations later
+    len_before = len(df_lines)
+    df_lines = df_lines[df_lines["frequency"] == "50"]
+    len_after = len(df_lines)
+    logger.info(f"Dropped {len_before - len_after} DC lines. Keeping {len_after} AC lines.")
+
     df_lines = _create_lines_geometry(df_lines)
     df_lines = _finalise_lines(df_lines)
 
@@ -1510,14 +1596,49 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     gdf_lines = gpd.GeoDataFrame(df_lines, geometry="geometry", crs=crs)
     gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon)
 
+    logger.info("---")
+    logger.info("HVDC LINKS")
+    path_links = {
+        "links": snakemake.input.links_relation,
+    }
+
+    ### CONTINUE HERE
+    # Cleaning process
+    df_links = _import_links(path_links)
+
+    df_links = _drop_duplicate_lines(df_links)
+    df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"])
+    df_links, list_voltages = _filter_by_voltage(df_links, min_voltage=min_voltage_dc)
+    # Keep only highest voltage of split string
+    df_links.loc[:, "voltage"] = df_links["voltage"].apply(
+        lambda x: str(max(map(int, x.split(";"))))
+    )
+    df_links.loc[:, "frequency"] = _clean_frequency(df_links["frequency"])
+    df_links.loc[:, "rating"] = _clean_rating(df_links["rating"])
+
+    df_links.loc[:, "geometry"] = df_links.apply(_create_single_link, axis=1)
+    df_links = _finalise_links(df_links)
+    gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs)
+
+
     # Add line endings to substations
     path_country_shapes = snakemake.input.country_shapes
     path_offshore_shapes = snakemake.input.offshore_shapes
+    
     df_substations = _add_line_endings_to_substations(
         df_substations,
         gdf_lines,
         path_country_shapes,
         path_offshore_shapes,
+        prefix="line-end",
+    )
+
+    df_substations = _add_line_endings_to_substations(
+        df_substations,
+        gdf_links,
+        path_country_shapes,
+        path_offshore_shapes,
+        prefix="link-end",
     )
 
     # Drop polygons and create GDF
@@ -1542,28 +1663,7 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     gdf_substations.to_file(output_substations, driver="GeoJSON")
     logger.info(f"Exporting clean lines to {output_lines}")
     gdf_lines.to_file(output_lines, driver="GeoJSON")
-
-    logger.info("---")
-    logger.info("HVDC LINKS")
-    path_links = {
-        "links": snakemake.input.links_relation,
-    }
-
-    ### CONTINUE HERE
-    # Cleaning process
-    df_links = _import_links(path_links)
-
-    df_links = _drop_duplicate_lines(df_links)
-    df_links.loc[:, "voltage"] = _clean_voltage(df_links["voltage"])
-    df_links, list_voltages = _filter_by_voltage(df_links, min_voltage=min_voltage_dc)
-    df_links.loc[:, "frequency"] = _clean_frequency(df_links["frequency"])
-    df_links.loc[:, "rating"] = _clean_rating(df_links["rating"])
-    df_links.loc[:, "geometry"] = df_links.apply(_create_single_link, axis=1)
-    df_links = _finalise_links(df_links)
-    gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs)
-
     logger.info(f"Exporting clean links to {output_links}")
     gdf_links.to_file(output_links, driver="GeoJSON")
     
-
-    logger.info("Cleaning OSM data completed.")
+    logger.info("Cleaning OSM data completed.")
\ No newline at end of file

From f9e3eec81990d2fc0d7fd4df4c5aa11c9ad84b3c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 3 Jun 2024 15:24:00 +0000
Subject: [PATCH 034/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/base_network_osm.py  |  7 ++--
 scripts/build_osm_network.py |  4 +--
 scripts/clean_osm_data.py    | 64 ++++++++++++++++++++----------------
 3 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index 68d3232e5..6e78f9454 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -216,12 +216,12 @@ def _load_links_from_eg(buses, eg_links):
         true_values=["t"],
         false_values=["f"],
         dtype=dict(
-            link_id="str", 
-            bus0="str", 
+            link_id="str",
+            bus0="str",
             bus1="str",
             voltage="int",
             p_nom="float",
-            ),
+        ),
     ).set_index("link_id")
 
     links["length"] /= 1e3
@@ -809,7 +809,6 @@ def base_network_osm(
 
     _set_lines_s_nom_from_linetypes(n)
 
-
     # TODO: what about this?
     n = _remove_unconnected_components(n)
 
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 8c47cbc91..66ba5cd24 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -682,7 +682,7 @@ def merge_stations_lines_by_station_id_and_voltage(
     #             all_dc_boundary_points = pd.concat(
     #                 [all_dc_boundary_points, dc_boundary_points], ignore_index=True
     #             )
-    
+
     # # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
     # # check for each entry in buses_dc whether it is included in lines_dc_points
     # buses_ac["is_dclink_boundary_point"] = False
@@ -850,7 +850,7 @@ def build_network(
     buses = buses.replace({True: "t", False: "f"})
 
     # Change column orders
-    cols_lines= [
+    cols_lines = [
         "bus0",
         "bus1",
         "voltage",
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index a8cd57ace..14e8005c1 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -80,7 +80,7 @@ def _create_polygon(row):
 
 def _extended_linemerge(lines):
     """
-    Merges a list of LineStrings into a single LineString by finding the 
+    Merges a list of LineStrings into a single LineString by finding the
     closest pair of points between all pairs of LineStrings.
 
     Parameters:
@@ -91,22 +91,22 @@ def _extended_linemerge(lines):
 
     Raises:
     TypeError: If the input is not a list of LineStrings.
-
     """
     # Ensure we have a list of LineStrings
     if not isinstance(lines, list):
         raise TypeError("Input should be a list of LineStrings")
     if any(not isinstance(line, LineString) for line in lines):
         raise TypeError("All elements in the list should be LineStrings")
-    
+
     if len(lines) == 1:
         return lines[0]
-    
+
     merged_linestring = linemerge(lines)
 
     if isinstance(merged_linestring, LineString):
         return merged_linestring
     else:
+
         def find_closest_points(line1, line2):
             min_dist = np.inf
             closest_points = (None, None)
@@ -117,7 +117,7 @@ def find_closest_points(line1, line2):
                         min_dist = dist
                         closest_points = (point1, point2)
             return closest_points
-        
+
         def merge_lines(lines):
             while len(lines) > 1:
                 min_distance = np.inf
@@ -131,15 +131,26 @@ def merge_lines(lines):
                             min_distance = distance
                             closest_pair = (point1, point2)
                             pair_indices = (i, j)
-                
+
                 connecting_line = LineString([closest_pair[0], closest_pair[1]])
-                combined_line = linemerge(MultiLineString([lines[pair_indices[0]], lines[pair_indices[1]], connecting_line]))
-                
-                new_lines = [line for k, line in enumerate(lines) if k not in pair_indices]
+                combined_line = linemerge(
+                    MultiLineString(
+                        [
+                            lines[pair_indices[0]],
+                            lines[pair_indices[1]],
+                            connecting_line,
+                        ]
+                    )
+                )
+
+                new_lines = [
+                    line for k, line in enumerate(lines) if k not in pair_indices
+                ]
                 new_lines.append(combined_line)
                 lines = new_lines
-            
+
             return lines[0]
+
         lines = list(merged_linestring.geoms)
         return merge_lines(lines)
 
@@ -678,11 +689,7 @@ def _import_links(path_links):
 
                 df = pd.DataFrame(data["elements"])
                 df["id"] = df["id"].astype(str)
-                df["id"] = df["id"].apply(
-                    lambda x: (
-                        f"relation/{x}"
-                    )
-                )
+                df["id"] = df["id"].apply(lambda x: (f"relation/{x}"))
                 df["country"] = country
 
                 col_tags = [
@@ -752,18 +759,18 @@ def _create_single_link(row):
         tuple = sorted([row["geometry"].coords[0], row["geometry"].coords[-1]])
         # round tuple to 3 decimals
         tuple = (
-            round(tuple[0][0], 3), 
-            round(tuple[0][1], 3), 
-            round(tuple[1][0], 3), 
-            round(tuple[1][1], 3)
-            )
+            round(tuple[0][0], 3),
+            round(tuple[0][1], 3),
+            round(tuple[1][0], 3),
+            round(tuple[1][1], 3),
+        )
         list_endpoints.append(tuple)
 
     df.loc[:, "endpoints"] = list_endpoints
     df_longest = df.loc[df.groupby("endpoints")["length"].idxmin()]
-    
+
     single_link = linemerge(df_longest["geometry"].values.tolist())
-    
+
     # If the longest component is a MultiLineString, extract the longest linestring from it
     if isinstance(single_link, MultiLineString):
         # Find connected components
@@ -1329,7 +1336,7 @@ def _finalise_links(df_links):
     # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
     # This workaround is needed as otherwise the column dtypes remain "objects"
     df_links["p_nom"] = df_links["p_nom"].astype(int)
-        # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
+    # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
     # This workaround is needed as otherwise the column dtypes remain "objects"
     df_links["voltage"] = df_links["voltage"].astype(int)
 
@@ -1583,7 +1590,9 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     len_before = len(df_lines)
     df_lines = df_lines[df_lines["frequency"] == "50"]
     len_after = len(df_lines)
-    logger.info(f"Dropped {len_before - len_after} DC lines. Keeping {len_after} AC lines.")
+    logger.info(
+        f"Dropped {len_before - len_after} DC lines. Keeping {len_after} AC lines."
+    )
 
     df_lines = _create_lines_geometry(df_lines)
     df_lines = _finalise_lines(df_lines)
@@ -1622,11 +1631,10 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     df_links = _finalise_links(df_links)
     gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs)
 
-
     # Add line endings to substations
     path_country_shapes = snakemake.input.country_shapes
     path_offshore_shapes = snakemake.input.offshore_shapes
-    
+
     df_substations = _add_line_endings_to_substations(
         df_substations,
         gdf_lines,
@@ -1667,5 +1675,5 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     gdf_lines.to_file(output_lines, driver="GeoJSON")
     logger.info(f"Exporting clean links to {output_links}")
     gdf_links.to_file(output_links, driver="GeoJSON")
-    
-    logger.info("Cleaning OSM data completed.")
\ No newline at end of file
+
+    logger.info("Cleaning OSM data completed.")

From aeb80a26495e690d8d8485a7f28d2e570f465ded Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 4 Jun 2024 16:06:46 +0200
Subject: [PATCH 035/100] removed manual line dropping.

---
 Snakefile                 | 1 +
 scripts/clean_osm_data.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 412d520d7..318deb980 100644
--- a/Snakefile
+++ b/Snakefile
@@ -137,4 +137,5 @@ rule sync:
         rsync -uvarh --no-g {params.cluster}/resources . || echo "No resources directory, skipping rsync"
         rsync -uvarh --no-g {params.cluster}/results . || echo "No results directory, skipping rsync"
         rsync -uvarh --no-g {params.cluster}/logs . || echo "No logs directory, skipping rsync"
+        rsync -uvarh --no-g {params.cluster}/data/osm . || echo "No data directory, skipping rsync"
         """
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 14e8005c1..b38b3c1a4 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1540,7 +1540,8 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
 
     # TODO pypsa-eur: Temporary solution as one AC line between converters will
     # create an error in simplify_network:
-    lines_to_drop = ["775580659"]
+    # lines_to_drop = ["775580659"]
+    lines_to_drop = [""]
 
     logger.info("---")
     logger.info("SUBSTATIONS")

From 637d28c755988aec814e19ddb82096dbfe238b18 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 6 Jun 2024 10:27:17 +0200
Subject: [PATCH 036/100] Updated clean script

---
 rules/build_electricity.smk  |  2 +-
 scripts/build_osm_network.py | 54 ------------------------------------
 scripts/clean_osm_data.py    | 31 +++++++++++++--------
 3 files changed, 21 insertions(+), 66 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 324dc6410..99f39423a 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -317,7 +317,7 @@ rule build_renewable_profiles:
         benchmarks("build_renewable_profiles_{technology}")
     threads: config["atlite"].get("nprocesses", 4)
     resources:
-        mem_mb=config["atlite"].get("nprocesses", 4) * 5000,
+        mem_mb=config["atlite"].get("nprocesses", 4) * 10000,
     wildcard_constraints:
         technology="(?!hydro).*",  # Any technology other than hydro
     conda:
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 66ba5cd24..0372692f9 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -636,60 +636,6 @@ def merge_stations_lines_by_station_id_and_voltage(
     set_substations_ids(buses_ac, distance_crs, tol=tol)
     set_substations_ids(buses_dc, distance_crs, tol=tol)
 
-    # # Find boundary points of DC links
-    # # lines_dc_shape = lines[lines["dc"] == True].unary_union
-    # # lines_dc_bounds = lines_dc_shape.boundary
-    # # lines_dc_points = [p for p in lines_dc_bounds.geoms]
-    # lines_dc = lines[lines["dc"] == True].reset_index()
-    # lines_dc["adj_idx"] = range(0, len(lines_dc))
-
-    # # Initialize an empty adjacency matrix
-    # dc_adj_matrix = np.zeros((len(lines_dc), len(lines_dc)), dtype=int)
-
-    # # Fill the adjacency matrix
-    # for i in range(len(lines_dc)):
-    #     for j in range(len(lines_dc)):
-    #         if are_lines_connected(lines_dc.iloc[i], lines_dc.iloc[j]):
-    #             dc_adj_matrix[i, j] = 1
-
-    # dc_paths = find_paths(dc_adj_matrix)
-
-    # all_dc_boundary_points = pd.Series()
-
-    # for path in dc_paths:
-    #     bus_0_coors = lines_dc.iloc[path]["bus_0_coors"]
-    #     bus_1_coors = lines_dc.iloc[path]["bus_1_coors"]
-
-    #     # Create DataFrame containing all points within a path
-    #     dc_points = pd.concat([bus_0_coors, bus_1_coors], ignore_index=True)
-
-    #     # Determine the value counts of individual points. If it occurs more than
-    #     # once, it cannot be an end-point of a path
-    #     bool_duplicates = (
-    #         dc_points.apply(lambda p: sum([are_almost_equal(p, s) for s in dc_points]))
-    #         > 1
-    #     )
-
-    #     # Drop all duplicates
-    #     dc_boundary_points = dc_points[~bool_duplicates]
-
-    #     if dc_boundary_points.empty:
-    #         all_dc_boundary_points = dc_boundary_points
-    #     else:
-    #         if all_dc_boundary_points.empty:
-    #             all_dc_boundary_points = dc_boundary_points
-    #         else:
-    #             all_dc_boundary_points = pd.concat(
-    #                 [all_dc_boundary_points, dc_boundary_points], ignore_index=True
-    #             )
-
-    # # TODO pypsa-eur: Add to pypsa-earth for all related entries on is_dclink_boundary_point
-    # # check for each entry in buses_dc whether it is included in lines_dc_points
-    # buses_ac["is_dclink_boundary_point"] = False
-    # buses_dc["is_dclink_boundary_point"] = buses_dc.geometry.apply(
-    #     lambda p: any([p.within(l) for l in all_dc_boundary_points])
-    # )
-
     logger.info(" - Merging substations with the same id")
 
     # merge buses with same station id and voltage
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index b38b3c1a4..ce1639667 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1279,8 +1279,6 @@ def _finalise_lines(df_lines):
         ]
     ]
 
-    # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
-    # This workaround is needed as otherwise the column dtypes remain "objects"
     df_lines["circuits"] = df_lines["circuits"].astype(int)
     df_lines["voltage"] = df_lines["voltage"].astype(int)
     df_lines["tag_frequency"] = df_lines["tag_frequency"].astype(int)
@@ -1333,11 +1331,7 @@ def _finalise_links(df_links):
         ]
     ]
 
-    # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
-    # This workaround is needed as otherwise the column dtypes remain "objects"
     df_links["p_nom"] = df_links["p_nom"].astype(int)
-    # Set lines data types df.apply(pd.to_numeric, args=('coerce',))
-    # This workaround is needed as otherwise the column dtypes remain "objects"
     df_links["voltage"] = df_links["voltage"].astype(int)
 
     return df_links
@@ -1524,6 +1518,11 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     return gdf_lines
 
 
+# Define a function to check if a polygon intersects any line in the lines GeoDataFrame
+def intersects_any_line(polygon, lines):
+    return lines.intersects(polygon).any()
+
+
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
@@ -1538,9 +1537,6 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     min_voltage_ac = 200000  # [unit: V] Minimum voltage value to filter AC lines.
     min_voltage_dc = 150000  #  [unit: V] Minimum voltage value to filter DC links.
 
-    # TODO pypsa-eur: Temporary solution as one AC line between converters will
-    # create an error in simplify_network:
-    # lines_to_drop = ["775580659"]
     lines_to_drop = [""]
 
     logger.info("---")
@@ -1614,8 +1610,6 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
         "links": snakemake.input.links_relation,
     }
 
-    ### CONTINUE HERE
-    # Cleaning process
     df_links = _import_links(path_links)
 
     df_links = _drop_duplicate_lines(df_links)
@@ -1652,6 +1646,21 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
         prefix="link-end",
     )
 
+    # # Drop df_substations.dc == True and tag_source != "link-end"
+    # df_substations = df_substations[
+    #     ~((df_substations.dc == True) & (df_substations.tag_source != "link-end"))
+    # ]
+
+    # # Apply the function to each polygon in the substations GeoDataFrame
+    # gdf_substations_polygon["connected"] = False
+    # gdf_substations_polygon['connected'] = gdf_substations_polygon['polygon'].apply(intersects_any_line, lines=gdf_lines)
+
+    # list_buses_disconnected = gdf_substations_polygon[gdf_substations_polygon['connected'] == False]['bus_id'].tolist()
+    
+    # # Drop islanded substations
+    # gdf_substations_polygon = gdf_substations_polygon[~gdf_substations_polygon['bus_id'].isin(list_buses_disconnected)]
+    # df_substations = df_substations[~df_substations['bus_id'].isin(list_buses_disconnected)]
+
     # Drop polygons and create GDF
     gdf_substations = gpd.GeoDataFrame(
         df_substations.drop(columns=["polygon"]), geometry="geometry", crs=crs

From f5e51711bad66f6c79ca6fefe8c3aac4dfee5f8e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 08:30:05 +0000
Subject: [PATCH 037/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/clean_osm_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index ce1639667..d64bcec97 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1656,7 +1656,7 @@ def intersects_any_line(polygon, lines):
     # gdf_substations_polygon['connected'] = gdf_substations_polygon['polygon'].apply(intersects_any_line, lines=gdf_lines)
 
     # list_buses_disconnected = gdf_substations_polygon[gdf_substations_polygon['connected'] == False]['bus_id'].tolist()
-    
+
     # # Drop islanded substations
     # gdf_substations_polygon = gdf_substations_polygon[~gdf_substations_polygon['bus_id'].isin(list_buses_disconnected)]
     # df_substations = df_substations[~df_substations['bus_id'].isin(list_buses_disconnected)]

From 3bebcc0204eb784720dd73f0b0990a082b71dfd6 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 6 Jun 2024 10:43:58 +0200
Subject: [PATCH 038/100] reverted Snakefile to default: sync settings

---
 Snakefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 318deb980..726b8d2ba 100644
--- a/Snakefile
+++ b/Snakefile
@@ -136,6 +136,4 @@ rule sync:
         rsync -uvarh --ignore-missing-args --files-from=.sync-send . {params.cluster}
         rsync -uvarh --no-g {params.cluster}/resources . || echo "No resources directory, skipping rsync"
         rsync -uvarh --no-g {params.cluster}/results . || echo "No results directory, skipping rsync"
-        rsync -uvarh --no-g {params.cluster}/logs . || echo "No logs directory, skipping rsync"
-        rsync -uvarh --no-g {params.cluster}/data/osm . || echo "No data directory, skipping rsync"
         """

From a2ee16f82490eb838d0b98a62072b7ecd0a3b212 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Thu, 6 Jun 2024 18:23:52 +0200
Subject: [PATCH 039/100] added prebuilt functionality.

---
 rules/build_electricity.smk | 10 +++++-----
 scripts/base_network_osm.py |  6 ++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 99f39423a..80b694323 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -115,11 +115,11 @@ if config["electricity_network"]["base_network"] == "osm":
             links=config_provider("links"),
             transformers=config_provider("transformers"),
         input:
-            eg_buses=resources("osm/buses.csv"),
-            eg_lines=resources("osm/lines.csv"),
-            eg_links=resources("osm/links.csv"),
-            eg_converters=resources("osm/converters.csv"),
-            eg_transformers=resources("osm/transformers.csv"),
+            eg_buses="data/osm/prebuilt/buses.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/buses.csv"),
+            eg_lines="data/osm/prebuilt/lines.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/lines.csv"),
+            eg_links="data/osm/prebuilt/links.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/links.csv"),
+            eg_converters="data/osm/prebuilt/converters.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/converters.csv"),
+            eg_transformers="data/osm/prebuilt/transformers.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/transformers.csv"),
             country_shapes=resources("country_shapes.geojson"),
             offshore_shapes=resources("offshore_shapes.geojson"),
             europe_shape=resources("europe_shape.geojson"),
diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
index 6e78f9454..f9b0daf57 100644
--- a/scripts/base_network_osm.py
+++ b/scripts/base_network_osm.py
@@ -145,11 +145,13 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
             dtype=dict(bus_id="str"),
         )
         .set_index("bus_id")
-        .drop(["station_id"], axis=1)
         .rename(columns=dict(voltage="v_nom"))
     )
 
-    buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
+    if "station_id" in buses.columns:
+        buses.drop("station_id", axis=1, inplace=True)
+
+    # buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
     buses["under_construction"] = buses.under_construction.where(
         lambda s: s.notnull(), False
     ).astype(bool)

From 10b51465c62c5b01331230f0040e9023945128f5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:24:21 +0000
Subject: [PATCH 040/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 rules/build_electricity.smk | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 80b694323..fe3194141 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -115,11 +115,31 @@ if config["electricity_network"]["base_network"] == "osm":
             links=config_provider("links"),
             transformers=config_provider("transformers"),
         input:
-            eg_buses="data/osm/prebuilt/buses.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/buses.csv"),
-            eg_lines="data/osm/prebuilt/lines.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/lines.csv"),
-            eg_links="data/osm/prebuilt/links.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/links.csv"),
-            eg_converters="data/osm/prebuilt/converters.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/converters.csv"),
-            eg_transformers="data/osm/prebuilt/transformers.csv" if config["electricity_network"]["osm_use_prebuilt"] == True else resources("osm/transformers.csv"),
+            eg_buses=(
+                "data/osm/prebuilt/buses.csv"
+                if config["electricity_network"]["osm_use_prebuilt"] == True
+                else resources("osm/buses.csv")
+            ),
+            eg_lines=(
+                "data/osm/prebuilt/lines.csv"
+                if config["electricity_network"]["osm_use_prebuilt"] == True
+                else resources("osm/lines.csv")
+            ),
+            eg_links=(
+                "data/osm/prebuilt/links.csv"
+                if config["electricity_network"]["osm_use_prebuilt"] == True
+                else resources("osm/links.csv")
+            ),
+            eg_converters=(
+                "data/osm/prebuilt/converters.csv"
+                if config["electricity_network"]["osm_use_prebuilt"] == True
+                else resources("osm/converters.csv")
+            ),
+            eg_transformers=(
+                "data/osm/prebuilt/transformers.csv"
+                if config["electricity_network"]["osm_use_prebuilt"] == True
+                else resources("osm/transformers.csv")
+            ),
             country_shapes=resources("country_shapes.geojson"),
             offshore_shapes=resources("offshore_shapes.geojson"),
             europe_shape=resources("europe_shape.geojson"),

From 38de271d05da52a377d8aa73c4b6417e1dcca063 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 7 Jun 2024 15:29:07 +0200
Subject: [PATCH 041/100] Updated build_electricity.smk to work with scenario
 management.

---
 rules/build_electricity.smk  | 86 +++++++++++++++++++++++++++---------
 scripts/retrieve_osm_data.py |  6 ++-
 2 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index fe3194141..7849031a8 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -104,7 +104,7 @@ if config["electricity_network"]["base_network"] == "gridkit":
             "../scripts/base_network.py"
 
 
-if config["electricity_network"]["base_network"] == "osm":
+if config["electricity_network"]["base_network"]  == "osm":
 
     rule base_network:
         params:
@@ -638,7 +638,7 @@ rule retrieve_osm_data:
         substations_way="data/osm/raw/{country}/substations_way.json",
         substations_relation="data/osm/raw/{country}/substations_relation.json",
     log:
-        logs("retrieve_osm_data_{country}.log"),
+        "logs/retrieve_osm_data_{country}.log",
     resources:
         cores=2,
         threads=1,
@@ -646,27 +646,37 @@ rule retrieve_osm_data:
         "../scripts/retrieve_osm_data.py"
 
 
+rule retrieve_osm_data_all:
+    input:
+        expand("data/osm/raw/{country}/cables_way.json", country=config_provider("countries")),
+        expand("data/osm/raw/{country}/lines_way.json", country=config_provider("countries")),
+        expand("data/osm/raw/{country}/links_relation.json", country=config_provider("countries")),
+        expand("data/osm/raw/{country}/substations_way.json", country=config_provider("countries")),
+        expand("data/osm/raw/{country}/substations_relation.json", country=config_provider("countries")),
+
+
 rule clean_osm_data:
     input:
-        cables_way=[
-            f"data/osm/raw/{country}/cables_way.json"
-            for country in config["countries"]
-        ],
-        lines_way=[
-            f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]
-        ],
-        links_relation=[
-            f"data/osm/raw/{country}/links_relation.json"
-            for country in config["countries"]
-        ],
-        substations_way=[
-            f"data/osm/raw/{country}/substations_way.json"
-            for country in config["countries"]
-        ],
-        substations_relation=[
-            f"data/osm/raw/{country}/substations_relation.json"
-            for country in config["countries"]
-        ],
+        cables_way=expand(
+            "data/osm/raw/{country}/cables_way.json",
+            country = config_provider("countries")
+        ),
+        lines_way=expand(
+            "data/osm/raw/{country}/lines_way.json",
+            country = config_provider("countries")
+        ),
+        links_relation=expand(
+            "data/osm/raw/{country}/links_relation.json",
+           country = config_provider("countries")
+        ),
+        substations_way=expand(
+            "data/osm/raw/{country}/substations_way.json",
+            country = config_provider("countries")
+        ),
+        substations_relation=expand(
+            "data/osm/raw/{country}/substations_relation.json",
+            country = config_provider("countries")
+        ),
         offshore_shapes=resources("offshore_shapes.geojson"),
         country_shapes=resources("country_shapes.geojson"),
     output:
@@ -680,6 +690,40 @@ rule clean_osm_data:
         "../scripts/clean_osm_data.py"
 
 
+# rule clean_osm_data:
+#     input:
+#         cables_way=[
+#             f"data/osm/raw/{country}/cables_way.json"
+#             for country in config["countries"]
+#         ],
+#         lines_way=[
+#             f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]
+#         ],
+#         links_relation=[
+#             f"data/osm/raw/{country}/links_relation.json"
+#             for country in config["countries"]
+#         ],
+#         substations_way=[
+#             f"data/osm/raw/{country}/substations_way.json"
+#             for country in config["countries"]
+#         ],
+#         substations_relation=[
+#             f"data/osm/raw/{country}/substations_relation.json"
+#             for country in config["countries"]
+#         ],
+#         offshore_shapes=resources("offshore_shapes.geojson"),
+#         country_shapes=resources("country_shapes.geojson"),
+#     output:
+#         substations=resources("osm/clean/substations.geojson"),
+#         substations_polygon=resources("osm/clean/substations_polygon.geojson"),
+#         lines=resources("osm/clean/lines.geojson"),
+#         links=resources("osm/clean/links.geojson"),
+#     log:
+#         logs("clean_osm_data.log"),
+#     script:
+#         "../scripts/clean_osm_data.py"
+
+
 rule build_osm_network:
     input:
         substations=resources("osm/clean/substations.geojson"),
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index bad99df3a..899337f89 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -17,7 +17,11 @@
 import time
 
 import requests
-from _helpers import configure_logging
+from _helpers import (
+    configure_logging,
+    # set_scenario_config,
+    # update_config_from_wildcards,
+)
 
 logger = logging.getLogger(__name__)
 

From bb55ad70df4d4a503404c736b7ebd55467ee6642 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Fri, 7 Jun 2024 15:31:46 +0200
Subject: [PATCH 042/100] removed commented-out code.

---
 rules/build_electricity.smk | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 7849031a8..185f5692a 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -690,40 +690,6 @@ rule clean_osm_data:
         "../scripts/clean_osm_data.py"
 
 
-# rule clean_osm_data:
-#     input:
-#         cables_way=[
-#             f"data/osm/raw/{country}/cables_way.json"
-#             for country in config["countries"]
-#         ],
-#         lines_way=[
-#             f"data/osm/raw/{country}/lines_way.json" for country in config["countries"]
-#         ],
-#         links_relation=[
-#             f"data/osm/raw/{country}/links_relation.json"
-#             for country in config["countries"]
-#         ],
-#         substations_way=[
-#             f"data/osm/raw/{country}/substations_way.json"
-#             for country in config["countries"]
-#         ],
-#         substations_relation=[
-#             f"data/osm/raw/{country}/substations_relation.json"
-#             for country in config["countries"]
-#         ],
-#         offshore_shapes=resources("offshore_shapes.geojson"),
-#         country_shapes=resources("country_shapes.geojson"),
-#     output:
-#         substations=resources("osm/clean/substations.geojson"),
-#         substations_polygon=resources("osm/clean/substations_polygon.geojson"),
-#         lines=resources("osm/clean/lines.geojson"),
-#         links=resources("osm/clean/links.geojson"),
-#     log:
-#         logs("clean_osm_data.log"),
-#     script:
-#         "../scripts/clean_osm_data.py"
-
-
 rule build_osm_network:
     input:
         substations=resources("osm/clean/substations.geojson"),

From a629dbad4e0748adf791b10c93cf58db8af51d76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 7 Jun 2024 13:33:35 +0000
Subject: [PATCH 043/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 rules/build_electricity.smk  | 37 +++++++++++++++++++++++++-----------
 scripts/retrieve_osm_data.py |  4 +---
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 185f5692a..5b3b432f6 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -104,7 +104,7 @@ if config["electricity_network"]["base_network"] == "gridkit":
             "../scripts/base_network.py"
 
 
-if config["electricity_network"]["base_network"]  == "osm":
+if config["electricity_network"]["base_network"] == "osm":
 
     rule base_network:
         params:
@@ -648,34 +648,49 @@ rule retrieve_osm_data:
 
 rule retrieve_osm_data_all:
     input:
-        expand("data/osm/raw/{country}/cables_way.json", country=config_provider("countries")),
-        expand("data/osm/raw/{country}/lines_way.json", country=config_provider("countries")),
-        expand("data/osm/raw/{country}/links_relation.json", country=config_provider("countries")),
-        expand("data/osm/raw/{country}/substations_way.json", country=config_provider("countries")),
-        expand("data/osm/raw/{country}/substations_relation.json", country=config_provider("countries")),
+        expand(
+            "data/osm/raw/{country}/cables_way.json",
+            country=config_provider("countries"),
+        ),
+        expand(
+            "data/osm/raw/{country}/lines_way.json",
+            country=config_provider("countries"),
+        ),
+        expand(
+            "data/osm/raw/{country}/links_relation.json",
+            country=config_provider("countries"),
+        ),
+        expand(
+            "data/osm/raw/{country}/substations_way.json",
+            country=config_provider("countries"),
+        ),
+        expand(
+            "data/osm/raw/{country}/substations_relation.json",
+            country=config_provider("countries"),
+        ),
 
 
 rule clean_osm_data:
     input:
         cables_way=expand(
             "data/osm/raw/{country}/cables_way.json",
-            country = config_provider("countries")
+            country=config_provider("countries"),
         ),
         lines_way=expand(
             "data/osm/raw/{country}/lines_way.json",
-            country = config_provider("countries")
+            country=config_provider("countries"),
         ),
         links_relation=expand(
             "data/osm/raw/{country}/links_relation.json",
-           country = config_provider("countries")
+            country=config_provider("countries"),
         ),
         substations_way=expand(
             "data/osm/raw/{country}/substations_way.json",
-            country = config_provider("countries")
+            country=config_provider("countries"),
         ),
         substations_relation=expand(
             "data/osm/raw/{country}/substations_relation.json",
-            country = config_provider("countries")
+            country=config_provider("countries"),
         ),
         offshore_shapes=resources("offshore_shapes.geojson"),
         country_shapes=resources("country_shapes.geojson"),
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 899337f89..67fc810ef 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -17,10 +17,8 @@
 import time
 
 import requests
-from _helpers import (
+from _helpers import (  # set_scenario_config,; update_config_from_wildcards,
     configure_logging,
-    # set_scenario_config,
-    # update_config_from_wildcards,
 )
 
 logger = logging.getLogger(__name__)

From 3e2f7d3d2278bed44b6237b2c0756e2a94b66e85 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 10 Jun 2024 13:36:29 +0200
Subject: [PATCH 044/100] removed commented-out code.

---
 config/config.default.yaml   |    4 +-
 rules/build_electricity.smk  |  157 +++--
 scripts/base_network.py      |  195 ++++++-
 scripts/base_network_osm.py  | 1058 ----------------------------------
 scripts/retrieve_osm_data.py |    4 +-
 5 files changed, 244 insertions(+), 1174 deletions(-)
 delete mode 100644 scripts/base_network_osm.py

diff --git a/config/config.default.yaml b/config/config.default.yaml
index d7ea86dab..7de26d9df 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -77,8 +77,8 @@ enable:
 
 # Settings related to the high-voltage electricity grid
 electricity_network:
-  base_network: "osm"  # "osm" or "gridkit"
-  osm_group_tolerance_buses: 5000  # [m] (default 5000) Tolerance in meters of the close buses to merge
+  base_network: "gridkit"           # "gridkit", "osm-prebuilt" (prebuilt network from OSM data), "osm-raw" (retrieve and build network from raw OSM data, takes longer)
+  osm_group_tolerance_buses: 5000   # only relevant for "osm-raw" setting: [m] (default 5000) Tolerance in meters of the close buses to merge
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
 co2_budget:
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 185f5692a..c1c2dc0b9 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -65,99 +65,80 @@ rule build_powerplants:
         "../scripts/build_powerplants.py"
 
 
-if config["electricity_network"]["base_network"] == "gridkit":
-
-    rule base_network:
-        params:
-            countries=config_provider("countries"),
-            snapshots=config_provider("snapshots"),
-            drop_leap_day=config_provider("enable", "drop_leap_day"),
-            lines=config_provider("lines"),
-            links=config_provider("links"),
-            transformers=config_provider("transformers"),
-        input:
-            eg_buses="data/entsoegridkit/buses.csv",
-            eg_lines="data/entsoegridkit/lines.csv",
-            eg_links="data/entsoegridkit/links.csv",
-            eg_converters="data/entsoegridkit/converters.csv",
-            eg_transformers="data/entsoegridkit/transformers.csv",
-            parameter_corrections="data/parameter_corrections.yaml",
-            links_p_nom="data/links_p_nom.csv",
-            links_tyndp="data/links_tyndp.csv",
-            country_shapes=resources("country_shapes.geojson"),
-            offshore_shapes=resources("offshore_shapes.geojson"),
-            europe_shape=resources("europe_shape.geojson"),
-        output:
-            base_network=resources("networks/base.nc"),
-            regions_onshore=resources("regions_onshore.geojson"),
-            regions_offshore=resources("regions_offshore.geojson"),
-        log:
-            logs("base_network.log"),
-        benchmark:
-            benchmarks("base_network")
-        threads: 1
-        resources:
-            mem_mb=1500,
-        conda:
-            "../envs/environment.yaml"
-        script:
-            "../scripts/base_network.py"
-
-
-if config["electricity_network"]["base_network"]  == "osm":
-
-    rule base_network:
-        params:
-            countries=config_provider("countries"),
-            snapshots=config_provider("snapshots"),
-            drop_leap_day=config_provider("enable", "drop_leap_day"),
-            lines=config_provider("lines"),
-            links=config_provider("links"),
-            transformers=config_provider("transformers"),
-        input:
-            eg_buses=(
-                "data/osm/prebuilt/buses.csv"
-                if config["electricity_network"]["osm_use_prebuilt"] == True
+rule base_network:
+    params:
+        countries=config_provider("countries"),
+        snapshots=config_provider("snapshots"),
+        drop_leap_day=config_provider("enable", "drop_leap_day"),
+        lines=config_provider("lines"),
+        links=config_provider("links"),
+        transformers=config_provider("transformers"),
+    input:
+        eg_buses=lambda w: (
+            "data/entsoegridkit/buses.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/buses.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
                 else resources("osm/buses.csv")
-            ),
-            eg_lines=(
-                "data/osm/prebuilt/lines.csv"
-                if config["electricity_network"]["osm_use_prebuilt"] == True
+            )
+        ),
+        eg_lines=lambda w: (
+            "data/entsoegridkit/lines.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/lines.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
                 else resources("osm/lines.csv")
-            ),
-            eg_links=(
-                "data/osm/prebuilt/links.csv"
-                if config["electricity_network"]["osm_use_prebuilt"] == True
+            )
+        ),
+        eg_links=lambda w: (
+            "data/entsoegridkit/links.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/links.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
                 else resources("osm/links.csv")
-            ),
-            eg_converters=(
-                "data/osm/prebuilt/converters.csv"
-                if config["electricity_network"]["osm_use_prebuilt"] == True
+            )
+        ),
+        eg_converters=lambda w: (
+            "data/entsoegridkit/converters.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/converters.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
                 else resources("osm/converters.csv")
-            ),
-            eg_transformers=(
-                "data/osm/prebuilt/transformers.csv"
-                if config["electricity_network"]["osm_use_prebuilt"] == True
+            )
+        ),
+        eg_transformers=lambda w: (
+            "data/entsoegridkit/transformers.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/transformers.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
                 else resources("osm/transformers.csv")
-            ),
-            country_shapes=resources("country_shapes.geojson"),
-            offshore_shapes=resources("offshore_shapes.geojson"),
-            europe_shape=resources("europe_shape.geojson"),
-        output:
-            base_network=resources("networks/base.nc"),
-            regions_onshore=resources("regions_onshore.geojson"),
-            regions_offshore=resources("regions_offshore.geojson"),
-        log:
-            logs("base_network.log"),
-        benchmark:
-            benchmarks("base_network")
-        threads: 1
-        resources:
-            mem_mb=1500,
-        conda:
-            "../envs/environment.yaml"
-        script:
-            "../scripts/base_network_osm.py"
+            )
+        ),
+        parameter_corrections=lambda w: (
+            "data/parameter_corrections.yaml" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else []
+        ),
+        links_p_nom=lambda w: (
+            "data/links_p_nom.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else []
+        ),
+        links_tyndp=lambda w: (
+            "data/links_tyndp.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else []
+        ),
+        country_shapes=resources("country_shapes.geojson"),
+        offshore_shapes=resources("offshore_shapes.geojson"),
+        europe_shape=resources("europe_shape.geojson"),
+    output:
+        base_network=resources("networks/base.nc"),
+        regions_onshore=resources("regions_onshore.geojson"),
+        regions_offshore=resources("regions_offshore.geojson"),
+    log:
+        logs("base_network.log"),
+    benchmark:
+        benchmarks("base_network")
+    threads: 1
+    resources:
+        mem_mb=1500,
+    conda:
+        "../envs/environment.yaml"
+    script:
+        "../scripts/base_network.py"
 
 
 rule build_shapes:
diff --git a/scripts/base_network.py b/scripts/base_network.py
index df3bc2b2c..963234a50 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -5,7 +5,10 @@
 
 # coding: utf-8
 """
-Creates the network topology from an `ENTSO-E map extract <https://github.com/PyPSA/GridKit/tree/master/entsoe>`_ (March 2022) as a PyPSA network.
+Creates the network topology from a `ENTSO-E map extract.
+
+<https://github.com/PyPSA/GridKit/tree/master/entsoe>`_ (March 2022) as a PyPSA
+network.
 
 Relevant Settings
 -----------------
@@ -142,29 +145,39 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
             dtype=dict(bus_id="str"),
         )
         .set_index("bus_id")
-        .drop(["station_id"], axis=1)
         .rename(columns=dict(voltage="v_nom"))
     )
 
-    buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
+    if "station_id" in buses.columns:
+        buses.drop("station_id", axis=1, inplace=True)
+
+    # buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
     buses["under_construction"] = buses.under_construction.where(
         lambda s: s.notnull(), False
     ).astype(bool)
 
     # remove all buses outside of all countries including exclusive economic zones (offshore)
     europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
+    # TODO pypsa-eur: Temporary fix: Convex hull, this is important when nodes are between countries
+    # europe_shape = europe_shape.convex_hull
+
     europe_shape_prepped = shapely.prepared.prep(europe_shape)
     buses_in_europe_b = buses[["x", "y"]].apply(
         lambda p: europe_shape_prepped.contains(Point(p)), axis=1
     )
 
-    buses_with_v_nom_to_keep_b = (
-        buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
-    )
-    logger.info(
-        f'Removing buses with voltages {pd.Index(buses.v_nom.unique()).dropna().difference(config_elec["voltages"])}'
-    )
+    # TODO pypsa-eur: Find a long-term solution
+    # buses_with_v_nom_to_keep_b = (
+    #     buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
+    # )
 
+    v_nom_min = min(config_elec["voltages"])
+    v_nom_max = max(config_elec["voltages"])
+
+    # Quick fix:
+    buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
+
+    logger.info(f"Removing buses outside of range {v_nom_min} - {v_nom_max} V")
     return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
 
 
@@ -221,6 +234,31 @@ def _load_links_from_eg(buses, eg_links):
     return links
 
 
+def _load_links_from_osm(buses, eg_links):
+    links = pd.read_csv(
+        eg_links,
+        quotechar="'",
+        true_values=["t"],
+        false_values=["f"],
+        dtype=dict(
+            link_id="str",
+            bus0="str",
+            bus1="str",
+            voltage="int",
+            p_nom="float",
+        ),
+    ).set_index("link_id")
+
+    links["length"] /= 1e3
+
+    links = _remove_dangling_branches(links, buses)
+
+    # Add DC line parameters
+    links["carrier"] = "DC"
+
+    return links
+
+
 def _add_links_from_tyndp(buses, links, links_tyndp, europe_shape):
     links_tyndp = pd.read_csv(links_tyndp)
 
@@ -347,7 +385,8 @@ def _load_lines_from_eg(buses, eg_lines):
     )
 
     lines["length"] /= 1e3
-    lines["carrier"] = "AC"
+
+    lines["carrier"] = "AC" #TODO pypsa-eur check
     lines = _remove_dangling_branches(lines, buses)
 
     return lines
@@ -397,7 +436,7 @@ def _reconnect_crimea(lines):
     return pd.concat([lines, lines_to_crimea])
 
 
-def _set_electrical_parameters_lines(lines, config):
+def _set_electrical_parameters_lines_eg(lines, config):
     v_noms = config["electricity"]["voltages"]
     linetypes = config["lines"]["types"]
 
@@ -409,16 +448,35 @@ def _set_electrical_parameters_lines(lines, config):
     return lines
 
 
+def _set_electrical_parameters_lines_osm(lines_config, voltages, lines):
+    if lines.empty:
+        lines["type"] = []
+        return lines
+
+    linetypes = _get_linetypes_config(lines_config["types"], voltages)
+
+    lines["carrier"] = "AC"
+    lines["dc"] = False
+
+    lines.loc[:, "type"] = lines.v_nom.apply(
+        lambda x: _get_linetype_by_voltage(x, linetypes)
+    )
+
+    lines["s_max_pu"] = lines_config["s_max_pu"]
+
+    return lines
+
+
 def _set_lines_s_nom_from_linetypes(n):
     n.lines["s_nom"] = (
         np.sqrt(3)
         * n.lines["type"].map(n.line_types.i_nom)
         * n.lines["v_nom"]
-        * n.lines.num_parallel
+        * n.lines["num_parallel"]
     )
 
 
-def _set_electrical_parameters_links(links, config, links_p_nom):
+def _set_electrical_parameters_links_eg(links, config, links_p_nom):
     if links.empty:
         return links
 
@@ -450,6 +508,19 @@ def _set_electrical_parameters_links(links, config, links_p_nom):
     return links
 
 
+def _set_electrical_parameters_links_osm(links, config):
+    if links.empty:
+        return links
+
+    p_max_pu = config["links"].get("p_max_pu", 1.0)
+    links["p_max_pu"] = p_max_pu
+    links["p_min_pu"] = -p_max_pu
+    links["carrier"] = "DC"
+    links["dc"] = True
+
+    return links
+
+
 def _set_electrical_parameters_converters(converters, config):
     p_max_pu = config["links"].get("p_max_pu", 1.0)
     converters["p_max_pu"] = p_max_pu
@@ -570,7 +641,7 @@ def prefer_voltage(x, which):
     buses["substation_lv"] = (
         lv_b & onshore_b & (~buses["under_construction"]) & has_connections_b
     )
-    buses["substation_off"] = ((hv_b & offshore_b) | (hv_b & onshore_b)) & (
+    buses["substation_off"] = (offshore_b | (hv_b & onshore_b)) & (
         ~buses["under_construction"]
     )
 
@@ -737,31 +808,55 @@ def base_network(
     parameter_corrections,
     config,
 ):
+    
     buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
 
-    links = _load_links_from_eg(buses, eg_links)
-    if config["links"].get("include_tyndp"):
+    if config["electricity_network"].get("base_network") == "gridkit":
+        links = _load_links_from_eg(buses, eg_links)
+    elif "osm" in config["electricity_network"].get("base_network"):
+        links = _load_links_from_osm(buses, eg_links)
+    else:
+        raise ValueError("base_network must be either 'gridkit' or 'osm'")
+
+    if (config["links"].get("include_tyndp") & (config["electricity_network"].get("base_network") == "gridkit")):
         buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
 
     converters = _load_converters_from_eg(buses, eg_converters)
+    transformers = _load_transformers_from_eg(buses, eg_transformers)
 
     lines = _load_lines_from_eg(buses, eg_lines)
-    transformers = _load_transformers_from_eg(buses, eg_transformers)
 
     if config["lines"].get("reconnect_crimea", True) and "UA" in config["countries"]:
         lines = _reconnect_crimea(lines)
 
-    lines = _set_electrical_parameters_lines(lines, config)
+    if config["electricity_network"].get("base_network") == "gridkit":
+        lines = _set_electrical_parameters_lines_eg(lines, config)
+        links = _set_electrical_parameters_links_eg(links, config, links_p_nom)
+    elif "osm" in config["electricity_network"].get("base_network"):
+        lines = _set_electrical_parameters_lines_osm(
+            config["lines"], config["electricity"]["voltages"], lines
+        )
+        links = _set_electrical_parameters_links_osm(links, config)
+    else:
+        raise ValueError("base_network must be either 'gridkit' or 'osm'")
+
     transformers = _set_electrical_parameters_transformers(transformers, config)
-    links = _set_electrical_parameters_links(links, config, links_p_nom)
     converters = _set_electrical_parameters_converters(converters, config)
 
     n = pypsa.Network()
-    n.name = "PyPSA-Eur"
+
+    if config["electricity_network"].get("base_network") == "gridkit":
+        n.name = "PyPSA-Eur (GridKit)"
+    elif "osm" in config["electricity_network"].get("base_network"):
+        n.name = "PyPSA-Eur (OSM)"
+    else:
+        raise ValueError("base_network must be either 'gridkit' or 'osm'")
 
     time = get_snapshots(snakemake.params.snapshots, snakemake.params.drop_leap_day)
     n.set_snapshots(time)
-    n.madd("Carrier", ["AC", "DC"])
+    n.madd(
+        "Carrier", ["AC", "DC"]
+    )  # TODO: fix hard code and check if AC/DC truly exist
 
     n.import_components_from_dataframe(buses, "Bus")
     n.import_components_from_dataframe(lines, "Line")
@@ -770,13 +865,15 @@ def base_network(
     n.import_components_from_dataframe(converters, "Link")
 
     _set_lines_s_nom_from_linetypes(n)
+    if config["electricity_network"].get("base_network") == "gridkit":
+        _apply_parameter_corrections(n, parameter_corrections)
 
-    _apply_parameter_corrections(n, parameter_corrections)
-
+    # TODO: what about this?
     n = _remove_unconnected_components(n)
 
     _set_countries_and_substations(n, config, country_shapes, offshore_shapes)
 
+    # TODO pypsa-eur add this
     _set_links_underwater_fraction(n, offshore_shapes)
 
     _replace_b2b_converter_at_country_border_by_link(n)
@@ -785,9 +882,59 @@ def base_network(
 
     _set_shapes(n, country_shapes, offshore_shapes)
 
+    logger.info(f"Base network created using {config['electricity_network'].get('base_network')}.")
+
     return n
 
 
+def _get_linetypes_config(line_types, voltages):
+    """
+    Return the dictionary of linetypes for selected voltages. The dictionary is
+    a subset of the dictionary line_types, whose keys match the selected
+    voltages.
+
+    Parameters
+    ----------
+    line_types : dict
+        Dictionary of linetypes: keys are nominal voltages and values are linetypes.
+    voltages : list
+        List of selected voltages.
+
+    Returns
+    -------
+        Dictionary of linetypes for selected voltages.
+    """
+    # get voltages value that are not availabile in the line types
+    vnoms_diff = set(voltages).symmetric_difference(set(line_types.keys()))
+    if vnoms_diff:
+        logger.warning(
+            f"Voltages {vnoms_diff} not in the {line_types} or {voltages} list."
+        )
+    return {k: v for k, v in line_types.items() if k in voltages}
+
+
+def _get_linetype_by_voltage(v_nom, d_linetypes):
+    """
+    Return the linetype of a specific line based on its voltage v_nom.
+
+    Parameters
+    ----------
+    v_nom : float
+        The voltage of the line.
+    d_linetypes : dict
+        Dictionary of linetypes: keys are nominal voltages and values are linetypes.
+
+    Returns
+    -------
+        The linetype of the line whose nominal voltage is closest to the line voltage.
+    """
+    v_nom_min, line_type_min = min(
+        d_linetypes.items(),
+        key=lambda x: abs(x[0] - v_nom),
+    )
+    return line_type_min
+
+
 def voronoi_partition_pts(points, outline):
     """
     Compute the polygons of a voronoi partition of `points` within the polygon
@@ -968,4 +1115,4 @@ def append_bus_shapes(n, shapes, type):
         offshore_shapes.to_frame().to_file(snakemake.output.regions_offshore)
 
     n.meta = snakemake.config
-    n.export_to_netcdf(snakemake.output.base_network)
+    n.export_to_netcdf(snakemake.output.base_network)
\ No newline at end of file
diff --git a/scripts/base_network_osm.py b/scripts/base_network_osm.py
deleted file mode 100644
index f9b0daf57..000000000
--- a/scripts/base_network_osm.py
+++ /dev/null
@@ -1,1058 +0,0 @@
-# -*- coding: utf-8 -*-
-# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
-#
-# SPDX-License-Identifier: MIT
-
-# coding: utf-8
-"""
-Creates the network topology from a `ENTSO-E map extract.
-
-<https://github.com/PyPSA/GridKit/tree/master/entsoe>`_ (March 2022) as a PyPSA
-network.
-
-Relevant Settings
------------------
-
-.. code:: yaml
-
-    countries:
-
-    electricity:
-        voltages:
-
-    lines:
-        types:
-        s_max_pu:
-        under_construction:
-
-    links:
-        p_max_pu:
-        under_construction:
-        include_tyndp:
-
-    transformers:
-        x:
-        s_nom:
-        type:
-
-.. seealso::
-    Documentation of the configuration file ``config/config.yaml`` at
-    :ref:`snapshots_cf`, :ref:`toplevel_cf`, :ref:`electricity_cf`, :ref:`load_cf`,
-    :ref:`lines_cf`, :ref:`links_cf`, :ref:`transformers_cf`
-
-Inputs
-------
-
-- ``data/entsoegridkit``:  Extract from the geographical vector data of the online `ENTSO-E Interactive Map <https://www.entsoe.eu/data/map/>`_ by the `GridKit <https://github.com/martacki/gridkit>`_ toolkit dating back to March 2022.
-- ``data/parameter_corrections.yaml``: Corrections for ``data/entsoegridkit``
-- ``data/links_p_nom.csv``: confer :ref:`links`
-- ``data/links_tyndp.csv``: List of projects in the `TYNDP 2018 <https://tyndp.entsoe.eu/tyndp2018/>`_ that are at least *in permitting* with fields for start- and endpoint (names and coordinates), length, capacity, construction status, and project reference ID.
-- ``resources/country_shapes.geojson``: confer :ref:`shapes`
-- ``resources/offshore_shapes.geojson``: confer :ref:`shapes`
-- ``resources/europe_shape.geojson``: confer :ref:`shapes`
-
-Outputs
--------
-
-- ``networks/base.nc``
-
-    .. image:: img/base.png
-        :scale: 33 %
-
-- ``resources/regions_onshore.geojson``:
-
-    .. image:: img/regions_onshore.png
-        :scale: 33 %
-
-- ``resources/regions_offshore.geojson``:
-
-    .. image:: img/regions_offshore.png
-        :scale: 33 %
-
-Description
------------
-Creates the network topology from an ENTSO-E map extract, and create Voronoi shapes for each bus representing both onshore and offshore regions.
-"""
-
-import logging
-from itertools import product
-
-import geopandas as gpd
-import networkx as nx
-import numpy as np
-import pandas as pd
-import pypsa
-import shapely
-import shapely.prepared
-import shapely.wkt
-import yaml
-from _helpers import REGION_COLS, configure_logging, get_snapshots, set_scenario_config
-from packaging.version import Version, parse
-from scipy import spatial
-from scipy.sparse import csgraph
-from shapely.geometry import LineString, Point, Polygon
-
-PD_GE_2_2 = parse(pd.__version__) >= Version("2.2")
-
-logger = logging.getLogger(__name__)
-
-
-def _get_oid(df):
-    if "tags" in df.columns:
-        return df.tags.str.extract('"oid"=>"(\d+)"', expand=False)
-    else:
-        return pd.Series(np.nan, df.index)
-
-
-def _get_country(df):
-    if "tags" in df.columns:
-        return df.tags.str.extract('"country"=>"([A-Z]{2})"', expand=False)
-    else:
-        return pd.Series(np.nan, df.index)
-
-
-def _find_closest_links(links, new_links, distance_upper_bound=1.5):
-    treecoords = np.asarray(
-        [
-            np.asarray(shapely.wkt.loads(s).coords)[[0, -1]].flatten()
-            for s in links.geometry
-        ]
-    )
-    querycoords = np.vstack(
-        [new_links[["x1", "y1", "x2", "y2"]], new_links[["x2", "y2", "x1", "y1"]]]
-    )
-    tree = spatial.KDTree(treecoords)
-    dist, ind = tree.query(querycoords, distance_upper_bound=distance_upper_bound)
-    found_b = ind < len(links)
-    found_i = np.arange(len(new_links) * 2)[found_b] % len(new_links)
-    return (
-        pd.DataFrame(
-            dict(D=dist[found_b], i=links.index[ind[found_b] % len(links)]),
-            index=new_links.index[found_i],
-        )
-        .sort_values(by="D")[lambda ds: ~ds.index.duplicated(keep="first")]
-        .sort_index()["i"]
-    )
-
-
-def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
-    buses = (
-        pd.read_csv(
-            eg_buses,
-            quotechar="'",
-            true_values=["t"],
-            false_values=["f"],
-            dtype=dict(bus_id="str"),
-        )
-        .set_index("bus_id")
-        .rename(columns=dict(voltage="v_nom"))
-    )
-
-    if "station_id" in buses.columns:
-        buses.drop("station_id", axis=1, inplace=True)
-
-    # buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
-    buses["under_construction"] = buses.under_construction.where(
-        lambda s: s.notnull(), False
-    ).astype(bool)
-
-    # remove all buses outside of all countries including exclusive economic zones (offshore)
-    europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
-    # TODO pypsa-eur: Temporary fix: Convex hull, this is important when nodes are between countries
-    # europe_shape = europe_shape.convex_hull
-
-    europe_shape_prepped = shapely.prepared.prep(europe_shape)
-    buses_in_europe_b = buses[["x", "y"]].apply(
-        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
-    )
-
-    # TODO pypsa-eur: Find a long-term solution
-    # buses_with_v_nom_to_keep_b = (
-    #     buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
-    # )
-
-    v_nom_min = min(config_elec["voltages"])
-    v_nom_max = max(config_elec["voltages"])
-
-    # Quick fix:
-    buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
-
-    logger.info(f"Removing buses outside of range {v_nom_min} - {v_nom_max} V")
-    return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
-
-
-def _load_transformers_from_eg(buses, eg_transformers):
-    transformers = pd.read_csv(
-        eg_transformers,
-        quotechar="'",
-        true_values=["t"],
-        false_values=["f"],
-        dtype=dict(transformer_id="str", bus0="str", bus1="str"),
-    ).set_index("transformer_id")
-
-    transformers = _remove_dangling_branches(transformers, buses)
-
-    return transformers
-
-
-def _load_converters_from_eg(buses, eg_converters):
-    converters = pd.read_csv(
-        eg_converters,
-        quotechar="'",
-        true_values=["t"],
-        false_values=["f"],
-        dtype=dict(converter_id="str", bus0="str", bus1="str"),
-    ).set_index("converter_id")
-
-    converters = _remove_dangling_branches(converters, buses)
-
-    converters["carrier"] = "B2B"
-
-    return converters
-
-
-def _load_links_from_eg(buses, eg_links):
-    links = pd.read_csv(
-        eg_links,
-        quotechar="'",
-        true_values=["t"],
-        false_values=["f"],
-        dtype=dict(
-            link_id="str",
-            bus0="str",
-            bus1="str",
-            voltage="int",
-            p_nom="float",
-        ),
-    ).set_index("link_id")
-
-    links["length"] /= 1e3
-
-    links = _remove_dangling_branches(links, buses)
-
-    # Add DC line parameters
-    links["carrier"] = "DC"
-
-    return links
-
-
-def _add_links_from_tyndp(buses, links, links_tyndp, europe_shape):
-    links_tyndp = pd.read_csv(links_tyndp)
-
-    # remove all links from list which lie outside all of the desired countries
-    europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
-    europe_shape_prepped = shapely.prepared.prep(europe_shape)
-    x1y1_in_europe_b = links_tyndp[["x1", "y1"]].apply(
-        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
-    )
-    x2y2_in_europe_b = links_tyndp[["x2", "y2"]].apply(
-        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
-    )
-    is_within_covered_countries_b = x1y1_in_europe_b & x2y2_in_europe_b
-
-    if not is_within_covered_countries_b.all():
-        logger.info(
-            "TYNDP links outside of the covered area (skipping): "
-            + ", ".join(links_tyndp.loc[~is_within_covered_countries_b, "Name"])
-        )
-
-        links_tyndp = links_tyndp.loc[is_within_covered_countries_b]
-        if links_tyndp.empty:
-            return buses, links
-
-    has_replaces_b = links_tyndp.replaces.notnull()
-    oids = dict(Bus=_get_oid(buses), Link=_get_oid(links))
-    keep_b = dict(
-        Bus=pd.Series(True, index=buses.index), Link=pd.Series(True, index=links.index)
-    )
-    for reps in links_tyndp.loc[has_replaces_b, "replaces"]:
-        for comps in reps.split(":"):
-            oids_to_remove = comps.split(".")
-            c = oids_to_remove.pop(0)
-            keep_b[c] &= ~oids[c].isin(oids_to_remove)
-    buses = buses.loc[keep_b["Bus"]]
-    links = links.loc[keep_b["Link"]]
-
-    links_tyndp["j"] = _find_closest_links(
-        links, links_tyndp, distance_upper_bound=0.20
-    )
-    # Corresponds approximately to 20km tolerances
-
-    if links_tyndp["j"].notnull().any():
-        logger.info(
-            "TYNDP links already in the dataset (skipping): "
-            + ", ".join(links_tyndp.loc[links_tyndp["j"].notnull(), "Name"])
-        )
-        links_tyndp = links_tyndp.loc[links_tyndp["j"].isnull()]
-        if links_tyndp.empty:
-            return buses, links
-
-    tree = spatial.KDTree(buses[["x", "y"]])
-    _, ind0 = tree.query(links_tyndp[["x1", "y1"]])
-    ind0_b = ind0 < len(buses)
-    links_tyndp.loc[ind0_b, "bus0"] = buses.index[ind0[ind0_b]]
-
-    _, ind1 = tree.query(links_tyndp[["x2", "y2"]])
-    ind1_b = ind1 < len(buses)
-    links_tyndp.loc[ind1_b, "bus1"] = buses.index[ind1[ind1_b]]
-
-    links_tyndp_located_b = (
-        links_tyndp["bus0"].notnull() & links_tyndp["bus1"].notnull()
-    )
-    if not links_tyndp_located_b.all():
-        logger.warning(
-            "Did not find connected buses for TYNDP links (skipping): "
-            + ", ".join(links_tyndp.loc[~links_tyndp_located_b, "Name"])
-        )
-        links_tyndp = links_tyndp.loc[links_tyndp_located_b]
-
-    logger.info("Adding the following TYNDP links: " + ", ".join(links_tyndp["Name"]))
-
-    links_tyndp = links_tyndp[["bus0", "bus1"]].assign(
-        carrier="DC",
-        p_nom=links_tyndp["Power (MW)"],
-        length=links_tyndp["Length (given) (km)"].fillna(
-            links_tyndp["Length (distance*1.2) (km)"]
-        ),
-        under_construction=True,
-        underground=False,
-        geometry=(
-            links_tyndp[["x1", "y1", "x2", "y2"]].apply(
-                lambda s: str(LineString([[s.x1, s.y1], [s.x2, s.y2]])), axis=1
-            )
-        ),
-        tags=(
-            '"name"=>"'
-            + links_tyndp["Name"]
-            + '", '
-            + '"ref"=>"'
-            + links_tyndp["Ref"]
-            + '", '
-            + '"status"=>"'
-            + links_tyndp["status"]
-            + '"'
-        ),
-    )
-
-    links_tyndp.index = "T" + links_tyndp.index.astype(str)
-
-    links = pd.concat([links, links_tyndp], sort=True)
-
-    return buses, links
-
-
-def _load_lines_from_eg(buses, eg_lines):
-    lines = (
-        pd.read_csv(
-            eg_lines,
-            quotechar="'",
-            true_values=["t"],
-            false_values=["f"],
-            dtype=dict(
-                line_id="str",
-                bus0="str",
-                bus1="str",
-                underground="bool",
-                under_construction="bool",
-            ),
-        )
-        .set_index("line_id")
-        .rename(columns=dict(voltage="v_nom", circuits="num_parallel"))
-    )
-
-    lines["length"] /= 1e3
-
-    # lines["carrier"] = "AC" #TODO pypsa-eur clean/remove this
-    lines = _remove_dangling_branches(lines, buses)
-
-    return lines
-
-
-def _apply_parameter_corrections(n, parameter_corrections):
-    with open(parameter_corrections) as f:
-        corrections = yaml.safe_load(f)
-
-    if corrections is None:
-        return
-
-    for component, attrs in corrections.items():
-        df = n.df(component)
-        oid = _get_oid(df)
-        if attrs is None:
-            continue
-
-        for attr, repls in attrs.items():
-            for i, r in repls.items():
-                if i == "oid":
-                    r = oid.map(repls["oid"]).dropna()
-                elif i == "index":
-                    r = pd.Series(repls["index"])
-                else:
-                    raise NotImplementedError()
-                inds = r.index.intersection(df.index)
-                df.loc[inds, attr] = r[inds].astype(df[attr].dtype)
-
-
-def _reconnect_crimea(lines):
-    logger.info("Reconnecting Crimea to the Ukrainian grid.")
-    lines_to_crimea = pd.DataFrame(
-        {
-            "bus0": ["3065", "3181", "3181"],
-            "bus1": ["3057", "3055", "3057"],
-            "v_nom": [300, 300, 300],
-            "num_parallel": [1, 1, 1],
-            "length": [140, 120, 140],
-            "carrier": ["AC", "AC", "AC"],
-            "underground": [False, False, False],
-            "under_construction": [False, False, False],
-        },
-        index=["Melitopol", "Liubymivka left", "Luibymivka right"],
-    )
-
-    return pd.concat([lines, lines_to_crimea])
-
-
-# def _set_electrical_parameters_lines(lines, config):
-#     v_noms = config["electricity"]["voltages"]
-#     linetypes = config["lines"]["types"]
-
-#     for v_nom in v_noms:
-#         lines.loc[lines["v_nom"] == v_nom, "type"] = linetypes[v_nom]
-
-
-def _set_electrical_parameters_lines(lines_config, voltages, lines):
-    if lines.empty:
-        lines["type"] = []
-        return lines
-
-    linetypes = _get_linetypes_config(lines_config["types"], voltages)
-
-    lines["carrier"] = "AC"
-    lines["dc"] = False
-
-    lines.loc[:, "type"] = lines.v_nom.apply(
-        lambda x: _get_linetype_by_voltage(x, linetypes)
-    )
-
-    lines["s_max_pu"] = lines_config["s_max_pu"]
-
-    return lines
-
-
-def _set_lines_s_nom_from_linetypes(n):
-    n.lines["s_nom"] = (
-        np.sqrt(3)
-        * n.lines["type"].map(n.line_types.i_nom)
-        * n.lines["v_nom"]
-        * n.lines["num_parallel"]
-    )
-    # Re-define s_nom for DC lines
-    n.lines.loc[n.lines["carrier"] == "DC", "s_nom"] = n.lines["type"].map(
-        n.line_types.i_nom
-    ) * n.lines.eval("v_nom * num_parallel")
-
-
-# TODO pypsa-eur: Clean/fix this, update list p_noms
-def _set_electrical_parameters_links(links, config):
-    if links.empty:
-        return links
-
-    p_max_pu = config["links"].get("p_max_pu", 1.0)
-    links["p_max_pu"] = p_max_pu
-    links["p_min_pu"] = -p_max_pu
-    links["carrier"] = "DC"
-    links["dc"] = True
-
-    return links
-
-
-def _set_electrical_parameters_converters(converters, config):
-    p_max_pu = config["links"].get("p_max_pu", 1.0)
-    converters["p_max_pu"] = p_max_pu
-    converters["p_min_pu"] = -p_max_pu
-
-    converters["p_nom"] = 2000
-
-    # Converters are combined with links
-    converters["under_construction"] = False
-    converters["underground"] = False
-
-    return converters
-
-
-def _set_electrical_parameters_transformers(transformers, config):
-    config = config["transformers"]
-
-    ## Add transformer parameters
-    transformers["x"] = config.get("x", 0.1)
-    transformers["s_nom"] = config.get("s_nom", 2000)
-    transformers["type"] = config.get("type", "")
-
-    return transformers
-
-
-def _remove_dangling_branches(branches, buses):
-    return pd.DataFrame(
-        branches.loc[branches.bus0.isin(buses.index) & branches.bus1.isin(buses.index)]
-    )
-
-
-def _remove_unconnected_components(network, threshold=6):
-    _, labels = csgraph.connected_components(network.adjacency_matrix(), directed=False)
-    component = pd.Series(labels, index=network.buses.index)
-
-    component_sizes = component.value_counts()
-    components_to_remove = component_sizes.loc[component_sizes < threshold]
-
-    logger.info(
-        f"Removing {len(components_to_remove)} unconnected network components with less than {components_to_remove.max()} buses. In total {components_to_remove.sum()} buses."
-    )
-
-    return network[component == component_sizes.index[0]]
-
-
-def _set_countries_and_substations(n, config, country_shapes, offshore_shapes):
-    buses = n.buses
-
-    def buses_in_shape(shape):
-        shape = shapely.prepared.prep(shape)
-        return pd.Series(
-            np.fromiter(
-                (
-                    shape.contains(Point(x, y))
-                    for x, y in buses.loc[:, ["x", "y"]].values
-                ),
-                dtype=bool,
-                count=len(buses),
-            ),
-            index=buses.index,
-        )
-
-    countries = config["countries"]
-    country_shapes = gpd.read_file(country_shapes).set_index("name")["geometry"]
-    # reindexing necessary for supporting empty geo-dataframes
-    offshore_shapes = gpd.read_file(offshore_shapes)
-    offshore_shapes = offshore_shapes.reindex(columns=["name", "geometry"]).set_index(
-        "name"
-    )["geometry"]
-    substation_b = buses["symbol"].str.contains(
-        "substation|converter station", case=False
-    )
-
-    def prefer_voltage(x, which):
-        index = x.index
-        if len(index) == 1:
-            return pd.Series(index, index)
-        key = (
-            x.index[0]
-            if x["v_nom"].isnull().all()
-            else getattr(x["v_nom"], "idx" + which)()
-        )
-        return pd.Series(key, index)
-
-    compat_kws = dict(include_groups=False) if PD_GE_2_2 else {}
-    gb = buses.loc[substation_b].groupby(
-        ["x", "y"], as_index=False, group_keys=False, sort=False
-    )
-    bus_map_low = gb.apply(prefer_voltage, "min", **compat_kws)
-    lv_b = (bus_map_low == bus_map_low.index).reindex(buses.index, fill_value=False)
-    bus_map_high = gb.apply(prefer_voltage, "max", **compat_kws)
-    hv_b = (bus_map_high == bus_map_high.index).reindex(buses.index, fill_value=False)
-
-    onshore_b = pd.Series(False, buses.index)
-    offshore_b = pd.Series(False, buses.index)
-
-    for country in countries:
-        onshore_shape = country_shapes[country]
-        onshore_country_b = buses_in_shape(onshore_shape)
-        onshore_b |= onshore_country_b
-
-        buses.loc[onshore_country_b, "country"] = country
-
-        if country not in offshore_shapes.index:
-            continue
-        offshore_country_b = buses_in_shape(offshore_shapes[country])
-        offshore_b |= offshore_country_b
-
-        buses.loc[offshore_country_b, "country"] = country
-
-    # Only accept buses as low-voltage substations (where load is attached), if
-    # they have at least one connection which is not under_construction
-    has_connections_b = pd.Series(False, index=buses.index)
-    for b, df in product(("bus0", "bus1"), (n.lines, n.links)):
-        has_connections_b |= ~df.groupby(b).under_construction.min()
-
-    buses["onshore_bus"] = onshore_b
-    buses["substation_lv"] = (
-        lv_b & onshore_b & (~buses["under_construction"]) & has_connections_b
-    )
-
-    # TODO: fix this in pypsa-eur master branch
-    # buses["substation_off"] = offshore_b & (
-    #     ~buses["under_construction"]
-    # )
-
-    buses["substation_off"] = (offshore_b | (hv_b & onshore_b)) & (
-        ~buses["under_construction"]
-    )
-
-    c_nan_b = buses.country.fillna("na") == "na"
-    if c_nan_b.sum() > 0:
-        c_tag = _get_country(buses.loc[c_nan_b])
-        c_tag.loc[~c_tag.isin(countries)] = np.nan
-        n.buses.loc[c_nan_b, "country"] = c_tag
-
-        c_tag_nan_b = n.buses.country.isnull()
-
-        # Nearest country in path length defines country of still homeless buses
-        # Work-around until commit 705119 lands in pypsa release
-        n.transformers["length"] = 0.0
-        graph = n.graph(weight="length")
-        n.transformers.drop("length", axis=1, inplace=True)
-
-        for b in n.buses.index[c_tag_nan_b]:
-            df = (
-                pd.DataFrame(
-                    dict(
-                        pathlength=nx.single_source_dijkstra_path_length(
-                            graph, b, cutoff=200
-                        )
-                    )
-                )
-                .join(n.buses.country)
-                .dropna()
-            )
-            assert (
-                not df.empty
-            ), "No buses with defined country within 200km of bus `{}`".format(b)
-            n.buses.at[b, "country"] = df.loc[df.pathlength.idxmin(), "country"]
-
-        logger.warning(
-            "{} buses are not in any country or offshore shape,"
-            " {} have been assigned from the tag of the entsoe map,"
-            " the rest from the next bus in terms of pathlength.".format(
-                c_nan_b.sum(), c_nan_b.sum() - c_tag_nan_b.sum()
-            )
-        )
-
-    return buses
-
-
-def _replace_b2b_converter_at_country_border_by_link(n):
-    # Affects only the B2B converter in Lithuania at the Polish border at the moment
-    buscntry = n.buses.country
-    linkcntry = n.links.bus0.map(buscntry)
-    converters_i = n.links.index[
-        (n.links.carrier == "B2B") & (linkcntry == n.links.bus1.map(buscntry))
-    ]
-
-    def findforeignbus(G, i):
-        cntry = linkcntry.at[i]
-        for busattr in ("bus0", "bus1"):
-            b0 = n.links.at[i, busattr]
-            for b1 in G[b0]:
-                if buscntry[b1] != cntry:
-                    return busattr, b0, b1
-        return None, None, None
-
-    for i in converters_i:
-        G = n.graph()
-        busattr, b0, b1 = findforeignbus(G, i)
-        if busattr is not None:
-            comp, line = next(iter(G[b0][b1]))
-            if comp != "Line":
-                logger.warning(
-                    "Unable to replace B2B `{}` expected a Line, but found a {}".format(
-                        i, comp
-                    )
-                )
-                continue
-
-            n.links.at[i, busattr] = b1
-            n.links.at[i, "p_nom"] = min(
-                n.links.at[i, "p_nom"], n.lines.at[line, "s_nom"]
-            )
-            n.links.at[i, "carrier"] = "DC"
-            n.links.at[i, "underwater_fraction"] = 0.0
-            n.links.at[i, "length"] = n.lines.at[line, "length"]
-
-            n.remove("Line", line)
-            n.remove("Bus", b0)
-
-            logger.info(
-                "Replacing B2B converter `{}` together with bus `{}` and line `{}` by an HVDC tie-line {}-{}".format(
-                    i, b0, line, linkcntry.at[i], buscntry.at[b1]
-                )
-            )
-
-
-def _set_links_underwater_fraction(n, offshore_shapes):
-    if n.links.empty:
-        return
-
-    if not hasattr(n.links, "geometry"):
-        n.links["underwater_fraction"] = 0.0
-    else:
-        offshore_shape = gpd.read_file(offshore_shapes).unary_union
-        links = gpd.GeoSeries(n.links.geometry.dropna().map(shapely.wkt.loads))
-        n.links["underwater_fraction"] = (
-            links.intersection(offshore_shape).length / links.length
-        )
-
-
-def _adjust_capacities_of_under_construction_branches(n, config):
-    lines_mode = config["lines"].get("under_construction", "undef")
-    if lines_mode == "zero":
-        n.lines.loc[n.lines.under_construction, "num_parallel"] = 0.0
-        n.lines.loc[n.lines.under_construction, "s_nom"] = 0.0
-    elif lines_mode == "remove":
-        n.mremove("Line", n.lines.index[n.lines.under_construction])
-    elif lines_mode != "keep":
-        logger.warning(
-            "Unrecognized configuration for `lines: under_construction` = `{}`. Keeping under construction lines."
-        )
-
-    links_mode = config["links"].get("under_construction", "undef")
-    if links_mode == "zero":
-        n.links.loc[n.links.under_construction, "p_nom"] = 0.0
-    elif links_mode == "remove":
-        n.mremove("Link", n.links.index[n.links.under_construction])
-    elif links_mode != "keep":
-        logger.warning(
-            "Unrecognized configuration for `links: under_construction` = `{}`. Keeping under construction links."
-        )
-
-    if lines_mode == "remove" or links_mode == "remove":
-        # We might need to remove further unconnected components
-        n = _remove_unconnected_components(n)
-
-    return n
-
-
-def _set_shapes(n, country_shapes, offshore_shapes):
-    # Write the geodataframes country_shapes and offshore_shapes to the network.shapes component
-    country_shapes = gpd.read_file(country_shapes).rename(columns={"name": "idx"})
-    country_shapes["type"] = "country"
-    offshore_shapes = gpd.read_file(offshore_shapes).rename(columns={"name": "idx"})
-    offshore_shapes["type"] = "offshore"
-    all_shapes = pd.concat([country_shapes, offshore_shapes], ignore_index=True)
-    n.madd(
-        "Shape",
-        all_shapes.index,
-        geometry=all_shapes.geometry,
-        idx=all_shapes.idx,
-        type=all_shapes["type"],
-    )
-
-
-def base_network_osm(
-    eg_buses,
-    eg_converters,
-    eg_transformers,
-    eg_lines,
-    eg_links,
-    europe_shape,
-    country_shapes,
-    offshore_shapes,
-    config,
-):
-    buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
-
-    # TODO pypsa-eur add this
-    links = _load_links_from_eg(buses, eg_links)
-    # if config["links"].get("include_tyndp"):
-    #     buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
-
-    converters = _load_converters_from_eg(buses, eg_converters)
-
-    lines = _load_lines_from_eg(buses, eg_lines)
-    transformers = _load_transformers_from_eg(buses, eg_transformers)
-
-    if config["lines"].get("reconnect_crimea", True) and "UA" in config["countries"]:
-        lines = _reconnect_crimea(lines)
-
-    lines = _set_electrical_parameters_lines(
-        config["lines"], config["electricity"]["voltages"], lines
-    )
-
-    links = _set_electrical_parameters_links(links, config)
-
-    transformers = _set_electrical_parameters_transformers(transformers, config)
-    converters = _set_electrical_parameters_converters(converters, config)
-
-    n = pypsa.Network()
-    n.name = "PyPSA-Eur (OSM)"
-
-    time = get_snapshots(snakemake.params.snapshots, snakemake.params.drop_leap_day)
-    n.set_snapshots(time)
-    n.madd(
-        "Carrier", ["AC", "DC"]
-    )  # TODO: fix hard code and check if AC/DC truly exist
-
-    n.import_components_from_dataframe(buses, "Bus")
-    n.import_components_from_dataframe(lines, "Line")
-    # The columns which names starts with "bus" are mixed up with the third-bus specification
-    # when executing additional_linkports()
-    # lines_dc.drop(
-    #     labels=[
-    #         "bus0_lon",
-    #         "bus0_lat",
-    #         "bus1_lon",
-    #         "bus1_lat",
-    #         "bus_0_coors",
-    #         "bus_1_coors",
-    #     ],
-    #     axis=1,
-    #     inplace=True,
-    # )
-    n.import_components_from_dataframe(links, "Link")
-    n.import_components_from_dataframe(transformers, "Transformer")
-    n.import_components_from_dataframe(converters, "Link")
-
-    _set_lines_s_nom_from_linetypes(n)
-
-    # TODO: what about this?
-    n = _remove_unconnected_components(n)
-
-    _set_countries_and_substations(n, config, country_shapes, offshore_shapes)
-
-    # TODO pypsa-eur add this
-    _set_links_underwater_fraction(n, offshore_shapes)
-
-    _replace_b2b_converter_at_country_border_by_link(n)
-
-    n = _adjust_capacities_of_under_construction_branches(n, config)
-
-    _set_shapes(n, country_shapes, offshore_shapes)
-
-    return n
-
-
-def _get_linetypes_config(line_types, voltages):
-    """
-    Return the dictionary of linetypes for selected voltages. The dictionary is
-    a subset of the dictionary line_types, whose keys match the selected
-    voltages.
-
-    Parameters
-    ----------
-    line_types : dict
-        Dictionary of linetypes: keys are nominal voltages and values are linetypes.
-    voltages : list
-        List of selected voltages.
-
-    Returns
-    -------
-        Dictionary of linetypes for selected voltages.
-    """
-    # get voltages value that are not availabile in the line types
-    vnoms_diff = set(voltages).symmetric_difference(set(line_types.keys()))
-    if vnoms_diff:
-        logger.warning(
-            f"Voltages {vnoms_diff} not in the {line_types} or {voltages} list."
-        )
-    return {k: v for k, v in line_types.items() if k in voltages}
-
-
-def _get_linetype_by_voltage(v_nom, d_linetypes):
-    """
-    Return the linetype of a specific line based on its voltage v_nom.
-
-    Parameters
-    ----------
-    v_nom : float
-        The voltage of the line.
-    d_linetypes : dict
-        Dictionary of linetypes: keys are nominal voltages and values are linetypes.
-
-    Returns
-    -------
-        The linetype of the line whose nominal voltage is closest to the line voltage.
-    """
-    v_nom_min, line_type_min = min(
-        d_linetypes.items(),
-        key=lambda x: abs(x[0] - v_nom),
-    )
-    return line_type_min
-
-
-def voronoi_partition_pts(points, outline):
-    """
-    Compute the polygons of a voronoi partition of `points` within the polygon
-    `outline`. Taken from
-    https://github.com/FRESNA/vresutils/blob/master/vresutils/graph.py.
-
-    Attributes
-    ----------
-    points : Nx2 - ndarray[dtype=float]
-    outline : Polygon
-    Returns
-    -------
-    polygons : N - ndarray[dtype=Polygon|MultiPolygon]
-    """
-    points = np.asarray(points)
-
-    if len(points) == 1:
-        polygons = [outline]
-    else:
-        xmin, ymin = np.amin(points, axis=0)
-        xmax, ymax = np.amax(points, axis=0)
-        xspan = xmax - xmin
-        yspan = ymax - ymin
-
-        # to avoid any network positions outside all Voronoi cells, append
-        # the corners of a rectangle framing these points
-        vor = spatial.Voronoi(
-            np.vstack(
-                (
-                    points,
-                    [
-                        [xmin - 3.0 * xspan, ymin - 3.0 * yspan],
-                        [xmin - 3.0 * xspan, ymax + 3.0 * yspan],
-                        [xmax + 3.0 * xspan, ymin - 3.0 * yspan],
-                        [xmax + 3.0 * xspan, ymax + 3.0 * yspan],
-                    ],
-                )
-            )
-        )
-
-        polygons = []
-        for i in range(len(points)):
-            poly = Polygon(vor.vertices[vor.regions[vor.point_region[i]]])
-
-            if not poly.is_valid:
-                poly = poly.buffer(0)
-
-            with np.errstate(invalid="ignore"):
-                poly = poly.intersection(outline)
-
-            polygons.append(poly)
-
-    return polygons
-
-
-def build_bus_shapes(n, country_shapes, offshore_shapes, countries):
-    country_shapes = gpd.read_file(country_shapes).set_index("name")["geometry"]
-    offshore_shapes = gpd.read_file(offshore_shapes)
-    offshore_shapes = offshore_shapes.reindex(columns=REGION_COLS).set_index("name")[
-        "geometry"
-    ]
-
-    onshore_regions = []
-    offshore_regions = []
-
-    for country in countries:
-        c_b = n.buses.country == country
-
-        onshore_shape = country_shapes[country]
-        onshore_locs = (
-            n.buses.loc[c_b & n.buses.onshore_bus]
-            .sort_values(
-                by="substation_lv", ascending=False
-            )  # preference for substations
-            .drop_duplicates(subset=["x", "y"], keep="first")[["x", "y"]]
-        )
-        onshore_regions.append(
-            gpd.GeoDataFrame(
-                {
-                    "name": onshore_locs.index,
-                    "x": onshore_locs["x"],
-                    "y": onshore_locs["y"],
-                    "geometry": voronoi_partition_pts(
-                        onshore_locs.values, onshore_shape
-                    ),
-                    "country": country,
-                }
-            )
-        )
-
-        if country not in offshore_shapes.index:
-            continue
-        offshore_shape = offshore_shapes[country]
-        offshore_locs = n.buses.loc[c_b & n.buses.substation_off, ["x", "y"]]
-        offshore_regions_c = gpd.GeoDataFrame(
-            {
-                "name": offshore_locs.index,
-                "x": offshore_locs["x"],
-                "y": offshore_locs["y"],
-                "geometry": voronoi_partition_pts(offshore_locs.values, offshore_shape),
-                "country": country,
-            }
-        )
-        offshore_regions_c = offshore_regions_c.loc[offshore_regions_c.area > 1e-2]
-        offshore_regions.append(offshore_regions_c)
-
-    shapes = pd.concat(onshore_regions, ignore_index=True)
-
-    return onshore_regions, offshore_regions, shapes
-
-
-def append_bus_shapes(n, shapes, type):
-    """
-    Append shapes to the network. If shapes with the same component and type
-    already exist, they will be removed.
-
-    Parameters:
-        n (pypsa.Network): The network to which the shapes will be appended.
-        shapes (geopandas.GeoDataFrame): The shapes to be appended.
-        **kwargs: Additional keyword arguments used in `n.madd`.
-
-    Returns:
-        None
-    """
-    remove = n.shapes.query("component == 'Bus' and type == @type").index
-    n.mremove("Shape", remove)
-
-    offset = n.shapes.index.astype(int).max() + 1 if not n.shapes.empty else 0
-    shapes = shapes.rename(lambda x: int(x) + offset)
-    n.madd(
-        "Shape",
-        shapes.index,
-        geometry=shapes.geometry,
-        idx=shapes.name,
-        component="Bus",
-        type=type,
-    )
-
-
-if __name__ == "__main__":
-    if "snakemake" not in globals():
-        from _helpers import mock_snakemake
-
-        snakemake = mock_snakemake("base_network")
-    configure_logging(snakemake)
-    set_scenario_config(snakemake)
-
-    n = base_network_osm(
-        snakemake.input.eg_buses,
-        snakemake.input.eg_converters,
-        snakemake.input.eg_transformers,
-        snakemake.input.eg_lines,
-        snakemake.input.eg_links,
-        snakemake.input.europe_shape,
-        snakemake.input.country_shapes,
-        snakemake.input.offshore_shapes,
-        snakemake.config,
-    )
-
-    logger.info("Base network created using OSM.")
-
-    onshore_regions, offshore_regions, shapes = build_bus_shapes(
-        n,
-        snakemake.input.country_shapes,
-        snakemake.input.offshore_shapes,
-        snakemake.params.countries,
-    )
-
-    shapes.to_file(snakemake.output.regions_onshore)
-    append_bus_shapes(n, shapes, "onshore")
-
-    if offshore_regions:
-        shapes = pd.concat(offshore_regions, ignore_index=True)
-        shapes.to_file(snakemake.output.regions_offshore)
-        append_bus_shapes(n, shapes, "offshore")
-    else:
-        offshore_shapes.to_frame().to_file(snakemake.output.regions_offshore)
-
-    n.meta = snakemake.config
-    n.export_to_netcdf(snakemake.output.base_network)
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index 899337f89..b36ffd176 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -19,7 +19,7 @@
 import requests
 from _helpers import (
     configure_logging,
-    # set_scenario_config,
+    set_scenario_config,
     # update_config_from_wildcards,
 )
 
@@ -143,8 +143,8 @@ def retrieve_osm_data(
         from _helpers import mock_snakemake
 
         snakemake = mock_snakemake("retrieve_osm_data", country="BE")
-
     configure_logging(snakemake)
+    set_scenario_config(snakemake)
 
     # Retrieve the OSM data
     country = snakemake.wildcards.country

From 6469ff47d67a7c45a022c97ef77f3050aa2b69fd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 10 Jun 2024 11:39:16 +0000
Subject: [PATCH 045/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 rules/build_electricity.smk  | 44 +++++++++++++++++++++++++-----------
 scripts/base_network.py      | 14 ++++++++----
 scripts/retrieve_osm_data.py |  3 +--
 3 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index b51913e94..8aaf16a83 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -75,50 +75,68 @@ rule base_network:
         transformers=config_provider("transformers"),
     input:
         eg_buses=lambda w: (
-            "data/entsoegridkit/buses.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/entsoegridkit/buses.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else (
-                "data/osm/prebuilt/buses.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
+                "data/osm/prebuilt/buses.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
                 else resources("osm/buses.csv")
             )
         ),
         eg_lines=lambda w: (
-            "data/entsoegridkit/lines.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/entsoegridkit/lines.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else (
-                "data/osm/prebuilt/lines.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
+                "data/osm/prebuilt/lines.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
                 else resources("osm/lines.csv")
             )
         ),
         eg_links=lambda w: (
-            "data/entsoegridkit/links.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/entsoegridkit/links.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else (
-                "data/osm/prebuilt/links.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
+                "data/osm/prebuilt/links.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
                 else resources("osm/links.csv")
             )
         ),
         eg_converters=lambda w: (
-            "data/entsoegridkit/converters.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/entsoegridkit/converters.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else (
-                "data/osm/prebuilt/converters.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
+                "data/osm/prebuilt/converters.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
                 else resources("osm/converters.csv")
             )
         ),
         eg_transformers=lambda w: (
-            "data/entsoegridkit/transformers.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/entsoegridkit/transformers.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else (
-                "data/osm/prebuilt/transformers.csv" if config_provider("electricity_network", "base_network")(w) == "osm-prebuilt"
+                "data/osm/prebuilt/transformers.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
                 else resources("osm/transformers.csv")
             )
         ),
         parameter_corrections=lambda w: (
-            "data/parameter_corrections.yaml" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/parameter_corrections.yaml"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else []
         ),
         links_p_nom=lambda w: (
-            "data/links_p_nom.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/links_p_nom.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else []
         ),
         links_tyndp=lambda w: (
-            "data/links_tyndp.csv" if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            "data/links_tyndp.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else []
         ),
         country_shapes=resources("country_shapes.geojson"),
diff --git a/scripts/base_network.py b/scripts/base_network.py
index 963234a50..393b3e0d6 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -386,7 +386,7 @@ def _load_lines_from_eg(buses, eg_lines):
 
     lines["length"] /= 1e3
 
-    lines["carrier"] = "AC" #TODO pypsa-eur check
+    lines["carrier"] = "AC"  # TODO pypsa-eur check
     lines = _remove_dangling_branches(lines, buses)
 
     return lines
@@ -808,7 +808,7 @@ def base_network(
     parameter_corrections,
     config,
 ):
-    
+
     buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
 
     if config["electricity_network"].get("base_network") == "gridkit":
@@ -818,7 +818,9 @@ def base_network(
     else:
         raise ValueError("base_network must be either 'gridkit' or 'osm'")
 
-    if (config["links"].get("include_tyndp") & (config["electricity_network"].get("base_network") == "gridkit")):
+    if config["links"].get("include_tyndp") & (
+        config["electricity_network"].get("base_network") == "gridkit"
+    ):
         buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
 
     converters = _load_converters_from_eg(buses, eg_converters)
@@ -882,7 +884,9 @@ def base_network(
 
     _set_shapes(n, country_shapes, offshore_shapes)
 
-    logger.info(f"Base network created using {config['electricity_network'].get('base_network')}.")
+    logger.info(
+        f"Base network created using {config['electricity_network'].get('base_network')}."
+    )
 
     return n
 
@@ -1115,4 +1119,4 @@ def append_bus_shapes(n, shapes, type):
         offshore_shapes.to_frame().to_file(snakemake.output.regions_offshore)
 
     n.meta = snakemake.config
-    n.export_to_netcdf(snakemake.output.base_network)
\ No newline at end of file
+    n.export_to_netcdf(snakemake.output.base_network)
diff --git a/scripts/retrieve_osm_data.py b/scripts/retrieve_osm_data.py
index e3f21443e..745533cff 100644
--- a/scripts/retrieve_osm_data.py
+++ b/scripts/retrieve_osm_data.py
@@ -17,10 +17,9 @@
 import time
 
 import requests
-from _helpers import (  # set_scenario_config,; update_config_from_wildcards,
+from _helpers import (  # set_scenario_config,; update_config_from_wildcards,; update_config_from_wildcards,
     configure_logging,
     set_scenario_config,
-    # update_config_from_wildcards,
 )
 
 logger = logging.getLogger(__name__)

From 221b1656c7422975d7947fd4e8a7abd0d2cab8da Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 11 Jun 2024 14:40:48 +0200
Subject: [PATCH 046/100] Fixed bug in pdf export by substituting pdf export
 with svg.

---
 config/config.default.yaml | 34 +++++++---------------------------
 rules/postprocess.smk      |  6 +++---
 scripts/plot_summary.py    |  4 ++--
 3 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index 7de26d9df..83801c611 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -75,10 +75,6 @@ enable:
   custom_busmap: false
   drop_leap_day: true
 
-# Settings related to the high-voltage electricity grid
-electricity_network:
-  base_network: "gridkit"           # "gridkit", "osm-prebuilt" (prebuilt network from OSM data), "osm-raw" (retrieve and build network from raw OSM data, takes longer)
-  osm_group_tolerance_buses: 5000   # only relevant for "osm-raw" setting: [m] (default 5000) Tolerance in meters of the close buses to merge
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
 co2_budget:
@@ -92,7 +88,7 @@ co2_budget:
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
-  voltages: [200., 220., 300., 380., 500., 750.]
+  voltages: [220., 300., 380., 500., 750.]
   gaslimit_enable: false
   gaslimit: false
   co2limit_enable: false
@@ -286,27 +282,11 @@ conventional:
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
 lines:
   types:
-    200.: "Al/St 240/40 2-bundle 220.0"
     220.: "Al/St 240/40 2-bundle 220.0"
     300.: "Al/St 240/40 3-bundle 300.0"
     380.: "Al/St 240/40 4-bundle 380.0"
-    400.: "Al/St 240/40 4-bundle 380.0"
     500.: "Al/St 240/40 4-bundle 380.0"
     750.: "Al/St 560/50 4-bundle 750.0"
-  dc_types:
-    200.: "HVDC XLPE 1000"
-    250.: "HVDC XLPE 1000"
-    270.: "HVDC XLPE 1000"
-    285.: "HVDC XLPE 1000"
-    300.: "HVDC XLPE 1000"
-    320.: "HVDC XLPE 1000"
-    350.: "HVDC XLPE 1000"
-    380.: "HVDC Oil filled 1400"
-    400.: "HVDC XLPE 1000"
-    450.: "HVDC XLPE 1000"
-    515.: "HVDC XLPE 1000"
-    525.: "HVDC XLPE 1000"
-    600.: "HVDC XLPE 1000"
   s_max_pu: 0.7
   s_nom_max: .inf
   max_extension: 20000 #MW
@@ -923,11 +903,11 @@ plotting:
   eu_node_location:
     x: -5.5
     y: 46.
-  costs_max: 1000
-  costs_threshold: 1
-  energy_max: 20000
-  energy_min: -20000
-  energy_threshold: 50.
+  # costs_max: 1000
+  # costs_threshold: 1
+  # energy_max: 20000
+  # energy_min: -20000
+  # energy_threshold: 50.
 
   nice_names:
     OCGT: "Open-Cycle Gas"
@@ -1225,4 +1205,4 @@ plotting:
     load: "#dd2e23"
     waste CHP: '#e3d37d'
     waste CHP CC: '#e3d3ff'
-    HVC to air: 'k'
+    HVC to air: 'k'
\ No newline at end of file
diff --git a/rules/postprocess.smk b/rules/postprocess.smk
index 39fd46c9d..edeff1ef4 100644
--- a/rules/postprocess.smk
+++ b/rules/postprocess.smk
@@ -233,9 +233,9 @@ rule plot_summary:
         eurostat="data/eurostat/Balances-April2023",
         co2="data/bundle/eea/UNFCCC_v23.csv",
     output:
-        costs=RESULTS + "graphs/costs.pdf",
-        energy=RESULTS + "graphs/energy.pdf",
-        balances=RESULTS + "graphs/balances-energy.pdf",
+        costs=RESULTS + "graphs/costs.svg",
+        energy=RESULTS + "graphs/energy.svg",
+        balances=RESULTS + "graphs/balances-energy.svg",
     threads: 2
     resources:
         mem_mb=10000,
diff --git a/scripts/plot_summary.py b/scripts/plot_summary.py
index 39fbba030..d131e9378 100644
--- a/scripts/plot_summary.py
+++ b/scripts/plot_summary.py
@@ -353,7 +353,7 @@ def plot_balances():
             frameon=False,
         )
 
-        fig.savefig(snakemake.output.balances[:-10] + k + ".pdf", bbox_inches="tight")
+        fig.savefig(snakemake.output.balances[:-10] + k + ".svg", bbox_inches="tight")
 
 
 def historical_emissions(countries):
@@ -563,7 +563,7 @@ def plot_carbon_budget_distribution(input_eurostat, options):
     )
 
     plt.grid(axis="y")
-    path = snakemake.output.balances.split("balances")[0] + "carbon_budget.pdf"
+    path = snakemake.output.balances.split("balances")[0] + "carbon_budget.svg"
     plt.savefig(path, bbox_inches="tight")
 
 

From 9bcdbc0ed86748ec100851b38f2e11ad2c43ee86 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 11 Jun 2024 12:41:47 +0000
Subject: [PATCH 047/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 config/config.default.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index 83801c611..40fe65ed3 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -1205,4 +1205,4 @@ plotting:
     load: "#dd2e23"
     waste CHP: '#e3d37d'
     waste CHP CC: '#e3d3ff'
-    HVC to air: 'k'
\ No newline at end of file
+    HVC to air: 'k'

From afb9e52b530e1ab55e92fba5838599deaa548197 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 11 Jun 2024 15:35:11 +0200
Subject: [PATCH 048/100] Bug-fix Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 726b8d2ba..4a3d3eebc 100644
--- a/Snakefile
+++ b/Snakefile
@@ -73,7 +73,7 @@ if config["foresight"] == "perfect":
 
 rule all:
     input:
-        expand(RESULTS + "graphs/costs.pdf", run=config["run"]["name"]),
+        expand(RESULTS + "graphs/costs.svg", run=config["run"]["name"]),
     default_target: True
 
 

From 3d2169a39c880f196a879a0a40596aca96a372b1 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Mon, 17 Jun 2024 18:28:39 +0200
Subject: [PATCH 049/100] dropped not needed columns from build_osm_network.

---
 scripts/build_osm_network.py | 1 -
 scripts/build_shapes.py      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 0372692f9..b052cd173 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -801,7 +801,6 @@ def build_network(
         "bus1",
         "voltage",
         "circuits",
-        "tag_frequency",
         "length",
         "underground",
         "under_construction",
diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 85afdaea4..402b6e6ed 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -108,7 +108,6 @@ def _simplify_polys(polys, minarea=0.1, tolerance=0.01, filterremote=True):
             polys = mainpoly
     return polys.simplify(tolerance=tolerance)
 
-
 def countries(naturalearth, country_list):
     if "RS" in country_list:
         country_list.append("KV")

From f2bd9bf32855c626e6e732ebece00f89a877cbb9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 17 Jun 2024 16:33:29 +0000
Subject: [PATCH 050/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/build_shapes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 402b6e6ed..85afdaea4 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -108,6 +108,7 @@ def _simplify_polys(polys, minarea=0.1, tolerance=0.01, filterremote=True):
             polys = mainpoly
     return polys.simplify(tolerance=tolerance)
 
+
 def countries(naturalearth, country_list):
     if "RS" in country_list:
         country_list.append("KV")

From acda4c41c6da369204679eca26d643e1de5bf2ad Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 18 Jun 2024 14:28:32 +0200
Subject: [PATCH 051/100] Updated build_shapes, config.default and
 clean_osm_data.

---
 config/config.default.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index d8dcff2ed..cc8ef4b63 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -86,6 +86,10 @@ co2_budget:
   2045: 0.032
   2050: 0.000
 
+electricity_network:
+  base_network: gridkit             # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
+  osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
+
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
   voltages: [220., 300., 380., 500., 750.]

From 777f7eafd247934ffa05af09502503038541fd78 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Tue, 18 Jun 2024 17:14:17 +0200
Subject: [PATCH 052/100] pre-commit changes.

---
 scripts/build_shapes.py   |  4 ++--
 scripts/clean_osm_data.py | 25 ++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/scripts/build_shapes.py b/scripts/build_shapes.py
index 85afdaea4..74d138800 100644
--- a/scripts/build_shapes.py
+++ b/scripts/build_shapes.py
@@ -91,7 +91,7 @@ def _get_country(target, **keys):
         return np.nan
 
 
-def _simplify_polys(polys, minarea=0.1, tolerance=0.01, filterremote=True):
+def _simplify_polys(polys, minarea=0.1, filterremote=True):
     if isinstance(polys, MultiPolygon):
         polys = sorted(polys.geoms, key=attrgetter("area"), reverse=True)
         mainpoly = polys[0]
@@ -106,7 +106,7 @@ def _simplify_polys(polys, minarea=0.1, tolerance=0.01, filterremote=True):
             )
         else:
             polys = mainpoly
-    return polys.simplify(tolerance=tolerance)
+    return polys
 
 
 def countries(naturalearth, country_list):
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index d64bcec97..917071954 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -78,6 +78,19 @@ def _create_polygon(row):
     return polygon
 
 
+def find_closest_polygon(gdf, point):
+    # Compute the distance to each polygon
+    gdf["distance"] = gdf["geometry"].apply(lambda geom: point.distance(geom))
+
+    # Find the index of the closest polygon
+    closest_idx = gdf["distance"].idxmin()
+
+    # Get the closest polygon's row
+    closest_polygon = gdf.loc[closest_idx]
+
+    return closest_idx
+
+
 def _extended_linemerge(lines):
     """
     Merges a list of LineStrings into a single LineString by finding the
@@ -549,15 +562,17 @@ def _add_line_endings_to_substations(
         axis=1,
     )
     gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs=crs)
-    utm = gdf_union.estimate_utm_crs(datum_name="WGS 84")
-    gdf_union = gdf_union.to_crs(utm)
-    gdf_union = gdf_union.buffer(2500)  # meters
-    gdf_union = gdf_union.to_crs(crs)
-    gdf_union = gpd.GeoDataFrame(geometry=gdf_union, crs=crs)
     gdf_buses_tofix = gpd.GeoDataFrame(
         buses[bool_multiple_countries], geometry="geometry", crs=crs
     )
     joined = gpd.sjoin(gdf_buses_tofix, gdf_union, how="left", predicate="within")
+
+    # For all remaining rows where the country/index_right column is NaN, find
+    # find the closest polygon index
+    joined.loc[joined["index_right"].isna(), "index_right"] = joined.loc[
+        joined["index_right"].isna(), "geometry"
+    ].apply(lambda x: find_closest_polygon(gdf_union, x))
+
     joined.reset_index(inplace=True)
     joined = joined.drop_duplicates(subset="bus_id")
     joined.set_index("bus_id", inplace=True)

From 58e5129ed48fc1bc40ec82804c4cac5b72fc8b65 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 19 Jun 2024 13:40:19 +0200
Subject: [PATCH 053/100] test

---
 scripts/build_osm_network.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index b052cd173..5ce5a804a 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -1,9 +1,7 @@
 # -*- coding: utf-8 -*-
-# SPDX-FileCopyrightText:  PyPSA-Earth and PyPSA-Eur Authors
+# SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur and PyPSA-Earth Authors
 #
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-# -*- coding: utf-8 -*-
+# SPDX-License-Identifier: MIT
 
 import logging
 import os
@@ -773,7 +771,7 @@ def build_network(
     # Rename "substation" in buses["symbol"] to "Substation"
     buses["symbol"] = buses["symbol"].replace({"substation": "Substation"})
 
-    # Drop unncessary index column and set respective element ids as index
+    # Drop unnecessary index column and set respective element ids as index
     lines.set_index("line_id", inplace=True)
     links.set_index("link_id", inplace=True)
     converters.set_index("converter_id", inplace=True)

From 917c52c8597c9a7a4b2f1126bd46e02a9729e281 Mon Sep 17 00:00:00 2001
From: Bobby Xiong <bobbyxng@gmail.com>
Date: Wed, 19 Jun 2024 23:33:13 +0200
Subject: [PATCH 054/100] Added initial prepare_osm_network_release.py script

---
 Snakefile                              |  1 +
 rules/build_electricity.smk            | 30 +++++------
 rules/development.smk                  | 20 +++++++
 scripts/base_network.py                | 30 +++++++++--
 scripts/build_osm_network.py           |  2 +-
 scripts/prepare_osm_network_release.py | 74 ++++++++++++++++++++++++++
 6 files changed, 138 insertions(+), 19 deletions(-)
 create mode 100644 rules/development.smk
 create mode 100644 scripts/prepare_osm_network_release.py

diff --git a/Snakefile b/Snakefile
index 4a3d3eebc..56a704dec 100644
--- a/Snakefile
+++ b/Snakefile
@@ -54,6 +54,7 @@ include: "rules/build_sector.smk"
 include: "rules/solve_electricity.smk"
 include: "rules/postprocess.smk"
 include: "rules/validate.smk"
+include: "rules/development.smk"
 
 
 if config["foresight"] == "overnight":
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 8aaf16a83..2b5437029 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -81,7 +81,7 @@ rule base_network:
                 "data/osm/prebuilt/buses.csv"
                 if config_provider("electricity_network", "base_network")(w)
                 == "osm-prebuilt"
-                else resources("osm/buses.csv")
+                else resources("osm/pre-base/buses.csv")
             )
         ),
         eg_lines=lambda w: (
@@ -91,7 +91,7 @@ rule base_network:
                 "data/osm/prebuilt/lines.csv"
                 if config_provider("electricity_network", "base_network")(w)
                 == "osm-prebuilt"
-                else resources("osm/lines.csv")
+                else resources("osm/pre-base/lines.csv")
             )
         ),
         eg_links=lambda w: (
@@ -101,7 +101,7 @@ rule base_network:
                 "data/osm/prebuilt/links.csv"
                 if config_provider("electricity_network", "base_network")(w)
                 == "osm-prebuilt"
-                else resources("osm/links.csv")
+                else resources("osm/pre-base/links.csv")
             )
         ),
         eg_converters=lambda w: (
@@ -111,7 +111,7 @@ rule base_network:
                 "data/osm/prebuilt/converters.csv"
                 if config_provider("electricity_network", "base_network")(w)
                 == "osm-prebuilt"
-                else resources("osm/converters.csv")
+                else resources("osm/pre-base/converters.csv")
             )
         ),
         eg_transformers=lambda w: (
@@ -121,7 +121,7 @@ rule base_network:
                 "data/osm/prebuilt/transformers.csv"
                 if config_provider("electricity_network", "base_network")(w)
                 == "osm-prebuilt"
-                else resources("osm/transformers.csv")
+                else resources("osm/pre-base/transformers.csv")
             )
         ),
         parameter_corrections=lambda w: (
@@ -711,16 +711,16 @@ rule build_osm_network:
         links=resources("osm/clean/links.geojson"),
         country_shapes=resources("country_shapes.geojson"),
     output:
-        lines=resources("osm/lines.csv"),
-        links=resources("osm/links.csv"),
-        converters=resources("osm/converters.csv"),
-        transformers=resources("osm/transformers.csv"),
-        substations=resources("osm/buses.csv"),
-        lines_geojson=resources("osm/lines.geojson"),
-        links_geojson=resources("osm/links.geojson"),
-        converters_geojson=resources("osm/converters.geojson"),
-        transformers_geojson=resources("osm/transformers.geojson"),
-        substations_geojson=resources("osm/buses.geojson"),
+        lines=resources("osm/pre-base/lines.csv"),
+        links=resources("osm/pre-base/links.csv"),
+        converters=resources("osm/pre-base/converters.csv"),
+        transformers=resources("osm/pre-base/transformers.csv"),
+        substations=resources("osm/pre-base/buses.csv"),
+        lines_geojson=resources("osm/pre-base/lines.geojson"),
+        links_geojson=resources("osm/pre-base/links.geojson"),
+        converters_geojson=resources("osm/pre-base/converters.geojson"),
+        transformers_geojson=resources("osm/pre-base/transformers.geojson"),
+        substations_geojson=resources("osm/pre-base/buses.geojson"),
     log:
         logs("build_osm_network.log"),
     benchmark:
diff --git a/rules/development.smk b/rules/development.smk
new file mode 100644
index 000000000..2316428a6
--- /dev/null
+++ b/rules/development.smk
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: : 2023-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+
+
+rule prepare_osm_network_release:
+    input:
+        base_network=resources("networks/base.nc"),
+    output:
+        lines=resources("osm/release/lines.csv"),
+        links=resources("osm/release/links.csv"),
+        converters=resources("osm/release/converters.csv"),
+        transformers=resources("osm/release/transformers.csv"),
+        buses=resources("osm/release/buses.csv"),
+    log:
+        logs("prepare_osm_network_release.log"),
+    benchmark:
+        benchmarks("prepare_osm_network_release")
+    script:
+        "../scripts/prepare_osm_network_release.py"
diff --git a/scripts/base_network.py b/scripts/base_network.py
index 393b3e0d6..d97ecc219 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -211,6 +211,22 @@ def _load_converters_from_eg(buses, eg_converters):
     return converters
 
 
+def _load_converters_from_osm(buses, eg_converters):
+    converters = pd.read_csv(
+        eg_converters,
+        quotechar="'",
+        true_values=["t"],
+        false_values=["f"],
+        dtype=dict(converter_id="str", bus0="str", bus1="str"),
+    ).set_index("converter_id")
+
+    converters = _remove_dangling_branches(converters, buses)
+
+    converters["carrier"] = ""
+
+    return converters
+
+
 def _load_links_from_eg(buses, eg_links):
     links = pd.read_csv(
         eg_links,
@@ -823,12 +839,20 @@ def base_network(
     ):
         buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
 
-    converters = _load_converters_from_eg(buses, eg_converters)
+    if config["electricity_network"].get("base_network") == "gridkit":
+        converters = _load_converters_from_eg(buses, eg_converters)
+    elif "osm" in config["electricity_network"].get("base_network"):
+        converters = _load_converters_from_osm(buses, eg_converters)
+
     transformers = _load_transformers_from_eg(buses, eg_transformers)
 
     lines = _load_lines_from_eg(buses, eg_lines)
 
-    if config["lines"].get("reconnect_crimea", True) and "UA" in config["countries"]:
+    if (
+        (config["electricity_network"].get("base_network") == "gridkit")
+        & (config["lines"].get("reconnect_crimea", True))
+        & ("UA" in config["countries"])
+    ):
         lines = _reconnect_crimea(lines)
 
     if config["electricity_network"].get("base_network") == "gridkit":
@@ -908,7 +932,7 @@ def _get_linetypes_config(line_types, voltages):
     -------
         Dictionary of linetypes for selected voltages.
     """
-    # get voltages value that are not availabile in the line types
+    # get voltages value that are not available in the line types
     vnoms_diff = set(voltages).symmetric_difference(set(line_types.keys()))
     if vnoms_diff:
         logger.warning(
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 5ce5a804a..39b20aec7 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -227,7 +227,7 @@ def set_lines_ids(lines, buses, distance_crs):
         ascii=False,
         unit=" lines",
         total=lines.shape[0],
-        desc="Set line bus ids ",
+        desc="Set line/link bus ids ",
     )
 
     # initialization
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
new file mode 100644
index 000000000..1e877e233
--- /dev/null
+++ b/scripts/prepare_osm_network_release.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: : 2020-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+
+import logging
+
+import pypsa
+from _helpers import configure_logging, set_scenario_config
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_osm_network_release(network):
+    return None
+
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("prepare_osm_network_release")
+
+    configure_logging(snakemake)
+    set_scenario_config(snakemake)
+
+    network = pypsa.Network(snakemake.input.base_network)
+
+    buses_columns = [
+        "bus_id",
+        "voltage",
+        "dc",
+        "symbol",
+        "under_construction",
+        "x",
+        "y",
+        "country",
+        "geometry",
+    ]
+
+    lines_columns = [
+        "line_id",
+        "bus0",
+        "bus1",
+        "voltage",
+        "circuits",
+        "length",
+        "underground",
+        "under_construction",
+        "geometry",
+    ]
+
+    links_columns = [
+        "link_id",
+        "bus0",
+        "bus1",
+        "voltage",
+        "p_nom",
+        "length",
+        "underground",
+        "under_construction",
+        "geometry",
+    ]
+
+    transformers_columns = [
+        "transformer_id",
+        "bus0",
+        "bus1",
+        "voltage_bus0",
+        "voltage_bus1",
+        "geometry",
+    ]
+
+    converters_columns = []

From 2a3ad5c3f9ebc1e1c5ca10ddc6a87d4578d3b0ca Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Fri, 21 Jun 2024 14:59:22 +0200
Subject: [PATCH 055/100] Finalised prepare_osm_network_release script to build
 clean and stable OSM base_network input files.

---
 scripts/_helpers.py                    |  4 +-
 scripts/prepare_osm_network_release.py | 81 ++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/scripts/_helpers.py b/scripts/_helpers.py
index ff304f5fa..a3b77c1c0 100644
--- a/scripts/_helpers.py
+++ b/scripts/_helpers.py
@@ -406,13 +406,13 @@ def mock_snakemake(
     from snakemake.api import Workflow
     from snakemake.common import SNAKEFILE_CHOICES
     from snakemake.script import Snakemake
-    from snakemake.settings import (
+    from snakemake.settings.types import (
+        ConfigSettings,
         DAGSettings,
         ResourceSettings,
         StorageSettings,
         WorkflowSettings,
     )
-    from snakemake.settings.types import ConfigSettings
 
     script_dir = Path(__file__).parent.resolve()
     if root_dir is None:
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
index 1e877e233..70c6f6982 100644
--- a/scripts/prepare_osm_network_release.py
+++ b/scripts/prepare_osm_network_release.py
@@ -4,14 +4,46 @@
 # SPDX-License-Identifier: MIT
 
 import logging
+import os
 
+import pandas as pd
 import pypsa
 from _helpers import configure_logging, set_scenario_config
 
 logger = logging.getLogger(__name__)
 
 
-def prepare_osm_network_release(network):
+def export_clean_csv(df, columns, output_file):
+    """
+    Export a cleaned DataFrame to a CSV file.
+
+    Args:
+        df (pandas.DataFrame): The DataFrame to be exported.
+        columns (list): A list of column names to include in the exported CSV file.
+        output_file (str): The path to the output CSV file.
+
+    Returns:
+        None
+    """
+    rename_dict = {
+        "Bus": "bus_id",
+        "Line": "line_id",
+        "Link": "link_id",
+        "Transformer": "transformer_id",
+        "v_nom": "voltage",
+        "num_parallel": "circuits",
+    }
+
+    if "converter_id" in columns:
+        rename_dict["Link"] = "converter_id"
+
+    # Create the directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
+    df.reset_index().rename(columns=rename_dict).loc[:, columns].replace(
+        {True: "t", False: "f"}
+    ).to_csv(output_file, index=False, quotechar="'")
+
     return None
 
 
@@ -24,8 +56,6 @@ def prepare_osm_network_release(network):
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
-    network = pypsa.Network(snakemake.input.base_network)
-
     buses_columns = [
         "bus_id",
         "voltage",
@@ -71,4 +101,47 @@ def prepare_osm_network_release(network):
         "geometry",
     ]
 
-    converters_columns = []
+    converters_columns = [
+        "converter_id",
+        "bus0",
+        "bus1",
+        "geometry",
+    ]
+
+    network = pypsa.Network(snakemake.input.base_network)
+
+    # Export to clean csv for release
+    logger.info(f"Exporting {len(network.buses)} buses to %s", snakemake.output.buses)
+    export_clean_csv(network.buses, buses_columns, snakemake.output.buses)
+
+    logger.info(
+        f"Exporting {len(network.transformers)} transformers to %s",
+        snakemake.output.transformers,
+    )
+    export_clean_csv(
+        network.transformers, transformers_columns, snakemake.output.transformers
+    )
+
+    logger.info(f"Exporting {len(network.lines)} lines to %s", snakemake.output.lines)
+    export_clean_csv(network.lines, lines_columns, snakemake.output.lines)
+
+    # Boolean that specifies if link element is a converter
+    is_converter = network.links.index.str.startswith("conv") == True
+
+    logger.info(
+        f"Exporting {len(network.links[~is_converter])} links to %s",
+        snakemake.output.links,
+    )
+    export_clean_csv(
+        network.links[~is_converter], links_columns, snakemake.output.links
+    )
+
+    logger.info(
+        f"Exporting {len(network.links[is_converter])} converters to %s",
+        snakemake.output.converters,
+    )
+    export_clean_csv(
+        network.links[is_converter], converters_columns, snakemake.output.converters
+    )
+
+    logger.info("Export of OSM network for release complete.")

From 38ee3da62add1121a69eb7f7c2a03504e2906f4b Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Fri, 21 Jun 2024 14:59:57 +0200
Subject: [PATCH 056/100] Added new rules/development.smk

---
 rules/development.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rules/development.smk b/rules/development.smk
index 2316428a6..24c46a159 100644
--- a/rules/development.smk
+++ b/rules/development.smk
@@ -7,11 +7,11 @@ rule prepare_osm_network_release:
     input:
         base_network=resources("networks/base.nc"),
     output:
+        buses=resources("osm/release/buses.csv"),
+        converters=resources("osm/release/converters.csv"),
         lines=resources("osm/release/lines.csv"),
         links=resources("osm/release/links.csv"),
-        converters=resources("osm/release/converters.csv"),
         transformers=resources("osm/release/transformers.csv"),
-        buses=resources("osm/release/buses.csv"),
     log:
         logs("prepare_osm_network_release.log"),
     benchmark:

From 0d9ba2ecec55c4001c15c68d5f8dd623dcbaaf7f Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 25 Jun 2024 20:36:34 +0200
Subject: [PATCH 057/100] Updated clean_osm_data to add substation_centroid to
 linestrings

---
 scripts/clean_osm_data.py | 239 ++++++++++++++++++++++----------------
 1 file changed, 136 insertions(+), 103 deletions(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 917071954..dc1e79915 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -34,8 +34,8 @@
 import numpy as np
 import pandas as pd
 from _helpers import configure_logging, set_scenario_config
-from shapely.geometry import LineString, MultiLineString, Polygon
-from shapely.ops import linemerge
+from shapely.geometry import LineString, MultiLineString, Point, Polygon
+from shapely.ops import linemerge, unary_union
 
 logger = logging.getLogger(__name__)
 
@@ -78,7 +78,17 @@ def _create_polygon(row):
     return polygon
 
 
-def find_closest_polygon(gdf, point):
+def _find_closest_polygon(gdf, point):
+    """
+    Find the closest polygon in a GeoDataFrame to a given point.
+
+    Parameters:
+    gdf (GeoDataFrame): A GeoDataFrame containing polygons.
+    point (Point): A Point object representing the target point.
+
+    Returns:
+    int: The index of the closest polygon in the GeoDataFrame.
+    """
     # Compute the distance to each polygon
     gdf["distance"] = gdf["geometry"].apply(lambda geom: point.distance(geom))
 
@@ -91,83 +101,6 @@ def find_closest_polygon(gdf, point):
     return closest_idx
 
 
-def _extended_linemerge(lines):
-    """
-    Merges a list of LineStrings into a single LineString by finding the
-    closest pair of points between all pairs of LineStrings.
-
-    Parameters:
-    lines (list): A list of LineStrings to be merged.
-
-    Returns:
-    merged_line (LineString): The merged LineString.
-
-    Raises:
-    TypeError: If the input is not a list of LineStrings.
-    """
-    # Ensure we have a list of LineStrings
-    if not isinstance(lines, list):
-        raise TypeError("Input should be a list of LineStrings")
-    if any(not isinstance(line, LineString) for line in lines):
-        raise TypeError("All elements in the list should be LineStrings")
-
-    if len(lines) == 1:
-        return lines[0]
-
-    merged_linestring = linemerge(lines)
-
-    if isinstance(merged_linestring, LineString):
-        return merged_linestring
-    else:
-
-        def find_closest_points(line1, line2):
-            min_dist = np.inf
-            closest_points = (None, None)
-            for point1 in line1.coords:
-                for point2 in line2.coords:
-                    dist = np.linalg.norm(np.array(point1) - np.array(point2))
-                    if dist < min_dist:
-                        min_dist = dist
-                        closest_points = (point1, point2)
-            return closest_points
-
-        def merge_lines(lines):
-            while len(lines) > 1:
-                min_distance = np.inf
-                closest_pair = (None, None)
-                pair_indices = (None, None)
-                for i in range(len(lines)):
-                    for j in range(i + 1, len(lines)):
-                        point1, point2 = find_closest_points(lines[i], lines[j])
-                        distance = np.linalg.norm(np.array(point1) - np.array(point2))
-                        if distance < min_distance:
-                            min_distance = distance
-                            closest_pair = (point1, point2)
-                            pair_indices = (i, j)
-
-                connecting_line = LineString([closest_pair[0], closest_pair[1]])
-                combined_line = linemerge(
-                    MultiLineString(
-                        [
-                            lines[pair_indices[0]],
-                            lines[pair_indices[1]],
-                            connecting_line,
-                        ]
-                    )
-                )
-
-                new_lines = [
-                    line for k, line in enumerate(lines) if k not in pair_indices
-                ]
-                new_lines.append(combined_line)
-                lines = new_lines
-
-            return lines[0]
-
-        lines = list(merged_linestring.geoms)
-        return merge_lines(lines)
-
-
 def _clean_voltage(column):
     """
     Function to clean the raw voltage column: manual fixing and drop nan values
@@ -571,7 +504,7 @@ def _add_line_endings_to_substations(
     # find the closest polygon index
     joined.loc[joined["index_right"].isna(), "index_right"] = joined.loc[
         joined["index_right"].isna(), "geometry"
-    ].apply(lambda x: find_closest_polygon(gdf_union, x))
+    ].apply(lambda x: _find_closest_polygon(gdf_union, x))
 
     joined.reset_index(inplace=True)
     joined = joined.drop_duplicates(subset="bus_id")
@@ -1128,6 +1061,27 @@ def _clean_lines(df_lines, list_voltages):
 
 
 def _create_substations_geometry(df_substations):
+    """
+    Creates geometries.
+
+    Parameters:
+    df_substations (DataFrame): The input DataFrame containing the substations
+    data.
+
+    Returns:
+    df_substations (DataFrame): A new DataFrame with the
+    polygons ["polygon"] of the substations geometries.
+    """
+    logger.info("Creating substations geometry.")
+    df_substations = df_substations.copy()
+
+    # Create centroids from geometries and keep the original polygons
+    df_substations.loc[:, "polygon"] = df_substations["geometry"]
+
+    return df_substations
+
+
+def _create_substations_centroid(df_substations):
     """
     Creates centroids from geometries and keeps the original polygons.
 
@@ -1142,11 +1096,10 @@ def _create_substations_geometry(df_substations):
     logger.info("Creating substations geometry.")
     df_substations = df_substations.copy()
 
-    # Create centroids from geometries and keep the original polygons
-    df_substations.loc[:, "polygon"] = df_substations["geometry"]
-    df_substations.loc[:, "geometry"] = df_substations["geometry"].apply(
+    df_substations.loc[:, "geometry"] = df_substations["polygon"].apply(
         lambda x: x.centroid
     )
+
     df_substations.loc[:, "lon"] = df_substations["geometry"].apply(lambda x: x.x)
     df_substations.loc[:, "lat"] = df_substations["geometry"].apply(lambda x: x.y)
 
@@ -1180,6 +1133,34 @@ def _create_lines_geometry(df_lines):
     return df_lines
 
 
+def _add_bus_centroid_to_line(linestring, point):
+    """
+    Adds the centroid of a substation to a linestring by extending the
+    linestring with a new segment.
+
+    Parameters:
+    linestring (LineString): The original linestring to extend.
+    point (Point): The centroid of the bus.
+
+    Returns:
+    merged (LineString): The extended linestring with the new segment.
+    """
+    start = linestring.coords[0]
+    end = linestring.coords[-1]
+
+    dist_to_start = point.distance(Point(start))
+    dist_to_end = point.distance(Point(end))
+
+    if dist_to_start < dist_to_end:
+        new_segment = LineString([point.coords[0], start])
+    else:
+        new_segment = LineString([point.coords[0], end])
+
+    merged = linemerge([linestring, new_segment])
+
+    return merged
+
+
 def _finalise_substations(df_substations):
     """
     Finalises the substations column types.
@@ -1533,9 +1514,65 @@ def _remove_lines_within_substations(gdf_lines, gdf_substations_polygon):
     return gdf_lines
 
 
-# Define a function to check if a polygon intersects any line in the lines GeoDataFrame
-def intersects_any_line(polygon, lines):
-    return lines.intersects(polygon).any()
+def _merge_touching_polygons(df):
+    """
+    Merge touching polygons in a GeoDataFrame.
+
+    Parameters:
+    - df: pandas.DataFrame or geopandas.GeoDataFrame
+        The input DataFrame containing the polygons to be merged.
+
+    Returns:
+    - gdf: geopandas.GeoDataFrame
+        The GeoDataFrame with merged polygons.
+    """
+
+    gdf = gpd.GeoDataFrame(df, geometry="polygon", crs=crs)
+    combined_polygons = unary_union(gdf.geometry)
+    if combined_polygons.geom_type == "MultiPolygon":
+        gdf_combined = gpd.GeoDataFrame(
+            geometry=[poly for poly in combined_polygons.geoms], crs=crs
+        )
+    else:
+        gdf_combined = gpd.GeoDataFrame(geometry=[combined_polygons], crs=crs)
+
+    gdf.reset_index(drop=True, inplace=True)
+
+    for i, combined_geom in gdf_combined.iterrows():
+        mask = gdf.intersects(combined_geom.geometry)
+        gdf.loc[mask, "polygon_merged"] = combined_geom.geometry
+
+    gdf.drop(columns=["polygon"], inplace=True)
+    gdf.rename(columns={"polygon_merged": "polygon"}, inplace=True)
+
+    return gdf
+
+
+def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon):
+    """
+    Extends the lines in the given GeoDataFrame `gdf_lines` to the centroid of
+    the nearest substations represented by the polygons in the
+    `gdf_substations_polygon` GeoDataFrame.
+
+    Parameters:
+    gdf_lines (GeoDataFrame): A GeoDataFrame containing the lines to be extended.
+    gdf_substations_polygon (GeoDataFrame): A GeoDataFrame containing the polygons representing substations.
+
+    Returns:
+    GeoDataFrame: A new GeoDataFrame with the lines extended to the substations.
+    """
+    gdf = gpd.sjoin(
+        gdf_lines,
+        gdf_substations_polygon.drop_duplicates(subset="polygon", inplace=False),
+        how="left",
+        lsuffix="line",
+        rsuffix="bus",
+        predicate="intersects",
+    ).drop(columns="index_bus")
+
+    # Rest of the code...
+
+    return gdf_lines
 
 
 if __name__ == "__main__":
@@ -1571,6 +1608,12 @@ def intersects_any_line(polygon, lines):
     df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
     df_substations = _clean_substations(df_substations, list_voltages)
     df_substations = _create_substations_geometry(df_substations)
+    # Merge touching polygons
+    df_substations = _merge_touching_polygons(df_substations)
+    # df_substations["polygon"] = df_substations["polygon"].apply(
+    #     lambda x: x.convex_hull
+    # )
+    df_substations = _create_substations_centroid(df_substations)
     df_substations = _finalise_substations(df_substations)
 
     # Create polygon GeoDataFrame to remove lines within substations
@@ -1580,6 +1623,8 @@ def intersects_any_line(polygon, lines):
         crs=crs,
     )
 
+    gdf_substations_polygon["geometry"] = gdf_substations_polygon.polygon.copy()
+
     logger.info("---")
     logger.info("LINES AND CABLES")
     path_lines = {
@@ -1618,6 +1663,7 @@ def intersects_any_line(polygon, lines):
     # Create GeoDataFrame
     gdf_lines = gpd.GeoDataFrame(df_lines, geometry="geometry", crs=crs)
     gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon)
+    gdf_lines = _extend_lines_to_substations(gdf_lines, gdf_substations_polygon)
 
     logger.info("---")
     logger.info("HVDC LINKS")
@@ -1661,21 +1707,6 @@ def intersects_any_line(polygon, lines):
         prefix="link-end",
     )
 
-    # # Drop df_substations.dc == True and tag_source != "link-end"
-    # df_substations = df_substations[
-    #     ~((df_substations.dc == True) & (df_substations.tag_source != "link-end"))
-    # ]
-
-    # # Apply the function to each polygon in the substations GeoDataFrame
-    # gdf_substations_polygon["connected"] = False
-    # gdf_substations_polygon['connected'] = gdf_substations_polygon['polygon'].apply(intersects_any_line, lines=gdf_lines)
-
-    # list_buses_disconnected = gdf_substations_polygon[gdf_substations_polygon['connected'] == False]['bus_id'].tolist()
-
-    # # Drop islanded substations
-    # gdf_substations_polygon = gdf_substations_polygon[~gdf_substations_polygon['bus_id'].isin(list_buses_disconnected)]
-    # df_substations = df_substations[~df_substations['bus_id'].isin(list_buses_disconnected)]
-
     # Drop polygons and create GDF
     gdf_substations = gpd.GeoDataFrame(
         df_substations.drop(columns=["polygon"]), geometry="geometry", crs=crs
@@ -1693,7 +1724,9 @@ def intersects_any_line(polygon, lines):
     logger.info(
         f"Exporting clean substations with polygon shapes to {output_substations_polygon}"
     )
-    gdf_substations_polygon.to_file(output_substations_polygon, driver="GeoJSON")
+    gdf_substations_polygon.drop(columns=["geometry"]).to_file(
+        output_substations_polygon, driver="GeoJSON"
+    )
     logger.info(f"Exporting clean substations to {output_substations}")
     gdf_substations.to_file(output_substations, driver="GeoJSON")
     logger.info(f"Exporting clean lines to {output_lines}")

From c818cd999920822583f38754b894f427871da2d2 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 25 Jun 2024 20:38:17 +0200
Subject: [PATCH 058/100] Updated clean_osm_data to add substation_centroid to
 linestrings

---
 scripts/clean_osm_data.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index dc1e79915..0a8617f59 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1133,34 +1133,6 @@ def _create_lines_geometry(df_lines):
     return df_lines
 
 
-def _add_bus_centroid_to_line(linestring, point):
-    """
-    Adds the centroid of a substation to a linestring by extending the
-    linestring with a new segment.
-
-    Parameters:
-    linestring (LineString): The original linestring to extend.
-    point (Point): The centroid of the bus.
-
-    Returns:
-    merged (LineString): The extended linestring with the new segment.
-    """
-    start = linestring.coords[0]
-    end = linestring.coords[-1]
-
-    dist_to_start = point.distance(Point(start))
-    dist_to_end = point.distance(Point(end))
-
-    if dist_to_start < dist_to_end:
-        new_segment = LineString([point.coords[0], start])
-    else:
-        new_segment = LineString([point.coords[0], end])
-
-    merged = linemerge([linestring, new_segment])
-
-    return merged
-
-
 def _finalise_substations(df_substations):
     """
     Finalises the substations column types.

From af3b4155f45817a5b843a3d3e54a1c9c63283e29 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 25 Jun 2024 21:06:25 +0200
Subject: [PATCH 059/100] Updated clean_osm_data to add substation_centroid to
 linestrings

---
 scripts/clean_osm_data.py | 121 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 1 deletion(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 0a8617f59..e40f510fb 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1133,6 +1133,34 @@ def _create_lines_geometry(df_lines):
     return df_lines
 
 
+def _add_bus_centroid_to_line(linestring, point):
+    """
+    Adds the centroid of a substation to a linestring by extending the
+    linestring with a new segment.
+
+    Parameters:
+    linestring (LineString): The original linestring to extend.
+    point (Point): The centroid of the bus.
+
+    Returns:
+    merged (LineString): The extended linestring with the new segment.
+    """
+    start = linestring.coords[0]
+    end = linestring.coords[-1]
+
+    dist_to_start = point.distance(Point(start))
+    dist_to_end = point.distance(Point(end))
+
+    if dist_to_start < dist_to_end:
+        new_segment = LineString([point.coords[0], start])
+    else:
+        new_segment = LineString([point.coords[0], end])
+
+    merged = linemerge([linestring, new_segment])
+
+    return merged
+
+
 def _finalise_substations(df_substations):
     """
     Finalises the substations column types.
@@ -1520,6 +1548,63 @@ def _merge_touching_polygons(df):
     return gdf
 
 
+def _add_endpoints_to_line(linestring, polygon_dict):
+    """
+    Adds endpoints to a line by removing any overlapping areas with polygons.
+
+    Parameters:
+    linestring (LineString): The original line to add endpoints to.
+    polygon_dict (dict): A dictionary of polygons, where the keys are bus IDs and the values are the corresponding polygons.
+
+    Returns:
+    LineString: The modified line with added endpoints.
+    """
+    if not polygon_dict:
+        return linestring
+
+    polygon_centroids = {
+        bus_id: polygon.centroid for bus_id, polygon in polygon_dict.items()
+    }
+    polygon_unary = polygons = unary_union(list(polygon_dict.values()))
+
+    # difference with polygon
+    linestring_new = linestring.difference(polygon_unary)
+
+    if type(linestring_new) == MultiLineString:
+        # keep the longest line in the multilinestring
+        linestring_new = max(linestring_new.geoms, key=lambda x: x.length)
+
+    for p in polygon_centroids:
+        linestring_new = _add_bus_centroid_to_line(linestring_new, polygon_centroids[p])
+
+    return linestring_new
+
+
+def _get_polygons_at_endpoints(linestring, polygon_dict):
+    """
+    Get the polygons that contain the endpoints of a given linestring.
+
+    Parameters:
+    linestring (LineString): The linestring for which to find the polygons at the endpoints.
+    polygon_dict (dict): A dictionary containing polygons as values, with bus_ids as keys.
+
+    Returns:
+    dict: A dictionary containing bus_ids as keys and polygons as values, where the polygons contain the endpoints of the linestring.
+    """
+    # Get the endpoints of the linestring
+    start_point = Point(linestring.coords[0])
+    end_point = Point(linestring.coords[-1])
+
+    # Initialize dictionary to store bus_ids as keys and polygons as values
+    bus_id_polygon_dict = {}
+
+    for bus_id, polygon in polygon_dict.items():
+        if polygon.contains(start_point) or polygon.contains(end_point):
+            bus_id_polygon_dict[bus_id] = polygon
+
+    return bus_id_polygon_dict
+
+
 def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon):
     """
     Extends the lines in the given GeoDataFrame `gdf_lines` to the centroid of
@@ -1542,7 +1627,41 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon):
         predicate="intersects",
     ).drop(columns="index_bus")
 
-    # Rest of the code...
+    # Group by 'line_id' and create a dictionary mapping 'bus_id' to 'geometry_bus', excluding the grouping columns
+    gdf = (
+        gdf.groupby("line_id")
+        .apply(
+            lambda x: x[["bus_id", "geometry_bus"]]
+            .dropna()
+            .set_index("bus_id")["geometry_bus"]
+            .to_dict(),
+            include_groups=False,
+        )
+        .reset_index()
+    )
+    gdf.columns = ["line_id", "bus_dict"]
+
+    gdf["intersects_bus"] = gdf.apply(lambda row: len(row["bus_dict"]) > 0, axis=1)
+
+    gdf.loc[:, "line_geometry"] = gdf.join(
+        gdf_lines.set_index("line_id")["geometry"], on="line_id"
+    )["geometry"]
+
+    # Polygons at the endpoints of the linestring
+    gdf["bus_endpoints"] = gdf.apply(
+        lambda row: _get_polygons_at_endpoints(row["line_geometry"], row["bus_dict"]),
+        axis=1,
+    )
+
+    gdf.loc[:, "line_geometry_new"] = gdf.apply(
+        lambda row: _add_endpoints_to_line(row["line_geometry"], row["bus_endpoints"]),
+        axis=1,
+    )
+
+    gdf.set_index("line_id", inplace=True)
+    gdf_lines.set_index("line_id", inplace=True)
+
+    gdf_lines.loc[:, "geometry"] = gdf["line_geometry_new"]
 
     return gdf_lines
 

From 45bccfcd994cdea6a45c5cd95a9771bf9d24af48 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 25 Jun 2024 21:16:21 +0200
Subject: [PATCH 060/100] Updated clean_osm_data to add substation_centroid to
 linestrings

---
 scripts/clean_osm_data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index e40f510fb..d17c5b172 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1561,7 +1561,6 @@ def _add_endpoints_to_line(linestring, polygon_dict):
     """
     if not polygon_dict:
         return linestring
-
     polygon_centroids = {
         bus_id: polygon.centroid for bus_id, polygon in polygon_dict.items()
     }

From 0db49f2772514ca35c038238d63b6483856e9fc5 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 25 Jun 2024 22:55:31 +0200
Subject: [PATCH 061/100] Added osm-prebuilt functionality and zenodo sandbox
 repository.

---
 rules/retrieve.smk | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 10ad9684a..33f36d0ec 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -319,3 +319,34 @@ if config["enable"]["retrieve"]:
             "../envs/retrieve.yaml"
         script:
             "../scripts/retrieve_monthly_fuel_prices.py"
+
+
+if config["enable"]["retrieve"] and (
+    config["electricity_network"]["base_network"] == "osm-prebuilt"
+):
+
+    rule retrieve_osm_prebuilt:
+        input:
+            buses=storage("https://sandbox.zenodo.org/records/74826/files/buses.csv"),
+            converters=storage(
+                "https://sandbox.zenodo.org/records/74826/files/converters.csv"
+            ),
+            lines=storage("https://sandbox.zenodo.org/records/74826/files/lines.csv"),
+            links=storage("https://sandbox.zenodo.org/records/74826/files/links.csv"),
+            transformers=storage(
+                "https://sandbox.zenodo.org/records/74826/files/transformers.csv"
+            ),
+        output:
+            buses="data/osm/prebuilt/buses.csv",
+            converters="data/osm/prebuilt/converters.csv",
+            lines="data/osm/prebuilt/lines.csv",
+            links="data/osm/prebuilt/links.csv",
+            transformers="data/osm/prebuilt/transformers.csv",
+        log:
+            "logs/retrieve_osm_prebuilt.log",
+        resources:
+            mem_mb=500,
+        retries: 2
+        run:
+            for key in input.keys():
+                move(input[key], output[key])

From 4f4f7c6783989c7d24874a5090d6ecca049ef8a2 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 10 Jul 2024 16:44:06 +0200
Subject: [PATCH 062/100] Updated clean_osm_data to geopandas v.1.01

---
 scripts/clean_osm_data.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index d17c5b172..babd06243 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -491,19 +491,21 @@ def _add_line_endings_to_substations(
         gdf_offshore, how="outer", left_index=True, right_index=True
     )
     gdf_union["geometry"] = gdf_union.apply(
-        lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]).unary_union,
+        lambda row: gpd.GeoSeries([row["geometry_x"], row["geometry_y"]]).union_all(),
         axis=1,
     )
     gdf_union = gpd.GeoDataFrame(geometry=gdf_union["geometry"], crs=crs)
     gdf_buses_tofix = gpd.GeoDataFrame(
         buses[bool_multiple_countries], geometry="geometry", crs=crs
     )
-    joined = gpd.sjoin(gdf_buses_tofix, gdf_union, how="left", predicate="within")
+    joined = gpd.sjoin(
+        gdf_buses_tofix, gdf_union.reset_index(), how="left", predicate="within"
+    )
 
     # For all remaining rows where the country/index_right column is NaN, find
     # find the closest polygon index
-    joined.loc[joined["index_right"].isna(), "index_right"] = joined.loc[
-        joined["index_right"].isna(), "geometry"
+    joined.loc[joined["name"].isna(), "name"] = joined.loc[
+        joined["name"].isna(), "geometry"
     ].apply(lambda x: _find_closest_polygon(gdf_union, x))
 
     joined.reset_index(inplace=True)
@@ -511,7 +513,7 @@ def _add_line_endings_to_substations(
     joined.set_index("bus_id", inplace=True)
 
     buses.loc[bool_multiple_countries, "country"] = joined.loc[
-        bool_multiple_countries, "index_right"
+        bool_multiple_countries, "name"
     ]
 
     return buses.reset_index()

From 0b5a1feca99cd4f640e01850e11d1c78f3654dfc Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 10 Jul 2024 23:18:00 +0200
Subject: [PATCH 063/100] Made base_network and build_osm_network function more
 robust for empty links.

---
 scripts/build_osm_network.py | 12 +++++++++---
 scripts/clean_osm_data.py    | 13 ++++++++-----
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 39b20aec7..586b2807b 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -773,14 +773,16 @@ def build_network(
 
     # Drop unnecessary index column and set respective element ids as index
     lines.set_index("line_id", inplace=True)
-    links.set_index("link_id", inplace=True)
+    if not links.empty:
+        links.set_index("link_id", inplace=True)
     converters.set_index("converter_id", inplace=True)
     transformers.set_index("transformer_id", inplace=True)
     buses.set_index("bus_id", inplace=True)
 
     # Convert voltages from V to kV
     lines["voltage"] = lines["voltage"] / 1000
-    links["voltage"] = links["voltage"] / 1000
+    if not links.empty:
+        links["voltage"] = links["voltage"] / 1000
     transformers["voltage_bus0"], transformers["voltage_bus1"] = (
         transformers["voltage_bus0"] / 1000,
         transformers["voltage_bus1"] / 1000,
@@ -817,7 +819,8 @@ def build_network(
         "geometry",
     ]
 
-    links = links[cols_links]
+    if not links.empty:
+        links = links[cols_links]
 
     cols_transformers = [
         "bus0",
@@ -830,6 +833,9 @@ def build_network(
 
     transformers = transformers[cols_transformers]
 
+    if links.empty:  # create empty dataframe with cols_links as columns
+        links = pd.DataFrame(columns=["link_id"] + cols_links)
+
     to_csv_nafix(lines, outputs["lines"], quotechar="'")  # Generate CSV
     to_csv_nafix(links, outputs["links"], quotechar="'")  # Generate CSV
     to_csv_nafix(converters, outputs["converters"], quotechar="'")  # Generate CSV
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index babd06243..9992dba6d 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -792,6 +792,9 @@ def _filter_by_voltage(df, min_voltage=200000):
     - list_voltages (list): A list of unique voltage values above min_voltage.
       The type of the list elements is string.
     """
+    if df.empty:
+        return df, []
+
     logger.info(
         f"Filtering dataframe by voltage. Only keeping rows above and including {min_voltage} V."
     )
@@ -1307,11 +1310,11 @@ def _finalise_links(df_links):
     )
 
     # Initiate new columns for subsequent build_osm_network step
-    df_links.loc[:, "bus0"] = None
-    df_links.loc[:, "bus1"] = None
-    df_links.loc[:, "length"] = None
-    df_links.loc[:, "under_construction"] = False
-    df_links.loc[:, "dc"] = True
+    df_links["bus0"] = None
+    df_links["bus1"] = None
+    df_links["length"] = None
+    df_links["under_construction"] = False
+    df_links["dc"] = True
 
     # Only include needed columns
     df_links = df_links[

From e0ae315044b47684f0266c1ce9d7fe0c0760aa51 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 10 Jul 2024 23:31:21 +0200
Subject: [PATCH 064/100] Made base_network and build_osm_network function more
 robust for empty links.

---
 scripts/build_osm_network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 586b2807b..b96c43321 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -835,6 +835,7 @@ def build_network(
 
     if links.empty:  # create empty dataframe with cols_links as columns
         links = pd.DataFrame(columns=["link_id"] + cols_links)
+        links.set_index("link_id", inplace=True)
 
     to_csv_nafix(lines, outputs["lines"], quotechar="'")  # Generate CSV
     to_csv_nafix(links, outputs["links"], quotechar="'")  # Generate CSV

From 299029909b45b3aeb99505862332393fcf1263ba Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Fri, 12 Jul 2024 13:52:44 +0200
Subject: [PATCH 065/100] Bug fix in base_network. Voltage level null is now
 kept (relevant e.g. for Corsica)

---
 scripts/base_network.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/scripts/base_network.py b/scripts/base_network.py
index 923c1a148..7e229591e 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -151,31 +151,23 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
     if "station_id" in buses.columns:
         buses.drop("station_id", axis=1, inplace=True)
 
-    # buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
+    buses["carrier"] = buses.pop("dc").map({True: "DC", False: "AC"})
     buses["under_construction"] = buses.under_construction.where(
         lambda s: s.notnull(), False
     ).astype(bool)
-
-    # remove all buses outside of all countries including exclusive economic zones (offshore)
     europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
-    # TODO pypsa-eur: Temporary fix: Convex hull, this is important when nodes are between countries
-    # europe_shape = europe_shape.convex_hull
-
     europe_shape_prepped = shapely.prepared.prep(europe_shape)
     buses_in_europe_b = buses[["x", "y"]].apply(
         lambda p: europe_shape_prepped.contains(Point(p)), axis=1
     )
 
-    # TODO pypsa-eur: Find a long-term solution
-    # buses_with_v_nom_to_keep_b = (
-    #     buses.v_nom.isin(config_elec["voltages"]) | buses.v_nom.isnull()
-    # )
-
     v_nom_min = min(config_elec["voltages"])
     v_nom_max = max(config_elec["voltages"])
 
     # Quick fix:
-    buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
+    buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (
+        buses.v_nom <= v_nom_max
+    ) | buses.v_nom.isnull()
 
     logger.info(f"Removing buses outside of range {v_nom_min} - {v_nom_max} V")
     return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])

From c0e6e43a24212c8cd1bcd649b3640f89e273a5f1 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Mon, 15 Jul 2024 10:08:55 +0200
Subject: [PATCH 066/100] Merge with hcanges in upstream PR 1146. Fixing UA and
 MD.

---
 rules/build_electricity.smk                   |   3 +-
 rules/retrieve.smk                            |  17 +++
 scripts/_helpers.py                           |  21 ++++
 scripts/add_electricity.py                    | 114 ++++++++++++++++--
 .../determine_availability_matrix_MD_UA.py    |   2 +
 scripts/retrieve_gdp_uamd.py                  |  34 ++++++
 6 files changed, 183 insertions(+), 8 deletions(-)
 create mode 100644 scripts/retrieve_gdp_uamd.py

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index cdcbd6662..1896ce9d5 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -468,7 +468,8 @@ rule add_electricity:
         ),
         load=resources("electricity_demand.csv"),
         nuts3_shapes=resources("nuts3_shapes.geojson"),
-        ua_md_gdp="data/GDP_PPP_30arcsec_v3_mapped_default.csv",
+        gdp_file="data/GDP_per_capita_PPP_1990_2015_v2.nc",
+        ppp_file="data/ppp_2013_1km_Aggregated.tif",
     output:
         resources("networks/elec.nc"),
     log:
diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 33f36d0ec..0590d766a 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -321,6 +321,23 @@ if config["enable"]["retrieve"]:
             "../scripts/retrieve_monthly_fuel_prices.py"
 
 
+if config["enable"]["retrieve"] and any(c in ["UA", "MD"] for c in config["countries"]):
+
+    rule retrieve_gdp_uamd:
+        output:
+            gdp="data/GDP_per_capita_PPP_1990_2015_v2.nc",
+            ppp="data/ppp_2013_1km_Aggregated.tif",
+        log:
+            "logs/retrieve_gdp_uamd.log",
+        resources:
+            mem_mb=5000,
+        retries: 2
+        conda:
+            "../envs/retrieve.yaml"
+        script:
+            "../scripts/retrieve_gdp_uamd.py"
+
+
 if config["enable"]["retrieve"] and (
     config["electricity_network"]["base_network"] == "osm-prebuilt"
 ):
diff --git a/scripts/_helpers.py b/scripts/_helpers.py
index 0bf92e396..c40945ad1 100644
--- a/scripts/_helpers.py
+++ b/scripts/_helpers.py
@@ -370,6 +370,27 @@ def update_to(b=1, bsize=1, tsize=None):
             urllib.request.urlretrieve(url, file, reporthook=update_to)
 
 
+def retrieve_file(url, destination):
+    """
+    Downloads a file from a specified URL to a local destination using custom
+    headers that mimic a Firefox browser request.
+
+    This function is useful for overcoming 'HTTP Error 403: Forbidden'
+    issues, which often occur when the server requires more typical
+    browser-like headers for access.
+    """
+
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
+    }
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+
+    with open(destination, "wb") as f:
+        f.write(response.content)
+    logger.info(f"File downloaded and saved as {destination}")
+
+
 def mock_snakemake(
     rulename,
     root_dir=None,
diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
index f90d6c851..ae7167562 100755
--- a/scripts/add_electricity.py
+++ b/scripts/add_electricity.py
@@ -91,6 +91,7 @@
 import pandas as pd
 import powerplantmatching as pm
 import pypsa
+import rasterio
 import scipy.sparse as sparse
 import xarray as xr
 from _helpers import (
@@ -100,6 +101,8 @@
     update_p_nom_max,
 )
 from powerplantmatching.export import map_country_bus
+from rasterio.mask import mask
+from shapely.geometry import box
 from shapely.prepared import prep
 
 idx = pd.IndexSlice
@@ -294,13 +297,13 @@ def shapes_to_shapes(orig, dest):
     return transfer
 
 
-def attach_load(n, regions, load, nuts3_shapes, ua_md_gdp, countries, scaling=1.0):
+def attach_load(
+    n, regions, load, nuts3_shapes, gdp_file, ppp_file, countries, scaling=1.0
+):
     substation_lv_i = n.buses.index[n.buses["substation_lv"]]
-    regions = gpd.read_file(regions).set_index("name").reindex(substation_lv_i)
+    gdf_regions = gpd.read_file(regions).set_index("name").reindex(substation_lv_i)
     opsd_load = pd.read_csv(load, index_col=0, parse_dates=True).filter(items=countries)
 
-    ua_md_gdp = pd.read_csv(ua_md_gdp, dtype={"name": "str"}).set_index("name")
-
     logger.info(f"Load data scaled by factor {scaling}.")
     opsd_load *= scaling
 
@@ -325,7 +328,12 @@ def upsample(cntry, group):
         factors = normed(0.6 * normed(gdp_n) + 0.4 * normed(pop_n))
         if cntry in ["UA", "MD"]:
             # overwrite factor because nuts3 provides no data for UA+MD
-            factors = normed(ua_md_gdp.loc[group.index, "GDP_PPP"].squeeze())
+            gdp_ua_md, ppp_ua_md = calculate_ua_md_gdp_ppp(
+                gdf_regions[gdf_regions.country == cntry], gdp_file, ppp_file
+            )
+            factors = normed(
+                0.6 * normed(gdp_ua_md["gdp"]) + 0.4 * normed(ppp_ua_md["ppp"])
+            )
         return pd.DataFrame(
             factors.values * load.values[:, np.newaxis],
             index=load.index,
@@ -335,7 +343,7 @@ def upsample(cntry, group):
     load = pd.concat(
         [
             upsample(cntry, group)
-            for cntry, group in regions.geometry.groupby(regions.country)
+            for cntry, group in gdf_regions.geometry.groupby(gdf_regions.country)
         ],
         axis=1,
     )
@@ -791,6 +799,97 @@ def attach_line_rating(
     n.lines_t.s_max_pu *= s_max_pu
 
 
+def calculate_ua_md_gdp_ppp(gdf_regions, gdp_file, ppp_file):
+    """
+    Calculate the GDP and PPP values for the regions within the bounding box of
+    UA and MD.
+
+    Parameters:
+    gdf_regions (GeoDataFrame): A GeoDataFrame containing the regions.
+    gdp_file (str): The file path to the dataset containing the GDP values for UA and MD.
+    ppp_file (str): The file path to the dataset containing the PPP values for UA and MD.
+
+    Returns:
+    tuple: A tuple containing two GeoDataFrames:
+        - gdp_ua_md: A GeoDataFrame with the aggregated GDP values mapped to each bus.
+        - ppp_ua_md: A GeoDataFrame with the aggregated PPP values mapped to each bus.
+    """
+    # Create a bounding box for UA, MD from region shape, including a buffer of 10000 metres
+    box_ua_md = (
+        gpd.GeoDataFrame(geometry=[box(*gdf_regions.total_bounds)], crs=gdf_regions.crs)
+        .to_crs(epsg=3857)
+        .buffer(10000)
+        .to_crs(gdf_regions.crs)
+    )
+
+    # GDP
+    with xr.open_dataset(gdp_file) as src_gdp_ua_md:
+        src_gdp_ua_md = src_gdp_ua_md.where(
+            (src_gdp_ua_md.longitude >= box_ua_md.bounds.minx.min())
+            & (src_gdp_ua_md.longitude <= box_ua_md.bounds.maxx.max())
+            & (src_gdp_ua_md.latitude >= box_ua_md.bounds.miny.min())
+            & (src_gdp_ua_md.latitude <= box_ua_md.bounds.maxy.max()),
+            drop=True,
+        )
+        gdp_ua_md = src_gdp_ua_md.to_dataframe().reset_index()
+
+    gdp_ua_md = gdp_ua_md.rename(columns={"GDP_per_capita_PPP": "gdp"})
+    gdp_ua_md = gdp_ua_md[gdp_ua_md.time == gdp_ua_md.time.max()]
+    gdp_ua_md = gpd.GeoDataFrame(
+        gdp_ua_md,
+        geometry=gpd.points_from_xy(gdp_ua_md.longitude, gdp_ua_md.latitude),
+        crs="EPSG:4326",
+    )
+
+    gdp_ua_md = gpd.sjoin(
+        gdp_ua_md, gdf_regions.reset_index(), predicate="within"
+    ).drop(columns=["index_right"])
+    gdp_ua_md = (
+        gdp_ua_md.groupby(["Bus", "country", "time"])
+        .agg({"gdp": "sum"})
+        .reset_index(level=["country", "time"])
+    )
+
+    # PPP
+    with rasterio.open(ppp_file) as src_ppp_ua_md:
+        # Mask the raster with the bounding box
+        out_image, out_transform = mask(src_ppp_ua_md, box_ua_md, crop=True)
+        out_image,
+        out_meta = src_ppp_ua_md.meta.copy()
+        out_meta.update(
+            {
+                "driver": "GTiff",
+                "height": out_image.shape[1],
+                "width": out_image.shape[2],
+                "transform": out_transform,
+            }
+        )
+
+    masked_data = out_image[0]  # Use the first band (rest is empty)
+    row_indices, col_indices = np.where(masked_data != src_ppp_ua_md.nodata)
+    values = masked_data[row_indices, col_indices]
+
+    # Affine transformation from pixel coordinates to geo coordinates
+    x_coords, y_coords = rasterio.transform.xy(out_transform, row_indices, col_indices)
+    ppp_ua_md = pd.DataFrame({"x": x_coords, "y": y_coords, "ppp": values})
+
+    ppp_ua_md = gpd.GeoDataFrame(
+        ppp_ua_md,
+        geometry=gpd.points_from_xy(ppp_ua_md.x, ppp_ua_md.y),
+        crs=src_ppp_ua_md.crs,
+    )
+
+    ppp_ua_md = gpd.sjoin(ppp_ua_md, gdf_regions.reset_index(), predicate="within")
+    ppp_ua_md = (
+        ppp_ua_md.groupby(["Bus", "country"])
+        .agg({"ppp": "sum"})
+        .reset_index()
+        .set_index("Bus")
+    )
+
+    return gdp_ua_md, ppp_ua_md
+
+
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
@@ -821,7 +920,8 @@ def attach_line_rating(
         snakemake.input.regions,
         snakemake.input.load,
         snakemake.input.nuts3_shapes,
-        snakemake.input.ua_md_gdp,
+        snakemake.input.gdp_file,
+        snakemake.input.ppp_file,
         params.countries,
         params.scaling_factor,
     )
diff --git a/scripts/determine_availability_matrix_MD_UA.py b/scripts/determine_availability_matrix_MD_UA.py
index 678ef025d..f19919e39 100644
--- a/scripts/determine_availability_matrix_MD_UA.py
+++ b/scripts/determine_availability_matrix_MD_UA.py
@@ -49,6 +49,8 @@ def get_wdpa_layer_name(wdpa_fn, layer_substring):
         gpd.read_file(snakemake.input.regions).set_index("name").rename_axis("bus")
     )
     buses = regions.index
+    buses = regions.loc[regions["country"].isin(["UA", "MD"])].index.values
+    regions = regions.loc[buses]
 
     excluder = atlite.ExclusionContainer(crs=3035, res=100)
 
diff --git a/scripts/retrieve_gdp_uamd.py b/scripts/retrieve_gdp_uamd.py
new file mode 100644
index 000000000..3da3be4f0
--- /dev/null
+++ b/scripts/retrieve_gdp_uamd.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: : 2023-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+"""
+Retrieve monthly fuel prices from Destatis.
+"""
+
+import logging
+from pathlib import Path
+
+from _helpers import configure_logging, retrieve_file, set_scenario_config
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("retrieve_gdp_uamd")
+    configure_logging(snakemake)
+    set_scenario_config(snakemake)
+
+dict_urls = dict(
+    {
+        "gdp": "https://datadryad.org/stash/downloads/file_stream/241947",
+        "ppp": "https://github.com/ecohealthalliance/sars_cov_risk/releases/download/v2.0.1/ppp_2020_1km_Aggregated.tif",
+    }
+)
+
+# Download and validate each dataset
+for key, path in snakemake.output.items():
+    retrieve_file(dict_urls[key], path)

From 84bf9ec66b9065b8cee876097f0f60d5e033ba6a Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Mon, 15 Jul 2024 10:46:10 +0200
Subject: [PATCH 067/100] Updated Zenodo and fixed prepare_osm_network_release

---
 rules/retrieve.smk                     | 10 +++++-----
 scripts/prepare_osm_network_release.py |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 0590d766a..6cf1efc7c 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -344,14 +344,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://sandbox.zenodo.org/records/74826/files/buses.csv"),
+            buses=storage("https://sandbox.zenodo.org/records/87601/files/buses.csv"),
             converters=storage(
-                "https://sandbox.zenodo.org/records/74826/files/converters.csv"
+                "https://sandbox.zenodo.org/records/87601/files/converters.csv"
             ),
-            lines=storage("https://sandbox.zenodo.org/records/74826/files/lines.csv"),
-            links=storage("https://sandbox.zenodo.org/records/74826/files/links.csv"),
+            lines=storage("https://sandbox.zenodo.org/records/87601/files/lines.csv"),
+            links=storage("https://sandbox.zenodo.org/records/87601/files/links.csv"),
             transformers=storage(
-                "https://sandbox.zenodo.org/records/74826/files/transformers.csv"
+                "https://sandbox.zenodo.org/records/87601/files/transformers.csv"
             ),
         output:
             buses="data/osm/prebuilt/buses.csv",
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
index 70c6f6982..70f1f3be1 100644
--- a/scripts/prepare_osm_network_release.py
+++ b/scripts/prepare_osm_network_release.py
@@ -59,7 +59,6 @@ def export_clean_csv(df, columns, output_file):
     buses_columns = [
         "bus_id",
         "voltage",
-        "dc",
         "symbol",
         "under_construction",
         "x",

From 7d96dd2d70924089d09fe5802f7855217d69ef0c Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Mon, 15 Jul 2024 11:02:40 +0200
Subject: [PATCH 068/100] Updated osm network release.

---
 rules/retrieve.smk                     | 10 +++++-----
 scripts/prepare_osm_network_release.py |  3 +++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 6cf1efc7c..91d72d8ac 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -344,14 +344,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://sandbox.zenodo.org/records/87601/files/buses.csv"),
+            buses=storage("https://sandbox.zenodo.org/records/87612/files/buses.csv"),
             converters=storage(
-                "https://sandbox.zenodo.org/records/87601/files/converters.csv"
+                "https://sandbox.zenodo.org/records/87612/files/converters.csv"
             ),
-            lines=storage("https://sandbox.zenodo.org/records/87601/files/lines.csv"),
-            links=storage("https://sandbox.zenodo.org/records/87601/files/links.csv"),
+            lines=storage("https://sandbox.zenodo.org/records/87612/files/lines.csv"),
+            links=storage("https://sandbox.zenodo.org/records/87612/files/links.csv"),
             transformers=storage(
-                "https://sandbox.zenodo.org/records/87601/files/transformers.csv"
+                "https://sandbox.zenodo.org/records/87612/files/transformers.csv"
             ),
         output:
             buses="data/osm/prebuilt/buses.csv",
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
index 70f1f3be1..41ebd5c83 100644
--- a/scripts/prepare_osm_network_release.py
+++ b/scripts/prepare_osm_network_release.py
@@ -59,6 +59,7 @@ def export_clean_csv(df, columns, output_file):
     buses_columns = [
         "bus_id",
         "voltage",
+        "dc",
         "symbol",
         "under_construction",
         "x",
@@ -109,6 +110,8 @@ def export_clean_csv(df, columns, output_file):
 
     network = pypsa.Network(snakemake.input.base_network)
 
+    network.buses["dc"] = network.buses.pop("carrier").map({"DC": "t", "AC": "f"})
+
     # Export to clean csv for release
     logger.info(f"Exporting {len(network.buses)} buses to %s", snakemake.output.buses)
     export_clean_csv(network.buses, buses_columns, snakemake.output.buses)

From 867de1edaf39d46f6e84de703a4e142fc40a5d19 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Mon, 15 Jul 2024 20:32:19 +0200
Subject: [PATCH 069/100] Updated prepare osm network release.

---
 rules/retrieve.smk                     | 10 +++++-----
 scripts/prepare_osm_network_release.py |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 91d72d8ac..33f1ccc6d 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -344,14 +344,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://sandbox.zenodo.org/records/87612/files/buses.csv"),
+            buses=storage("https://sandbox.zenodo.org/records/87679/files/buses.csv"),
             converters=storage(
-                "https://sandbox.zenodo.org/records/87612/files/converters.csv"
+                "https://sandbox.zenodo.org/records/87679/files/converters.csv"
             ),
-            lines=storage("https://sandbox.zenodo.org/records/87612/files/lines.csv"),
-            links=storage("https://sandbox.zenodo.org/records/87612/files/links.csv"),
+            lines=storage("https://sandbox.zenodo.org/records/87679/files/lines.csv"),
+            links=storage("https://sandbox.zenodo.org/records/87679/files/links.csv"),
             transformers=storage(
-                "https://sandbox.zenodo.org/records/87612/files/transformers.csv"
+                "https://sandbox.zenodo.org/records/87679/files/transformers.csv"
             ),
         output:
             buses="data/osm/prebuilt/buses.csv",
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
index 41ebd5c83..b33009e0b 100644
--- a/scripts/prepare_osm_network_release.py
+++ b/scripts/prepare_osm_network_release.py
@@ -111,6 +111,8 @@ def export_clean_csv(df, columns, output_file):
     network = pypsa.Network(snakemake.input.base_network)
 
     network.buses["dc"] = network.buses.pop("carrier").map({"DC": "t", "AC": "f"})
+    network.lines.length = network.lines.length * 1e3
+    network.links.length = network.links.length * 1e3
 
     # Export to clean csv for release
     logger.info(f"Exporting {len(network.buses)} buses to %s", snakemake.output.buses)

From 41dff328bb3880c99e723dc3e1fbac4d192961d5 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 16 Jul 2024 10:04:21 +0200
Subject: [PATCH 070/100] Updated MD, UA scripts.

---
 rules/build_electricity.smk                   | 211 ++++--------------
 rules/retrieve.smk                            |  33 +--
 scripts/build_gdp_ppp_non_nuts3.py            | 151 +++++++++++++
 .../determine_availability_matrix_MD_UA.py    |   3 +-
 4 files changed, 194 insertions(+), 204 deletions(-)
 create mode 100644 scripts/build_gdp_ppp_non_nuts3.py

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 1896ce9d5..4e71d9f19 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -74,71 +74,14 @@ rule base_network:
         links=config_provider("links"),
         transformers=config_provider("transformers"),
     input:
-        eg_buses=lambda w: (
-            "data/entsoegridkit/buses.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/buses.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/buses.csv")
-            )
-        ),
-        eg_lines=lambda w: (
-            "data/entsoegridkit/lines.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/lines.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/lines.csv")
-            )
-        ),
-        eg_links=lambda w: (
-            "data/entsoegridkit/links.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/links.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/links.csv")
-            )
-        ),
-        eg_converters=lambda w: (
-            "data/entsoegridkit/converters.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/converters.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/converters.csv")
-            )
-        ),
-        eg_transformers=lambda w: (
-            "data/entsoegridkit/transformers.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/transformers.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/transformers.csv")
-            )
-        ),
-        parameter_corrections=lambda w: (
-            "data/parameter_corrections.yaml"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else []
-        ),
-        links_p_nom=lambda w: (
-            "data/links_p_nom.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else []
-        ),
-        links_tyndp=lambda w: (
-            "data/links_tyndp.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else []
-        ),
+        eg_buses="data/entsoegridkit/buses.csv",
+        eg_lines="data/entsoegridkit/lines.csv",
+        eg_links="data/entsoegridkit/links.csv",
+        eg_converters="data/entsoegridkit/converters.csv",
+        eg_transformers="data/entsoegridkit/transformers.csv",
+        parameter_corrections="data/parameter_corrections.yaml",
+        links_p_nom="data/links_p_nom.csv",
+        links_tyndp="data/links_tyndp.csv",
         country_shapes=resources("country_shapes.geojson"),
         offshore_shapes=resources("offshore_shapes.geojson"),
         europe_shape=resources("europe_shape.geojson"),
@@ -336,7 +279,7 @@ rule build_renewable_profiles:
         benchmarks("build_renewable_profiles_{technology}")
     threads: config["atlite"].get("nprocesses", 4)
     resources:
-        mem_mb=config["atlite"].get("nprocesses", 4) * 10000,
+        mem_mb=config["atlite"].get("nprocesses", 4) * 5000,
     wildcard_constraints:
         technology="(?!hydro).*",  # Any technology other than hydro
     conda:
@@ -432,6 +375,35 @@ def input_conventional(w):
     }
 
 
+# Optional input when having Ukraine (UA) or Moldova (MD) in the countries list
+def input_gdp_ppp_non_nuts3(w):
+    countries = set(config_provider("countries")(w))
+    if {"UA", "MD"}.intersection(countries):
+        return {"gdp_ppp_non_nuts3": resources("gdp_ppp_non_nuts3.geojson")}
+    return {}
+
+
+rule build_gdp_ppp_non_nuts3:
+    params:
+        countries=config_provider("countries"),
+    input:
+        base_network=resources("networks/base.nc"),
+        regions=resources("regions_onshore.geojson"),
+        gdp_non_nuts3="data/GDP_per_capita_PPP_1990_2015_v2.nc",
+        ppp_non_nuts3="data/ppp_2013_1km_Aggregated.tif",
+    output:
+        resources("gdp_ppp_non_nuts3.geojson"),
+    log:
+        logs("build_gdp_ppp_non_nuts3.log"),
+    threads: 1
+    resources:
+        mem_mb=1500,
+    conda:
+        "../envs/environment.yaml"
+    script:
+        "../scripts/build_gdp_ppp_non_nuts3.py"
+
+
 rule add_electricity:
     params:
         length_factor=config_provider("lines", "length_factor"),
@@ -447,6 +419,7 @@ rule add_electricity:
     input:
         unpack(input_profile_tech),
         unpack(input_conventional),
+        unpack(input_gdp_ppp_non_nuts3),
         base_network=resources("networks/base.nc"),
         line_rating=lambda w: (
             resources("networks/line_rating.nc")
@@ -468,8 +441,6 @@ rule add_electricity:
         ),
         load=resources("electricity_demand.csv"),
         nuts3_shapes=resources("nuts3_shapes.geojson"),
-        gdp_file="data/GDP_per_capita_PPP_1990_2015_v2.nc",
-        ppp_file="data/ppp_2013_1km_Aggregated.tif",
     output:
         resources("networks/elec.nc"),
     log:
@@ -515,7 +486,7 @@ rule simplify_network:
         benchmarks("simplify_network/elec_s{simpl}")
     threads: 1
     resources:
-        mem_mb=40000,
+        mem_mb=12000,
     conda:
         "../envs/environment.yaml"
     script:
@@ -562,7 +533,7 @@ rule cluster_network:
         benchmarks("cluster_network/elec_s{simpl}_{clusters}")
     threads: 1
     resources:
-        mem_mb=40000,
+        mem_mb=10000,
     conda:
         "../envs/environment.yaml"
     script:
@@ -628,103 +599,3 @@ rule prepare_network:
         "../envs/environment.yaml"
     script:
         "../scripts/prepare_network.py"
-
-
-rule retrieve_osm_data:
-    output:
-        cables_way="data/osm/raw/{country}/cables_way.json",
-        lines_way="data/osm/raw/{country}/lines_way.json",
-        links_relation="data/osm/raw/{country}/links_relation.json",
-        substations_way="data/osm/raw/{country}/substations_way.json",
-        substations_relation="data/osm/raw/{country}/substations_relation.json",
-    log:
-        "logs/retrieve_osm_data_{country}.log",
-    resources:
-        cores=2,
-        threads=1,
-    script:
-        "../scripts/retrieve_osm_data.py"
-
-
-rule retrieve_osm_data_all:
-    input:
-        expand(
-            "data/osm/raw/{country}/cables_way.json",
-            country=config_provider("countries"),
-        ),
-        expand(
-            "data/osm/raw/{country}/lines_way.json",
-            country=config_provider("countries"),
-        ),
-        expand(
-            "data/osm/raw/{country}/links_relation.json",
-            country=config_provider("countries"),
-        ),
-        expand(
-            "data/osm/raw/{country}/substations_way.json",
-            country=config_provider("countries"),
-        ),
-        expand(
-            "data/osm/raw/{country}/substations_relation.json",
-            country=config_provider("countries"),
-        ),
-
-
-rule clean_osm_data:
-    input:
-        cables_way=expand(
-            "data/osm/raw/{country}/cables_way.json",
-            country=config_provider("countries"),
-        ),
-        lines_way=expand(
-            "data/osm/raw/{country}/lines_way.json",
-            country=config_provider("countries"),
-        ),
-        links_relation=expand(
-            "data/osm/raw/{country}/links_relation.json",
-            country=config_provider("countries"),
-        ),
-        substations_way=expand(
-            "data/osm/raw/{country}/substations_way.json",
-            country=config_provider("countries"),
-        ),
-        substations_relation=expand(
-            "data/osm/raw/{country}/substations_relation.json",
-            country=config_provider("countries"),
-        ),
-        offshore_shapes=resources("offshore_shapes.geojson"),
-        country_shapes=resources("country_shapes.geojson"),
-    output:
-        substations=resources("osm/clean/substations.geojson"),
-        substations_polygon=resources("osm/clean/substations_polygon.geojson"),
-        lines=resources("osm/clean/lines.geojson"),
-        links=resources("osm/clean/links.geojson"),
-    log:
-        logs("clean_osm_data.log"),
-    script:
-        "../scripts/clean_osm_data.py"
-
-
-rule build_osm_network:
-    input:
-        substations=resources("osm/clean/substations.geojson"),
-        lines=resources("osm/clean/lines.geojson"),
-        links=resources("osm/clean/links.geojson"),
-        country_shapes=resources("country_shapes.geojson"),
-    output:
-        lines=resources("osm/pre-base/lines.csv"),
-        links=resources("osm/pre-base/links.csv"),
-        converters=resources("osm/pre-base/converters.csv"),
-        transformers=resources("osm/pre-base/transformers.csv"),
-        substations=resources("osm/pre-base/buses.csv"),
-        lines_geojson=resources("osm/pre-base/lines.geojson"),
-        links_geojson=resources("osm/pre-base/links.geojson"),
-        converters_geojson=resources("osm/pre-base/converters.geojson"),
-        transformers_geojson=resources("osm/pre-base/transformers.geojson"),
-        substations_geojson=resources("osm/pre-base/buses.geojson"),
-    log:
-        logs("build_osm_network.log"),
-    benchmark:
-        benchmarks("build_osm_network")
-    script:
-        "../scripts/build_osm_network.py"
diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 33f1ccc6d..2f9fe21df 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -321,7 +321,7 @@ if config["enable"]["retrieve"]:
             "../scripts/retrieve_monthly_fuel_prices.py"
 
 
-if config["enable"]["retrieve"] and any(c in ["UA", "MD"] for c in config["countries"]):
+if config["enable"]["retrieve"] and {"UA", "MD"}.intersection(config["countries"]):
 
     rule retrieve_gdp_uamd:
         output:
@@ -336,34 +336,3 @@ if config["enable"]["retrieve"] and any(c in ["UA", "MD"] for c in config["count
             "../envs/retrieve.yaml"
         script:
             "../scripts/retrieve_gdp_uamd.py"
-
-
-if config["enable"]["retrieve"] and (
-    config["electricity_network"]["base_network"] == "osm-prebuilt"
-):
-
-    rule retrieve_osm_prebuilt:
-        input:
-            buses=storage("https://sandbox.zenodo.org/records/87679/files/buses.csv"),
-            converters=storage(
-                "https://sandbox.zenodo.org/records/87679/files/converters.csv"
-            ),
-            lines=storage("https://sandbox.zenodo.org/records/87679/files/lines.csv"),
-            links=storage("https://sandbox.zenodo.org/records/87679/files/links.csv"),
-            transformers=storage(
-                "https://sandbox.zenodo.org/records/87679/files/transformers.csv"
-            ),
-        output:
-            buses="data/osm/prebuilt/buses.csv",
-            converters="data/osm/prebuilt/converters.csv",
-            lines="data/osm/prebuilt/lines.csv",
-            links="data/osm/prebuilt/links.csv",
-            transformers="data/osm/prebuilt/transformers.csv",
-        log:
-            "logs/retrieve_osm_prebuilt.log",
-        resources:
-            mem_mb=500,
-        retries: 2
-        run:
-            for key in input.keys():
-                move(input[key], output[key])
diff --git a/scripts/build_gdp_ppp_non_nuts3.py b/scripts/build_gdp_ppp_non_nuts3.py
new file mode 100644
index 000000000..80dd1bb32
--- /dev/null
+++ b/scripts/build_gdp_ppp_non_nuts3.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: MIT
+"""
+
+"""
+
+import logging
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import pypsa
+import rasterio
+import xarray as xr
+from _helpers import configure_logging, set_scenario_config
+from rasterio.mask import mask
+from shapely.geometry import box
+
+logger = logging.getLogger(__name__)
+
+
+def calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3):
+    """
+    Calculate the GDP and PPP values for non NUTS3 regions.
+
+    Parameters:
+    country (str): The two-letter country code of the non-NUTS3 region.
+    regions (GeoDataFrame): A GeoDataFrame containing the regions.
+    gdp_non_nuts3 (str): The file path to the dataset containing the GDP values
+    for non NUTS3 countries (e.g. MD, UA)
+    ppp_non_nuts3 (str): The file path to the dataset containing the PPP values
+    for non NUTS3 countries (e.g. MD, UA)
+
+    Returns:
+    tuple: A tuple containing two GeoDataFrames:
+        - gdp: A GeoDataFrame with the aggregated GDP values mapped to each bus.
+        - ppp: A GeoDataFrame with the aggregated PPP values mapped to each bus.
+    """
+    regions = regions.drop(columns=["x", "y"])
+    regions = regions[regions.country == country]
+    # Create a bounding box for UA, MD from region shape, including a buffer of 10000 metres
+    bounding_box = (
+        gpd.GeoDataFrame(geometry=[box(*regions.total_bounds)], crs=regions.crs)
+        .to_crs(epsg=3857)
+        .buffer(10000)
+        .to_crs(regions.crs)
+    )
+
+    # GDP
+    logger.info(f"Mapping GDP values to non-NUTS3 region: {regions.country.unique()}")
+    with xr.open_dataset(gdp_non_nuts3) as src_gdp:
+        src_gdp = src_gdp.where(
+            (src_gdp.longitude >= bounding_box.bounds.minx.min())
+            & (src_gdp.longitude <= bounding_box.bounds.maxx.max())
+            & (src_gdp.latitude >= bounding_box.bounds.miny.min())
+            & (src_gdp.latitude <= bounding_box.bounds.maxy.max()),
+            drop=True,
+        )
+        gdp = src_gdp.to_dataframe().reset_index()
+    gdp = gdp.rename(columns={"GDP_per_capita_PPP": "gdp"})
+    gdp = gdp[gdp.time == gdp.time.max()]
+    gdp = gpd.GeoDataFrame(
+        gdp,
+        geometry=gpd.points_from_xy(gdp.longitude, gdp.latitude),
+        crs="EPSG:4326",
+    )
+    gdp = gpd.sjoin(gdp, regions, predicate="within")
+    gdp = (
+        gdp.groupby(["Bus", "country"])
+        .agg({"gdp": "sum"})
+        .reset_index(level=["country"])
+    )
+
+    # PPP
+    logger.info(f"Mapping PPP values to non-NUTS3 region: {regions.country.unique()}")
+    with rasterio.open(ppp_non_nuts3) as src_ppp:
+        # Mask the raster with the bounding box
+        out_image, out_transform = mask(src_ppp, bounding_box, crop=True)
+        out_image,
+        out_meta = src_ppp.meta.copy()
+        out_meta.update(
+            {
+                "driver": "GTiff",
+                "height": out_image.shape[1],
+                "width": out_image.shape[2],
+                "transform": out_transform,
+            }
+        )
+    masked_data = out_image[0]  # Use the first band (rest is empty)
+    row_indices, col_indices = np.where(masked_data != src_ppp.nodata)
+    values = masked_data[row_indices, col_indices]
+
+    # Affine transformation from pixel coordinates to geo coordinates
+    x_coords, y_coords = rasterio.transform.xy(out_transform, row_indices, col_indices)
+    ppp = pd.DataFrame({"x": x_coords, "y": y_coords, "ppp": values})
+    ppp = gpd.GeoDataFrame(
+        ppp,
+        geometry=gpd.points_from_xy(ppp.x, ppp.y),
+        crs=src_ppp.crs,
+    )
+    ppp = gpd.sjoin(ppp, regions, predicate="within")
+    ppp = (
+        ppp.groupby(["Bus", "country"])
+        .agg({"ppp": "sum"})
+        .reset_index()
+        .set_index("Bus")
+    )
+    gdp_ppp = regions.join(gdp.drop(columns="country"), on="Bus").join(
+        ppp.drop(columns="country"), on="Bus"
+    )
+    gdp_ppp.fillna(0, inplace=True)
+
+    return gdp_ppp
+
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        from _helpers import mock_snakemake
+
+        snakemake = mock_snakemake("build_gdp_ppp_non_nuts3")
+    configure_logging(snakemake)
+    set_scenario_config(snakemake)
+
+    n = pypsa.Network(snakemake.input.base_network)
+    substation_lv_i = n.buses.index[n.buses["substation_lv"]]
+    regions = (
+        gpd.read_file(snakemake.input.regions)
+        .set_index("name")
+        .reindex(substation_lv_i)
+    )
+
+    gdp_non_nuts3 = snakemake.input.gdp_non_nuts3
+    ppp_non_nuts3 = snakemake.input.ppp_non_nuts3
+
+    countries_non_nuts3 = pd.Index(("MD", "UA"))
+    subset = set(countries_non_nuts3) & set(snakemake.params.countries)
+
+    gdp_ppp = pd.concat(
+        [
+            calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3)
+            for country in subset
+        ],
+        axis=0,
+    )
+
+    logger.info(
+        f"Exporting GDP and PPP values for non-NUTS3 regions {snakemake.output}"
+    )
+    gdp_ppp.reset_index().to_file(snakemake.output, driver="GeoJSON")
diff --git a/scripts/determine_availability_matrix_MD_UA.py b/scripts/determine_availability_matrix_MD_UA.py
index f19919e39..f6f416dfb 100644
--- a/scripts/determine_availability_matrix_MD_UA.py
+++ b/scripts/determine_availability_matrix_MD_UA.py
@@ -49,6 +49,7 @@ def get_wdpa_layer_name(wdpa_fn, layer_substring):
         gpd.read_file(snakemake.input.regions).set_index("name").rename_axis("bus")
     )
     buses = regions.index
+    # Limit to "UA" and "MD" regions
     buses = regions.loc[regions["country"].isin(["UA", "MD"])].index.values
     regions = regions.loc[buses]
 
@@ -154,8 +155,6 @@ def get_wdpa_layer_name(wdpa_fn, layer_substring):
     plt.axis("off")
     plt.savefig(snakemake.output.availability_map, bbox_inches="tight", dpi=500)
 
-    # Limit results only to buses for UA and MD
-    buses = regions.loc[regions["country"].isin(["UA", "MD"])].index.values
     availability = availability.sel(bus=buses)
 
     # Save and plot for verification

From 75f146cec17c695c16fbb0dfc64e001bce231145 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 16 Jul 2024 10:06:38 +0200
Subject: [PATCH 071/100] Cleaned determine_availability_matrix_MD_UA.py,
 removed redundant code

---
 scripts/determine_availability_matrix_MD_UA.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/determine_availability_matrix_MD_UA.py b/scripts/determine_availability_matrix_MD_UA.py
index f6f416dfb..2ed11d3c0 100644
--- a/scripts/determine_availability_matrix_MD_UA.py
+++ b/scripts/determine_availability_matrix_MD_UA.py
@@ -48,7 +48,6 @@ def get_wdpa_layer_name(wdpa_fn, layer_substring):
     regions = (
         gpd.read_file(snakemake.input.regions).set_index("name").rename_axis("bus")
     )
-    buses = regions.index
     # Limit to "UA" and "MD" regions
     buses = regions.loc[regions["country"].isin(["UA", "MD"])].index.values
     regions = regions.loc[buses]

From 44f46362bd49a4f4defaf6238955906a5cd0c558 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 16 Jul 2024 15:04:31 +0200
Subject: [PATCH 072/100] Bug fixes.

---
 rules/build_electricity.smk        | 137 +++++++++++++++++++++++++++--
 rules/retrieve.smk                 |  80 +++++++++++++++++
 scripts/add_electricity.py         | 120 ++++---------------------
 scripts/build_gdp_ppp_non_nuts3.py |   4 +
 4 files changed, 229 insertions(+), 112 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 4e71d9f19..27b3ac58e 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -74,14 +74,71 @@ rule base_network:
         links=config_provider("links"),
         transformers=config_provider("transformers"),
     input:
-        eg_buses="data/entsoegridkit/buses.csv",
-        eg_lines="data/entsoegridkit/lines.csv",
-        eg_links="data/entsoegridkit/links.csv",
-        eg_converters="data/entsoegridkit/converters.csv",
-        eg_transformers="data/entsoegridkit/transformers.csv",
-        parameter_corrections="data/parameter_corrections.yaml",
-        links_p_nom="data/links_p_nom.csv",
-        links_tyndp="data/links_tyndp.csv",
+        eg_buses=lambda w: (
+            "data/entsoegridkit/buses.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/buses.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
+                else resources("osm/pre-base/buses.csv")
+            )
+        ),
+        eg_lines=lambda w: (
+            "data/entsoegridkit/lines.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/lines.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
+                else resources("osm/pre-base/lines.csv")
+            )
+        ),
+        eg_links=lambda w: (
+            "data/entsoegridkit/links.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/links.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
+                else resources("osm/pre-base/links.csv")
+            )
+        ),
+        eg_converters=lambda w: (
+            "data/entsoegridkit/converters.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/converters.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
+                else resources("osm/pre-base/converters.csv")
+            )
+        ),
+        eg_transformers=lambda w: (
+            "data/entsoegridkit/transformers.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else (
+                "data/osm/prebuilt/transformers.csv"
+                if config_provider("electricity_network", "base_network")(w)
+                == "osm-prebuilt"
+                else resources("osm/pre-base/transformers.csv")
+            )
+        ),
+        parameter_corrections=lambda w: (
+            "data/parameter_corrections.yaml"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else []
+        ),
+        links_p_nom=lambda w: (
+            "data/links_p_nom.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else []
+        ),
+        links_tyndp=lambda w: (
+            "data/links_tyndp.csv"
+            if config_provider("electricity_network", "base_network")(w) == "gridkit"
+            else []
+        ),
         country_shapes=resources("country_shapes.geojson"),
         offshore_shapes=resources("offshore_shapes.geojson"),
         europe_shape=resources("europe_shape.geojson"),
@@ -599,3 +656,67 @@ rule prepare_network:
         "../envs/environment.yaml"
     script:
         "../scripts/prepare_network.py"
+
+
+if config["electricity_network"]["base_network"] == "osm-raw":
+
+    rule clean_osm_data:
+        input:
+            cables_way=expand(
+                "data/osm/raw/{country}/cables_way.json",
+                country=config_provider("countries"),
+            ),
+            lines_way=expand(
+                "data/osm/raw/{country}/lines_way.json",
+                country=config_provider("countries"),
+            ),
+            links_relation=expand(
+                "data/osm/raw/{country}/links_relation.json",
+                country=config_provider("countries"),
+            ),
+            substations_way=expand(
+                "data/osm/raw/{country}/substations_way.json",
+                country=config_provider("countries"),
+            ),
+            substations_relation=expand(
+                "data/osm/raw/{country}/substations_relation.json",
+                country=config_provider("countries"),
+            ),
+            offshore_shapes=resources("offshore_shapes.geojson"),
+            country_shapes=resources("country_shapes.geojson"),
+        output:
+            substations=resources("osm/clean/substations.geojson"),
+            substations_polygon=resources("osm/clean/substations_polygon.geojson"),
+            lines=resources("osm/clean/lines.geojson"),
+            links=resources("osm/clean/links.geojson"),
+        log:
+            logs("clean_osm_data.log"),
+        script:
+            "../scripts/clean_osm_data.py"
+
+
+if config["electricity_network"]["base_network"] == "osm-raw":
+
+    rule build_osm_network:
+        input:
+            substations=resources("osm/clean/substations.geojson"),
+            lines=resources("osm/clean/lines.geojson"),
+            links=resources("osm/clean/links.geojson"),
+            country_shapes=resources("country_shapes.geojson"),
+        output:
+            lines=resources("osm/pre-base/lines.csv"),
+            links=resources("osm/pre-base/links.csv"),
+            converters=resources("osm/pre-base/converters.csv"),
+            transformers=resources("osm/pre-base/transformers.csv"),
+            substations=resources("osm/pre-base/buses.csv"),
+            lines_geojson=resources("osm/pre-base/lines.geojson"),
+            links_geojson=resources("osm/pre-base/links.geojson"),
+            converters_geojson=resources("osm/pre-base/converters.geojson"),
+            transformers_geojson=resources("osm/pre-base/transformers.geojson"),
+            substations_geojson=resources("osm/pre-base/buses.geojson"),
+        log:
+            logs("build_osm_network.log"),
+        benchmark:
+            benchmarks("build_osm_network")
+        script:
+            "../scripts/build_osm_network.py"
diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 2f9fe21df..11312b600 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -336,3 +336,83 @@ if config["enable"]["retrieve"] and {"UA", "MD"}.intersection(config["countries"
             "../envs/retrieve.yaml"
         script:
             "../scripts/retrieve_gdp_uamd.py"
+
+
+if config["enable"]["retrieve"] and (
+    config["electricity_network"]["base_network"] == "osm-prebuilt"
+):
+
+    rule retrieve_osm_prebuilt:
+        input:
+            buses=storage("https://sandbox.zenodo.org/records/87679/files/buses.csv"),
+            converters=storage(
+                "https://sandbox.zenodo.org/records/87679/files/converters.csv"
+            ),
+            lines=storage("https://sandbox.zenodo.org/records/87679/files/lines.csv"),
+            links=storage("https://sandbox.zenodo.org/records/87679/files/links.csv"),
+            transformers=storage(
+                "https://sandbox.zenodo.org/records/87679/files/transformers.csv"
+            ),
+        output:
+            buses="data/osm/prebuilt/buses.csv",
+            converters="data/osm/prebuilt/converters.csv",
+            lines="data/osm/prebuilt/lines.csv",
+            links="data/osm/prebuilt/links.csv",
+            transformers="data/osm/prebuilt/transformers.csv",
+        log:
+            "logs/retrieve_osm_prebuilt.log",
+        resources:
+            mem_mb=500,
+        retries: 2
+        run:
+            for key in input.keys():
+                move(input[key], output[key])
+
+
+
+if config["enable"]["retrieve"] and (
+    config["electricity_network"]["base_network"] == "osm-raw"
+):
+
+    rule retrieve_osm_data:
+        output:
+            cables_way="data/osm/raw/{country}/cables_way.json",
+            lines_way="data/osm/raw/{country}/lines_way.json",
+            links_relation="data/osm/raw/{country}/links_relation.json",
+            substations_way="data/osm/raw/{country}/substations_way.json",
+            substations_relation="data/osm/raw/{country}/substations_relation.json",
+        log:
+            "logs/retrieve_osm_data_{country}.log",
+        resources:
+            cores=2,
+            threads=1,
+        script:
+            "../scripts/retrieve_osm_data.py"
+
+
+if config["enable"]["retrieve"] and (
+    config["electricity_network"]["base_network"] == "osm-raw"
+):
+
+    rule retrieve_osm_data_all:
+        input:
+            expand(
+                "data/osm/raw/{country}/cables_way.json",
+                country=config_provider("countries"),
+            ),
+            expand(
+                "data/osm/raw/{country}/lines_way.json",
+                country=config_provider("countries"),
+            ),
+            expand(
+                "data/osm/raw/{country}/links_relation.json",
+                country=config_provider("countries"),
+            ),
+            expand(
+                "data/osm/raw/{country}/substations_way.json",
+                country=config_provider("countries"),
+            ),
+            expand(
+                "data/osm/raw/{country}/substations_relation.json",
+                country=config_provider("countries"),
+            ),
diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
index ae7167562..df40a8007 100755
--- a/scripts/add_electricity.py
+++ b/scripts/add_electricity.py
@@ -91,7 +91,6 @@
 import pandas as pd
 import powerplantmatching as pm
 import pypsa
-import rasterio
 import scipy.sparse as sparse
 import xarray as xr
 from _helpers import (
@@ -101,8 +100,6 @@
     update_p_nom_max,
 )
 from powerplantmatching.export import map_country_bus
-from rasterio.mask import mask
-from shapely.geometry import box
 from shapely.prepared import prep
 
 idx = pd.IndexSlice
@@ -298,7 +295,7 @@ def shapes_to_shapes(orig, dest):
 
 
 def attach_load(
-    n, regions, load, nuts3_shapes, gdp_file, ppp_file, countries, scaling=1.0
+    n, regions, load, nuts3_shapes, gdp_ppp_non_nuts3, countries, scaling=1.0
 ):
     substation_lv_i = n.buses.index[n.buses["substation_lv"]]
     gdf_regions = gpd.read_file(regions).set_index("name").reindex(substation_lv_i)
@@ -309,7 +306,7 @@ def attach_load(
 
     nuts3 = gpd.read_file(nuts3_shapes).set_index("index")
 
-    def upsample(cntry, group):
+    def upsample(cntry, group, gdp_ppp_non_nuts3):
         load = opsd_load[cntry]
 
         if len(group) == 1:
@@ -326,13 +323,15 @@ def upsample(cntry, group):
         # relative factors 0.6 and 0.4 have been determined from a linear
         # regression on the country to continent load data
         factors = normed(0.6 * normed(gdp_n) + 0.4 * normed(pop_n))
-        if cntry in ["UA", "MD"]:
+        if cntry in ["UA", "MD"] and gdp_ppp_non_nuts3 is not None:
             # overwrite factor because nuts3 provides no data for UA+MD
-            gdp_ua_md, ppp_ua_md = calculate_ua_md_gdp_ppp(
-                gdf_regions[gdf_regions.country == cntry], gdp_file, ppp_file
-            )
+            gdp_ppp_non_nuts3 = gpd.read_file(gdp_ppp_non_nuts3).set_index("Bus")
+            gdp_ppp_non_nuts3 = gdp_ppp_non_nuts3.loc[
+                gdp_ppp_non_nuts3.country == cntry
+            ]
             factors = normed(
-                0.6 * normed(gdp_ua_md["gdp"]) + 0.4 * normed(ppp_ua_md["ppp"])
+                0.6 * normed(gdp_ppp_non_nuts3["gdp"])
+                + 0.4 * normed(gdp_ppp_non_nuts3["ppp"])
             )
         return pd.DataFrame(
             factors.values * load.values[:, np.newaxis],
@@ -342,7 +341,7 @@ def upsample(cntry, group):
 
     load = pd.concat(
         [
-            upsample(cntry, group)
+            upsample(cntry, group, gdp_ppp_non_nuts3)
             for cntry, group in gdf_regions.geometry.groupby(gdf_regions.country)
         ],
         axis=1,
@@ -799,97 +798,6 @@ def attach_line_rating(
     n.lines_t.s_max_pu *= s_max_pu
 
 
-def calculate_ua_md_gdp_ppp(gdf_regions, gdp_file, ppp_file):
-    """
-    Calculate the GDP and PPP values for the regions within the bounding box of
-    UA and MD.
-
-    Parameters:
-    gdf_regions (GeoDataFrame): A GeoDataFrame containing the regions.
-    gdp_file (str): The file path to the dataset containing the GDP values for UA and MD.
-    ppp_file (str): The file path to the dataset containing the PPP values for UA and MD.
-
-    Returns:
-    tuple: A tuple containing two GeoDataFrames:
-        - gdp_ua_md: A GeoDataFrame with the aggregated GDP values mapped to each bus.
-        - ppp_ua_md: A GeoDataFrame with the aggregated PPP values mapped to each bus.
-    """
-    # Create a bounding box for UA, MD from region shape, including a buffer of 10000 metres
-    box_ua_md = (
-        gpd.GeoDataFrame(geometry=[box(*gdf_regions.total_bounds)], crs=gdf_regions.crs)
-        .to_crs(epsg=3857)
-        .buffer(10000)
-        .to_crs(gdf_regions.crs)
-    )
-
-    # GDP
-    with xr.open_dataset(gdp_file) as src_gdp_ua_md:
-        src_gdp_ua_md = src_gdp_ua_md.where(
-            (src_gdp_ua_md.longitude >= box_ua_md.bounds.minx.min())
-            & (src_gdp_ua_md.longitude <= box_ua_md.bounds.maxx.max())
-            & (src_gdp_ua_md.latitude >= box_ua_md.bounds.miny.min())
-            & (src_gdp_ua_md.latitude <= box_ua_md.bounds.maxy.max()),
-            drop=True,
-        )
-        gdp_ua_md = src_gdp_ua_md.to_dataframe().reset_index()
-
-    gdp_ua_md = gdp_ua_md.rename(columns={"GDP_per_capita_PPP": "gdp"})
-    gdp_ua_md = gdp_ua_md[gdp_ua_md.time == gdp_ua_md.time.max()]
-    gdp_ua_md = gpd.GeoDataFrame(
-        gdp_ua_md,
-        geometry=gpd.points_from_xy(gdp_ua_md.longitude, gdp_ua_md.latitude),
-        crs="EPSG:4326",
-    )
-
-    gdp_ua_md = gpd.sjoin(
-        gdp_ua_md, gdf_regions.reset_index(), predicate="within"
-    ).drop(columns=["index_right"])
-    gdp_ua_md = (
-        gdp_ua_md.groupby(["Bus", "country", "time"])
-        .agg({"gdp": "sum"})
-        .reset_index(level=["country", "time"])
-    )
-
-    # PPP
-    with rasterio.open(ppp_file) as src_ppp_ua_md:
-        # Mask the raster with the bounding box
-        out_image, out_transform = mask(src_ppp_ua_md, box_ua_md, crop=True)
-        out_image,
-        out_meta = src_ppp_ua_md.meta.copy()
-        out_meta.update(
-            {
-                "driver": "GTiff",
-                "height": out_image.shape[1],
-                "width": out_image.shape[2],
-                "transform": out_transform,
-            }
-        )
-
-    masked_data = out_image[0]  # Use the first band (rest is empty)
-    row_indices, col_indices = np.where(masked_data != src_ppp_ua_md.nodata)
-    values = masked_data[row_indices, col_indices]
-
-    # Affine transformation from pixel coordinates to geo coordinates
-    x_coords, y_coords = rasterio.transform.xy(out_transform, row_indices, col_indices)
-    ppp_ua_md = pd.DataFrame({"x": x_coords, "y": y_coords, "ppp": values})
-
-    ppp_ua_md = gpd.GeoDataFrame(
-        ppp_ua_md,
-        geometry=gpd.points_from_xy(ppp_ua_md.x, ppp_ua_md.y),
-        crs=src_ppp_ua_md.crs,
-    )
-
-    ppp_ua_md = gpd.sjoin(ppp_ua_md, gdf_regions.reset_index(), predicate="within")
-    ppp_ua_md = (
-        ppp_ua_md.groupby(["Bus", "country"])
-        .agg({"ppp": "sum"})
-        .reset_index()
-        .set_index("Bus")
-    )
-
-    return gdp_ua_md, ppp_ua_md
-
-
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
@@ -915,13 +823,17 @@ def calculate_ua_md_gdp_ppp(gdf_regions, gdp_file, ppp_file):
     )
     ppl = load_powerplants(snakemake.input.powerplants)
 
+    if "gdp_ppp_non_nuts3" in snakemake.input.keys():
+        gdp_ppp_non_nuts3 = snakemake.input.gdp_ppp_non_nuts3
+    else:
+        gdp_ppp_non_nuts3 = None
+
     attach_load(
         n,
         snakemake.input.regions,
         snakemake.input.load,
         snakemake.input.nuts3_shapes,
-        snakemake.input.gdp_file,
-        snakemake.input.ppp_file,
+        gdp_ppp_non_nuts3,
         params.countries,
         params.scaling_factor,
     )
diff --git a/scripts/build_gdp_ppp_non_nuts3.py b/scripts/build_gdp_ppp_non_nuts3.py
index 80dd1bb32..4c9cda265 100644
--- a/scripts/build_gdp_ppp_non_nuts3.py
+++ b/scripts/build_gdp_ppp_non_nuts3.py
@@ -3,7 +3,11 @@
 #
 # SPDX-License-Identifier: MIT
 """
+Maps the GDP and PPP values to non-NUTS3 regions.
 
+The script takes as input the country code, a GeoDataFrame containing
+the regions, and the file paths to the datasets containing the GDP and
+PPP values for non-NUTS3 countries.
 """
 
 import logging

From 712c476260fa33b3df071f024a97702b6481c4fa Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 16 Jul 2024 21:54:29 +0200
Subject: [PATCH 073/100] Bug fixes for UA MD scripts.

---
 rules/build_electricity.smk        |  16 ++--
 rules/retrieve.smk                 |   4 +-
 scripts/add_electricity.py         |  26 +++---
 scripts/build_gdp_ppp_non_nuts3.py | 136 +++++++++++++++++++----------
 scripts/retrieve_gdp_uamd.py       |   4 +-
 5 files changed, 114 insertions(+), 72 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 27b3ac58e..ce73d1d5c 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -433,32 +433,32 @@ def input_conventional(w):
 
 
 # Optional input when having Ukraine (UA) or Moldova (MD) in the countries list
-def input_gdp_ppp_non_nuts3(w):
+def input_gdp_pop_non_nuts3(w):
     countries = set(config_provider("countries")(w))
     if {"UA", "MD"}.intersection(countries):
-        return {"gdp_ppp_non_nuts3": resources("gdp_ppp_non_nuts3.geojson")}
+        return {"gdp_pop_non_nuts3": resources("gdp_pop_non_nuts3.geojson")}
     return {}
 
 
-rule build_gdp_ppp_non_nuts3:
+rule build_gdp_pop_non_nuts3:
     params:
         countries=config_provider("countries"),
     input:
         base_network=resources("networks/base.nc"),
         regions=resources("regions_onshore.geojson"),
         gdp_non_nuts3="data/GDP_per_capita_PPP_1990_2015_v2.nc",
-        ppp_non_nuts3="data/ppp_2013_1km_Aggregated.tif",
+        pop_non_nuts3="data/ppp_2013_1km_Aggregated.tif",
     output:
-        resources("gdp_ppp_non_nuts3.geojson"),
+        resources("gdp_pop_non_nuts3.geojson"),
     log:
-        logs("build_gdp_ppp_non_nuts3.log"),
+        logs("build_gdp_pop_non_nuts3.log"),
     threads: 1
     resources:
         mem_mb=1500,
     conda:
         "../envs/environment.yaml"
     script:
-        "../scripts/build_gdp_ppp_non_nuts3.py"
+        "../scripts/build_gdp_pop_non_nuts3.py"
 
 
 rule add_electricity:
@@ -476,7 +476,7 @@ rule add_electricity:
     input:
         unpack(input_profile_tech),
         unpack(input_conventional),
-        unpack(input_gdp_ppp_non_nuts3),
+        unpack(input_gdp_pop_non_nuts3),
         base_network=resources("networks/base.nc"),
         line_rating=lambda w: (
             resources("networks/line_rating.nc")
diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 11312b600..a939ecd9a 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -325,8 +325,8 @@ if config["enable"]["retrieve"] and {"UA", "MD"}.intersection(config["countries"
 
     rule retrieve_gdp_uamd:
         output:
-            gdp="data/GDP_per_capita_PPP_1990_2015_v2.nc",
-            ppp="data/ppp_2013_1km_Aggregated.tif",
+            gdp_non_nuts3="data/GDP_per_capita_PPP_1990_2015_v2.nc",
+            pop_non_nuts3="data/ppp_2013_1km_Aggregated.tif",
         log:
             "logs/retrieve_gdp_uamd.log",
         resources:
diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
index df40a8007..1bd139189 100755
--- a/scripts/add_electricity.py
+++ b/scripts/add_electricity.py
@@ -295,7 +295,7 @@ def shapes_to_shapes(orig, dest):
 
 
 def attach_load(
-    n, regions, load, nuts3_shapes, gdp_ppp_non_nuts3, countries, scaling=1.0
+    n, regions, load, nuts3_shapes, gdp_pop_non_nuts3, countries, scaling=1.0
 ):
     substation_lv_i = n.buses.index[n.buses["substation_lv"]]
     gdf_regions = gpd.read_file(regions).set_index("name").reindex(substation_lv_i)
@@ -306,7 +306,7 @@ def attach_load(
 
     nuts3 = gpd.read_file(nuts3_shapes).set_index("index")
 
-    def upsample(cntry, group, gdp_ppp_non_nuts3):
+    def upsample(cntry, group, gdp_pop_non_nuts3):
         load = opsd_load[cntry]
 
         if len(group) == 1:
@@ -323,15 +323,15 @@ def upsample(cntry, group, gdp_ppp_non_nuts3):
         # relative factors 0.6 and 0.4 have been determined from a linear
         # regression on the country to continent load data
         factors = normed(0.6 * normed(gdp_n) + 0.4 * normed(pop_n))
-        if cntry in ["UA", "MD"] and gdp_ppp_non_nuts3 is not None:
+        if cntry in ["UA", "MD"] and gdp_pop_non_nuts3 is not None:
             # overwrite factor because nuts3 provides no data for UA+MD
-            gdp_ppp_non_nuts3 = gpd.read_file(gdp_ppp_non_nuts3).set_index("Bus")
-            gdp_ppp_non_nuts3 = gdp_ppp_non_nuts3.loc[
-                gdp_ppp_non_nuts3.country == cntry
+            gdp_pop_non_nuts3 = gpd.read_file(gdp_pop_non_nuts3).set_index("Bus")
+            gdp_pop_non_nuts3 = gdp_pop_non_nuts3.loc[
+                gdp_pop_non_nuts3.country == cntry
             ]
             factors = normed(
-                0.6 * normed(gdp_ppp_non_nuts3["gdp"])
-                + 0.4 * normed(gdp_ppp_non_nuts3["ppp"])
+                0.6 * normed(gdp_pop_non_nuts3["gdp"])
+                + 0.4 * normed(gdp_pop_non_nuts3["pop"])
             )
         return pd.DataFrame(
             factors.values * load.values[:, np.newaxis],
@@ -341,7 +341,7 @@ def upsample(cntry, group, gdp_ppp_non_nuts3):
 
     load = pd.concat(
         [
-            upsample(cntry, group, gdp_ppp_non_nuts3)
+            upsample(cntry, group, gdp_pop_non_nuts3)
             for cntry, group in gdf_regions.geometry.groupby(gdf_regions.country)
         ],
         axis=1,
@@ -823,17 +823,17 @@ def attach_line_rating(
     )
     ppl = load_powerplants(snakemake.input.powerplants)
 
-    if "gdp_ppp_non_nuts3" in snakemake.input.keys():
-        gdp_ppp_non_nuts3 = snakemake.input.gdp_ppp_non_nuts3
+    if "gdp_pop_non_nuts3" in snakemake.input.keys():
+        gdp_pop_non_nuts3 = snakemake.input.gdp_pop_non_nuts3
     else:
-        gdp_ppp_non_nuts3 = None
+        gdp_pop_non_nuts3 = None
 
     attach_load(
         n,
         snakemake.input.regions,
         snakemake.input.load,
         snakemake.input.nuts3_shapes,
-        gdp_ppp_non_nuts3,
+        gdp_pop_non_nuts3,
         params.countries,
         params.scaling_factor,
     )
diff --git a/scripts/build_gdp_ppp_non_nuts3.py b/scripts/build_gdp_ppp_non_nuts3.py
index 4c9cda265..7d45da748 100644
--- a/scripts/build_gdp_ppp_non_nuts3.py
+++ b/scripts/build_gdp_ppp_non_nuts3.py
@@ -3,16 +3,20 @@
 #
 # SPDX-License-Identifier: MIT
 """
-Maps the GDP and PPP values to non-NUTS3 regions.
+Maps the GDP p.c.
 
-The script takes as input the country code, a GeoDataFrame containing
-the regions, and the file paths to the datasets containing the GDP and
-PPP values for non-NUTS3 countries.
+and population values to non-NUTS3 regions. The script takes as input
+the country code, a GeoDataFrame containing the regions, and the file
+paths to the datasets containing the GDP and POP values for non-NUTS3
+countries.
 """
 
 import logging
 
+import cartopy.crs as ccrs
 import geopandas as gpd
+import matplotlib.colors as colors
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pypsa
@@ -25,24 +29,28 @@
 logger = logging.getLogger(__name__)
 
 
-def calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3):
+def calc_gdp_pop(country, regions, gdp_non_nuts3, pop_non_nuts3):
     """
-    Calculate the GDP and PPP values for non NUTS3 regions.
+    Calculate the GDP p.c. and population values for non NUTS3 regions.
 
     Parameters:
     country (str): The two-letter country code of the non-NUTS3 region.
     regions (GeoDataFrame): A GeoDataFrame containing the regions.
-    gdp_non_nuts3 (str): The file path to the dataset containing the GDP values
+    gdp_non_nuts3 (str): The file path to the dataset containing the GDP p.c values
     for non NUTS3 countries (e.g. MD, UA)
-    ppp_non_nuts3 (str): The file path to the dataset containing the PPP values
+    pop_non_nuts3 (str): The file path to the dataset containing the POP values
     for non NUTS3 countries (e.g. MD, UA)
 
     Returns:
     tuple: A tuple containing two GeoDataFrames:
-        - gdp: A GeoDataFrame with the aggregated GDP values mapped to each bus.
-        - ppp: A GeoDataFrame with the aggregated PPP values mapped to each bus.
+        - gdp: A GeoDataFrame with the mean GDP p.c. values mapped to each bus.
+        - pop: A GeoDataFrame with the summed POP values mapped to each bus.
     """
-    regions = regions.drop(columns=["x", "y"])
+    regions = (
+        regions.rename(columns={"name": "Bus"})
+        .drop(columns=["x", "y"])
+        .set_index("Bus")
+    )
     regions = regions[regions.country == country]
     # Create a bounding box for UA, MD from region shape, including a buffer of 10000 metres
     bounding_box = (
@@ -53,7 +61,9 @@ def calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3):
     )
 
     # GDP
-    logger.info(f"Mapping GDP values to non-NUTS3 region: {regions.country.unique()}")
+    logger.info(
+        f"Mapping mean GDP p.c. to non-NUTS3 region: {regions.country.unique()}"
+    )
     with xr.open_dataset(gdp_non_nuts3) as src_gdp:
         src_gdp = src_gdp.where(
             (src_gdp.longitude >= bounding_box.bounds.minx.min())
@@ -65,25 +75,28 @@ def calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3):
         gdp = src_gdp.to_dataframe().reset_index()
     gdp = gdp.rename(columns={"GDP_per_capita_PPP": "gdp"})
     gdp = gdp[gdp.time == gdp.time.max()]
-    gdp = gpd.GeoDataFrame(
+    gdp_raster = gpd.GeoDataFrame(
         gdp,
         geometry=gpd.points_from_xy(gdp.longitude, gdp.latitude),
         crs="EPSG:4326",
     )
-    gdp = gpd.sjoin(gdp, regions, predicate="within")
+    gdp_mapped = gpd.sjoin(gdp_raster, regions, predicate="within")
     gdp = (
-        gdp.groupby(["Bus", "country"])
-        .agg({"gdp": "sum"})
+        gdp_mapped.copy()
+        .groupby(["Bus", "country"])
+        .agg({"gdp": "mean"})
         .reset_index(level=["country"])
     )
 
-    # PPP
-    logger.info(f"Mapping PPP values to non-NUTS3 region: {regions.country.unique()}")
-    with rasterio.open(ppp_non_nuts3) as src_ppp:
+    # POP
+    logger.info(
+        f"Mapping summed population to non-NUTS3 region: {regions.country.unique()}"
+    )
+    with rasterio.open(pop_non_nuts3) as src_pop:
         # Mask the raster with the bounding box
-        out_image, out_transform = mask(src_ppp, bounding_box, crop=True)
+        out_image, out_transform = mask(src_pop, bounding_box, crop=True)
         out_image,
-        out_meta = src_ppp.meta.copy()
+        out_meta = src_pop.meta.copy()
         out_meta.update(
             {
                 "driver": "GTiff",
@@ -93,63 +106,92 @@ def calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3):
             }
         )
     masked_data = out_image[0]  # Use the first band (rest is empty)
-    row_indices, col_indices = np.where(masked_data != src_ppp.nodata)
+    row_indices, col_indices = np.where(masked_data != src_pop.nodata)
     values = masked_data[row_indices, col_indices]
 
     # Affine transformation from pixel coordinates to geo coordinates
     x_coords, y_coords = rasterio.transform.xy(out_transform, row_indices, col_indices)
-    ppp = pd.DataFrame({"x": x_coords, "y": y_coords, "ppp": values})
-    ppp = gpd.GeoDataFrame(
-        ppp,
-        geometry=gpd.points_from_xy(ppp.x, ppp.y),
-        crs=src_ppp.crs,
+    pop_raster = pd.DataFrame({"x": x_coords, "y": y_coords, "pop": values})
+    pop_raster = gpd.GeoDataFrame(
+        pop_raster,
+        geometry=gpd.points_from_xy(pop_raster.x, pop_raster.y),
+        crs=src_pop.crs,
     )
-    ppp = gpd.sjoin(ppp, regions, predicate="within")
-    ppp = (
-        ppp.groupby(["Bus", "country"])
-        .agg({"ppp": "sum"})
+    pop_mapped = gpd.sjoin(pop_raster, regions, predicate="within")
+    pop = (
+        pop_mapped.groupby(["Bus", "country"])
+        .agg({"pop": "sum"})
         .reset_index()
         .set_index("Bus")
     )
-    gdp_ppp = regions.join(gdp.drop(columns="country"), on="Bus").join(
-        ppp.drop(columns="country"), on="Bus"
+    gdp_pop = regions.join(gdp.drop(columns="country"), on="Bus").join(
+        pop.drop(columns="country"), on="Bus"
+    )
+    gdp_pop.fillna(0, inplace=True)
+
+    # Plot for validation purposes
+    cmap = plt.get_cmap("viridis")
+    norm = colors.Normalize(vmin=0, vmax=gdp_mapped.gdp.max())
+    crs = ccrs.AlbersEqualArea()
+    # two column plot
+    fig, axes = plt.subplots(1, 2, figsize=(10, 5), subplot_kw={"projection": crs})
+    gpd.GeoDataFrame(
+        regions.join(gdp.drop(columns="country"), on="Bus"),
+        crs=src_gdp.attrs["projection"],
+    ).to_crs(crs.proj4_init).plot(
+        ax=axes[0],
+        column="gdp",
+        cmap=cmap,
+        norm=norm,
+        legend=True,
+        legend_kwds={
+            "label": "Mean GDP (mapped to bus regions)",
+            "orientation": "horizontal",
+        },
+    )
+    gpd.GeoDataFrame(
+        regions.join(pop.drop(columns="country"), on="Bus"), crs=src_pop.crs
+    ).to_crs(crs.proj4_init).plot(
+        ax=axes[1],
+        column="pop",
+        cmap=cmap,
+        legend=True,
+        legend_kwds={
+            "label": "Abs. population (mapped to bus regions)",
+            "orientation": "horizontal",
+        },
     )
-    gdp_ppp.fillna(0, inplace=True)
+    plt.show()
 
-    return gdp_ppp
+    return gdp_pop
 
 
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
 
-        snakemake = mock_snakemake("build_gdp_ppp_non_nuts3")
+        snakemake = mock_snakemake("build_gdp_pop_non_nuts3")
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
     n = pypsa.Network(snakemake.input.base_network)
-    substation_lv_i = n.buses.index[n.buses["substation_lv"]]
-    regions = (
-        gpd.read_file(snakemake.input.regions)
-        .set_index("name")
-        .reindex(substation_lv_i)
-    )
+    regions = gpd.read_file(snakemake.input.regions)
 
     gdp_non_nuts3 = snakemake.input.gdp_non_nuts3
-    ppp_non_nuts3 = snakemake.input.ppp_non_nuts3
+    pop_non_nuts3 = snakemake.input.pop_non_nuts3
 
     countries_non_nuts3 = pd.Index(("MD", "UA"))
     subset = set(countries_non_nuts3) & set(snakemake.params.countries)
 
-    gdp_ppp = pd.concat(
+    gdp_pop = pd.concat(
         [
-            calc_gdp_ppp(country, regions, gdp_non_nuts3, ppp_non_nuts3)
+            calc_gdp_pop(country, regions, gdp_non_nuts3, pop_non_nuts3)
             for country in subset
         ],
         axis=0,
     )
 
     logger.info(
-        f"Exporting GDP and PPP values for non-NUTS3 regions {snakemake.output}"
+        f"Exporting GDP and POP values for non-NUTS3 regions {snakemake.output}"
     )
-    gdp_ppp.reset_index().to_file(snakemake.output, driver="GeoJSON")
+    gdp_pop.reset_index().to_file(snakemake.output, driver="GeoJSON")
diff --git a/scripts/retrieve_gdp_uamd.py b/scripts/retrieve_gdp_uamd.py
index 3da3be4f0..780f2ea65 100644
--- a/scripts/retrieve_gdp_uamd.py
+++ b/scripts/retrieve_gdp_uamd.py
@@ -24,8 +24,8 @@
 
 dict_urls = dict(
     {
-        "gdp": "https://datadryad.org/stash/downloads/file_stream/241947",
-        "ppp": "https://github.com/ecohealthalliance/sars_cov_risk/releases/download/v2.0.1/ppp_2020_1km_Aggregated.tif",
+        "gdp_non_nuts3": "https://datadryad.org/stash/downloads/file_stream/241947",
+        "pop_non_nuts3": "https://github.com/ecohealthalliance/sars_cov_risk/releases/download/v2.0.1/ppp_2020_1km_Aggregated.tif",
     }
 )
 

From 5071c298785f79f91964740b0d4b68a26d538877 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 16 Jul 2024 22:45:05 +0200
Subject: [PATCH 074/100] Rename of build script.

---
 .../{build_gdp_ppp_non_nuts3.py => build_gdp_pop_non_nuts3.py}    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{build_gdp_ppp_non_nuts3.py => build_gdp_pop_non_nuts3.py} (100%)

diff --git a/scripts/build_gdp_ppp_non_nuts3.py b/scripts/build_gdp_pop_non_nuts3.py
similarity index 100%
rename from scripts/build_gdp_ppp_non_nuts3.py
rename to scripts/build_gdp_pop_non_nuts3.py

From d8941f22ab8930134e3a7faaf5bffad04933ccd8 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 16 Jul 2024 23:28:04 +0200
Subject: [PATCH 075/100] Bug fix: only distribute load to buses with
 substation.

---
 scripts/add_electricity.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
index 1bd139189..0f6ca7f6e 100755
--- a/scripts/add_electricity.py
+++ b/scripts/add_electricity.py
@@ -327,7 +327,8 @@ def upsample(cntry, group, gdp_pop_non_nuts3):
             # overwrite factor because nuts3 provides no data for UA+MD
             gdp_pop_non_nuts3 = gpd.read_file(gdp_pop_non_nuts3).set_index("Bus")
             gdp_pop_non_nuts3 = gdp_pop_non_nuts3.loc[
-                gdp_pop_non_nuts3.country == cntry
+                (gdp_pop_non_nuts3.country == cntry)
+                & (gdp_pop_non_nuts3.index.isin(substation_lv_i))
             ]
             factors = normed(
                 0.6 * normed(gdp_pop_non_nuts3["gdp"])

From f234c9c2dd17d3344b7302908ea96f41835b2e3d Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 17 Jul 2024 23:33:58 +0200
Subject: [PATCH 076/100] Updated zenodo sandbox repository.

---
 rules/retrieve.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index a939ecd9a..22c2b65a0 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -344,14 +344,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://sandbox.zenodo.org/records/87679/files/buses.csv"),
+            buses=storage("https://sandbox.zenodo.org/records/89508/files/buses.csv"),
             converters=storage(
-                "https://sandbox.zenodo.org/records/87679/files/converters.csv"
+                "https://sandbox.zenodo.org/records/89508/files/converters.csv"
             ),
-            lines=storage("https://sandbox.zenodo.org/records/87679/files/lines.csv"),
-            links=storage("https://sandbox.zenodo.org/records/87679/files/links.csv"),
+            lines=storage("https://sandbox.zenodo.org/records/89508/files/lines.csv"),
+            links=storage("https://sandbox.zenodo.org/records/89508/files/links.csv"),
             transformers=storage(
-                "https://sandbox.zenodo.org/records/87679/files/transformers.csv"
+                "https://sandbox.zenodo.org/records/89508/files/transformers.csv"
             ),
         output:
             buses="data/osm/prebuilt/buses.csv",

From f7d8992e816d43f92ebcd015b1f59fd20d1d4e88 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Thu, 18 Jul 2024 11:29:19 +0200
Subject: [PATCH 077/100] Updated config.default

---
 config/config.default.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index e2b5f3ee5..a0932761a 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -907,11 +907,11 @@ plotting:
   eu_node_location:
     x: -5.5
     y: 46.
-  # costs_max: 1000
-  # costs_threshold: 1
-  # energy_max: 20000
-  # energy_min: -20000
-  # energy_threshold: 50.
+  costs_max: 1000
+  costs_threshold: 1
+  energy_max: 20000
+  energy_min: -20000
+  energy_threshold: 50.
 
   nice_names:
     OCGT: "Open-Cycle Gas"

From 4c9a055e574bc90762b0a59feeffe1d24bb55f97 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 23 Jul 2024 10:07:10 +0200
Subject: [PATCH 078/100] Cleaned config.default.yaml: Related settings grouped
 together and redundant voltage settings aggregated.

---
 scripts/base_network.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/scripts/base_network.py b/scripts/base_network.py
index 7e229591e..291777996 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -135,7 +135,7 @@ def _find_closest_links(links, new_links, distance_upper_bound=1.5):
     )
 
 
-def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
+def _load_buses_from_eg(eg_buses, europe_shape, config):
     buses = (
         pd.read_csv(
             eg_buses,
@@ -161,8 +161,8 @@ def _load_buses_from_eg(eg_buses, europe_shape, config_elec):
         lambda p: europe_shape_prepped.contains(Point(p)), axis=1
     )
 
-    v_nom_min = min(config_elec["voltages"])
-    v_nom_max = max(config_elec["voltages"])
+    v_nom_min = min(config["lines"]["types"].keys())
+    v_nom_max = max(config["lines"]["types"].keys())
 
     # Quick fix:
     buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (
@@ -445,7 +445,7 @@ def _reconnect_crimea(lines):
 
 
 def _set_electrical_parameters_lines_eg(lines, config):
-    v_noms = config["electricity"]["voltages"]
+    v_noms = list(config["lines"]["types"].keys())
     linetypes = config["lines"]["types"]
 
     for v_nom in v_noms:
@@ -456,12 +456,13 @@ def _set_electrical_parameters_lines_eg(lines, config):
     return lines
 
 
-def _set_electrical_parameters_lines_osm(lines_config, voltages, lines):
+def _set_electrical_parameters_lines_osm(lines, config):
     if lines.empty:
         lines["type"] = []
         return lines
 
-    linetypes = _get_linetypes_config(lines_config["types"], voltages)
+    v_noms = list(config["lines"]["types"].keys())
+    linetypes = _get_linetypes_config(config["lines"]["types"], v_noms)
 
     lines["carrier"] = "AC"
     lines["dc"] = False
@@ -470,7 +471,7 @@ def _set_electrical_parameters_lines_osm(lines_config, voltages, lines):
         lambda x: _get_linetype_by_voltage(x, linetypes)
     )
 
-    lines["s_max_pu"] = lines_config["s_max_pu"]
+    lines["s_max_pu"] = config["lines"]["s_max_pu"]
 
     return lines
 
@@ -817,7 +818,7 @@ def base_network(
     config,
 ):
 
-    buses = _load_buses_from_eg(eg_buses, europe_shape, config["electricity"])
+    buses = _load_buses_from_eg(eg_buses, europe_shape, config)
 
     if config["electricity_network"].get("base_network") == "gridkit":
         links = _load_links_from_eg(buses, eg_links)
@@ -851,9 +852,7 @@ def base_network(
         lines = _set_electrical_parameters_lines_eg(lines, config)
         links = _set_electrical_parameters_links_eg(links, config, links_p_nom)
     elif "osm" in config["electricity_network"].get("base_network"):
-        lines = _set_electrical_parameters_lines_osm(
-            config["lines"], config["electricity"]["voltages"], lines
-        )
+        lines = _set_electrical_parameters_lines_osm(lines, config)
         links = _set_electrical_parameters_links_osm(links, config)
     else:
         raise ValueError("base_network must be either 'gridkit' or 'osm'")

From 306883f1dd2559a778c3d2553baed58d83c9d7ae Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 23 Jul 2024 10:31:04 +0200
Subject: [PATCH 079/100] Cleaned config.default.yaml: Related settings grouped
 together and redundant voltage settings aggregated. Added release notes.

---
 config/config.default.yaml | 72 ++++++++++++++++++--------------------
 doc/release_notes.rst      |  4 +++
 2 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index d13541bad..87d8e4f09 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -75,7 +75,6 @@ enable:
   custom_busmap: false
   drop_leap_day: true
 
-
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
 co2_budget:
   2020: 0.701
@@ -90,9 +89,43 @@ electricity_network:
   base_network: gridkit             # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
   osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
 
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
+lines:
+  types: # Specify voltages (keys) and line types (values) for lines
+    220.: "Al/St 240/40 2-bundle 220.0"
+    300.: "Al/St 240/40 3-bundle 300.0"
+    380.: "Al/St 240/40 4-bundle 380.0"
+    500.: "Al/St 240/40 4-bundle 380.0"
+    750.: "Al/St 560/50 4-bundle 750.0"
+  s_max_pu: 0.7
+  s_nom_max: .inf
+  max_extension: 20000 #MW
+  length_factor: 1.25  # Note that 'osm-raw' and 'osm-prebuilt' already contain higher spatial resolution line routes and lengths
+  reconnect_crimea: true  # Only needed for 'gridkit' base_network, in OSM, the lines are already connected
+  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+  dynamic_line_rating:
+    activate: false
+    cutout: europe-2013-era5
+    correction_factor: 0.95
+    max_voltage_difference: false
+    max_line_rating: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#links
+links:
+  p_max_pu: 1.0
+  p_nom_max: .inf
+  max_extension: 30000 #MW
+  include_tyndp: true
+  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
+transformers:
+  x: 0.1
+  s_nom: 2000.
+  type: ''
+
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
-  voltages: [220., 300., 380., 500., 750.]
   gaslimit_enable: false
   gaslimit: false
   co2limit_enable: false
@@ -283,41 +316,6 @@ conventional:
   nuclear:
     p_max_pu: "data/nuclear_p_max_pu.csv" # float of file name
 
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
-lines:
-  types:
-    220.: "Al/St 240/40 2-bundle 220.0"
-    300.: "Al/St 240/40 3-bundle 300.0"
-    380.: "Al/St 240/40 4-bundle 380.0"
-    500.: "Al/St 240/40 4-bundle 380.0"
-    750.: "Al/St 560/50 4-bundle 750.0"
-  s_max_pu: 0.7
-  s_nom_max: .inf
-  max_extension: 20000 #MW
-  length_factor: 1.25
-  reconnect_crimea: true
-  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
-  dynamic_line_rating:
-    activate: false
-    cutout: europe-2013-era5
-    correction_factor: 0.95
-    max_voltage_difference: false
-    max_line_rating: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#links
-links:
-  p_max_pu: 1.0
-  p_nom_max: .inf
-  max_extension: 30000 #MW
-  include_tyndp: true
-  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
-transformers:
-  x: 0.1
-  s_nom: 2000.
-  type: ''
-
 # docs-load in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#load
 load:
   interpolate_limit: 3
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index eb29ce4b1..640ec5d6d 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -41,6 +41,10 @@ Upcoming Release
 
 * Enable parallelism in :mod:`determine_availability_matrix_MD_UA.py` and remove plots. This requires the use of temporary files.
 
+* Added new feature that to base the electricity network on OpenStreetMap (OSM data) (PR https://github.com/PyPSA/pypsa-eur/pull/1079). Note that a heuristics based cleaning process is used for lines and links where electrical parameters are incomplete, missing, or ambiguous. Through ``electricity_network["base_network"]``, the base network can be set to "gridkit" (original default setting), "osm-prebuilt" (which downloads the latest prebuilt snapshot based on OSM data from Zenodo), or "osm-raw" which retrieves (once) and cleans the raw OSM data and subsequently builds the network. Note that this process may take a few minutes.
+
+* Voltage settings have been aggregated and are now directly read from the line type dictionary. Instead of ``electricity["voltages"]``, scripts have been updated to refer to ``lines["types"].keys()``.
+
 PyPSA-Eur 0.11.0 (25th May 2024)
 =====================================
 

From 77ac356de2bc6f837102eb0940791371a81658ac Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 23 Jul 2024 11:31:01 +0200
Subject: [PATCH 080/100] Updated Zenodo repositories for OSM-prebuilt to
 offcial publication.

---
 rules/retrieve.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 999100c1c..3b563d783 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -346,14 +346,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://sandbox.zenodo.org/records/89508/files/buses.csv"),
+            buses=storage("https://zenodo.org/records/12799202/files/buses.csv"),
             converters=storage(
-                "https://sandbox.zenodo.org/records/89508/files/converters.csv"
+                "https://zenodo.org/records/12799202/files/converters.csv"
             ),
-            lines=storage("https://sandbox.zenodo.org/records/89508/files/lines.csv"),
-            links=storage("https://sandbox.zenodo.org/records/89508/files/links.csv"),
+            lines=storage("https://zenodo.org/records/12799202/files/lines.csv"),
+            links=storage("https://zenodo.org/records/12799202/files/links.csv"),
             transformers=storage(
-                "https://sandbox.zenodo.org/records/89508/files/transformers.csv"
+                "https://zenodo.org/records/12799202/files/transformers.csv"
             ),
         output:
             buses="data/osm/prebuilt/buses.csv",

From f6717925412fed1679846ab68eae30cca627cd0e Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 30 Jul 2024 11:14:07 +0200
Subject: [PATCH 081/100] Updated configtables

---
 config/config_backup.yaml                | 262 +++++++++++++----------
 doc/configtables/electricity_network.csv |   3 +
 2 files changed, 150 insertions(+), 115 deletions(-)
 create mode 100644 doc/configtables/electricity_network.csv

diff --git a/config/config_backup.yaml b/config/config_backup.yaml
index 2bcaf173c..9ebeea351 100644
--- a/config/config_backup.yaml
+++ b/config/config_backup.yaml
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: CC0-1.0
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#top-level-configuration
-version: 0.10.0
+version: 0.11.0
 tutorial: false
 
 logging:
@@ -15,14 +15,13 @@ private:
     entsoe_api:
 
 remote:
-  ssh: "z1"
-  path: "~/scratch/projects/pypsa-eur"
+  ssh: z1
+  path: ~/scratch/projects/pypsa-eur
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
 run:
   prefix: ""
-  # name: "test-europe1-gridkit"
-  name: "test-begb-gridkit"
+  name: "europe-ua-md-gridkit-custom"
   scenarios:
     enable: false
     file: config/scenarios.yaml
@@ -41,60 +40,40 @@ scenario:
   simpl:
   - ''
   ll:
-  - v1.0 # TODO mit und ohne Netzausbau v1.0
+  - v1.0
   clusters:
-  - 40
-  # - 128
-  # - 256
-  # - 512
-  # # - 1024
+  - 320
   opts:
-  - 'Co2L0-169H'
+  - ''
   sector_opts:
   - ''
   planning_horizons:
   # - 2020
-  # - 2030
+  - 2030
   # - 2040
-  - 2050
+  # - 2050
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
-countries: ["BE", "GB"]
-# countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']
-# countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MD', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA']
-
-# Settings related to the high-voltage electricity grid
-electricity_network:
-  base_network: "gridkit"  # "osm" or "gridkit"
-  build_osm_network: true  # If 'true', the network will be built from scratch (retrieving OSM data, cleaning, and building) and stored under resources, 'false' will use snapshots in data/osm
-
-build_osm_network:  # Options of the build_osm_network script; osm = OpenStreetMap
-  group_tolerance_buses: 5000  # [m] (default 5000) Tolerance in meters of the close buses to merge
-  split_overpassing_lines: false  # When True, lines overpassing buses are splitted and connected to the bueses
-  overpassing_lines_tolerance: 1  # [m] (default 1) Tolerance to identify lines overpassing buses
-  force_ac: false  # When true, it forces all components (lines and substation) to be AC-only. To be used if DC assets create problem.
+countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA', 'MD']
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
 snapshots:
-  start: "2013-01-01"
-  end: "2014-01-01"
+  start: '2013-01-01'
+  end: '2014-01-01'
   inclusive: 'left'
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
 enable:
-  retrieve: true
+  retrieve: auto
   prepare_links_p_nom: false
   retrieve_databundle: true
-  retrieve_sector_databundle: true
   retrieve_cost_data: true
   build_cutout: false
-  retrieve_irena: false
   retrieve_cutout: true
-  build_natura_raster: false
-  retrieve_natura_raster: true
-  custom_busmap: false
+  custom_busmap: true
   drop_leap_day: true
 
+
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
 co2_budget:
   2020: 0.701
@@ -105,6 +84,10 @@ co2_budget:
   2045: 0.032
   2050: 0.000
 
+electricity_network:
+  base_network: gridkit             # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
+  osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
+
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
   voltages: [200., 220., 300., 380., 400., 500., 750.]
@@ -113,7 +96,6 @@ electricity:
   co2limit_enable: false
   co2limit: 7.75e+7
   co2base: 1.487e+9
-  agg_p_nom_limits: data/agg_p_nom_minmax.csv
 
   operational_reserve:
     activate: false
@@ -126,17 +108,18 @@ electricity:
     H2: 168
 
   extendable_carriers:
-    Generator: [solar, onwind, offwind-ac, offwind-dc, OCGT]
+    Generator: [solar, solar-hsat, onwind, offwind-ac, offwind-dc, offwind-float, OCGT, CCGT]
     StorageUnit: [] # battery, H2
     Store: [battery, H2]
     Link: [] # H2 pipeline
 
   powerplants_filter: (DateOut >= 2023 or DateOut != DateOut) and not (Country == 'Germany' and Fueltype == 'Nuclear')
   custom_powerplants: false
-  everywhere_powerplants: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
+  everywhere_powerplants: []
 
   conventional_carriers: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
-  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro] # hydro removed
+  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro]
+  # renewable_carriers: [solar, solar-hsat, onwind, offwind-ac, offwind-dc, offwind-float, hydro]
 
   estimate_renewable_capacities:
     enable: true
@@ -144,7 +127,7 @@ electricity:
     year: 2020
     expansion_limit: false
     technology_mapping:
-      Offshore: [offwind-ac, offwind-dc]
+      Offshore: [offwind-ac, offwind-dc, offwind-float]
       Onshore: [onwind]
       PV: [solar]
 
@@ -212,7 +195,7 @@ renewable:
     luisa: false # [0, 5230]
     natura: true
     ship_threshold: 400
-    max_depth: 50
+    max_depth: 60
     max_shore_distance: 30000
     excluder_resolution: 200
     clip_p_max_pu: 1.e-2
@@ -228,10 +211,28 @@ renewable:
     luisa: false # [0, 5230]
     natura: true
     ship_threshold: 400
-    max_depth: 50
+    max_depth: 60
     min_shore_distance: 30000
     excluder_resolution: 200
     clip_p_max_pu: 1.e-2
+  offwind-float:
+    cutout: europe-2013-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_5MW_offshore
+    # ScholzPhd Tab 4.3.1: 10MW/km^2
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    # proxy for wake losses
+    # from 10.1016/j.energy.2018.08.153
+    # until done more rigorously in #153
+    corine: [44, 255]
+    natura: true
+    ship_threshold: 400
+    excluder_resolution: 200
+    min_depth: 60
+    max_depth: 1000
+    clip_p_max_pu: 1.e-2
   solar:
     cutout: europe-2013-sarah
     resource:
@@ -247,6 +248,21 @@ renewable:
     natura: true
     excluder_resolution: 100
     clip_p_max_pu: 1.e-2
+  solar-hsat:
+    cutout: europe-2013-sarah
+    resource:
+      method: pv
+      panel: CSi
+      orientation:
+        slope: 35.
+        azimuth: 180.
+      tracking: horizontal
+    capacity_per_sqkm: 4.43 # 15% higher land usage acc. to NREL
+    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
+    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
+    natura: true
+    excluder_resolution: 100
+    clip_p_max_pu: 1.e-2
   hydro:
     cutout: europe-2013-era5
     carriers: [ror, PHS, hydro]
@@ -269,27 +285,19 @@ conventional:
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
 lines:
   types:
-    200.: "Al/St 240/40 2-bundle 220.0"
-    220.: "Al/St 240/40 2-bundle 220.0"
-    300.: "Al/St 240/40 3-bundle 300.0"
-    380.: "Al/St 240/40 4-bundle 380.0"
-    400.: "Al/St 240/40 4-bundle 380.0"
-    500.: "Al/St 240/40 4-bundle 380.0"
-    750.: "Al/St 560/50 4-bundle 750.0"
-  dc_types: # setting only for osm
-    200.: "HVDC XLPE 1000"
-    220.: "HVDC XLPE 1000"
-    300.: "HVDC XLPE 1000"
-    750.: "HVDC XLPE 1000"
-    380.: "HVDC XLPE 1000"
-    400.: "HVDC XLPE 1000"
-    500.: "HVDC XLPE 1000"
+    200.0: Al/St 240/40 2-bundle 220.0
+    220.0: Al/St 240/40 2-bundle 220.0
+    300.0: Al/St 240/40 3-bundle 300.0
+    380.0: Al/St 240/40 4-bundle 380.0
+    400.0: Al/St 240/40 4-bundle 380.0
+    500.0: Al/St 240/40 4-bundle 380.0
+    750.0: Al/St 560/50 4-bundle 750.0
   s_max_pu: 0.7
   s_nom_max: .inf
   max_extension: 20000 #MW
   length_factor: 1.25
   reconnect_crimea: true
-  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
   dynamic_line_rating:
     activate: false
     cutout: europe-2013-era5
@@ -302,7 +310,7 @@ links:
   p_max_pu: 1.0
   p_nom_max: .inf
   max_extension: 30000 #MW
-  include_tyndp: false
+  include_tyndp: true
   under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
@@ -335,6 +343,8 @@ pypsa_eur:
   - onwind
   - offwind-ac
   - offwind-dc
+  - offwind-float
+  - solar-hsat
   - solar
   - ror
   - nuclear
@@ -385,8 +395,8 @@ solar_thermal:
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#existing-capacities
 existing_capacities:
-  grouping_years_power: [1895, 1920, 1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025, 2030]
-  grouping_years_heat: [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020] # heat grouping years >= baseyear will be ignored
+  grouping_years_power: [1920, 1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025]
+  grouping_years_heat: [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2019] # heat grouping years >= baseyear will be ignored
   threshold_capacity: 10
   default_heating_lifetime: 20
   conventional_carriers:
@@ -427,7 +437,6 @@ sector:
   bev_availability: 0.5
   bev_energy: 0.05
   bev_charge_efficiency: 0.9
-  bev_plug_to_wheel_efficiency: 0.2
   bev_charge_rate: 0.011
   bev_avail_max: 0.95
   bev_avail_mean: 0.8
@@ -456,8 +465,9 @@ sector:
     2040: 0.3
     2045: 0.15
     2050: 0
-  transport_fuel_cell_efficiency: 0.5
-  transport_internal_combustion_efficiency: 0.3
+  transport_electric_efficiency: 53.19 # 1 MWh_el = 53.19*100 km
+  transport_fuel_cell_efficiency: 30.003 # 1 MWh_H2 = 30.003*100 km
+  transport_ice_efficiency: 16.0712 # 1 MWh_oil = 16.0712 * 100 km
   agriculture_machinery_electric_share: 0
   agriculture_machinery_oil_share: 1
   agriculture_machinery_fuel_efficiency: 0.7
@@ -563,15 +573,15 @@ sector:
   - nearshore    # within 50 km of sea
     # - offshore
   ammonia: false
-  min_part_load_fischer_tropsch: 0.7
+  min_part_load_fischer_tropsch: 0.5
   min_part_load_methanolisation: 0.3
   min_part_load_methanation: 0.3
-  use_fischer_tropsch_waste_heat: true
-  use_haber_bosch_waste_heat: true
-  use_methanolisation_waste_heat: true
-  use_methanation_waste_heat: true
-  use_fuel_cell_waste_heat: true
-  use_electrolysis_waste_heat: true
+  use_fischer_tropsch_waste_heat: 0.25
+  use_haber_bosch_waste_heat: 0.25
+  use_methanolisation_waste_heat: 0.25
+  use_methanation_waste_heat: 0.25
+  use_fuel_cell_waste_heat: 0.25
+  use_electrolysis_waste_heat: 0.25
   electricity_transmission_grid: true
   electricity_distribution_grid: true
   electricity_distribution_grid_cost_factor: 1.0
@@ -586,6 +596,8 @@ sector:
     gas pipeline:
       efficiency_per_1000km: 1 #0.977
       compression_per_1000km: 0.01
+    electricity distribution grid:
+      efficiency_static: 0.97
   H2_network: true
   gas_network: false
   H2_retrofit: false
@@ -614,6 +626,13 @@ sector:
       solar: 3
       offwind-ac: 3
       offwind-dc: 3
+  enhanced_geothermal:
+    enable: false
+    flexible: true
+    max_hours: 240
+    max_boost: 0.25
+    var_cf: true
+    sustainability_factor: 0.0025
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#industry
 industry:
@@ -679,6 +698,9 @@ industry:
     2040: 0.12
     2045: 0.16
     2050: 0.20
+  HVC_environment_sequestration_fraction: 0.
+  waste_to_energy: false
+  waste_to_energy_cc: false
   sector_ratios_fraction_future:
     2020: 0.0
     2025: 0.1
@@ -697,6 +719,7 @@ industry:
   methanol_production_today: 1.5
   MWh_elec_per_tMeOH: 0.167
   MWh_CH4_per_tMeOH: 10.25
+  MWh_MeOH_per_tMeOH: 5.528
   hotmaps_locate_missing: false
   reference_year: 2015
 
@@ -704,8 +727,7 @@ industry:
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#costs
 costs:
   year: 2030
-  version: v0.8.1
-  rooftop_share: 0.14  # based on the potentials, assuming  (0.1 kW/m2 and 10 m2/person)
+  version: v0.9.0
   social_discountrate: 0.02
   fill_values:
     FOM: 0
@@ -730,8 +752,8 @@ costs:
     battery: 0.
     battery inverter: 0.
   emission_prices:
-    enable: false
-    co2: 0.
+    enable: true
+    co2: 100.
     co2_monthly_prices: false
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#clustering
@@ -755,8 +777,8 @@ clustering:
       ramp_limit_up: max
       ramp_limit_down: max
   temporal:
-    resolution_elec: 169H
-    resolution_sector: 169H
+    resolution_elec: 25H
+    resolution_sector: 25H
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#adjustments
 adjustments:
@@ -777,11 +799,28 @@ solving:
     # io_api: "direct"  # Increases performance but only supported for the highs and gurobi solvers
     # options that go into the optimize function
     track_iterations: false
-    min_iterations: 4
-    max_iterations: 6
+    min_iterations: 2
+    max_iterations: 3
     transmission_losses: 2
     linearized_unit_commitment: true
     horizon: 365
+    post_discretization:
+      enable: false
+      line_unit_size: 1700
+      line_threshold: 0.3
+      link_unit_size:
+        DC: 2000
+        H2 pipeline: 1200
+        gas pipeline: 1500
+      link_threshold:
+        DC: 0.3
+        H2 pipeline: 0.3
+        gas pipeline: 0.3
+
+  agg_p_nom_limits:
+    agg_offwind: false
+    include_existing: false
+    file: data/agg_p_nom_minmax.csv
 
   constraints:
     CCL: false
@@ -795,7 +834,7 @@ solving:
 
   solver_options:
     highs-default:
-      # refer to https://ergo-code.github.io/HiGHS/options/definitions.html#solver
+      # refer to https://ergo-code.github.io/HiGHS/dev/options/definitions/
       threads: 4
       solver: "ipm"
       run_crossover: "off"
@@ -848,23 +887,17 @@ solving:
     cbc-default: {} # Used in CI
     glpk-default: {} # Used in CI
 
-  mem_mb: 100000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
-  runtime: 12h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
+  mem_mb: 30000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
+  runtime: 6h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
 
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
-
 plotting:
   map:
-    boundaries:
-  eu_node_location:
-    x: -5.5
-    y: 46.
-  # costs_max: 1000
-  # costs_threshold: 0.0000001
-  # energy_max:
-  # energy_min:
-  # energy_threshold: 0.000001
+    boundaries: [-11, 30, 34, 71]
+    color_geomap:
+      ocean: white
+      land: white
   projection:
     name: "EqualEarth"
     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
@@ -872,34 +905,21 @@ plotting:
     # central_longitude: 10.
     # central_latitude: 50.
     # standard_parallels: [35, 65]
-
-# plotting:
-#   map:
-#     boundaries: [-11, 30, 34, 71]
-#     color_geomap:
-#       ocean: white
-#       land: white
-#   projection:
-#     name: "EqualEarth"
-#     # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
-#     # name: "LambertConformal"
-#     # central_longitude: 10.
-#     # central_latitude: 50.
-#     # standard_parallels: [35, 65]
-#   eu_node_location:
-#     x: -5.5
-#     y: 46.
-#   costs_max: 1000
-#   costs_threshold: 1
-#   energy_max: 20000
-#   energy_min: -20000
-#   energy_threshold: 50.
+  eu_node_location:
+    x: -5.5
+    y: 46.
+  costs_max: 1000
+  costs_threshold: 1
+  energy_max: 20000
+  energy_min: -20000
+  energy_threshold: 50.
 
   nice_names:
     OCGT: "Open-Cycle Gas"
     CCGT: "Combined-Cycle Gas"
     offwind-ac: "Offshore Wind (AC)"
     offwind-dc: "Offshore Wind (DC)"
+    offwind-float: "Offshore Wind (Floating)"
     onwind: "Onshore Wind"
     solar: "Solar"
     PHS: "Pumped Hydro Storage"
@@ -924,6 +944,9 @@ plotting:
     offwind-dc: "#74c6f2"
     offshore wind (DC): "#74c6f2"
     offshore wind dc: "#74c6f2"
+    offwind-float: "#b5e2fa"
+    offshore wind (Float): "#b5e2fa"
+    offshore wind float: "#b5e2fa"
     # water
     hydro: '#298c81'
     hydro reservoir: '#298c81'
@@ -935,6 +958,7 @@ plotting:
     # solar
     solar: "#f9d002"
     solar PV: "#f9d002"
+    solar-hsat: "#fdb915"
     solar thermal: '#ffbf2b'
     residential rural solar thermal: '#f1c069'
     services rural solar thermal: '#eabf61'
@@ -1036,6 +1060,7 @@ plotting:
     BEV charger: '#baf238'
     V2G: '#e5ffa8'
     land transport EV: '#baf238'
+    land transport demand: '#38baf2'
     Li ion: '#baf238'
     # hot water storage
     water tanks: '#e69487'
@@ -1140,6 +1165,7 @@ plotting:
     methanolisation: '#83d6d5'
     methanol: '#468c8b'
     shipping methanol: '#468c8b'
+    industry methanol: '#468c8b'
     # co2
     CC: '#f29dae'
     CCS: '#f29dae'
@@ -1170,6 +1196,9 @@ plotting:
     waste: '#e3d37d'
     other: '#000000'
     geothermal: '#ba91b1'
+    geothermal heat: '#ba91b1'
+    geothermal district heat: '#d19D00'
+    geothermal organic rankine cycle: '#ffbf00'
     AC: "#70af1d"
     AC-AC: "#70af1d"
     AC line: "#70af1d"
@@ -1179,3 +1208,6 @@ plotting:
     DC-DC: "#8a1caf"
     DC link: "#8a1caf"
     load: "#dd2e23"
+    waste CHP: '#e3d37d'
+    waste CHP CC: '#e3d3ff'
+    HVC to air: 'k'
diff --git a/doc/configtables/electricity_network.csv b/doc/configtables/electricity_network.csv
new file mode 100644
index 000000000..f7a51ef1f
--- /dev/null
+++ b/doc/configtables/electricity_network.csv
@@ -0,0 +1,3 @@
+,Unit,Values,Description
+base_network, --, "Any value in {'gridkit', 'osm-prebuilt', 'osm-raw}", "Specify the underlying base network, i.e. GridKit (based on ENTSO-E web map extract, OpenStreetMap (OSM) prebuilt or raw (built from raw OSM data), takes longer."
+osm_group_tolerance_buses, meters, float, "Specifies the radius in which substations shall be clustered to a single bus. Default recommendation: 5000 (meters)"

From 008df4bcc8406db3a0f3ed0224b77ee6acb68ef7 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Fri, 9 Aug 2024 16:42:04 +0200
Subject: [PATCH 082/100] Updated links.csv: Under_construction lines to in
 commission.

---
 data/entsoegridkit/links.csv | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/data/entsoegridkit/links.csv b/data/entsoegridkit/links.csv
index 00a488ddf..4a94c32bb 100644
--- a/data/entsoegridkit/links.csv
+++ b/data/entsoegridkit/links.csv
@@ -6,7 +6,7 @@ link_id,bus0,bus1,length,underground,under_construction,tags,geometry
 5587,1377,2382,76847.0139826037,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32533", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"200", "symbol"=>"DC-Line", "country"=>"IT", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(8.67675371049727 40.6777653795244,9.03900099999999 40.979898,9.22164899999999 41.133159,9.19977299501706 41.2082924934473)'
 5640,1422,1638,234733.218840324,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32590", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"Rómulo", "symbol"=>"DC-Line", "country"=>"ES", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(2.48932993486183 39.561252379133,1.13159199999999 39.610978,0 39.710356,-0.234388957535875 39.7314420592468)'
 13589,2262,7428,316517.539537871,f,f,,'LINESTRING(9.17009350125146 41.2967653544603,9.38095099999999 41.331451,9.858856 41.352072,10.70755 41.479776,11.25 41.448903,12.100067 41.432431,12.380219 41.426253,12.418671 41.401536,12.704315 41.347948,12.805939 41.368564,12.9016442293009 41.3921592955445)'
-14802,2258,7029,391819.608605717,f,t,,'LINESTRING(14.0986517070226 42.4133438660838,14.412689 42.431566,15.115814 42.363618,16.269379 42.067646,16.875 42.126747,16.962891 42.135913,18.531189 42.271212,18.7271798293119 42.3522936900005)'
+14802,2258,7029,391819.608605717,f,f,,'LINESTRING(14.0986517070226 42.4133438660838,14.412689 42.431566,15.115814 42.363618,16.269379 42.067646,16.875 42.126747,16.962891 42.135913,18.531189 42.271212,18.7271798293119 42.3522936900005)'
 14668,2333,3671,146536.932669904,f,t,,'LINESTRING(6.04271995139229 45.4637174756646,6.16607700000001 45.327048,6.351471 45.183973,6.54922499999999 45.148148,6.62338299999999 45.101638,6.642609 45.089036,6.70440700000001 45.05121,6.980438 45.089036,7.00653099999999 45.092914,7.21939099999999 45.094853,7.223511 45.089036,7.378693 44.871443,7.32136143270145 44.8385424366672)'
 14808,2379,2383,103628.671904731,f,f,,'LINESTRING(9.37725891362686 42.7057449479108,9.79980499999999 42.799431,10.5931379465185 42.9693952059839)'
 5575,2379,2380,24868.4258834249,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32521", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>" ", "symbol"=>"DC-Line", "country"=>"FR", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(9.37679000208623 42.7053229039427,9.357605 42.552069,9.45054814341409 42.5389781005166)'
@@ -15,7 +15,7 @@ link_id,bus0,bus1,length,underground,under_construction,tags,geometry
 5583,2382,7428,11623.019620339,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32529", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>" ", "symbol"=>"DC-Line", "country"=>"IT", "t9_code"=>"FR-IT-01", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"1", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.555323123e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(9.17008474107272 41.2967639130447,9.168091 41.303603,9.18319700000001 41.250968,9.1995514318356 41.2089447559651)'
 14825,2476,2585,45367.7245799963,f,f,,'LINESTRING(2.98259070757654 42.2776059846425,2.90313700000001 42.397094,2.867432 42.467032,2.77404800000001 42.655172)'
 8745,3611,8302,9361.61122972312,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"120591", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"None", "symbol"=>"DC-Line", "country"=>"CH", "t9_code"=>"None", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"1", "CreatedDate"=>"1.556535027e+12", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(7.95410166666667 47.5542867377085,7.928009 47.555214,7.937622 47.526475,7.96895162362761 47.4961125343931)'
-14801,4709,4781,50206.4589537583,f,t,,'LINESTRING(6.43068069229957 50.8136946409214,6.020508 50.766865,5.925751 50.755572,5.73118285928413 50.7304278585398)'
+14801,4709,4781,50206.4589537583,f,f,,'LINESTRING(6.43068069229957 50.8136946409214,6.020508 50.766865,5.925751 50.755572,5.73118285928413 50.7304278585398)'
 14814,4972,5062,232745.802729813,f,f,,'LINESTRING(4.04528166772434 51.9611233898246,2.41561900000001 51.702353,0.794192405058928 51.4189824547604)'
 5558,4975,7427,45665.1050240866,f,t,'"MW"=>"None", "TSO"=>"None", "oid"=>"32502", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>" ", "symbol"=>"DC-Line", "country"=>"UK", "t9_code"=>" BE-UK-01", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"1", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.555407949e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(1.92947399999999 51.251601,1.27623412238205 51.2327009391635)'
 14826,4977,4983,52725.5506558225,f,f,,'LINESTRING(1.75051314494826 50.9186901861196,1.43508900000001 50.970535,1.02353536683349 51.0370060560335)'
@@ -33,16 +33,16 @@ link_id,bus0,bus1,length,underground,under_construction,tags,geometry
 5571,5743,7074,89346.6337548304,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32517", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"HelWin1", "symbol"=>"DC-Line", "country"=>"DE", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.545224101e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(8.12610708224912 54.310749538123,8.238373 54.256401,9.32699442549698 53.9319562532009)'
 5567,5744,5787,139209.866527364,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32512", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"DolWin1", "symbol"=>"DC-Line", "country"=>"DE", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.545224147e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(6.84493115764205 53.880869,6.909027 53.880869,7.116394 53.835512,7.36358600000001 53.396432,7.32101399999999 53.112163,7.33612100000001 52.893992,7.16075117704058 52.8485079587114)'
 5570,5745,8272,99066.5793764307,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32515", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"DolWin3", "symbol"=>"DC-Line", "country"=>"DE", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.545224133e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(6.84423599483409 53.8134043878533,6.71127300000001 53.693454,6.65634200000001 53.59821,6.73461900000001 53.55581,7.112274 53.45126,7.05596900000001 53.340713,7.237244 53.26932,7.223511 53.18135,7.223511 53.1805270078955)'
-14803,5751,5803,280301.445474794,f,t,,'LINESTRING(6.75668661933496 53.437616158174,6.838989 53.664171,6.96258499999999 53.785238,7.34298700000001 53.882488,7.80029300000001 54.517096,8.20678699999999 55.297102,8.86005375885099 55.4336013425692)'
+14803,5751,5803,280301.445474794,f,f,,'LINESTRING(6.75668661933496 53.437616158174,6.838989 53.664171,6.96258499999999 53.785238,7.34298700000001 53.882488,7.80029300000001 54.517096,8.20678699999999 55.297102,8.86005375885099 55.4336013425692)'
 14821,5749,6363,575352.425009444,f,f,,'LINESTRING(6.83036734046461 53.4374933986115,6.253967 53.645452,6.33636499999999 55.776573,6.34597800000001 56.029855,6.34597800000001 56.030622,6.43661500000001 58.130121,6.90176957000565 58.2653404287817)'
 5568,5768,5787,131420.09609615,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32513", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"DolWin2", "symbol"=>"DC-Line", "country"=>"DE", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.545224159e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(7.11083415172816 53.9630966319811,7.07107499999999 53.80795,7.301788 53.39807,7.267456 53.110514,7.29354899999999 52.907246,7.16070024970726 52.8485606886388)'
 12932,5770,5773,6905.52230262641,f,t,,'LINESTRING(7.15460523215685 53.4027398808691,7.24823000000001 53.375956)'
-14848,5858,6358,574884.998052791,f,t,,'LINESTRING(6.81690675921544 58.6338502746805,6.63024900000001 58.249559,6.78268399999999 57.579197,7.17544599999999 56.532986,7.17407200000001 56.5345,7.46521000000001 55.776573,7.46521000000001 55.776573,7.64099100000001 55.312736,8.458099 54.316523,9.394684 53.934262)'
+14848,5858,6358,574884.998052791,f,f,,'LINESTRING(6.81690675921544 58.6338502746805,6.63024900000001 58.249559,6.78268399999999 57.579197,7.17544599999999 56.532986,7.17407200000001 56.5345,7.46521000000001 55.776573,7.46521000000001 55.776573,7.64099100000001 55.312736,8.458099 54.316523,9.394684 53.934262)'
 5581,5893,6072,59184.4227659405,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32527", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>" ", "symbol"=>"DC-Line", "country"=>"UK", "t9_code"=>"222.1.2", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"1", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(-4.94702447012386 55.0727948492206,-5.137482 55.042188,-5.62500000000001 54.890036,-5.631866 54.887667,-5.7332134509551 54.813550429852)'
 5580,5893,6072,58741.4601812995,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32526", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>" ", "symbol"=>"DC-Line", "country"=>"UK", "t9_code"=>"222.1.1", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"1", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(-4.94689333475508 55.0726735779237,-5.045471 55.009914,-5.59616099999999 54.840245,-5.62500000000001 54.834709,-5.73306677066227 54.8134313531551)'
 8009,5897,5936,363085.503577327,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"70191", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"Western HVDC link", "symbol"=>"DC-Line", "country"=>"UK", "t9_code"=>"None", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"1.514994622e+12", "DeletedDate"=>"None", "ModifiedDate"=>"1.51499467e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(-3.18595885129092 53.213699479605,-3.158569 53.308724,-3.40988200000001 53.511735,-4.081421 53.803084,-5.158081 54.013418,-5.28442399999999 54.866334,-5.177307 55.345546,-4.88616899999999 55.586883,-4.8806877889882 55.7044245716822)'
 14815,5937,6086,242400.41935291,f,f,,'LINESTRING(-3.12293971810515 53.2087645354697,-3.13934300000001 53.266034,-3.368683 53.377594,-5.18280000000001 53.495399,-5.62500000000001 53.519084,-5.62500000000001 53.519084,-6.101532 53.503568,-6.61057668606004 53.483977180569)'
-14804,5949,6684,695432.776022422,f,t,,'LINESTRING(6.64773945778347 59.5995729910866,6.483307 59.539192,6.374817 59.538495,6.24847399999999 59.510636,6.196289 59.448566,5.898285 59.321981,5.64697299999999 59.234284,5.62500000000001 59.223042,4.81338500000001 58.813742,2.03384400000001 57.374679,0 56.170023,-0.650940000000012 55.776573,-1.55838055228731 55.2221613174321)'
+14804,5949,6684,695432.776022422,f,f,,'LINESTRING(6.64773945778347 59.5995729910866,6.483307 59.539192,6.374817 59.538495,6.24847399999999 59.510636,6.196289 59.448566,5.898285 59.321981,5.64697299999999 59.234284,5.62500000000001 59.223042,4.81338500000001 58.813742,2.03384400000001 57.374679,0 56.170023,-0.650940000000012 55.776573,-1.55838055228731 55.2221613174321)'
 5635,6300,6348,93313.2906756649,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"32585", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"150", "symbol"=>"DC-Line", "country"=>"SE", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(18.2272491895352 57.5711315582343,17.274628 57.645401,16.875 57.674052,16.6818074486274 57.692364166947)'
 14819,6311,6416,122337.134741418,f,f,,'LINESTRING(10.2163282994747 57.1311139024238,10.567474 57.20771,10.737762 57.192832,10.972595 57.230016,11.25 57.33171,11.532898 57.436081,11.867981 57.556366,12.0227165657676 57.561507168045)'
 14809,6311,6416,122935.90852816,f,f,,'LINESTRING(10.2163571716117 57.1310010356663,10.366974 57.123569,10.578461 57.16678,10.740509 57.15263,11.001434 57.197296,11.174469 57.255281,11.25 57.282754,11.56723 57.399104,12.0227887239052 57.5613889668514)'

From 210ef806794331b9b8193d10897854ed19618cfa Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Fri, 9 Aug 2024 17:40:02 +0200
Subject: [PATCH 083/100] Updated link 8394 and parameter_corrections:
 Continuation of North-Sea-Link.

---
 data/entsoegridkit/links.csv    | 2 +-
 data/parameter_corrections.yaml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/entsoegridkit/links.csv b/data/entsoegridkit/links.csv
index 4a94c32bb..abcaf0cc1 100644
--- a/data/entsoegridkit/links.csv
+++ b/data/entsoegridkit/links.csv
@@ -58,6 +58,6 @@ link_id,bus0,bus1,length,underground,under_construction,tags,geometry
 14818,6586,6618,257364.279393886,f,f,,'LINESTRING(21.3559064590049 61.0800030227353,21.303864 61.005076,20.946808 60.801394,18.153534 60.501202,18.007965 60.483615,17.171631 60.503906,17.0593630437863 60.5503864910584)'
 14817,6589,6618,197128.229552834,f,f,,'LINESTRING(21.3557421230034 61.0800501553429,20.902863 60.846249,18.224945 60.556604,18.0193872312079 60.533018071939)'
 14812,6620,6623,140169.735736189,f,f,,'LINESTRING(22.3045576957813 60.4368452717433,21.404114 60.329667,19.8472351583549 60.129935739173)'
-8394,6684,6696,21158.5735245602,f,t,'"MW"=>"None", "TSO"=>"None", "oid"=>"89791", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"None", "symbol"=>"DC-Line", "country"=>"NO", "t9_code"=>"None", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"1.518010133e+12", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(6.64851407057135 59.5996162767494,6.99238592942864 59.5246589234811)'
+8394,6684,6696,21158.5735245602,f,f,'"MW"=>"None", "TSO"=>"None", "oid"=>"89791", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"None", "symbol"=>"DC-Line", "country"=>"NO", "t9_code"=>"None", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"1.518010133e+12", "DeletedDate"=>"None", "ModifiedDate"=>"None", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(6.64851407057135 59.5996162767494,6.99238592942864 59.5246589234811)'
 5569,5787,8272,38561.1931761179,f,t,'"MW"=>"None", "TSO"=>"None", "oid"=>"32514", "ext1"=>"None", "EIC_2"=>"None", "EIC_3"=>"None", "EIC_4"=>"None", "text_"=>"DolWin 3", "symbol"=>"DC-Line", "country"=>"DE", "t9_code"=>"0", "visible"=>"1", "EIC_code"=>"None", "tie_line"=>"0", "oneCircuit"=>"0", "CreatedDate"=>"None", "DeletedDate"=>"None", "ModifiedDate"=>"1.489072219e+12", "Internalcomments"=>"None", "visible_on_printed"=>"1"','LINESTRING(7.223511 53.1805270078955,7.223511 53.179704,7.21527100000001 53.121229,7.24273699999999 52.932086,7.16056753068224 52.8486333236236)'
 14813,7053,7430,192856.020480538,f,f,,'LINESTRING(10.8823542109264 53.948125809387,11.25 54.061,11.657867 54.186548,12.208557 54.386955,12.236023 54.402946,12.43515 54.541003,12.602692 54.684153,12.745514 54.844199,12.744141 54.842618,12.87735 54.979978,12.947388 55.077581,12.9299984288384 55.0630403498842)'
diff --git a/data/parameter_corrections.yaml b/data/parameter_corrections.yaml
index df15738af..3d19bed8d 100644
--- a/data/parameter_corrections.yaml
+++ b/data/parameter_corrections.yaml
@@ -15,6 +15,7 @@ Link:
       "115000": 1200 # Caithness Moray HVDC
     index:
       "14804": 1400 # North-Sea link (NSN Link)
+      "8394": 1400 # North-Sea Link (NSN Link) continuation
       "14822": 700 # NO-DK Skagerrak 4
       "14827": 440 # NO-DK Skagerrak 3
       "14810": 500 # NO-DK Skagerrak 1-2

From e33edfe3796fcb12c0c66bb944fd1054cb65e0eb Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 14 Aug 2024 14:13:57 +0200
Subject: [PATCH 084/100] Major update: fix simplify_network, fix Corsica,
 updated build_osm_network to include lines overpassing nodes.

---
 scripts/base_network.py      |  14 +-
 scripts/build_osm_network.py | 315 +++++++++++++++++++++++++++++++----
 scripts/clean_osm_data.py    |  35 +++-
 scripts/simplify_network.py  |  46 +++++
 4 files changed, 366 insertions(+), 44 deletions(-)

diff --git a/scripts/base_network.py b/scripts/base_network.py
index 254cb9053..f8ce9faf2 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -166,11 +166,13 @@ def _load_buses_from_eg(eg_buses, europe_shape, config):
     v_nom_max = max(config["lines"]["types"].keys())
 
     # Quick fix:
-    buses_with_v_nom_to_keep_b = (v_nom_min <= buses.v_nom) & (
-        buses.v_nom <= v_nom_max
-    ) | buses.v_nom.isnull()
+    buses_with_v_nom_to_keep_b = (
+        (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
+        | (buses.v_nom.isnull())
+        | (buses.carrier == "DC")
+    )
 
-    logger.info(f"Removing buses outside of range {v_nom_min} - {v_nom_max} V")
+    logger.info(f"Removing buses outside of range AC {v_nom_min} - {v_nom_max} V")
     return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
 
 
@@ -536,7 +538,9 @@ def _set_electrical_parameters_converters(converters, config):
     converters["p_max_pu"] = p_max_pu
     converters["p_min_pu"] = -p_max_pu
 
-    converters["p_nom"] = 2000
+    # if column "p_nom" does not exist, set to 2000
+    if "p_nom" not in converters:
+        converters["p_nom"] = 2000
 
     # Converters are combined with links
     converters["under_construction"] = False
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index b96c43321..795712067 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -5,6 +5,7 @@
 
 import logging
 import os
+import string
 
 import geopandas as gpd
 import numpy as np
@@ -12,7 +13,7 @@
 from _benchmark import memory_logger
 from _helpers import configure_logging, set_scenario_config
 from shapely.geometry import LineString, Point
-from shapely.ops import linemerge, split
+from shapely.ops import linemerge, nearest_points, split
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
@@ -451,62 +452,170 @@ def get_transformers(buses, lines):
     return df_transformers
 
 
-def get_converters(buses):
+# def get_converters(buses):
+#     """
+#     Function to create fake converter lines that connect buses of the same
+#     station_id of different polarities.
+#     """
+
+#     df_converters = []
+
+#     for g_name, g_value in buses.sort_values("voltage", ascending=True).groupby(
+#         by="station_id"
+#     ):
+#         # note: by construction there cannot be more that two buses with the same station_id and same voltage
+#         n_voltages = len(g_value)
+
+#         # A converter stations should have both AC and DC parts
+#         if g_value["dc"].any() & ~g_value["dc"].all():
+#             dc_voltage = g_value[g_value.dc]["voltage"].values
+
+#             for u in dc_voltage:
+#                 id_0 = g_value[g_value["dc"] & g_value["voltage"].isin([u])].index[0]
+
+#                 ac_voltages = g_value[~g_value.dc]["voltage"]
+#                 # A converter is added between a DC nodes and AC one with the closest voltage
+#                 id_1 = ac_voltages.sub(u).abs().idxmin()
+
+#                 geom_conv = LineString(
+#                     [g_value.geometry.loc[id_0], g_value.geometry.loc[id_1]]
+#                 )
+
+#                 # check if bus is a dclink boundary point, only then add converter
+#                 df_converters.append(
+#                     [
+#                         f"convert_{g_name}_{id_0}",  # "line_id"
+#                         g_value["bus_id"].loc[id_0],  # "bus0"
+#                         g_value["bus_id"].loc[id_1],  # "bus1"
+#                         False,  # "underground"
+#                         False,  # "under_construction"
+#                         g_value.country.loc[id_0],  # "country"
+#                         geom_conv,  # "geometry"
+#                     ]
+#                 )
+
+#     # name of the columns
+#     conv_columns = [
+#         "converter_id",
+#         "bus0",
+#         "bus1",
+#         "underground",
+#         "under_construction",
+#         "country",
+#         "geometry",
+#     ]
+
+#     df_converters = gpd.GeoDataFrame(df_converters, columns=conv_columns).reset_index()
+
+#     return df_converters
+
+
+def _find_closest_bus(row, buses, distance_crs, tol=5000):
     """
-    Function to create fake converter lines that connect buses of the same
-    station_id of different polarities.
+    Find the closest bus to a given bus based on geographical distance and
+    country.
+
+    Parameters:
+    - row: The bus_id of the bus to find the closest bus for.
+    - buses: A GeoDataFrame containing information about all the buses.
+    - distance_crs: The coordinate reference system to use for distance calculations.
+    - tol: The tolerance distance within which a bus is considered closest (default: 5000).
+    Returns:
+    - closest_bus_id: The bus_id of the closest bus, or None if no bus is found within the distance and same country.
     """
+    gdf_buses = buses.copy()
+    gdf_buses = gdf_buses.to_crs(distance_crs)
+    # Get the geometry of the bus with bus_id = link_bus_id
+    bus = gdf_buses[gdf_buses["bus_id"] == row]
+    bus_geom = bus.geometry.values[0]
 
-    df_converters = []
+    gdf_buses_filtered = gdf_buses[gdf_buses["dc"] == False]
 
-    for g_name, g_value in buses.sort_values("voltage", ascending=True).groupby(
-        by="station_id"
-    ):
-        # note: by construction there cannot be more that two buses with the same station_id and same voltage
-        n_voltages = len(g_value)
+    # Find the closest point in the filtered buses
+    nearest_geom = nearest_points(bus_geom, gdf_buses_filtered.union_all())[1]
 
-        # A converter stations should have both AC and DC parts
-        if g_value["dc"].any() & ~g_value["dc"].all():
-            dc_voltage = g_value[g_value.dc]["voltage"].values
+    # Get the bus_id of the closest bus
+    closest_bus = gdf_buses_filtered.loc[gdf_buses["geometry"] == nearest_geom]
 
-            for u in dc_voltage:
-                id_0 = g_value[g_value["dc"] & g_value["voltage"].isin([u])].index[0]
+    # check if closest_bus_id is within the distance
+    within_distance = (
+        closest_bus.to_crs(distance_crs).distance(bus.to_crs(distance_crs), align=False)
+    ).values[0] <= tol
 
-                ac_voltages = g_value[~g_value.dc]["voltage"]
-                # A converter is added between a DC nodes and AC one with the closest voltage
-                id_1 = ac_voltages.sub(u).abs().idxmin()
+    in_same_country = closest_bus.country.values[0] == bus.country.values[0]
 
-                geom_conv = LineString(
-                    [g_value.geometry.loc[id_0], g_value.geometry.loc[id_1]]
-                )
+    if within_distance and in_same_country:
+        closest_bus_id = closest_bus.bus_id.values[0]
+    else:
+        closest_bus_id = None
 
-                # check if bus is a dclink boundary point, only then add converter
-                df_converters.append(
-                    [
-                        f"convert_{g_name}_{id_0}",  # "line_id"
-                        g_value["bus_id"].loc[id_0],  # "bus0"
-                        g_value["bus_id"].loc[id_1],  # "bus1"
-                        False,  # "underground"
-                        False,  # "under_construction"
-                        g_value.country.loc[id_0],  # "country"
-                        geom_conv,  # "geometry"
-                    ]
-                )
+    return closest_bus_id
+
+
+def _get_converters(buses, links, distance_crs, tol):
+    """
+    Get the converters for the given buses and links. Connecting link endings
+    to closest AC bus.
+
+    Parameters:
+    - buses (pandas.DataFrame): DataFrame containing information about buses.
+    - links (pandas.DataFrame): DataFrame containing information about links.
+    Returns:
+    - gdf_converters (geopandas.GeoDataFrame): GeoDataFrame containing information about converters.
+    """
+    converters = []
+    for idx, row in links.iterrows():
+        for conv in range(2):
+            link_end = row[f"bus{conv}"]
+            # HVDC Gotland is connected to 130 kV grid, closest HVAC bus is further away
+
+            closest_bus = _find_closest_bus(link_end, buses, distance_crs, tol=40000)
+
+            if closest_bus is None:
+                continue
+
+            converter_id = f"converter/{row['link_id']}_{conv}"
+            logger.info(
+                f"Added converter #{conv+1}/2 for link {row['link_id']}:{converter_id}."
+            )
+
+            # Create the converter
+            converters.append(
+                [
+                    converter_id,  # "line_id"
+                    link_end,  # "bus0"
+                    closest_bus,  # "bus1"
+                    row["p_nom"],  # "p_nom"
+                    False,  # "underground"
+                    False,  # "under_construction"
+                    buses[buses["bus_id"] == closest_bus].country.values[
+                        0
+                    ],  # "country"
+                    LineString(
+                        [
+                            buses[buses["bus_id"] == link_end].geometry.values[0],
+                            buses[buses["bus_id"] == closest_bus].geometry.values[0],
+                        ]
+                    ),  # "geometry"
+                ]
+            )
 
-    # name of the columns
     conv_columns = [
         "converter_id",
         "bus0",
         "bus1",
+        "p_nom",
         "underground",
         "under_construction",
         "country",
         "geometry",
     ]
 
-    df_converters = gpd.GeoDataFrame(df_converters, columns=conv_columns).reset_index()
+    gdf_converters = gpd.GeoDataFrame(
+        converters, columns=conv_columns, crs=geo_crs
+    ).reset_index()
 
-    return df_converters
+    return gdf_converters
 
 
 def connect_stations_same_station_id(lines, buses):
@@ -669,6 +778,133 @@ def merge_stations_lines_by_station_id_and_voltage(
     return lines, links, buses
 
 
+def _split_linestring_by_point(linestring, points):
+    """
+    Function to split a linestring geometry by multiple inner points.
+
+    Parameters
+    ----------
+    lstring : LineString
+        Linestring of the line to be split
+    points : list
+        List of points to split the linestring
+
+    Return
+    ------
+    list_lines : list
+        List of linestring to split the line
+    """
+
+    list_linestrings = [linestring]
+
+    for p in points:
+        # execute split to all lines and store results
+        temp_list = [split(l, p) for l in list_linestrings]
+        # nest all geometries
+        list_linestrings = [lstring for tval in temp_list for lstring in tval.geoms]
+
+    return list_linestrings
+
+
+def fix_overpassing_lines(lines, buses, distance_crs, tol=1):
+    """
+    Fix overpassing lines by splitting them at nodes within a given tolerance,
+    to include the buses being overpassed.
+
+    Parameters:
+    - lines (GeoDataFrame): The lines to be fixed.
+    - buses (GeoDataFrame): The buses representing nodes.
+    - distance_crs (str): The coordinate reference system (CRS) for distance calculations.
+    - tol (float): The tolerance distance in meters for determining if a bus is within a line.
+    Returns:
+    - lines (GeoDataFrame): The fixed lines.
+    - buses (GeoDataFrame): The buses representing nodes.
+    """
+
+    lines_to_add = []  # list of lines to be added
+    lines_to_split = []  # list of lines that have been split
+
+    lines_epsgmod = lines.to_crs(distance_crs)
+    buses_epsgmod = buses.to_crs(distance_crs)
+
+    # set tqdm options for substation ids
+    tqdm_kwargs_substation_ids = dict(
+        ascii=False,
+        unit=" lines",
+        total=lines.shape[0],
+        desc="Verify lines overpassing nodes ",
+    )
+
+    for l in tqdm(lines.index, **tqdm_kwargs_substation_ids):
+        # bus indices being within tolerance from the line
+        bus_in_tol_epsg = buses_epsgmod[
+            buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol
+        ]
+
+        # exclude endings of the lines
+        bus_in_tol_epsg = bus_in_tol_epsg[
+            (
+                (
+                    bus_in_tol_epsg.geometry.distance(
+                        lines_epsgmod.geometry.loc[l].boundary.geoms[0]
+                    )
+                    > tol
+                )
+                | (
+                    bus_in_tol_epsg.geometry.distance(
+                        lines_epsgmod.geometry.loc[l].boundary.geoms[1]
+                    )
+                    > tol
+                )
+            )
+        ]
+
+        if not bus_in_tol_epsg.empty:
+            # add index of line to split
+            lines_to_split.append(l)
+
+            buses_locs = buses.geometry.loc[bus_in_tol_epsg.index]
+
+            # get new line geometries
+            new_geometries = _split_linestring_by_point(lines.geometry[l], buses_locs)
+            n_geoms = len(new_geometries)
+
+            # create temporary copies of the line
+            df_append = gpd.GeoDataFrame([lines.loc[l]] * n_geoms)
+            # update geometries
+            df_append["geometry"] = new_geometries
+            # update name of the line if there are multiple line segments
+            df_append["line_id"] = [
+                str(df_append["line_id"].iloc[0])
+                + (f"-{letter}" if n_geoms > 1 else "")
+                for letter in string.ascii_lowercase[:n_geoms]
+            ]
+
+            lines_to_add.append(df_append)
+
+    if not lines_to_add:
+        return lines, buses
+
+    df_to_add = gpd.GeoDataFrame(pd.concat(lines_to_add, ignore_index=True))
+    df_to_add.set_crs(lines.crs, inplace=True)
+    df_to_add.set_index(lines.index[-1] + df_to_add.index, inplace=True)
+
+    # update length
+    df_to_add["length"] = df_to_add.to_crs(distance_crs).geometry.length
+
+    # update line endings
+    df_to_add = line_endings_to_bus_conversion(df_to_add)
+
+    # remove original lines
+    lines.drop(lines_to_split, inplace=True)
+
+    lines = df_to_add if lines.empty else pd.concat([lines, df_to_add])
+
+    lines = gpd.GeoDataFrame(lines.reset_index(drop=True), crs=lines.crs)
+
+    return lines, buses
+
+
 def build_network(
     inputs,
     outputs,
@@ -741,6 +977,11 @@ def build_network(
     lines = line_endings_to_bus_conversion(lines)
     links = line_endings_to_bus_conversion(links)
 
+    logger.info(
+        "Fixing lines overpassing nodes: Connecting nodes and splittling lines."
+    )
+    lines, buses = fix_overpassing_lines(lines, buses, distance_crs, tol=1)
+
     # METHOD to merge buses with same voltage and within tolerance
     tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
     logger.info(f"Aggregating close substations: Enabled with tolerance {tol} m")
@@ -759,7 +1000,7 @@ def build_network(
     transformers = get_transformers(buses, lines)
 
     # get converters: currently modelled as links connecting buses with different polarity
-    converters = get_converters(buses)
+    converters = _get_converters(buses, links, distance_crs, tol)
 
     logger.info("Saving outputs")
 
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 9992dba6d..bf9d1c4ab 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -802,7 +802,7 @@ def _filter_by_voltage(df, min_voltage=200000):
     # Keep numeric strings
     list_voltages = list_voltages[np.vectorize(str.isnumeric)(list_voltages)]
     list_voltages = list_voltages.astype(int)
-    list_voltages = list_voltages[list_voltages >= int(min_voltage_ac)]
+    list_voltages = list_voltages[list_voltages >= int(min_voltage)]
     list_voltages = list_voltages.astype(str)
 
     bool_voltages = df["voltage"].apply(_check_voltage, list_voltages=list_voltages)
@@ -1670,6 +1670,35 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon):
     return gdf_lines
 
 
+# Function to bridge gaps between all lines
+def _bridge_lines(lines):
+    bridged_lines = []
+    for i in range(len(lines) - 1):
+        bridged_lines.append(lines[i])
+
+        # Get the endpoints of the current line and the startpoints of the next line
+        end_points = [lines[i].coords[-1], lines[i].coords[0]]
+        start_points = [lines[i + 1].coords[0], lines[i + 1].coords[-1]]
+
+        # Find the closest pair of points between the two LineStrings
+        min_distance = float("inf")
+        closest_pair = None
+
+        for end_point in end_points:
+            for start_point in start_points:
+                distance = LineString([end_point, start_point]).length
+                if distance < min_distance:
+                    min_distance = distance
+                    closest_pair = (end_point, start_point)
+
+        # Create a bridge between the closest points
+        bridge = LineString(closest_pair)
+        bridged_lines.append(bridge)
+
+    bridged_lines.append(lines[-1])
+    return bridged_lines
+
+
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
@@ -1780,7 +1809,9 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon):
 
     df_links.loc[:, "geometry"] = df_links.apply(_create_single_link, axis=1)
     df_links = _finalise_links(df_links)
-    gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs)
+    gdf_links = gpd.GeoDataFrame(df_links, geometry="geometry", crs=crs).set_index(
+        "link_id"
+    )
 
     # Add line endings to substations
     path_country_shapes = snakemake.input.country_shapes
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 036ca0815..651e8ea29 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -306,14 +306,24 @@ def split_links(nodes):
 
         seen = set()
 
+        # Corsica substation
+        node_corsica = find_closest_bus(
+            n,
+            x=9.44802,
+            y=42.52842,
+            tol=2000,  # Tolerance needed to only return the bus if the region is actually modelled
+        )
+
         # Supernodes are endpoints of links, identified by having lass then two neighbours or being an AC Bus
         # An example for the latter is if two different links are connected to the same AC bus.
+        # Manually keep Corsica substation as a supernode
         supernodes = {
             m
             for m in nodes
             if (
                 (len(G.adj[m]) < 2 or (set(G.adj[m]) - nodes))
                 or (n.buses.loc[m, "carrier"] == "AC")
+                or (m == node_corsica)
             )
         }
 
@@ -530,6 +540,42 @@ def cluster(
     return clustering.network, clustering.busmap
 
 
+def find_closest_bus(n, x, y, tol=2000):
+    """
+    Find the index of the closest bus to the given coordinates within a specified tolerance.
+    Parameters:
+        n (pypsa.Network): The network object.
+        x (float): The x-coordinate (longitude) of the target location.
+        y (float): The y-coordinate (latitude) of the target location.
+        tol (float): The distance tolerance in meters. Default is 2000 meters.
+
+    Returns:
+        int: The index of the closest bus to the target location within the tolerance.
+             Returns None if no bus is within the tolerance.
+    """
+    # Conversion factors
+    meters_per_degree_lat = 111139  # Meters per degree of latitude
+    meters_per_degree_lon = 111139 * np.cos(
+        np.radians(y)
+    )  # Meters per degree of longitude at the given latitude
+
+    x0 = np.array(n.buses.x)
+    y0 = np.array(n.buses.y)
+
+    # Calculate distances in meters
+    dist = np.sqrt(
+        ((x - x0) * meters_per_degree_lon) ** 2
+        + ((y - y0) * meters_per_degree_lat) ** 2
+    )
+
+    # Find the closest bus within the tolerance
+    min_dist = dist.min()
+    if min_dist <= tol:
+        return n.buses.index[dist.argmin()]
+    else:
+        return None
+
+
 if __name__ == "__main__":
     if "snakemake" not in globals():
         from _helpers import mock_snakemake

From b143f49d651fe15e31c72872073fa96f87f84826 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 14 Aug 2024 18:17:30 +0200
Subject: [PATCH 085/100] remove config backup

---
 config/config_backup.yaml | 1213 -------------------------------------
 1 file changed, 1213 deletions(-)
 delete mode 100644 config/config_backup.yaml

diff --git a/config/config_backup.yaml b/config/config_backup.yaml
deleted file mode 100644
index 9ebeea351..000000000
--- a/config/config_backup.yaml
+++ /dev/null
@@ -1,1213 +0,0 @@
-# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
-#
-# SPDX-License-Identifier: CC0-1.0
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#top-level-configuration
-version: 0.11.0
-tutorial: false
-
-logging:
-  level: INFO
-  format: '%(levelname)s:%(name)s:%(message)s'
-
-private:
-  keys:
-    entsoe_api:
-
-remote:
-  ssh: z1
-  path: ~/scratch/projects/pypsa-eur
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
-run:
-  prefix: ""
-  name: "europe-ua-md-gridkit-custom"
-  scenarios:
-    enable: false
-    file: config/scenarios.yaml
-  disable_progressbar: false
-  shared_resources:
-    policy: false
-    exclude: []
-  shared_cutouts: true
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#foresight
-foresight: overnight
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#scenario
-# Wildcard docs in https://pypsa-eur.readthedocs.io/en/latest/wildcards.html
-scenario:
-  simpl:
-  - ''
-  ll:
-  - v1.0
-  clusters:
-  - 320
-  opts:
-  - ''
-  sector_opts:
-  - ''
-  planning_horizons:
-  # - 2020
-  - 2030
-  # - 2040
-  # - 2050
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
-countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA', 'MD']
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
-snapshots:
-  start: '2013-01-01'
-  end: '2014-01-01'
-  inclusive: 'left'
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
-enable:
-  retrieve: auto
-  prepare_links_p_nom: false
-  retrieve_databundle: true
-  retrieve_cost_data: true
-  build_cutout: false
-  retrieve_cutout: true
-  custom_busmap: true
-  drop_leap_day: true
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
-co2_budget:
-  2020: 0.701
-  2025: 0.524
-  2030: 0.297
-  2035: 0.150
-  2040: 0.071
-  2045: 0.032
-  2050: 0.000
-
-electricity_network:
-  base_network: gridkit             # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
-  osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
-electricity:
-  voltages: [200., 220., 300., 380., 400., 500., 750.]
-  gaslimit_enable: false
-  gaslimit: false
-  co2limit_enable: false
-  co2limit: 7.75e+7
-  co2base: 1.487e+9
-
-  operational_reserve:
-    activate: false
-    epsilon_load: 0.02
-    epsilon_vres: 0.02
-    contingency: 4000
-
-  max_hours:
-    battery: 6
-    H2: 168
-
-  extendable_carriers:
-    Generator: [solar, solar-hsat, onwind, offwind-ac, offwind-dc, offwind-float, OCGT, CCGT]
-    StorageUnit: [] # battery, H2
-    Store: [battery, H2]
-    Link: [] # H2 pipeline
-
-  powerplants_filter: (DateOut >= 2023 or DateOut != DateOut) and not (Country == 'Germany' and Fueltype == 'Nuclear')
-  custom_powerplants: false
-  everywhere_powerplants: []
-
-  conventional_carriers: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
-  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro]
-  # renewable_carriers: [solar, solar-hsat, onwind, offwind-ac, offwind-dc, offwind-float, hydro]
-
-  estimate_renewable_capacities:
-    enable: true
-    from_opsd: true
-    year: 2020
-    expansion_limit: false
-    technology_mapping:
-      Offshore: [offwind-ac, offwind-dc, offwind-float]
-      Onshore: [onwind]
-      PV: [solar]
-
-  autarky:
-    enable: false
-    by_country: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#atlite
-atlite:
-  default_cutout: europe-2013-era5
-  nprocesses: 4
-  show_progress: false
-  cutouts:
-    # use 'base' to determine geographical bounds and time span from config
-    # base:
-      # module: era5
-    europe-2013-era5:
-      module: era5 # in priority order
-      x: [-12., 42.]
-      y: [33., 72]
-      dx: 0.3
-      dy: 0.3
-      time: ['2013', '2013']
-    europe-2013-sarah:
-      module: [sarah, era5] # in priority order
-      x: [-12., 42.]
-      y: [33., 65]
-      dx: 0.2
-      dy: 0.2
-      time: ['2013', '2013']
-      sarah_interpolate: false
-      sarah_dir:
-      features: [influx, temperature]
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#renewable
-renewable:
-  onwind:
-    cutout: europe-2013-era5
-    resource:
-      method: wind
-      turbine: Vestas_V112_3MW
-      add_cutout_windspeed: true
-    capacity_per_sqkm: 3
-    # correction_factor: 0.93
-    corine:
-      grid_codes: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32]
-      distance: 1000
-      distance_grid_codes: [1, 2, 3, 4, 5, 6]
-    luisa: false
-      # grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
-      # distance: 1000
-      # distance_grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
-    natura: true
-    excluder_resolution: 100
-    clip_p_max_pu: 1.e-2
-  offwind-ac:
-    cutout: europe-2013-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
-      add_cutout_windspeed: true
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    corine: [44, 255]
-    luisa: false # [0, 5230]
-    natura: true
-    ship_threshold: 400
-    max_depth: 60
-    max_shore_distance: 30000
-    excluder_resolution: 200
-    clip_p_max_pu: 1.e-2
-  offwind-dc:
-    cutout: europe-2013-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
-      add_cutout_windspeed: true
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    corine: [44, 255]
-    luisa: false # [0, 5230]
-    natura: true
-    ship_threshold: 400
-    max_depth: 60
-    min_shore_distance: 30000
-    excluder_resolution: 200
-    clip_p_max_pu: 1.e-2
-  offwind-float:
-    cutout: europe-2013-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_5MW_offshore
-    # ScholzPhd Tab 4.3.1: 10MW/km^2
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    # proxy for wake losses
-    # from 10.1016/j.energy.2018.08.153
-    # until done more rigorously in #153
-    corine: [44, 255]
-    natura: true
-    ship_threshold: 400
-    excluder_resolution: 200
-    min_depth: 60
-    max_depth: 1000
-    clip_p_max_pu: 1.e-2
-  solar:
-    cutout: europe-2013-sarah
-    resource:
-      method: pv
-      panel: CSi
-      orientation:
-        slope: 35.
-        azimuth: 180.
-    capacity_per_sqkm: 5.1
-    # correction_factor: 0.854337
-    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
-    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
-    natura: true
-    excluder_resolution: 100
-    clip_p_max_pu: 1.e-2
-  solar-hsat:
-    cutout: europe-2013-sarah
-    resource:
-      method: pv
-      panel: CSi
-      orientation:
-        slope: 35.
-        azimuth: 180.
-      tracking: horizontal
-    capacity_per_sqkm: 4.43 # 15% higher land usage acc. to NREL
-    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
-    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
-    natura: true
-    excluder_resolution: 100
-    clip_p_max_pu: 1.e-2
-  hydro:
-    cutout: europe-2013-era5
-    carriers: [ror, PHS, hydro]
-    PHS_max_hours: 6
-    hydro_max_hours: "energy_capacity_totals_by_country" # one of energy_capacity_totals_by_country, estimate_by_large_installations or a float
-    flatten_dispatch: false
-    flatten_dispatch_buffer: 0.2
-    clip_min_inflow: 1.0
-    eia_norm_year: false
-    eia_correct_by_capacity: false
-    eia_approximate_missing: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#conventional
-conventional:
-  unit_commitment: false
-  dynamic_fuel_price: false
-  nuclear:
-    p_max_pu: "data/nuclear_p_max_pu.csv" # float of file name
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
-lines:
-  types:
-    200.0: Al/St 240/40 2-bundle 220.0
-    220.0: Al/St 240/40 2-bundle 220.0
-    300.0: Al/St 240/40 3-bundle 300.0
-    380.0: Al/St 240/40 4-bundle 380.0
-    400.0: Al/St 240/40 4-bundle 380.0
-    500.0: Al/St 240/40 4-bundle 380.0
-    750.0: Al/St 560/50 4-bundle 750.0
-  s_max_pu: 0.7
-  s_nom_max: .inf
-  max_extension: 20000 #MW
-  length_factor: 1.25
-  reconnect_crimea: true
-  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
-  dynamic_line_rating:
-    activate: false
-    cutout: europe-2013-era5
-    correction_factor: 0.95
-    max_voltage_difference: false
-    max_line_rating: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#links
-links:
-  p_max_pu: 1.0
-  p_nom_max: .inf
-  max_extension: 30000 #MW
-  include_tyndp: true
-  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
-transformers:
-  x: 0.1
-  s_nom: 2000.
-  type: ''
-
-# docs-load in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#load
-load:
-  interpolate_limit: 3
-  time_shift_for_large_gaps: 1w
-  manual_adjustments: true # false
-  scaling_factor: 1.0
-  fixed_year: false # false or year (e.g. 2013)
-  supplement_synthetic: true
-
-# docs
-# TODO: PyPSA-Eur merge issue in prepare_sector_network.py
-# regulate what components with which carriers are kept from PyPSA-Eur;
-# some technologies are removed because they are implemented differently
-# (e.g. battery or H2 storage) or have different year-dependent costs
-# in PyPSA-Eur-Sec
-pypsa_eur:
-  Bus:
-  - AC
-  Link:
-  - DC
-  Generator:
-  - onwind
-  - offwind-ac
-  - offwind-dc
-  - offwind-float
-  - solar-hsat
-  - solar
-  - ror
-  - nuclear
-  StorageUnit:
-  - PHS
-  - hydro
-  Store: []
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#energy
-energy:
-  energy_totals_year: 2019
-  base_emissions_year: 1990
-  emissions: CO2
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#biomass
-biomass:
-  year: 2030
-  scenario: ENS_Med
-  classes:
-    solid biomass:
-    - Agricultural waste
-    - Fuelwood residues
-    - Secondary Forestry residues - woodchips
-    - Sawdust
-    - Residues from landscape care
-    - Municipal waste
-    not included:
-    - Sugar from sugar beet
-    - Rape seed
-    - "Sunflower, soya seed "
-    - Bioethanol barley, wheat, grain maize, oats, other cereals and rye
-    - Miscanthus, switchgrass, RCG
-    - Willow
-    - Poplar
-    - FuelwoodRW
-    - C&P_RW
-    biogas:
-    - Manure solid, liquid
-    - Sludge
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solar-thermal
-solar_thermal:
-  clearsky_model: simple  # should be "simple" or "enhanced"?
-  orientation:
-    slope: 45.
-    azimuth: 180.
-  cutout: default
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#existing-capacities
-existing_capacities:
-  grouping_years_power: [1920, 1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025]
-  grouping_years_heat: [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2019] # heat grouping years >= baseyear will be ignored
-  threshold_capacity: 10
-  default_heating_lifetime: 20
-  conventional_carriers:
-  - lignite
-  - coal
-  - oil
-  - uranium
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#sector
-sector:
-  transport: true
-  heating: true
-  biomass: true
-  industry: true
-  agriculture: true
-  district_heating:
-    potential: 0.6
-    progress:
-      2020: 0.0
-      2025: 0.15
-      2030: 0.3
-      2035: 0.45
-      2040: 0.6
-      2045: 0.8
-      2050: 1.0
-    district_heating_loss: 0.15
-  cluster_heat_buses: true
-  heat_demand_cutout: default
-  bev_dsm_restriction_value: 0.75
-  bev_dsm_restriction_time: 7
-  transport_heating_deadband_upper: 20.
-  transport_heating_deadband_lower: 15.
-  ICE_lower_degree_factor: 0.375
-  ICE_upper_degree_factor: 1.6
-  EV_lower_degree_factor: 0.98
-  EV_upper_degree_factor: 0.63
-  bev_dsm: true
-  bev_availability: 0.5
-  bev_energy: 0.05
-  bev_charge_efficiency: 0.9
-  bev_charge_rate: 0.011
-  bev_avail_max: 0.95
-  bev_avail_mean: 0.8
-  v2g: true
-  land_transport_fuel_cell_share:
-    2020: 0
-    2025: 0
-    2030: 0
-    2035: 0
-    2040: 0
-    2045: 0
-    2050: 0
-  land_transport_electric_share:
-    2020: 0
-    2025: 0.15
-    2030: 0.3
-    2035: 0.45
-    2040: 0.7
-    2045: 0.85
-    2050: 1
-  land_transport_ice_share:
-    2020: 1
-    2025: 0.85
-    2030: 0.7
-    2035: 0.55
-    2040: 0.3
-    2045: 0.15
-    2050: 0
-  transport_electric_efficiency: 53.19 # 1 MWh_el = 53.19*100 km
-  transport_fuel_cell_efficiency: 30.003 # 1 MWh_H2 = 30.003*100 km
-  transport_ice_efficiency: 16.0712 # 1 MWh_oil = 16.0712 * 100 km
-  agriculture_machinery_electric_share: 0
-  agriculture_machinery_oil_share: 1
-  agriculture_machinery_fuel_efficiency: 0.7
-  agriculture_machinery_electric_efficiency: 0.3
-  MWh_MeOH_per_MWh_H2: 0.8787
-  MWh_MeOH_per_tCO2: 4.0321
-  MWh_MeOH_per_MWh_e: 3.6907
-  shipping_hydrogen_liquefaction: false
-  shipping_hydrogen_share:
-    2020: 0
-    2025: 0
-    2030: 0
-    2035: 0
-    2040: 0
-    2045: 0
-    2050: 0
-  shipping_methanol_share:
-    2020: 0
-    2025: 0.15
-    2030: 0.3
-    2035: 0.5
-    2040: 0.7
-    2045: 0.85
-    2050: 1
-  shipping_oil_share:
-    2020: 1
-    2025: 0.85
-    2030: 0.7
-    2035: 0.5
-    2040: 0.3
-    2045: 0.15
-    2050: 0
-  shipping_methanol_efficiency: 0.46
-  shipping_oil_efficiency: 0.40
-  aviation_demand_factor: 1.
-  HVC_demand_factor: 1.
-  time_dep_hp_cop: true
-  heat_pump_sink_T: 55.
-  reduce_space_heat_exogenously: true
-  reduce_space_heat_exogenously_factor:
-    2020: 0.10  # this results in a space heat demand reduction of 10%
-    2025: 0.09  # first heat demand increases compared to 2020 because of larger floor area per capita
-    2030: 0.09
-    2035: 0.11
-    2040: 0.16
-    2045: 0.21
-    2050: 0.29
-  retrofitting:
-    retro_endogen: false
-    cost_factor: 1.0
-    interest_rate: 0.04
-    annualise_cost: true
-    tax_weighting: false
-    construction_index: true
-  tes: true
-  tes_tau:
-    decentral: 3
-    central: 180
-  boilers: true
-  resistive_heaters: true
-  oil_boilers: false
-  biomass_boiler: true
-  overdimension_individual_heating: 1.1  #to cover demand peaks bigger than data
-  chp: true
-  micro_chp: false
-  solar_thermal: true
-  solar_cf_correction: 0.788457  # =  >>> 1/1.2683
-  marginal_cost_storage: 0. #1e-4
-  methanation: true
-  coal_cc: false
-  dac: true
-  co2_vent: false
-  central_heat_vent: false
-  allam_cycle: false
-  hydrogen_fuel_cell: true
-  hydrogen_turbine: false
-  SMR: true
-  SMR_cc: true
-  regional_methanol_demand: false
-  regional_oil_demand: false
-  regional_coal_demand: false
-  regional_co2_sequestration_potential:
-    enable: false
-    attribute:
-    - conservative estimate Mt
-    - conservative estimate GAS Mt
-    - conservative estimate OIL Mt
-    - conservative estimate aquifer Mt
-    include_onshore: false
-    min_size: 3
-    max_size: 25
-    years_of_storage: 25
-  co2_sequestration_potential: 200
-  co2_sequestration_cost: 10
-  co2_sequestration_lifetime: 50
-  co2_spatial: false
-  co2network: false
-  co2_network_cost_factor: 1
-  cc_fraction: 0.9
-  hydrogen_underground_storage: true
-  hydrogen_underground_storage_locations:
-    # - onshore  # more than 50 km from sea
-  - nearshore    # within 50 km of sea
-    # - offshore
-  ammonia: false
-  min_part_load_fischer_tropsch: 0.5
-  min_part_load_methanolisation: 0.3
-  min_part_load_methanation: 0.3
-  use_fischer_tropsch_waste_heat: 0.25
-  use_haber_bosch_waste_heat: 0.25
-  use_methanolisation_waste_heat: 0.25
-  use_methanation_waste_heat: 0.25
-  use_fuel_cell_waste_heat: 0.25
-  use_electrolysis_waste_heat: 0.25
-  electricity_transmission_grid: true
-  electricity_distribution_grid: true
-  electricity_distribution_grid_cost_factor: 1.0
-  electricity_grid_connection: true
-  transmission_efficiency:
-    DC:
-      efficiency_static: 0.98
-      efficiency_per_1000km: 0.977
-    H2 pipeline:
-      efficiency_per_1000km: 1 # 0.982
-      compression_per_1000km: 0.018
-    gas pipeline:
-      efficiency_per_1000km: 1 #0.977
-      compression_per_1000km: 0.01
-    electricity distribution grid:
-      efficiency_static: 0.97
-  H2_network: true
-  gas_network: false
-  H2_retrofit: false
-  H2_retrofit_capacity_per_CH4: 0.6
-  gas_network_connectivity_upgrade: 1
-  gas_distribution_grid: true
-  gas_distribution_grid_cost_factor: 1.0
-  biomass_spatial: false
-  biomass_transport: false
-  biogas_upgrading_cc: false
-  conventional_generation:
-    OCGT: gas
-  biomass_to_liquid: false
-  biosng: false
-  limit_max_growth:
-    enable: false
-    # allowing 30% larger than max historic growth
-    factor: 1.3
-    max_growth:  # unit GW
-      onwind: 16 # onshore max grow so far 16 GW in Europe https://www.iea.org/reports/renewables-2020/wind
-      solar: 28 # solar max grow so far 28 GW in Europe https://www.iea.org/reports/renewables-2020/solar-pv
-      offwind-ac: 35 # offshore max grow so far 3.5 GW in Europe https://windeurope.org/about-wind/statistics/offshore/european-offshore-wind-industry-key-trends-statistics-2019/
-      offwind-dc: 35
-    max_relative_growth:
-      onwind: 3
-      solar: 3
-      offwind-ac: 3
-      offwind-dc: 3
-  enhanced_geothermal:
-    enable: false
-    flexible: true
-    max_hours: 240
-    max_boost: 0.25
-    var_cf: true
-    sustainability_factor: 0.0025
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#industry
-industry:
-  St_primary_fraction:
-    2020: 0.6
-    2025: 0.55
-    2030: 0.5
-    2035: 0.45
-    2040: 0.4
-    2045: 0.35
-    2050: 0.3
-  DRI_fraction:
-    2020: 0
-    2025: 0
-    2030: 0.05
-    2035: 0.2
-    2040: 0.4
-    2045: 0.7
-    2050: 1
-  H2_DRI: 1.7
-  elec_DRI: 0.322
-  Al_primary_fraction:
-    2020: 0.4
-    2025: 0.375
-    2030: 0.35
-    2035: 0.325
-    2040: 0.3
-    2045: 0.25
-    2050: 0.2
-  MWh_NH3_per_tNH3: 5.166
-  MWh_CH4_per_tNH3_SMR: 10.8
-  MWh_elec_per_tNH3_SMR: 0.7
-  MWh_H2_per_tNH3_electrolysis: 5.93
-  MWh_elec_per_tNH3_electrolysis: 0.2473
-  MWh_NH3_per_MWh_H2_cracker: 1.46 # https://github.com/euronion/trace/blob/44a5ff8401762edbef80eff9cfe5a47c8d3c8be4/data/efficiencies.csv
-  NH3_process_emissions: 24.5
-  petrochemical_process_emissions: 25.5
-  #HVC primary/recycling based on values used in Neumann et al https://doi.org/10.1016/j.joule.2023.06.016, linearly interpolated between 2020 and 2050
-  #2020 recycling rates based on Agora https://static.agora-energiewende.de/fileadmin/Projekte/2021/2021_02_EU_CEAP/A-EW_254_Mobilising-circular-economy_study_WEB.pdf
-  #fractions refer to the total primary HVC production in 2020
-  #assumes 6.7 Mtplastics produced from recycling in 2020
-  HVC_primary_fraction:
-    2020: 1.0
-    2025: 0.9
-    2030: 0.8
-    2035: 0.7
-    2040: 0.6
-    2045: 0.5
-    2050: 0.4
-  HVC_mechanical_recycling_fraction:
-    2020: 0.12
-    2025: 0.15
-    2030: 0.18
-    2035: 0.21
-    2040: 0.24
-    2045: 0.27
-    2050: 0.30
-  HVC_chemical_recycling_fraction:
-    2020: 0.0
-    2025: 0.0
-    2030: 0.04
-    2035: 0.08
-    2040: 0.12
-    2045: 0.16
-    2050: 0.20
-  HVC_environment_sequestration_fraction: 0.
-  waste_to_energy: false
-  waste_to_energy_cc: false
-  sector_ratios_fraction_future:
-    2020: 0.0
-    2025: 0.1
-    2030: 0.3
-    2035: 0.5
-    2040: 0.7
-    2045: 0.9
-    2050: 1.0
-  basic_chemicals_without_NH3_production_today: 69. #Mt/a, = 86 Mtethylene-equiv - 17 MtNH3
-  HVC_production_today: 52.
-  MWh_elec_per_tHVC_mechanical_recycling: 0.547
-  MWh_elec_per_tHVC_chemical_recycling: 6.9
-  chlorine_production_today: 9.58
-  MWh_elec_per_tCl: 3.6
-  MWh_H2_per_tCl: -0.9372
-  methanol_production_today: 1.5
-  MWh_elec_per_tMeOH: 0.167
-  MWh_CH4_per_tMeOH: 10.25
-  MWh_MeOH_per_tMeOH: 5.528
-  hotmaps_locate_missing: false
-  reference_year: 2015
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#costs
-costs:
-  year: 2030
-  version: v0.9.0
-  social_discountrate: 0.02
-  fill_values:
-    FOM: 0
-    VOM: 0
-    efficiency: 1
-    fuel: 0
-    investment: 0
-    lifetime: 25
-    "CO2 intensity": 0
-    "discount rate": 0.07
-  # Marginal and capital costs can be overwritten
-  # capital_cost:
-  #   onwind: 500
-  marginal_cost:
-    solar: 0.01
-    onwind: 0.015
-    offwind: 0.015
-    hydro: 0.
-    H2: 0.
-    electrolysis: 0.
-    fuel cell: 0.
-    battery: 0.
-    battery inverter: 0.
-  emission_prices:
-    enable: true
-    co2: 100.
-    co2_monthly_prices: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#clustering
-clustering:
-  focus_weights: false
-  simplify_network:
-    to_substations: false
-    algorithm: kmeans # choose from: [hac, kmeans]
-    feature: solar+onwind-time
-    exclude_carriers: []
-    remove_stubs: true
-    remove_stubs_across_borders: true
-  cluster_network:
-    algorithm: kmeans
-    feature: solar+onwind-time
-    exclude_carriers: []
-    consider_efficiency_classes: false
-  aggregation_strategies:
-    generators:
-      committable: any
-      ramp_limit_up: max
-      ramp_limit_down: max
-  temporal:
-    resolution_elec: 25H
-    resolution_sector: 25H
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#adjustments
-adjustments:
-  electricity: false
-  sector: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solving
-solving:
-  #tmpdir: "path/to/tmp"
-  options:
-    clip_p_max_pu: 1.e-2
-    load_shedding: false
-    noisy_costs: true
-    skip_iterations: true
-    rolling_horizon: false
-    seed: 123
-    custom_extra_functionality: "../data/custom_extra_functionality.py"
-    # io_api: "direct"  # Increases performance but only supported for the highs and gurobi solvers
-    # options that go into the optimize function
-    track_iterations: false
-    min_iterations: 2
-    max_iterations: 3
-    transmission_losses: 2
-    linearized_unit_commitment: true
-    horizon: 365
-    post_discretization:
-      enable: false
-      line_unit_size: 1700
-      line_threshold: 0.3
-      link_unit_size:
-        DC: 2000
-        H2 pipeline: 1200
-        gas pipeline: 1500
-      link_threshold:
-        DC: 0.3
-        H2 pipeline: 0.3
-        gas pipeline: 0.3
-
-  agg_p_nom_limits:
-    agg_offwind: false
-    include_existing: false
-    file: data/agg_p_nom_minmax.csv
-
-  constraints:
-    CCL: false
-    EQ: false
-    BAU: false
-    SAFE: false
-
-  solver:
-    name: gurobi
-    options: gurobi-default
-
-  solver_options:
-    highs-default:
-      # refer to https://ergo-code.github.io/HiGHS/dev/options/definitions/
-      threads: 4
-      solver: "ipm"
-      run_crossover: "off"
-      small_matrix_value: 1e-6
-      large_matrix_value: 1e9
-      primal_feasibility_tolerance: 1e-5
-      dual_feasibility_tolerance: 1e-5
-      ipm_optimality_tolerance: 1e-4
-      parallel: "on"
-      random_seed: 123
-    gurobi-default:
-      threads: 4
-      method: 2 # barrier
-      crossover: 0
-      BarConvTol: 1.e-6
-      Seed: 123
-      AggFill: 0
-      PreDual: 0
-      GURO_PAR_BARDENSETHRESH: 200
-    gurobi-numeric-focus:
-      NumericFocus: 3       # Favour numeric stability over speed
-      method: 2             # barrier
-      crossover: 0          # do not use crossover
-      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
-      BarConvTol: 1.e-5
-      FeasibilityTol: 1.e-4
-      OptimalityTol: 1.e-4
-      ObjScale: -0.5
-      threads: 8
-      Seed: 123
-    gurobi-fallback:        # Use gurobi defaults
-      crossover: 0
-      method: 2             # barrier
-      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
-      BarConvTol: 1.e-5
-      FeasibilityTol: 1.e-5
-      OptimalityTol: 1.e-5
-      Seed: 123
-      threads: 8
-    cplex-default:
-      threads: 4
-      lpmethod: 4 # barrier
-      solutiontype: 2 # non basic solution, ie no crossover
-      barrier.convergetol: 1.e-5
-      feasopt.tolerance: 1.e-6
-    copt-default:
-      Threads: 8
-      LpMethod: 2
-      Crossover: 0
-    cbc-default: {} # Used in CI
-    glpk-default: {} # Used in CI
-
-  mem_mb: 30000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
-  runtime: 6h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
-plotting:
-  map:
-    boundaries: [-11, 30, 34, 71]
-    color_geomap:
-      ocean: white
-      land: white
-  projection:
-    name: "EqualEarth"
-    # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
-    # name: "LambertConformal"
-    # central_longitude: 10.
-    # central_latitude: 50.
-    # standard_parallels: [35, 65]
-  eu_node_location:
-    x: -5.5
-    y: 46.
-  costs_max: 1000
-  costs_threshold: 1
-  energy_max: 20000
-  energy_min: -20000
-  energy_threshold: 50.
-
-  nice_names:
-    OCGT: "Open-Cycle Gas"
-    CCGT: "Combined-Cycle Gas"
-    offwind-ac: "Offshore Wind (AC)"
-    offwind-dc: "Offshore Wind (DC)"
-    offwind-float: "Offshore Wind (Floating)"
-    onwind: "Onshore Wind"
-    solar: "Solar"
-    PHS: "Pumped Hydro Storage"
-    hydro: "Reservoir & Dam"
-    battery: "Battery Storage"
-    H2: "Hydrogen Storage"
-    lines: "Transmission Lines"
-    ror: "Run of River"
-    load: "Load Shedding"
-    ac: "AC"
-    dc: "DC"
-
-  tech_colors:
-    # wind
-    onwind: "#235ebc"
-    onshore wind: "#235ebc"
-    offwind: "#6895dd"
-    offshore wind: "#6895dd"
-    offwind-ac: "#6895dd"
-    offshore wind (AC): "#6895dd"
-    offshore wind ac: "#6895dd"
-    offwind-dc: "#74c6f2"
-    offshore wind (DC): "#74c6f2"
-    offshore wind dc: "#74c6f2"
-    offwind-float: "#b5e2fa"
-    offshore wind (Float): "#b5e2fa"
-    offshore wind float: "#b5e2fa"
-    # water
-    hydro: '#298c81'
-    hydro reservoir: '#298c81'
-    ror: '#3dbfb0'
-    run of river: '#3dbfb0'
-    hydroelectricity: '#298c81'
-    PHS: '#51dbcc'
-    hydro+PHS: "#08ad97"
-    # solar
-    solar: "#f9d002"
-    solar PV: "#f9d002"
-    solar-hsat: "#fdb915"
-    solar thermal: '#ffbf2b'
-    residential rural solar thermal: '#f1c069'
-    services rural solar thermal: '#eabf61'
-    residential urban decentral solar thermal: '#e5bc5a'
-    services urban decentral solar thermal: '#dfb953'
-    urban central solar thermal: '#d7b24c'
-    solar rooftop: '#ffea80'
-    # gas
-    OCGT: '#e0986c'
-    OCGT marginal: '#e0986c'
-    OCGT-heat: '#e0986c'
-    gas boiler: '#db6a25'
-    gas boilers: '#db6a25'
-    gas boiler marginal: '#db6a25'
-    residential rural gas boiler: '#d4722e'
-    residential urban decentral gas boiler: '#cb7a36'
-    services rural gas boiler: '#c4813f'
-    services urban decentral gas boiler: '#ba8947'
-    urban central gas boiler: '#b0904f'
-    gas: '#e05b09'
-    fossil gas: '#e05b09'
-    natural gas: '#e05b09'
-    biogas to gas: '#e36311'
-    biogas to gas CC: '#e51245'
-    CCGT: '#a85522'
-    CCGT marginal: '#a85522'
-    allam: '#B98F76'
-    gas for industry co2 to atmosphere: '#692e0a'
-    gas for industry co2 to stored: '#8a3400'
-    gas for industry: '#853403'
-    gas for industry CC: '#692e0a'
-    gas pipeline: '#ebbca0'
-    gas pipeline new: '#a87c62'
-    # oil
-    oil: '#c9c9c9'
-    imported oil: '#a3a3a3'
-    oil boiler: '#adadad'
-    residential rural oil boiler: '#a9a9a9'
-    services rural oil boiler: '#a5a5a5'
-    residential urban decentral oil boiler: '#a1a1a1'
-    urban central oil boiler: '#9d9d9d'
-    services urban decentral oil boiler: '#999999'
-    agriculture machinery oil: '#949494'
-    shipping oil: "#808080"
-    land transport oil: '#afafaf'
-    # nuclear
-    Nuclear: '#ff8c00'
-    Nuclear marginal: '#ff8c00'
-    nuclear: '#ff8c00'
-    uranium: '#ff8c00'
-    # coal
-    Coal: '#545454'
-    coal: '#545454'
-    Coal marginal: '#545454'
-    coal for industry: '#343434'
-    solid: '#545454'
-    Lignite: '#826837'
-    lignite: '#826837'
-    Lignite marginal: '#826837'
-    # biomass
-    biogas: '#e3d37d'
-    biomass: '#baa741'
-    solid biomass: '#baa741'
-    solid biomass transport: '#baa741'
-    solid biomass for industry: '#7a6d26'
-    solid biomass for industry CC: '#47411c'
-    solid biomass for industry co2 from atmosphere: '#736412'
-    solid biomass for industry co2 to stored: '#47411c'
-    urban central solid biomass CHP: '#9d9042'
-    urban central solid biomass CHP CC: '#6c5d28'
-    biomass boiler: '#8A9A5B'
-    residential rural biomass boiler: '#a1a066'
-    residential urban decentral biomass boiler: '#b0b87b'
-    services rural biomass boiler: '#c6cf98'
-    services urban decentral biomass boiler: '#dde5b5'
-    biomass to liquid: '#32CD32'
-    BioSNG: '#123456'
-    # power transmission
-    lines: '#6c9459'
-    transmission lines: '#6c9459'
-    electricity distribution grid: '#97ad8c'
-    low voltage: '#97ad8c'
-    # electricity demand
-    Electric load: '#110d63'
-    electric demand: '#110d63'
-    electricity: '#110d63'
-    industry electricity: '#2d2a66'
-    industry new electricity: '#2d2a66'
-    agriculture electricity: '#494778'
-    # battery + EVs
-    battery: '#ace37f'
-    battery storage: '#ace37f'
-    battery charger: '#88a75b'
-    battery discharger: '#5d4e29'
-    home battery: '#80c944'
-    home battery storage: '#80c944'
-    home battery charger: '#5e8032'
-    home battery discharger: '#3c5221'
-    BEV charger: '#baf238'
-    V2G: '#e5ffa8'
-    land transport EV: '#baf238'
-    land transport demand: '#38baf2'
-    Li ion: '#baf238'
-    # hot water storage
-    water tanks: '#e69487'
-    residential rural water tanks: '#f7b7a3'
-    services rural water tanks: '#f3afa3'
-    residential urban decentral water tanks: '#f2b2a3'
-    services urban decentral water tanks: '#f1b4a4'
-    urban central water tanks: '#e9977d'
-    hot water storage: '#e69487'
-    hot water charging: '#e8998b'
-    urban central water tanks charger: '#b57a67'
-    residential rural water tanks charger: '#b4887c'
-    residential urban decentral water tanks charger: '#b39995'
-    services rural water tanks charger: '#b3abb0'
-    services urban decentral water tanks charger: '#b3becc'
-    hot water discharging: '#e99c8e'
-    urban central water tanks discharger: '#b9816e'
-    residential rural water tanks discharger: '#ba9685'
-    residential urban decentral water tanks discharger: '#baac9e'
-    services rural water tanks discharger: '#bbc2b8'
-    services urban decentral water tanks discharger: '#bdd8d3'
-    # heat demand
-    Heat load: '#cc1f1f'
-    heat: '#cc1f1f'
-    heat vent: '#aa3344'
-    heat demand: '#cc1f1f'
-    rural heat: '#ff5c5c'
-    residential rural heat: '#ff7c7c'
-    services rural heat: '#ff9c9c'
-    central heat: '#cc1f1f'
-    urban central heat: '#d15959'
-    urban central heat vent: '#a74747'
-    decentral heat: '#750606'
-    residential urban decentral heat: '#a33c3c'
-    services urban decentral heat: '#cc1f1f'
-    low-temperature heat for industry: '#8f2727'
-    process heat: '#ff0000'
-    agriculture heat: '#d9a5a5'
-    # heat supply
-    heat pumps: '#2fb537'
-    heat pump: '#2fb537'
-    air heat pump: '#36eb41'
-    residential urban decentral air heat pump: '#48f74f'
-    services urban decentral air heat pump: '#5af95d'
-    services rural air heat pump: '#5af95d'
-    urban central air heat pump: '#6cfb6b'
-    ground heat pump: '#2fb537'
-    residential rural ground heat pump: '#48f74f'
-    residential rural air heat pump: '#48f74f'
-    services rural ground heat pump: '#5af95d'
-    Ambient: '#98eb9d'
-    CHP: '#8a5751'
-    urban central gas CHP: '#8d5e56'
-    CHP CC: '#634643'
-    urban central gas CHP CC: '#6e4e4c'
-    CHP heat: '#8a5751'
-    CHP electric: '#8a5751'
-    district heating: '#e8beac'
-    resistive heater: '#d8f9b8'
-    residential rural resistive heater: '#bef5b5'
-    residential urban decentral resistive heater: '#b2f1a9'
-    services rural resistive heater: '#a5ed9d'
-    services urban decentral resistive heater: '#98e991'
-    urban central resistive heater: '#8cdf85'
-    retrofitting: '#8487e8'
-    building retrofitting: '#8487e8'
-    # hydrogen
-    H2 for industry: "#f073da"
-    H2 for shipping: "#ebaee0"
-    H2: '#bf13a0'
-    hydrogen: '#bf13a0'
-    retrofitted H2 boiler: '#e5a0d9'
-    SMR: '#870c71'
-    SMR CC: '#4f1745'
-    H2 liquefaction: '#d647bd'
-    hydrogen storage: '#bf13a0'
-    H2 Store: '#bf13a0'
-    H2 storage: '#bf13a0'
-    land transport fuel cell: '#6b3161'
-    H2 pipeline: '#f081dc'
-    H2 pipeline retrofitted: '#ba99b5'
-    H2 Fuel Cell: '#c251ae'
-    H2 fuel cell: '#c251ae'
-    H2 turbine: '#991f83'
-    H2 Electrolysis: '#ff29d9'
-    H2 electrolysis: '#ff29d9'
-    # ammonia
-    NH3: '#46caf0'
-    ammonia: '#46caf0'
-    ammonia store: '#00ace0'
-    ammonia cracker: '#87d0e6'
-    Haber-Bosch: '#076987'
-    # syngas
-    Sabatier: '#9850ad'
-    methanation: '#c44ce6'
-    methane: '#c44ce6'
-    # synfuels
-    Fischer-Tropsch: '#25c49a'
-    liquid: '#25c49a'
-    kerosene for aviation: '#a1ffe6'
-    naphtha for industry: '#57ebc4'
-    methanolisation: '#83d6d5'
-    methanol: '#468c8b'
-    shipping methanol: '#468c8b'
-    industry methanol: '#468c8b'
-    # co2
-    CC: '#f29dae'
-    CCS: '#f29dae'
-    CO2 sequestration: '#f29dae'
-    DAC: '#ff5270'
-    co2 stored: '#f2385a'
-    co2 sequestered: '#f2682f'
-    co2: '#f29dae'
-    co2 vent: '#ffd4dc'
-    CO2 pipeline: '#f5627f'
-    # emissions
-    process emissions CC: '#000000'
-    process emissions: '#222222'
-    process emissions to stored: '#444444'
-    process emissions to atmosphere: '#888888'
-    oil emissions: '#aaaaaa'
-    shipping oil emissions: "#555555"
-    shipping methanol emissions: '#666666'
-    land transport oil emissions: '#777777'
-    agriculture machinery oil emissions: '#333333'
-    # other
-    shipping: '#03a2ff'
-    power-to-heat: '#2fb537'
-    power-to-gas: '#c44ce6'
-    power-to-H2: '#ff29d9'
-    power-to-liquid: '#25c49a'
-    gas-to-power/heat: '#ee8340'
-    waste: '#e3d37d'
-    other: '#000000'
-    geothermal: '#ba91b1'
-    geothermal heat: '#ba91b1'
-    geothermal district heat: '#d19D00'
-    geothermal organic rankine cycle: '#ffbf00'
-    AC: "#70af1d"
-    AC-AC: "#70af1d"
-    AC line: "#70af1d"
-    links: "#8a1caf"
-    HVDC links: "#8a1caf"
-    DC: "#8a1caf"
-    DC-DC: "#8a1caf"
-    DC link: "#8a1caf"
-    load: "#dd2e23"
-    waste CHP: '#e3d37d'
-    waste CHP CC: '#e3d3ff'
-    HVC to air: 'k'

From f87eec5dece86146f5837a8cc66f00d81a828a8e Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Thu, 15 Aug 2024 17:52:55 +0200
Subject: [PATCH 086/100] Bug fix: Carrier type of all supernodes corrected to
 'AC'

---
 .sync-send                  |    3 +
 Snakefile                   |    2 +-
 config/config_backuo.yaml   | 1259 +++++++++++++++++++++++++++++++++++
 scripts/simplify_network.py |   30 +-
 4 files changed, 1281 insertions(+), 13 deletions(-)
 create mode 100644 config/config_backuo.yaml

diff --git a/.sync-send b/.sync-send
index 483c7a999..6fc8cb4c0 100644
--- a/.sync-send
+++ b/.sync-send
@@ -9,3 +9,6 @@ config/test
 envs
 matplotlibrc
 Snakefile
+data/eez/
+data/naturalearth/
+resources/europe-nuts2-gridkit/
diff --git a/Snakefile b/Snakefile
index 56a704dec..c45c7e58d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -135,6 +135,6 @@ rule sync:
     shell:
         """
         rsync -uvarh --ignore-missing-args --files-from=.sync-send . {params.cluster}
-        rsync -uvarh --no-g {params.cluster}/resources . || echo "No resources directory, skipping rsync"
+        # rsync -uvarh --no-g {params.cluster}/resources . || echo "No resources directory, skipping rsync"
         rsync -uvarh --no-g {params.cluster}/results . || echo "No results directory, skipping rsync"
         """
diff --git a/config/config_backuo.yaml b/config/config_backuo.yaml
new file mode 100644
index 000000000..f45ea5be0
--- /dev/null
+++ b/config/config_backuo.yaml
@@ -0,0 +1,1259 @@
+# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
+#
+# SPDX-License-Identifier: CC0-1.0
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#top-level-configuration
+version: 0.11.0
+tutorial: false
+
+logging:
+  level: INFO
+  format: '%(levelname)s:%(name)s:%(message)s'
+
+private:
+  keys:
+    entsoe_api:
+
+remote:
+  ssh: zecm
+  path: ~/scratch/projects/pypsa-eur
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
+run:
+  prefix: ""
+  name: "europe-nuts2-gridkit"
+  scenarios:
+    enable: false
+    file: config/scenarios.yaml
+  disable_progressbar: false
+  shared_resources:
+    policy: false
+    exclude: []
+  shared_cutouts: true
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#foresight
+foresight: overnight
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#scenario
+# Wildcard docs in https://pypsa-eur.readthedocs.io/en/latest/wildcards.html
+scenario:
+  simpl:
+  - ''
+  ll:
+  - v1.0
+  clusters:
+  - 318
+  opts:
+  - ''
+  sector_opts:
+  - ''
+  planning_horizons:
+  # - 2020
+  - 2030
+  # - 2040
+  # - 2050
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
+countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA', 'MD']
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
+snapshots:
+  start: "2013-01-01"
+  end: "2014-01-01"
+  inclusive: 'left'
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
+enable:
+  retrieve: auto
+  prepare_links_p_nom: false
+  retrieve_databundle: true
+  retrieve_cost_data: true
+  build_cutout: false
+  retrieve_cutout: true
+  custom_busmap: true
+  drop_leap_day: true
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
+co2_budget:
+  2020: 0.701
+  2025: 0.524
+  2030: 0.297
+  2035: 0.150
+  2040: 0.071
+  2045: 0.032
+  2050: 0.000
+
+electricity_network:
+  base_network: gridkit    # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
+  osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
+lines:
+  types: # Specify voltages (keys) and line types (values) for lines
+    200.: "Al/St 240/40 2-bundle 200.0"
+    220.: "Al/St 240/40 2-bundle 220.0"
+    300.: "Al/St 240/40 3-bundle 300.0"
+    380.: "Al/St 240/40 4-bundle 380.0"
+    500.: "Al/St 240/40 4-bundle 380.0"
+    750.: "Al/St 560/50 4-bundle 750.0"
+  s_max_pu: 0.7
+  s_nom_max: .inf
+  max_extension: 20000 #MW
+  length_factor: 1.25
+  reconnect_crimea: true  # Only needed for 'gridkit' base_network, in OSM, the lines are already connected
+  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+  dynamic_line_rating:
+    activate: false
+    cutout: europe-2013-sarah3-era5
+    correction_factor: 0.95
+    max_voltage_difference: false
+    max_line_rating: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#links
+links:
+  p_max_pu: 1.0
+  p_nom_max: .inf
+  max_extension: 30000 #MW
+  include_tyndp: false
+  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
+transformers:
+  x: 0.1
+  s_nom: 2000.
+  type: ''
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
+electricity:
+  gaslimit_enable: false
+  gaslimit: false
+  co2limit_enable: false
+  co2limit: 7.75e+7
+  co2base: 1.487e+9
+
+  operational_reserve:
+    activate: false
+    epsilon_load: 0.02
+    epsilon_vres: 0.02
+    contingency: 4000
+
+  max_hours:
+    battery: 6
+    H2: 168
+
+  extendable_carriers:
+    Generator: [solar, solar-hsat, onwind, offwind-ac, offwind-dc, offwind-float, OCGT, CCGT]
+    StorageUnit: [] # battery, H2
+    Store: [battery, H2]
+    Link: [] # H2 pipeline
+
+  powerplants_filter: (DateOut >= 2023 or DateOut != DateOut) and not (Country == 'Germany' and Fueltype == 'Nuclear')
+  custom_powerplants: false
+  everywhere_powerplants: []
+
+  conventional_carriers: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
+  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro]
+
+  estimate_renewable_capacities:
+    enable: true
+    from_opsd: true
+    year: 2020
+    expansion_limit: false
+    technology_mapping:
+      Offshore: [offwind-ac, offwind-dc, offwind-float]
+      Onshore: [onwind]
+      PV: [solar]
+
+  autarky:
+    enable: false
+    by_country: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#atlite
+atlite:
+  default_cutout: europe-2013-sarah3-era5
+  nprocesses: 4
+  show_progress: false
+  cutouts:
+    # use 'base' to determine geographical bounds and time span from config
+    # base:
+      # module: era5
+    europe-2013-sarah3-era5:
+      module: [sarah, era5] # in priority order
+      x: [-12., 42.]
+      y: [33., 72.]
+      dx: 0.3
+      dy: 0.3
+      time: ['2013', '2013']
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#renewable
+renewable:
+  onwind:
+    cutout: europe-2013-sarah3-era5
+    resource:
+      method: wind
+      turbine: Vestas_V112_3MW
+      smooth: true
+      add_cutout_windspeed: true
+    capacity_per_sqkm: 3
+    # correction_factor: 0.93
+    corine:
+      grid_codes: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32]
+      distance: 1000
+      distance_grid_codes: [1, 2, 3, 4, 5, 6]
+    luisa: false
+      # grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
+      # distance: 1000
+      # distance_grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
+    natura: true
+    excluder_resolution: 100
+    clip_p_max_pu: 1.e-2
+  offwind-ac:
+    cutout: europe-2013-sarah3-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
+      smooth: true
+      add_cutout_windspeed: true
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    corine: [44, 255]
+    luisa: false # [0, 5230]
+    natura: true
+    ship_threshold: 400
+    max_depth: 60
+    max_shore_distance: 30000
+    excluder_resolution: 200
+    clip_p_max_pu: 1.e-2
+  offwind-dc:
+    cutout: europe-2013-sarah3-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
+      smooth: true
+      add_cutout_windspeed: true
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    corine: [44, 255]
+    luisa: false # [0, 5230]
+    natura: true
+    ship_threshold: 400
+    max_depth: 60
+    min_shore_distance: 30000
+    excluder_resolution: 200
+    clip_p_max_pu: 1.e-2
+  offwind-float:
+    cutout: europe-2013-sarah3-era5
+    resource:
+      method: wind
+      turbine: NREL_ReferenceTurbine_5MW_offshore
+      smooth: true
+      add_cutout_windspeed: true
+    # ScholzPhd Tab 4.3.1: 10MW/km^2
+    capacity_per_sqkm: 2
+    correction_factor: 0.8855
+    # proxy for wake losses
+    # from 10.1016/j.energy.2018.08.153
+    # until done more rigorously in #153
+    corine: [44, 255]
+    natura: true
+    ship_threshold: 400
+    excluder_resolution: 200
+    min_depth: 60
+    max_depth: 1000
+    clip_p_max_pu: 1.e-2
+  solar:
+    cutout: europe-2013-sarah3-era5
+    resource:
+      method: pv
+      panel: CSi
+      orientation:
+        slope: 35.
+        azimuth: 180.
+    capacity_per_sqkm: 5.1
+    # correction_factor: 0.854337
+    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
+    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
+    natura: true
+    excluder_resolution: 100
+    clip_p_max_pu: 1.e-2
+  solar-hsat:
+    cutout: europe-2013-sarah3-era5
+    resource:
+      method: pv
+      panel: CSi
+      orientation:
+        slope: 35.
+        azimuth: 180.
+      tracking: horizontal
+    capacity_per_sqkm: 4.43 # 15% higher land usage acc. to NREL
+    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
+    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
+    natura: true
+    excluder_resolution: 100
+    clip_p_max_pu: 1.e-2
+  hydro:
+    cutout: europe-2013-sarah3-era5
+    carriers: [ror, PHS, hydro]
+    PHS_max_hours: 6
+    hydro_max_hours: "energy_capacity_totals_by_country" # one of energy_capacity_totals_by_country, estimate_by_large_installations or a float
+    flatten_dispatch: false
+    flatten_dispatch_buffer: 0.2
+    clip_min_inflow: 1.0
+    eia_norm_year: false
+    eia_correct_by_capacity: false
+    eia_approximate_missing: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#conventional
+conventional:
+  unit_commitment: false
+  dynamic_fuel_price: false
+  nuclear:
+    p_max_pu: "data/nuclear_p_max_pu.csv" # float of file name
+
+# docs-load in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#load
+load:
+  interpolate_limit: 3
+  time_shift_for_large_gaps: 1w
+  manual_adjustments: true # false
+  scaling_factor: 1.0
+  fixed_year: false # false or year (e.g. 2013)
+  supplement_synthetic: true
+
+# docs
+# TODO: PyPSA-Eur merge issue in prepare_sector_network.py
+# regulate what components with which carriers are kept from PyPSA-Eur;
+# some technologies are removed because they are implemented differently
+# (e.g. battery or H2 storage) or have different year-dependent costs
+# in PyPSA-Eur-Sec
+pypsa_eur:
+  Bus:
+  - AC
+  Link:
+  - DC
+  Generator:
+  - onwind
+  - offwind-ac
+  - offwind-dc
+  - offwind-float
+  - solar-hsat
+  - solar
+  - ror
+  - nuclear
+  StorageUnit:
+  - PHS
+  - hydro
+  Store: []
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#energy
+energy:
+  energy_totals_year: 2019
+  base_emissions_year: 1990
+  emissions: CO2
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#biomass
+biomass:
+  year: 2030
+  scenario: ENS_Med
+  classes:
+    solid biomass:
+    - Agricultural waste
+    - Fuelwood residues
+    - Secondary Forestry residues - woodchips
+    - Sawdust
+    - Residues from landscape care
+    not included:
+    - Sugar from sugar beet
+    - Rape seed
+    - "Sunflower, soya seed "
+    - Bioethanol barley, wheat, grain maize, oats, other cereals and rye
+    - Miscanthus, switchgrass, RCG
+    - Willow
+    - Poplar
+    - FuelwoodRW
+    - C&P_RW
+    biogas:
+    - Manure solid, liquid
+    - Sludge
+    municipal solid waste:
+    - Municipal waste
+  share_unsustainable_use_retained:
+    2020: 1
+    2025: 0.66
+    2030: 0.33
+    2035: 0
+    2040: 0
+    2045: 0
+    2050: 0
+  share_sustainable_potential_available:
+    2020: 0
+    2025: 0.33
+    2030: 0.66
+    2035: 1
+    2040: 1
+    2045: 1
+    2050: 1
+
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solar-thermal
+solar_thermal:
+  clearsky_model: simple  # should be "simple" or "enhanced"?
+  orientation:
+    slope: 45.
+    azimuth: 180.
+  cutout: default
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#existing-capacities
+existing_capacities:
+  grouping_years_power: [1920, 1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025]
+  grouping_years_heat: [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2019] # heat grouping years >= baseyear will be ignored
+  threshold_capacity: 10
+  default_heating_lifetime: 20
+  conventional_carriers:
+  - lignite
+  - coal
+  - oil
+  - uranium
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#sector
+sector:
+  transport: true
+  heating: true
+  biomass: true
+  industry: true
+  agriculture: true
+  fossil_fuels: true
+  district_heating:
+    potential: 0.6
+    progress:
+      2020: 0.0
+      2025: 0.15
+      2030: 0.3
+      2035: 0.45
+      2040: 0.6
+      2045: 0.8
+      2050: 1.0
+    district_heating_loss: 0.15
+    forward_temperature: 90 #C
+    return_temperature: 50 #C
+    heat_source_cooling: 6 #K
+    heat_pump_cop_approximation:
+      refrigerant: ammonia
+      heat_exchanger_pinch_point_temperature_difference: 5 #K
+      isentropic_compressor_efficiency: 0.8
+      heat_loss: 0.0
+  heat_pump_sources:
+    urban central:
+    - air
+    urban decentral:
+    - air
+    rural:
+    - air
+    - ground
+  cluster_heat_buses: true
+  heat_demand_cutout: default
+  bev_dsm_restriction_value: 0.75
+  bev_dsm_restriction_time: 7
+  transport_heating_deadband_upper: 20.
+  transport_heating_deadband_lower: 15.
+  ICE_lower_degree_factor: 0.375
+  ICE_upper_degree_factor: 1.6
+  EV_lower_degree_factor: 0.98
+  EV_upper_degree_factor: 0.63
+  bev_dsm: true
+  bev_availability: 0.5
+  bev_energy: 0.05
+  bev_charge_efficiency: 0.9
+  bev_charge_rate: 0.011
+  bev_avail_max: 0.95
+  bev_avail_mean: 0.8
+  v2g: true
+  land_transport_fuel_cell_share:
+    2020: 0
+    2025: 0
+    2030: 0
+    2035: 0
+    2040: 0
+    2045: 0
+    2050: 0
+  land_transport_electric_share:
+    2020: 0
+    2025: 0.15
+    2030: 0.3
+    2035: 0.45
+    2040: 0.7
+    2045: 0.85
+    2050: 1
+  land_transport_ice_share:
+    2020: 1
+    2025: 0.85
+    2030: 0.7
+    2035: 0.55
+    2040: 0.3
+    2045: 0.15
+    2050: 0
+  transport_electric_efficiency: 53.19 # 1 MWh_el = 53.19*100 km
+  transport_fuel_cell_efficiency: 30.003 # 1 MWh_H2 = 30.003*100 km
+  transport_ice_efficiency: 16.0712 # 1 MWh_oil = 16.0712 * 100 km
+  agriculture_machinery_electric_share: 0
+  agriculture_machinery_oil_share: 1
+  agriculture_machinery_fuel_efficiency: 0.7
+  agriculture_machinery_electric_efficiency: 0.3
+  MWh_MeOH_per_MWh_H2: 0.8787
+  MWh_MeOH_per_tCO2: 4.0321
+  MWh_MeOH_per_MWh_e: 3.6907
+  shipping_hydrogen_liquefaction: false
+  shipping_hydrogen_share:
+    2020: 0
+    2025: 0
+    2030: 0
+    2035: 0
+    2040: 0
+    2045: 0
+    2050: 0
+  shipping_methanol_share:
+    2020: 0
+    2025: 0.15
+    2030: 0.3
+    2035: 0.5
+    2040: 0.7
+    2045: 0.85
+    2050: 1
+  shipping_oil_share:
+    2020: 1
+    2025: 0.85
+    2030: 0.7
+    2035: 0.5
+    2040: 0.3
+    2045: 0.15
+    2050: 0
+  shipping_methanol_efficiency: 0.46
+  shipping_oil_efficiency: 0.40
+  aviation_demand_factor: 1.
+  HVC_demand_factor: 1.
+  time_dep_hp_cop: true
+  heat_pump_sink_T_individual_heating: 55.
+  reduce_space_heat_exogenously: true
+  reduce_space_heat_exogenously_factor:
+    2020: 0.10  # this results in a space heat demand reduction of 10%
+    2025: 0.09  # first heat demand increases compared to 2020 because of larger floor area per capita
+    2030: 0.09
+    2035: 0.11
+    2040: 0.16
+    2045: 0.21
+    2050: 0.29
+  retrofitting:
+    retro_endogen: false
+    cost_factor: 1.0
+    interest_rate: 0.04
+    annualise_cost: true
+    tax_weighting: false
+    construction_index: true
+  tes: true
+  tes_tau:
+    decentral: 3
+    central: 180
+  boilers: true
+  resistive_heaters: true
+  oil_boilers: false
+  biomass_boiler: true
+  overdimension_individual_heating: 1.1  #to cover demand peaks bigger than data
+  chp: true
+  micro_chp: false
+  solar_thermal: true
+  solar_cf_correction: 0.788457  # =  >>> 1/1.2683
+  marginal_cost_storage: 0. #1e-4
+  methanation: true
+  coal_cc: false
+  dac: true
+  co2_vent: false
+  central_heat_vent: false
+  allam_cycle: false
+  hydrogen_fuel_cell: true
+  hydrogen_turbine: false
+  SMR: true
+  SMR_cc: true
+  regional_methanol_demand: false
+  regional_oil_demand: false
+  regional_coal_demand: false
+  regional_co2_sequestration_potential:
+    enable: false
+    attribute:
+    - conservative estimate Mt
+    - conservative estimate GAS Mt
+    - conservative estimate OIL Mt
+    - conservative estimate aquifer Mt
+    include_onshore: false
+    min_size: 3
+    max_size: 25
+    years_of_storage: 25
+  co2_sequestration_potential: 200
+  co2_sequestration_cost: 10
+  co2_sequestration_lifetime: 50
+  co2_spatial: false
+  co2network: false
+  co2_network_cost_factor: 1
+  cc_fraction: 0.9
+  hydrogen_underground_storage: true
+  hydrogen_underground_storage_locations:
+    # - onshore  # more than 50 km from sea
+  - nearshore    # within 50 km of sea
+    # - offshore
+  ammonia: false
+  min_part_load_fischer_tropsch: 0.5
+  min_part_load_methanolisation: 0.3
+  min_part_load_methanation: 0.3
+  use_fischer_tropsch_waste_heat: 0.25
+  use_haber_bosch_waste_heat: 0.25
+  use_methanolisation_waste_heat: 0.25
+  use_methanation_waste_heat: 0.25
+  use_fuel_cell_waste_heat: 0.25
+  use_electrolysis_waste_heat: 0.25
+  electricity_transmission_grid: true
+  electricity_distribution_grid: true
+  electricity_distribution_grid_cost_factor: 1.0
+  electricity_grid_connection: true
+  transmission_efficiency:
+    DC:
+      efficiency_static: 0.98
+      efficiency_per_1000km: 0.977
+    H2 pipeline:
+      efficiency_per_1000km: 1 # 0.982
+      compression_per_1000km: 0.018
+    gas pipeline:
+      efficiency_per_1000km: 1 #0.977
+      compression_per_1000km: 0.01
+    electricity distribution grid:
+      efficiency_static: 0.97
+  H2_network: true
+  gas_network: false
+  H2_retrofit: false
+  H2_retrofit_capacity_per_CH4: 0.6
+  gas_network_connectivity_upgrade: 1
+  gas_distribution_grid: true
+  gas_distribution_grid_cost_factor: 1.0
+  biomass_spatial: false
+  biomass_transport: false
+  biogas_upgrading_cc: false
+  conventional_generation:
+    OCGT: gas
+  biomass_to_liquid: false
+  electrobiofuels: false
+  biosng: false
+  municipal_solid_waste: false
+  limit_max_growth:
+    enable: false
+    # allowing 30% larger than max historic growth
+    factor: 1.3
+    max_growth:  # unit GW
+      onwind: 16 # onshore max grow so far 16 GW in Europe https://www.iea.org/reports/renewables-2020/wind
+      solar: 28 # solar max grow so far 28 GW in Europe https://www.iea.org/reports/renewables-2020/solar-pv
+      offwind-ac: 35 # offshore max grow so far 3.5 GW in Europe https://windeurope.org/about-wind/statistics/offshore/european-offshore-wind-industry-key-trends-statistics-2019/
+      offwind-dc: 35
+    max_relative_growth:
+      onwind: 3
+      solar: 3
+      offwind-ac: 3
+      offwind-dc: 3
+  enhanced_geothermal:
+    enable: false
+    flexible: true
+    max_hours: 240
+    max_boost: 0.25
+    var_cf: true
+    sustainability_factor: 0.0025
+  solid_biomass_import:
+    enable: false
+    price: 54 #EUR/MWh
+    max_amount: 1390 # TWh
+    upstream_emissions_factor: .1 #share of solid biomass CO2 emissions at full combustion
+
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#industry
+industry:
+  St_primary_fraction:
+    2020: 0.6
+    2025: 0.55
+    2030: 0.5
+    2035: 0.45
+    2040: 0.4
+    2045: 0.35
+    2050: 0.3
+  DRI_fraction:
+    2020: 0
+    2025: 0
+    2030: 0.05
+    2035: 0.2
+    2040: 0.4
+    2045: 0.7
+    2050: 1
+  H2_DRI: 1.7
+  elec_DRI: 0.322
+  Al_primary_fraction:
+    2020: 0.4
+    2025: 0.375
+    2030: 0.35
+    2035: 0.325
+    2040: 0.3
+    2045: 0.25
+    2050: 0.2
+  MWh_NH3_per_tNH3: 5.166
+  MWh_CH4_per_tNH3_SMR: 10.8
+  MWh_elec_per_tNH3_SMR: 0.7
+  MWh_H2_per_tNH3_electrolysis: 5.93
+  MWh_elec_per_tNH3_electrolysis: 0.2473
+  MWh_NH3_per_MWh_H2_cracker: 1.46 # https://github.com/euronion/trace/blob/44a5ff8401762edbef80eff9cfe5a47c8d3c8be4/data/efficiencies.csv
+  NH3_process_emissions: 24.5
+  petrochemical_process_emissions: 25.5
+  #HVC primary/recycling based on values used in Neumann et al https://doi.org/10.1016/j.joule.2023.06.016, linearly interpolated between 2020 and 2050
+  #2020 recycling rates based on Agora https://static.agora-energiewende.de/fileadmin/Projekte/2021/2021_02_EU_CEAP/A-EW_254_Mobilising-circular-economy_study_WEB.pdf
+  #fractions refer to the total primary HVC production in 2020
+  #assumes 6.7 Mtplastics produced from recycling in 2020
+  HVC_primary_fraction:
+    2020: 1.0
+    2025: 0.9
+    2030: 0.8
+    2035: 0.7
+    2040: 0.6
+    2045: 0.5
+    2050: 0.4
+  HVC_mechanical_recycling_fraction:
+    2020: 0.12
+    2025: 0.15
+    2030: 0.18
+    2035: 0.21
+    2040: 0.24
+    2045: 0.27
+    2050: 0.30
+  HVC_chemical_recycling_fraction:
+    2020: 0.0
+    2025: 0.0
+    2030: 0.04
+    2035: 0.08
+    2040: 0.12
+    2045: 0.16
+    2050: 0.20
+  HVC_environment_sequestration_fraction: 0.
+  waste_to_energy: false
+  waste_to_energy_cc: false
+  sector_ratios_fraction_future:
+    2020: 0.0
+    2025: 0.1
+    2030: 0.3
+    2035: 0.5
+    2040: 0.7
+    2045: 0.9
+    2050: 1.0
+  basic_chemicals_without_NH3_production_today: 69. #Mt/a, = 86 Mtethylene-equiv - 17 MtNH3
+  HVC_production_today: 52.
+  MWh_elec_per_tHVC_mechanical_recycling: 0.547
+  MWh_elec_per_tHVC_chemical_recycling: 6.9
+  chlorine_production_today: 9.58
+  MWh_elec_per_tCl: 3.6
+  MWh_H2_per_tCl: -0.9372
+  methanol_production_today: 1.5
+  MWh_elec_per_tMeOH: 0.167
+  MWh_CH4_per_tMeOH: 10.25
+  MWh_MeOH_per_tMeOH: 5.528
+  hotmaps_locate_missing: false
+  reference_year: 2019
+
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#costs
+costs:
+  year: 2030
+  version: v0.9.1
+  social_discountrate: 0.02
+  fill_values:
+    FOM: 0
+    VOM: 0
+    efficiency: 1
+    fuel: 0
+    investment: 0
+    lifetime: 25
+    "CO2 intensity": 0
+    "discount rate": 0.07
+  # Marginal and capital costs can be overwritten
+  # capital_cost:
+  #   onwind: 500
+  marginal_cost:
+    solar: 0.01
+    onwind: 0.015
+    offwind: 0.015
+    hydro: 0.
+    H2: 0.
+    electrolysis: 0.
+    fuel cell: 0.
+    battery: 0.
+    battery inverter: 0.
+  emission_prices:
+    enable: true
+    co2: 100.
+    co2_monthly_prices: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#clustering
+clustering:
+  focus_weights: false
+  simplify_network:
+    to_substations: false
+    algorithm: kmeans # choose from: [hac, kmeans]
+    feature: solar+onwind-time
+    exclude_carriers: []
+    remove_stubs: true
+    remove_stubs_across_borders: true
+  cluster_network:
+    algorithm: kmeans
+    feature: solar+onwind-time
+    exclude_carriers: []
+    consider_efficiency_classes: false
+  aggregation_strategies:
+    generators:
+      committable: any
+      ramp_limit_up: max
+      ramp_limit_down: max
+  temporal:
+    resolution_elec: 1H
+    resolution_sector: 1H
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#adjustments
+adjustments:
+  electricity: false
+  sector: false
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solving
+solving:
+  #tmpdir: "path/to/tmp"
+  options:
+    clip_p_max_pu: 1.e-2
+    load_shedding: false
+    curtailment_mode: false
+    noisy_costs: true
+    skip_iterations: true
+    rolling_horizon: false
+    seed: 123
+    custom_extra_functionality: "../data/custom_extra_functionality.py"
+    # io_api: "direct"  # Increases performance but only supported for the highs and gurobi solvers
+    # options that go into the optimize function
+    track_iterations: false
+    min_iterations: 2
+    max_iterations: 3
+    transmission_losses: 2
+    linearized_unit_commitment: true
+    horizon: 365
+    post_discretization:
+      enable: false
+      line_unit_size: 1700
+      line_threshold: 0.3
+      link_unit_size:
+        DC: 2000
+        H2 pipeline: 1200
+        gas pipeline: 1500
+      link_threshold:
+        DC: 0.3
+        H2 pipeline: 0.3
+        gas pipeline: 0.3
+
+  agg_p_nom_limits:
+    agg_offwind: false
+    include_existing: false
+    file: data/agg_p_nom_minmax.csv
+
+  constraints:
+    CCL: false
+    EQ: false
+    BAU: false
+    SAFE: false
+
+  solver:
+    name: gurobi
+    options: gurobi-default
+
+  solver_options:
+    highs-default:
+      # refer to https://ergo-code.github.io/HiGHS/dev/options/definitions/
+      threads: 1
+      solver: "ipm"
+      run_crossover: "off"
+      small_matrix_value: 1e-6
+      large_matrix_value: 1e9
+      primal_feasibility_tolerance: 1e-5
+      dual_feasibility_tolerance: 1e-5
+      ipm_optimality_tolerance: 1e-4
+      parallel: "on"
+      random_seed: 123
+    gurobi-default:
+      threads: 8
+      method: 2 # barrier
+      crossover: 0
+      BarConvTol: 1.e-6
+      Seed: 123
+      AggFill: 0
+      PreDual: 0
+      GURO_PAR_BARDENSETHRESH: 200
+    gurobi-numeric-focus:
+      NumericFocus: 3       # Favour numeric stability over speed
+      method: 2             # barrier
+      crossover: 0          # do not use crossover
+      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
+      BarConvTol: 1.e-5
+      FeasibilityTol: 1.e-4
+      OptimalityTol: 1.e-4
+      ObjScale: -0.5
+      threads: 8
+      Seed: 123
+    gurobi-fallback:        # Use gurobi defaults
+      crossover: 0
+      method: 2             # barrier
+      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
+      BarConvTol: 1.e-5
+      FeasibilityTol: 1.e-5
+      OptimalityTol: 1.e-5
+      Seed: 123
+      threads: 8
+    cplex-default:
+      threads: 4
+      lpmethod: 4 # barrier
+      solutiontype: 2 # non basic solution, ie no crossover
+      barrier.convergetol: 1.e-5
+      feasopt.tolerance: 1.e-6
+    copt-default:
+      Threads: 8
+      LpMethod: 2
+      Crossover: 0
+      RelGap: 1.e-6
+      Dualize: 0
+    copt-gpu:
+      LpMethod: 6
+      GPUMode: 1
+      PDLPTol: 1.e-5
+      Crossover: 0
+    cbc-default: {} # Used in CI
+    glpk-default: {} # Used in CI
+
+  mem_mb: 140000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
+  runtime: 60h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
+
+
+# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
+plotting:
+  map:
+    boundaries: [-11, 30, 34, 71]
+    color_geomap:
+      ocean: white
+      land: white
+  projection:
+    name: "EqualEarth"
+    # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
+    # name: "LambertConformal"
+    # central_longitude: 10.
+    # central_latitude: 50.
+    # standard_parallels: [35, 65]
+  eu_node_location:
+    x: -5.5
+    y: 46.
+  costs_max: 1000
+  costs_threshold: 1
+  energy_max: 20000
+  energy_min: -20000
+  energy_threshold: 50.
+
+  nice_names:
+    OCGT: "Open-Cycle Gas"
+    CCGT: "Combined-Cycle Gas"
+    offwind-ac: "Offshore Wind (AC)"
+    offwind-dc: "Offshore Wind (DC)"
+    offwind-float: "Offshore Wind (Floating)"
+    onwind: "Onshore Wind"
+    solar: "Solar"
+    PHS: "Pumped Hydro Storage"
+    hydro: "Reservoir & Dam"
+    battery: "Battery Storage"
+    H2: "Hydrogen Storage"
+    lines: "Transmission Lines"
+    ror: "Run of River"
+    load: "Load Shedding"
+    ac: "AC"
+    dc: "DC"
+
+  tech_colors:
+    # wind
+    onwind: "#235ebc"
+    onshore wind: "#235ebc"
+    offwind: "#6895dd"
+    offshore wind: "#6895dd"
+    offwind-ac: "#6895dd"
+    offshore wind (AC): "#6895dd"
+    offshore wind ac: "#6895dd"
+    offwind-dc: "#74c6f2"
+    offshore wind (DC): "#74c6f2"
+    offshore wind dc: "#74c6f2"
+    offwind-float: "#b5e2fa"
+    offshore wind (Float): "#b5e2fa"
+    offshore wind float: "#b5e2fa"
+    # water
+    hydro: '#298c81'
+    hydro reservoir: '#298c81'
+    ror: '#3dbfb0'
+    run of river: '#3dbfb0'
+    hydroelectricity: '#298c81'
+    PHS: '#51dbcc'
+    hydro+PHS: "#08ad97"
+    # solar
+    solar: "#f9d002"
+    solar PV: "#f9d002"
+    solar-hsat: "#fdb915"
+    solar thermal: '#ffbf2b'
+    residential rural solar thermal: '#f1c069'
+    services rural solar thermal: '#eabf61'
+    residential urban decentral solar thermal: '#e5bc5a'
+    services urban decentral solar thermal: '#dfb953'
+    urban central solar thermal: '#d7b24c'
+    solar rooftop: '#ffea80'
+    # gas
+    OCGT: '#e0986c'
+    OCGT marginal: '#e0986c'
+    OCGT-heat: '#e0986c'
+    gas boiler: '#db6a25'
+    gas boilers: '#db6a25'
+    gas boiler marginal: '#db6a25'
+    residential rural gas boiler: '#d4722e'
+    residential urban decentral gas boiler: '#cb7a36'
+    services rural gas boiler: '#c4813f'
+    services urban decentral gas boiler: '#ba8947'
+    urban central gas boiler: '#b0904f'
+    gas: '#e05b09'
+    fossil gas: '#e05b09'
+    natural gas: '#e05b09'
+    biogas to gas: '#e36311'
+    biogas to gas CC: '#e51245'
+    CCGT: '#a85522'
+    CCGT marginal: '#a85522'
+    allam: '#B98F76'
+    gas for industry co2 to atmosphere: '#692e0a'
+    gas for industry co2 to stored: '#8a3400'
+    gas for industry: '#853403'
+    gas for industry CC: '#692e0a'
+    gas pipeline: '#ebbca0'
+    gas pipeline new: '#a87c62'
+    # oil
+    oil: '#c9c9c9'
+    imported oil: '#a3a3a3'
+    oil boiler: '#adadad'
+    residential rural oil boiler: '#a9a9a9'
+    services rural oil boiler: '#a5a5a5'
+    residential urban decentral oil boiler: '#a1a1a1'
+    urban central oil boiler: '#9d9d9d'
+    services urban decentral oil boiler: '#999999'
+    agriculture machinery oil: '#949494'
+    shipping oil: "#808080"
+    land transport oil: '#afafaf'
+    # nuclear
+    Nuclear: '#ff8c00'
+    Nuclear marginal: '#ff8c00'
+    nuclear: '#ff8c00'
+    uranium: '#ff8c00'
+    # coal
+    Coal: '#545454'
+    coal: '#545454'
+    Coal marginal: '#545454'
+    coal for industry: '#343434'
+    solid: '#545454'
+    Lignite: '#826837'
+    lignite: '#826837'
+    Lignite marginal: '#826837'
+    # biomass
+    biogas: '#e3d37d'
+    biomass: '#baa741'
+    solid biomass: '#baa741'
+    municipal solid waste: '#91ba41'
+    solid biomass import: '#d5ca8d'
+    solid biomass transport: '#baa741'
+    solid biomass for industry: '#7a6d26'
+    solid biomass for industry CC: '#47411c'
+    solid biomass for industry co2 from atmosphere: '#736412'
+    solid biomass for industry co2 to stored: '#47411c'
+    urban central solid biomass CHP: '#9d9042'
+    urban central solid biomass CHP CC: '#6c5d28'
+    biomass boiler: '#8A9A5B'
+    residential rural biomass boiler: '#a1a066'
+    residential urban decentral biomass boiler: '#b0b87b'
+    services rural biomass boiler: '#c6cf98'
+    services urban decentral biomass boiler: '#dde5b5'
+    biomass to liquid: '#32CD32'
+    unsustainable bioliquids: '#32CD32'
+    electrobiofuels: 'red'
+    BioSNG: '#123456'
+    # power transmission
+    lines: '#6c9459'
+    transmission lines: '#6c9459'
+    electricity distribution grid: '#97ad8c'
+    low voltage: '#97ad8c'
+    # electricity demand
+    Electric load: '#110d63'
+    electric demand: '#110d63'
+    electricity: '#110d63'
+    industry electricity: '#2d2a66'
+    industry new electricity: '#2d2a66'
+    agriculture electricity: '#494778'
+    # battery + EVs
+    battery: '#ace37f'
+    battery storage: '#ace37f'
+    battery charger: '#88a75b'
+    battery discharger: '#5d4e29'
+    home battery: '#80c944'
+    home battery storage: '#80c944'
+    home battery charger: '#5e8032'
+    home battery discharger: '#3c5221'
+    BEV charger: '#baf238'
+    V2G: '#e5ffa8'
+    land transport EV: '#baf238'
+    land transport demand: '#38baf2'
+    EV battery: '#baf238'
+    # hot water storage
+    water tanks: '#e69487'
+    residential rural water tanks: '#f7b7a3'
+    services rural water tanks: '#f3afa3'
+    residential urban decentral water tanks: '#f2b2a3'
+    services urban decentral water tanks: '#f1b4a4'
+    urban central water tanks: '#e9977d'
+    hot water storage: '#e69487'
+    hot water charging: '#e8998b'
+    urban central water tanks charger: '#b57a67'
+    residential rural water tanks charger: '#b4887c'
+    residential urban decentral water tanks charger: '#b39995'
+    services rural water tanks charger: '#b3abb0'
+    services urban decentral water tanks charger: '#b3becc'
+    hot water discharging: '#e99c8e'
+    urban central water tanks discharger: '#b9816e'
+    residential rural water tanks discharger: '#ba9685'
+    residential urban decentral water tanks discharger: '#baac9e'
+    services rural water tanks discharger: '#bbc2b8'
+    services urban decentral water tanks discharger: '#bdd8d3'
+    # heat demand
+    Heat load: '#cc1f1f'
+    heat: '#cc1f1f'
+    heat vent: '#aa3344'
+    heat demand: '#cc1f1f'
+    rural heat: '#ff5c5c'
+    residential rural heat: '#ff7c7c'
+    services rural heat: '#ff9c9c'
+    central heat: '#cc1f1f'
+    urban central heat: '#d15959'
+    urban central heat vent: '#a74747'
+    decentral heat: '#750606'
+    residential urban decentral heat: '#a33c3c'
+    services urban decentral heat: '#cc1f1f'
+    low-temperature heat for industry: '#8f2727'
+    process heat: '#ff0000'
+    agriculture heat: '#d9a5a5'
+    # heat supply
+    heat pumps: '#2fb537'
+    heat pump: '#2fb537'
+    air heat pump: '#36eb41'
+    residential urban decentral air heat pump: '#48f74f'
+    services urban decentral air heat pump: '#5af95d'
+    services rural air heat pump: '#5af95d'
+    urban central air heat pump: '#6cfb6b'
+    ground heat pump: '#2fb537'
+    residential rural ground heat pump: '#48f74f'
+    residential rural air heat pump: '#48f74f'
+    services rural ground heat pump: '#5af95d'
+    Ambient: '#98eb9d'
+    CHP: '#8a5751'
+    urban central gas CHP: '#8d5e56'
+    CHP CC: '#634643'
+    urban central gas CHP CC: '#6e4e4c'
+    CHP heat: '#8a5751'
+    CHP electric: '#8a5751'
+    district heating: '#e8beac'
+    resistive heater: '#d8f9b8'
+    residential rural resistive heater: '#bef5b5'
+    residential urban decentral resistive heater: '#b2f1a9'
+    services rural resistive heater: '#a5ed9d'
+    services urban decentral resistive heater: '#98e991'
+    urban central resistive heater: '#8cdf85'
+    retrofitting: '#8487e8'
+    building retrofitting: '#8487e8'
+    # hydrogen
+    H2 for industry: "#f073da"
+    H2 for shipping: "#ebaee0"
+    H2: '#bf13a0'
+    hydrogen: '#bf13a0'
+    retrofitted H2 boiler: '#e5a0d9'
+    SMR: '#870c71'
+    SMR CC: '#4f1745'
+    H2 liquefaction: '#d647bd'
+    hydrogen storage: '#bf13a0'
+    H2 Store: '#bf13a0'
+    H2 storage: '#bf13a0'
+    land transport fuel cell: '#6b3161'
+    H2 pipeline: '#f081dc'
+    H2 pipeline retrofitted: '#ba99b5'
+    H2 Fuel Cell: '#c251ae'
+    H2 fuel cell: '#c251ae'
+    H2 turbine: '#991f83'
+    H2 Electrolysis: '#ff29d9'
+    H2 electrolysis: '#ff29d9'
+    # ammonia
+    NH3: '#46caf0'
+    ammonia: '#46caf0'
+    ammonia store: '#00ace0'
+    ammonia cracker: '#87d0e6'
+    Haber-Bosch: '#076987'
+    # syngas
+    Sabatier: '#9850ad'
+    methanation: '#c44ce6'
+    methane: '#c44ce6'
+    # synfuels
+    Fischer-Tropsch: '#25c49a'
+    liquid: '#25c49a'
+    kerosene for aviation: '#a1ffe6'
+    naphtha for industry: '#57ebc4'
+    methanolisation: '#83d6d5'
+    methanol: '#468c8b'
+    shipping methanol: '#468c8b'
+    industry methanol: '#468c8b'
+    # co2
+    CC: '#f29dae'
+    CCS: '#f29dae'
+    CO2 sequestration: '#f29dae'
+    DAC: '#ff5270'
+    co2 stored: '#f2385a'
+    co2 sequestered: '#f2682f'
+    co2: '#f29dae'
+    co2 vent: '#ffd4dc'
+    CO2 pipeline: '#f5627f'
+    # emissions
+    process emissions CC: '#000000'
+    process emissions: '#222222'
+    process emissions to stored: '#444444'
+    process emissions to atmosphere: '#888888'
+    oil emissions: '#aaaaaa'
+    shipping oil emissions: "#555555"
+    shipping methanol emissions: '#666666'
+    land transport oil emissions: '#777777'
+    agriculture machinery oil emissions: '#333333'
+    # other
+    shipping: '#03a2ff'
+    power-to-heat: '#2fb537'
+    power-to-gas: '#c44ce6'
+    power-to-H2: '#ff29d9'
+    power-to-liquid: '#25c49a'
+    gas-to-power/heat: '#ee8340'
+    waste: '#e3d37d'
+    other: '#000000'
+    geothermal: '#ba91b1'
+    geothermal heat: '#ba91b1'
+    geothermal district heat: '#d19D00'
+    geothermal organic rankine cycle: '#ffbf00'
+    AC: "#70af1d"
+    AC-AC: "#70af1d"
+    AC line: "#70af1d"
+    links: "#8a1caf"
+    HVDC links: "#8a1caf"
+    DC: "#8a1caf"
+    DC-DC: "#8a1caf"
+    DC link: "#8a1caf"
+    load: "#dd2e23"
+    waste CHP: '#e3d37d'
+    waste CHP CC: '#e3d3ff'
+    HVC to air: 'k'
diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 651e8ea29..a2c32b61d 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -301,29 +301,20 @@ def simplify_links(
     # Only span graph over the DC link components
     G = n.graph(branch_components=["Link"])
 
-    def split_links(nodes):
+    def split_links(nodes, added_supernodes=None):
         nodes = frozenset(nodes)
 
         seen = set()
 
-        # Corsica substation
-        node_corsica = find_closest_bus(
-            n,
-            x=9.44802,
-            y=42.52842,
-            tol=2000,  # Tolerance needed to only return the bus if the region is actually modelled
-        )
-
         # Supernodes are endpoints of links, identified by having lass then two neighbours or being an AC Bus
         # An example for the latter is if two different links are connected to the same AC bus.
-        # Manually keep Corsica substation as a supernode
         supernodes = {
             m
             for m in nodes
             if (
                 (len(G.adj[m]) < 2 or (set(G.adj[m]) - nodes))
                 or (n.buses.loc[m, "carrier"] == "AC")
-                or (m == node_corsica)
+                or (m in added_supernodes)
             )
         }
 
@@ -360,8 +351,20 @@ def split_links(nodes):
         0.0, index=n.buses.index, columns=list(connection_costs_per_link)
     )
 
+    node_corsica = find_closest_bus(
+        n,
+        x=9.44802,
+        y=42.52842,
+        tol=2000,  # Tolerance needed to only return the bus if the region is actually modelled
+    )
+
+    added_supernodes = []
+    added_supernodes.append(node_corsica)
+
     for lbl in labels.value_counts().loc[lambda s: s > 2].index:
-        for b, buses, links in split_links(labels.index[labels == lbl]):
+        for b, buses, links in split_links(
+            labels.index[labels == lbl], added_supernodes
+        ):
             if len(buses) <= 2:
                 continue
 
@@ -422,6 +425,9 @@ def split_links(nodes):
 
     logger.debug("Collecting all components using the busmap")
 
+    # Change carrier type of all added super_nodes to "AC"
+    n.buses.loc[added_supernodes, "carrier"] = "AC"
+
     _aggregate_and_move_components(
         n,
         busmap,

From e097fc4ce12efb39bc35644776e99148415c7b10 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Thu, 15 Aug 2024 18:10:51 +0200
Subject: [PATCH 087/100] Bug fix: Carrier type of all supernodes corrected to
 'AC'

---
 scripts/simplify_network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index a2c32b61d..119445c42 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -359,7 +359,8 @@ def split_links(nodes, added_supernodes=None):
     )
 
     added_supernodes = []
-    added_supernodes.append(node_corsica)
+    if node_corsica is not None:
+        added_supernodes.append(node_corsica)
 
     for lbl in labels.value_counts().loc[lambda s: s > 2].index:
         for b, buses, links in split_links(

From 864321e3c88bb7e12f4a9915a0cf4d95a32681e1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 19 Aug 2024 08:50:40 +0000
Subject: [PATCH 088/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/build_industry_sector_ratios.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/build_industry_sector_ratios.py b/scripts/build_industry_sector_ratios.py
index 530ac910f..c2438f915 100644
--- a/scripts/build_industry_sector_ratios.py
+++ b/scripts/build_industry_sector_ratios.py
@@ -445,7 +445,9 @@ def chemicals_industry():
 
     # subtract ammonia energy demand (in ktNH3/a)
     ammonia = pd.read_csv(snakemake.input.ammonia_production, index_col=0)
-    ammonia_total = ammonia.loc[ammonia.index.intersection(eu27), str(max(2018, year))].sum()
+    ammonia_total = ammonia.loc[
+        ammonia.index.intersection(eu27), str(max(2018, year))
+    ].sum()
     df.loc["methane", sector] -= ammonia_total * params["MWh_CH4_per_tNH3_SMR"]
     df.loc["elec", sector] -= ammonia_total * params["MWh_elec_per_tNH3_SMR"]
 

From 9b663452914e888f10db29af6e5dd21543092514 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Mon, 19 Aug 2024 12:18:25 +0200
Subject: [PATCH 089/100] Updated rules and base_network for compatibility with
 TYNDP projects.

---
 rules/build_electricity.smk | 5 -----
 scripts/base_network.py     | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 184523013..b0de316eb 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -118,11 +118,6 @@ rule base_network:
             if config_provider("electricity_network", "base_network")(w) == "gridkit"
             else []
         ),
-        links_tyndp=lambda w: (
-            "data/links_tyndp.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else []
-        ),
         country_shapes=resources("country_shapes.geojson"),
         offshore_shapes=resources("offshore_shapes.geojson"),
         europe_shape=resources("europe_shape.geojson"),
diff --git a/scripts/base_network.py b/scripts/base_network.py
index 49ef72812..38c949e6e 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -829,11 +829,6 @@ def base_network(
     else:
         raise ValueError("base_network must be either 'gridkit' or 'osm'")
 
-    if config["links"].get("include_tyndp") & (
-        config["electricity_network"].get("base_network") == "gridkit"
-    ):
-        buses, links = _add_links_from_tyndp(buses, links, links_tyndp, europe_shape)
-
     if config["electricity_network"].get("base_network") == "gridkit":
         converters = _load_converters_from_eg(buses, eg_converters)
     elif "osm" in config["electricity_network"].get("base_network"):

From 412acd88f2b9cc4144d9e6c56790b8d4254369b2 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Mon, 19 Aug 2024 15:54:48 +0200
Subject: [PATCH 090/100] Updated Zenodo repository and prebuilt network to
 include 150 kV HVDC connections.

---
 rules/retrieve.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 1cf53f785..371100d6e 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -415,14 +415,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://zenodo.org/records/12799202/files/buses.csv"),
+            buses=storage("https://zenodo.org/records/13342577/files/buses.csv"),
             converters=storage(
-                "https://zenodo.org/records/12799202/files/converters.csv"
+                "https://zenodo.org/records/13342577/files/converters.csv"
             ),
-            lines=storage("https://zenodo.org/records/12799202/files/lines.csv"),
-            links=storage("https://zenodo.org/records/12799202/files/links.csv"),
+            lines=storage("https://zenodo.org/records/13342577/files/lines.csv"),
+            links=storage("https://zenodo.org/records/13342577/files/links.csv"),
             transformers=storage(
-                "https://zenodo.org/records/12799202/files/transformers.csv"
+                "https://zenodo.org/records/13342577/files/transformers.csv"
             ),
         output:
             buses="data/osm/prebuilt/buses.csv",

From f1526fb00ab4e7fbf54f09a8d0f224e4ea515cc0 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Tue, 20 Aug 2024 11:20:43 +0200
Subject: [PATCH 091/100] Removed outdated config backup.

---
 config/config_backuo.yaml | 1259 -------------------------------------
 1 file changed, 1259 deletions(-)
 delete mode 100644 config/config_backuo.yaml

diff --git a/config/config_backuo.yaml b/config/config_backuo.yaml
deleted file mode 100644
index f45ea5be0..000000000
--- a/config/config_backuo.yaml
+++ /dev/null
@@ -1,1259 +0,0 @@
-# SPDX-FileCopyrightText: : 2017-2024 The PyPSA-Eur Authors
-#
-# SPDX-License-Identifier: CC0-1.0
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#top-level-configuration
-version: 0.11.0
-tutorial: false
-
-logging:
-  level: INFO
-  format: '%(levelname)s:%(name)s:%(message)s'
-
-private:
-  keys:
-    entsoe_api:
-
-remote:
-  ssh: zecm
-  path: ~/scratch/projects/pypsa-eur
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#run
-run:
-  prefix: ""
-  name: "europe-nuts2-gridkit"
-  scenarios:
-    enable: false
-    file: config/scenarios.yaml
-  disable_progressbar: false
-  shared_resources:
-    policy: false
-    exclude: []
-  shared_cutouts: true
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#foresight
-foresight: overnight
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#scenario
-# Wildcard docs in https://pypsa-eur.readthedocs.io/en/latest/wildcards.html
-scenario:
-  simpl:
-  - ''
-  ll:
-  - v1.0
-  clusters:
-  - 318
-  opts:
-  - ''
-  sector_opts:
-  - ''
-  planning_horizons:
-  # - 2020
-  - 2030
-  # - 2040
-  # - 2050
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#countries
-countries: ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'UA', 'MD']
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#snapshots
-snapshots:
-  start: "2013-01-01"
-  end: "2014-01-01"
-  inclusive: 'left'
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#enable
-enable:
-  retrieve: auto
-  prepare_links_p_nom: false
-  retrieve_databundle: true
-  retrieve_cost_data: true
-  build_cutout: false
-  retrieve_cutout: true
-  custom_busmap: true
-  drop_leap_day: true
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#co2-budget
-co2_budget:
-  2020: 0.701
-  2025: 0.524
-  2030: 0.297
-  2035: 0.150
-  2040: 0.071
-  2045: 0.032
-  2050: 0.000
-
-electricity_network:
-  base_network: gridkit    # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
-  osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
-lines:
-  types: # Specify voltages (keys) and line types (values) for lines
-    200.: "Al/St 240/40 2-bundle 200.0"
-    220.: "Al/St 240/40 2-bundle 220.0"
-    300.: "Al/St 240/40 3-bundle 300.0"
-    380.: "Al/St 240/40 4-bundle 380.0"
-    500.: "Al/St 240/40 4-bundle 380.0"
-    750.: "Al/St 560/50 4-bundle 750.0"
-  s_max_pu: 0.7
-  s_nom_max: .inf
-  max_extension: 20000 #MW
-  length_factor: 1.25
-  reconnect_crimea: true  # Only needed for 'gridkit' base_network, in OSM, the lines are already connected
-  under_construction: 'keep' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
-  dynamic_line_rating:
-    activate: false
-    cutout: europe-2013-sarah3-era5
-    correction_factor: 0.95
-    max_voltage_difference: false
-    max_line_rating: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#links
-links:
-  p_max_pu: 1.0
-  p_nom_max: .inf
-  max_extension: 30000 #MW
-  include_tyndp: false
-  under_construction: 'zero' # 'zero': set capacity to zero, 'remove': remove, 'keep': with full capacity
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#transformers
-transformers:
-  x: 0.1
-  s_nom: 2000.
-  type: ''
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
-electricity:
-  gaslimit_enable: false
-  gaslimit: false
-  co2limit_enable: false
-  co2limit: 7.75e+7
-  co2base: 1.487e+9
-
-  operational_reserve:
-    activate: false
-    epsilon_load: 0.02
-    epsilon_vres: 0.02
-    contingency: 4000
-
-  max_hours:
-    battery: 6
-    H2: 168
-
-  extendable_carriers:
-    Generator: [solar, solar-hsat, onwind, offwind-ac, offwind-dc, offwind-float, OCGT, CCGT]
-    StorageUnit: [] # battery, H2
-    Store: [battery, H2]
-    Link: [] # H2 pipeline
-
-  powerplants_filter: (DateOut >= 2023 or DateOut != DateOut) and not (Country == 'Germany' and Fueltype == 'Nuclear')
-  custom_powerplants: false
-  everywhere_powerplants: []
-
-  conventional_carriers: [nuclear, oil, OCGT, CCGT, coal, lignite, geothermal, biomass]
-  renewable_carriers: [solar, onwind, offwind-ac, offwind-dc, hydro]
-
-  estimate_renewable_capacities:
-    enable: true
-    from_opsd: true
-    year: 2020
-    expansion_limit: false
-    technology_mapping:
-      Offshore: [offwind-ac, offwind-dc, offwind-float]
-      Onshore: [onwind]
-      PV: [solar]
-
-  autarky:
-    enable: false
-    by_country: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#atlite
-atlite:
-  default_cutout: europe-2013-sarah3-era5
-  nprocesses: 4
-  show_progress: false
-  cutouts:
-    # use 'base' to determine geographical bounds and time span from config
-    # base:
-      # module: era5
-    europe-2013-sarah3-era5:
-      module: [sarah, era5] # in priority order
-      x: [-12., 42.]
-      y: [33., 72.]
-      dx: 0.3
-      dy: 0.3
-      time: ['2013', '2013']
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#renewable
-renewable:
-  onwind:
-    cutout: europe-2013-sarah3-era5
-    resource:
-      method: wind
-      turbine: Vestas_V112_3MW
-      smooth: true
-      add_cutout_windspeed: true
-    capacity_per_sqkm: 3
-    # correction_factor: 0.93
-    corine:
-      grid_codes: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32]
-      distance: 1000
-      distance_grid_codes: [1, 2, 3, 4, 5, 6]
-    luisa: false
-      # grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
-      # distance: 1000
-      # distance_grid_codes: [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242]
-    natura: true
-    excluder_resolution: 100
-    clip_p_max_pu: 1.e-2
-  offwind-ac:
-    cutout: europe-2013-sarah3-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
-      smooth: true
-      add_cutout_windspeed: true
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    corine: [44, 255]
-    luisa: false # [0, 5230]
-    natura: true
-    ship_threshold: 400
-    max_depth: 60
-    max_shore_distance: 30000
-    excluder_resolution: 200
-    clip_p_max_pu: 1.e-2
-  offwind-dc:
-    cutout: europe-2013-sarah3-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_2020ATB_5.5MW
-      smooth: true
-      add_cutout_windspeed: true
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    corine: [44, 255]
-    luisa: false # [0, 5230]
-    natura: true
-    ship_threshold: 400
-    max_depth: 60
-    min_shore_distance: 30000
-    excluder_resolution: 200
-    clip_p_max_pu: 1.e-2
-  offwind-float:
-    cutout: europe-2013-sarah3-era5
-    resource:
-      method: wind
-      turbine: NREL_ReferenceTurbine_5MW_offshore
-      smooth: true
-      add_cutout_windspeed: true
-    # ScholzPhd Tab 4.3.1: 10MW/km^2
-    capacity_per_sqkm: 2
-    correction_factor: 0.8855
-    # proxy for wake losses
-    # from 10.1016/j.energy.2018.08.153
-    # until done more rigorously in #153
-    corine: [44, 255]
-    natura: true
-    ship_threshold: 400
-    excluder_resolution: 200
-    min_depth: 60
-    max_depth: 1000
-    clip_p_max_pu: 1.e-2
-  solar:
-    cutout: europe-2013-sarah3-era5
-    resource:
-      method: pv
-      panel: CSi
-      orientation:
-        slope: 35.
-        azimuth: 180.
-    capacity_per_sqkm: 5.1
-    # correction_factor: 0.854337
-    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
-    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
-    natura: true
-    excluder_resolution: 100
-    clip_p_max_pu: 1.e-2
-  solar-hsat:
-    cutout: europe-2013-sarah3-era5
-    resource:
-      method: pv
-      panel: CSi
-      orientation:
-        slope: 35.
-        azimuth: 180.
-      tracking: horizontal
-    capacity_per_sqkm: 4.43 # 15% higher land usage acc. to NREL
-    corine: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 31, 32]
-    luisa: false # [1111, 1121, 1122, 1123, 1130, 1210, 1221, 1222, 1230, 1241, 1242, 1310, 1320, 1330, 1410, 1421, 1422, 2110, 2120, 2130, 2210, 2220, 2230, 2310, 2410, 2420, 3210, 3320, 3330]
-    natura: true
-    excluder_resolution: 100
-    clip_p_max_pu: 1.e-2
-  hydro:
-    cutout: europe-2013-sarah3-era5
-    carriers: [ror, PHS, hydro]
-    PHS_max_hours: 6
-    hydro_max_hours: "energy_capacity_totals_by_country" # one of energy_capacity_totals_by_country, estimate_by_large_installations or a float
-    flatten_dispatch: false
-    flatten_dispatch_buffer: 0.2
-    clip_min_inflow: 1.0
-    eia_norm_year: false
-    eia_correct_by_capacity: false
-    eia_approximate_missing: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#conventional
-conventional:
-  unit_commitment: false
-  dynamic_fuel_price: false
-  nuclear:
-    p_max_pu: "data/nuclear_p_max_pu.csv" # float of file name
-
-# docs-load in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#load
-load:
-  interpolate_limit: 3
-  time_shift_for_large_gaps: 1w
-  manual_adjustments: true # false
-  scaling_factor: 1.0
-  fixed_year: false # false or year (e.g. 2013)
-  supplement_synthetic: true
-
-# docs
-# TODO: PyPSA-Eur merge issue in prepare_sector_network.py
-# regulate what components with which carriers are kept from PyPSA-Eur;
-# some technologies are removed because they are implemented differently
-# (e.g. battery or H2 storage) or have different year-dependent costs
-# in PyPSA-Eur-Sec
-pypsa_eur:
-  Bus:
-  - AC
-  Link:
-  - DC
-  Generator:
-  - onwind
-  - offwind-ac
-  - offwind-dc
-  - offwind-float
-  - solar-hsat
-  - solar
-  - ror
-  - nuclear
-  StorageUnit:
-  - PHS
-  - hydro
-  Store: []
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#energy
-energy:
-  energy_totals_year: 2019
-  base_emissions_year: 1990
-  emissions: CO2
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#biomass
-biomass:
-  year: 2030
-  scenario: ENS_Med
-  classes:
-    solid biomass:
-    - Agricultural waste
-    - Fuelwood residues
-    - Secondary Forestry residues - woodchips
-    - Sawdust
-    - Residues from landscape care
-    not included:
-    - Sugar from sugar beet
-    - Rape seed
-    - "Sunflower, soya seed "
-    - Bioethanol barley, wheat, grain maize, oats, other cereals and rye
-    - Miscanthus, switchgrass, RCG
-    - Willow
-    - Poplar
-    - FuelwoodRW
-    - C&P_RW
-    biogas:
-    - Manure solid, liquid
-    - Sludge
-    municipal solid waste:
-    - Municipal waste
-  share_unsustainable_use_retained:
-    2020: 1
-    2025: 0.66
-    2030: 0.33
-    2035: 0
-    2040: 0
-    2045: 0
-    2050: 0
-  share_sustainable_potential_available:
-    2020: 0
-    2025: 0.33
-    2030: 0.66
-    2035: 1
-    2040: 1
-    2045: 1
-    2050: 1
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solar-thermal
-solar_thermal:
-  clearsky_model: simple  # should be "simple" or "enhanced"?
-  orientation:
-    slope: 45.
-    azimuth: 180.
-  cutout: default
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#existing-capacities
-existing_capacities:
-  grouping_years_power: [1920, 1950, 1955, 1960, 1965, 1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 2025]
-  grouping_years_heat: [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015, 2019] # heat grouping years >= baseyear will be ignored
-  threshold_capacity: 10
-  default_heating_lifetime: 20
-  conventional_carriers:
-  - lignite
-  - coal
-  - oil
-  - uranium
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#sector
-sector:
-  transport: true
-  heating: true
-  biomass: true
-  industry: true
-  agriculture: true
-  fossil_fuels: true
-  district_heating:
-    potential: 0.6
-    progress:
-      2020: 0.0
-      2025: 0.15
-      2030: 0.3
-      2035: 0.45
-      2040: 0.6
-      2045: 0.8
-      2050: 1.0
-    district_heating_loss: 0.15
-    forward_temperature: 90 #C
-    return_temperature: 50 #C
-    heat_source_cooling: 6 #K
-    heat_pump_cop_approximation:
-      refrigerant: ammonia
-      heat_exchanger_pinch_point_temperature_difference: 5 #K
-      isentropic_compressor_efficiency: 0.8
-      heat_loss: 0.0
-  heat_pump_sources:
-    urban central:
-    - air
-    urban decentral:
-    - air
-    rural:
-    - air
-    - ground
-  cluster_heat_buses: true
-  heat_demand_cutout: default
-  bev_dsm_restriction_value: 0.75
-  bev_dsm_restriction_time: 7
-  transport_heating_deadband_upper: 20.
-  transport_heating_deadband_lower: 15.
-  ICE_lower_degree_factor: 0.375
-  ICE_upper_degree_factor: 1.6
-  EV_lower_degree_factor: 0.98
-  EV_upper_degree_factor: 0.63
-  bev_dsm: true
-  bev_availability: 0.5
-  bev_energy: 0.05
-  bev_charge_efficiency: 0.9
-  bev_charge_rate: 0.011
-  bev_avail_max: 0.95
-  bev_avail_mean: 0.8
-  v2g: true
-  land_transport_fuel_cell_share:
-    2020: 0
-    2025: 0
-    2030: 0
-    2035: 0
-    2040: 0
-    2045: 0
-    2050: 0
-  land_transport_electric_share:
-    2020: 0
-    2025: 0.15
-    2030: 0.3
-    2035: 0.45
-    2040: 0.7
-    2045: 0.85
-    2050: 1
-  land_transport_ice_share:
-    2020: 1
-    2025: 0.85
-    2030: 0.7
-    2035: 0.55
-    2040: 0.3
-    2045: 0.15
-    2050: 0
-  transport_electric_efficiency: 53.19 # 1 MWh_el = 53.19*100 km
-  transport_fuel_cell_efficiency: 30.003 # 1 MWh_H2 = 30.003*100 km
-  transport_ice_efficiency: 16.0712 # 1 MWh_oil = 16.0712 * 100 km
-  agriculture_machinery_electric_share: 0
-  agriculture_machinery_oil_share: 1
-  agriculture_machinery_fuel_efficiency: 0.7
-  agriculture_machinery_electric_efficiency: 0.3
-  MWh_MeOH_per_MWh_H2: 0.8787
-  MWh_MeOH_per_tCO2: 4.0321
-  MWh_MeOH_per_MWh_e: 3.6907
-  shipping_hydrogen_liquefaction: false
-  shipping_hydrogen_share:
-    2020: 0
-    2025: 0
-    2030: 0
-    2035: 0
-    2040: 0
-    2045: 0
-    2050: 0
-  shipping_methanol_share:
-    2020: 0
-    2025: 0.15
-    2030: 0.3
-    2035: 0.5
-    2040: 0.7
-    2045: 0.85
-    2050: 1
-  shipping_oil_share:
-    2020: 1
-    2025: 0.85
-    2030: 0.7
-    2035: 0.5
-    2040: 0.3
-    2045: 0.15
-    2050: 0
-  shipping_methanol_efficiency: 0.46
-  shipping_oil_efficiency: 0.40
-  aviation_demand_factor: 1.
-  HVC_demand_factor: 1.
-  time_dep_hp_cop: true
-  heat_pump_sink_T_individual_heating: 55.
-  reduce_space_heat_exogenously: true
-  reduce_space_heat_exogenously_factor:
-    2020: 0.10  # this results in a space heat demand reduction of 10%
-    2025: 0.09  # first heat demand increases compared to 2020 because of larger floor area per capita
-    2030: 0.09
-    2035: 0.11
-    2040: 0.16
-    2045: 0.21
-    2050: 0.29
-  retrofitting:
-    retro_endogen: false
-    cost_factor: 1.0
-    interest_rate: 0.04
-    annualise_cost: true
-    tax_weighting: false
-    construction_index: true
-  tes: true
-  tes_tau:
-    decentral: 3
-    central: 180
-  boilers: true
-  resistive_heaters: true
-  oil_boilers: false
-  biomass_boiler: true
-  overdimension_individual_heating: 1.1  #to cover demand peaks bigger than data
-  chp: true
-  micro_chp: false
-  solar_thermal: true
-  solar_cf_correction: 0.788457  # =  >>> 1/1.2683
-  marginal_cost_storage: 0. #1e-4
-  methanation: true
-  coal_cc: false
-  dac: true
-  co2_vent: false
-  central_heat_vent: false
-  allam_cycle: false
-  hydrogen_fuel_cell: true
-  hydrogen_turbine: false
-  SMR: true
-  SMR_cc: true
-  regional_methanol_demand: false
-  regional_oil_demand: false
-  regional_coal_demand: false
-  regional_co2_sequestration_potential:
-    enable: false
-    attribute:
-    - conservative estimate Mt
-    - conservative estimate GAS Mt
-    - conservative estimate OIL Mt
-    - conservative estimate aquifer Mt
-    include_onshore: false
-    min_size: 3
-    max_size: 25
-    years_of_storage: 25
-  co2_sequestration_potential: 200
-  co2_sequestration_cost: 10
-  co2_sequestration_lifetime: 50
-  co2_spatial: false
-  co2network: false
-  co2_network_cost_factor: 1
-  cc_fraction: 0.9
-  hydrogen_underground_storage: true
-  hydrogen_underground_storage_locations:
-    # - onshore  # more than 50 km from sea
-  - nearshore    # within 50 km of sea
-    # - offshore
-  ammonia: false
-  min_part_load_fischer_tropsch: 0.5
-  min_part_load_methanolisation: 0.3
-  min_part_load_methanation: 0.3
-  use_fischer_tropsch_waste_heat: 0.25
-  use_haber_bosch_waste_heat: 0.25
-  use_methanolisation_waste_heat: 0.25
-  use_methanation_waste_heat: 0.25
-  use_fuel_cell_waste_heat: 0.25
-  use_electrolysis_waste_heat: 0.25
-  electricity_transmission_grid: true
-  electricity_distribution_grid: true
-  electricity_distribution_grid_cost_factor: 1.0
-  electricity_grid_connection: true
-  transmission_efficiency:
-    DC:
-      efficiency_static: 0.98
-      efficiency_per_1000km: 0.977
-    H2 pipeline:
-      efficiency_per_1000km: 1 # 0.982
-      compression_per_1000km: 0.018
-    gas pipeline:
-      efficiency_per_1000km: 1 #0.977
-      compression_per_1000km: 0.01
-    electricity distribution grid:
-      efficiency_static: 0.97
-  H2_network: true
-  gas_network: false
-  H2_retrofit: false
-  H2_retrofit_capacity_per_CH4: 0.6
-  gas_network_connectivity_upgrade: 1
-  gas_distribution_grid: true
-  gas_distribution_grid_cost_factor: 1.0
-  biomass_spatial: false
-  biomass_transport: false
-  biogas_upgrading_cc: false
-  conventional_generation:
-    OCGT: gas
-  biomass_to_liquid: false
-  electrobiofuels: false
-  biosng: false
-  municipal_solid_waste: false
-  limit_max_growth:
-    enable: false
-    # allowing 30% larger than max historic growth
-    factor: 1.3
-    max_growth:  # unit GW
-      onwind: 16 # onshore max grow so far 16 GW in Europe https://www.iea.org/reports/renewables-2020/wind
-      solar: 28 # solar max grow so far 28 GW in Europe https://www.iea.org/reports/renewables-2020/solar-pv
-      offwind-ac: 35 # offshore max grow so far 3.5 GW in Europe https://windeurope.org/about-wind/statistics/offshore/european-offshore-wind-industry-key-trends-statistics-2019/
-      offwind-dc: 35
-    max_relative_growth:
-      onwind: 3
-      solar: 3
-      offwind-ac: 3
-      offwind-dc: 3
-  enhanced_geothermal:
-    enable: false
-    flexible: true
-    max_hours: 240
-    max_boost: 0.25
-    var_cf: true
-    sustainability_factor: 0.0025
-  solid_biomass_import:
-    enable: false
-    price: 54 #EUR/MWh
-    max_amount: 1390 # TWh
-    upstream_emissions_factor: .1 #share of solid biomass CO2 emissions at full combustion
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#industry
-industry:
-  St_primary_fraction:
-    2020: 0.6
-    2025: 0.55
-    2030: 0.5
-    2035: 0.45
-    2040: 0.4
-    2045: 0.35
-    2050: 0.3
-  DRI_fraction:
-    2020: 0
-    2025: 0
-    2030: 0.05
-    2035: 0.2
-    2040: 0.4
-    2045: 0.7
-    2050: 1
-  H2_DRI: 1.7
-  elec_DRI: 0.322
-  Al_primary_fraction:
-    2020: 0.4
-    2025: 0.375
-    2030: 0.35
-    2035: 0.325
-    2040: 0.3
-    2045: 0.25
-    2050: 0.2
-  MWh_NH3_per_tNH3: 5.166
-  MWh_CH4_per_tNH3_SMR: 10.8
-  MWh_elec_per_tNH3_SMR: 0.7
-  MWh_H2_per_tNH3_electrolysis: 5.93
-  MWh_elec_per_tNH3_electrolysis: 0.2473
-  MWh_NH3_per_MWh_H2_cracker: 1.46 # https://github.com/euronion/trace/blob/44a5ff8401762edbef80eff9cfe5a47c8d3c8be4/data/efficiencies.csv
-  NH3_process_emissions: 24.5
-  petrochemical_process_emissions: 25.5
-  #HVC primary/recycling based on values used in Neumann et al https://doi.org/10.1016/j.joule.2023.06.016, linearly interpolated between 2020 and 2050
-  #2020 recycling rates based on Agora https://static.agora-energiewende.de/fileadmin/Projekte/2021/2021_02_EU_CEAP/A-EW_254_Mobilising-circular-economy_study_WEB.pdf
-  #fractions refer to the total primary HVC production in 2020
-  #assumes 6.7 Mtplastics produced from recycling in 2020
-  HVC_primary_fraction:
-    2020: 1.0
-    2025: 0.9
-    2030: 0.8
-    2035: 0.7
-    2040: 0.6
-    2045: 0.5
-    2050: 0.4
-  HVC_mechanical_recycling_fraction:
-    2020: 0.12
-    2025: 0.15
-    2030: 0.18
-    2035: 0.21
-    2040: 0.24
-    2045: 0.27
-    2050: 0.30
-  HVC_chemical_recycling_fraction:
-    2020: 0.0
-    2025: 0.0
-    2030: 0.04
-    2035: 0.08
-    2040: 0.12
-    2045: 0.16
-    2050: 0.20
-  HVC_environment_sequestration_fraction: 0.
-  waste_to_energy: false
-  waste_to_energy_cc: false
-  sector_ratios_fraction_future:
-    2020: 0.0
-    2025: 0.1
-    2030: 0.3
-    2035: 0.5
-    2040: 0.7
-    2045: 0.9
-    2050: 1.0
-  basic_chemicals_without_NH3_production_today: 69. #Mt/a, = 86 Mtethylene-equiv - 17 MtNH3
-  HVC_production_today: 52.
-  MWh_elec_per_tHVC_mechanical_recycling: 0.547
-  MWh_elec_per_tHVC_chemical_recycling: 6.9
-  chlorine_production_today: 9.58
-  MWh_elec_per_tCl: 3.6
-  MWh_H2_per_tCl: -0.9372
-  methanol_production_today: 1.5
-  MWh_elec_per_tMeOH: 0.167
-  MWh_CH4_per_tMeOH: 10.25
-  MWh_MeOH_per_tMeOH: 5.528
-  hotmaps_locate_missing: false
-  reference_year: 2019
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#costs
-costs:
-  year: 2030
-  version: v0.9.1
-  social_discountrate: 0.02
-  fill_values:
-    FOM: 0
-    VOM: 0
-    efficiency: 1
-    fuel: 0
-    investment: 0
-    lifetime: 25
-    "CO2 intensity": 0
-    "discount rate": 0.07
-  # Marginal and capital costs can be overwritten
-  # capital_cost:
-  #   onwind: 500
-  marginal_cost:
-    solar: 0.01
-    onwind: 0.015
-    offwind: 0.015
-    hydro: 0.
-    H2: 0.
-    electrolysis: 0.
-    fuel cell: 0.
-    battery: 0.
-    battery inverter: 0.
-  emission_prices:
-    enable: true
-    co2: 100.
-    co2_monthly_prices: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#clustering
-clustering:
-  focus_weights: false
-  simplify_network:
-    to_substations: false
-    algorithm: kmeans # choose from: [hac, kmeans]
-    feature: solar+onwind-time
-    exclude_carriers: []
-    remove_stubs: true
-    remove_stubs_across_borders: true
-  cluster_network:
-    algorithm: kmeans
-    feature: solar+onwind-time
-    exclude_carriers: []
-    consider_efficiency_classes: false
-  aggregation_strategies:
-    generators:
-      committable: any
-      ramp_limit_up: max
-      ramp_limit_down: max
-  temporal:
-    resolution_elec: 1H
-    resolution_sector: 1H
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#adjustments
-adjustments:
-  electricity: false
-  sector: false
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#solving
-solving:
-  #tmpdir: "path/to/tmp"
-  options:
-    clip_p_max_pu: 1.e-2
-    load_shedding: false
-    curtailment_mode: false
-    noisy_costs: true
-    skip_iterations: true
-    rolling_horizon: false
-    seed: 123
-    custom_extra_functionality: "../data/custom_extra_functionality.py"
-    # io_api: "direct"  # Increases performance but only supported for the highs and gurobi solvers
-    # options that go into the optimize function
-    track_iterations: false
-    min_iterations: 2
-    max_iterations: 3
-    transmission_losses: 2
-    linearized_unit_commitment: true
-    horizon: 365
-    post_discretization:
-      enable: false
-      line_unit_size: 1700
-      line_threshold: 0.3
-      link_unit_size:
-        DC: 2000
-        H2 pipeline: 1200
-        gas pipeline: 1500
-      link_threshold:
-        DC: 0.3
-        H2 pipeline: 0.3
-        gas pipeline: 0.3
-
-  agg_p_nom_limits:
-    agg_offwind: false
-    include_existing: false
-    file: data/agg_p_nom_minmax.csv
-
-  constraints:
-    CCL: false
-    EQ: false
-    BAU: false
-    SAFE: false
-
-  solver:
-    name: gurobi
-    options: gurobi-default
-
-  solver_options:
-    highs-default:
-      # refer to https://ergo-code.github.io/HiGHS/dev/options/definitions/
-      threads: 1
-      solver: "ipm"
-      run_crossover: "off"
-      small_matrix_value: 1e-6
-      large_matrix_value: 1e9
-      primal_feasibility_tolerance: 1e-5
-      dual_feasibility_tolerance: 1e-5
-      ipm_optimality_tolerance: 1e-4
-      parallel: "on"
-      random_seed: 123
-    gurobi-default:
-      threads: 8
-      method: 2 # barrier
-      crossover: 0
-      BarConvTol: 1.e-6
-      Seed: 123
-      AggFill: 0
-      PreDual: 0
-      GURO_PAR_BARDENSETHRESH: 200
-    gurobi-numeric-focus:
-      NumericFocus: 3       # Favour numeric stability over speed
-      method: 2             # barrier
-      crossover: 0          # do not use crossover
-      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
-      BarConvTol: 1.e-5
-      FeasibilityTol: 1.e-4
-      OptimalityTol: 1.e-4
-      ObjScale: -0.5
-      threads: 8
-      Seed: 123
-    gurobi-fallback:        # Use gurobi defaults
-      crossover: 0
-      method: 2             # barrier
-      BarHomogeneous: 1     # Use homogeneous barrier if standard does not converge
-      BarConvTol: 1.e-5
-      FeasibilityTol: 1.e-5
-      OptimalityTol: 1.e-5
-      Seed: 123
-      threads: 8
-    cplex-default:
-      threads: 4
-      lpmethod: 4 # barrier
-      solutiontype: 2 # non basic solution, ie no crossover
-      barrier.convergetol: 1.e-5
-      feasopt.tolerance: 1.e-6
-    copt-default:
-      Threads: 8
-      LpMethod: 2
-      Crossover: 0
-      RelGap: 1.e-6
-      Dualize: 0
-    copt-gpu:
-      LpMethod: 6
-      GPUMode: 1
-      PDLPTol: 1.e-5
-      Crossover: 0
-    cbc-default: {} # Used in CI
-    glpk-default: {} # Used in CI
-
-  mem_mb: 140000 #memory in MB; 20 GB enough for 50+B+I+H2; 100 GB for 181+B+I+H2
-  runtime: 60h #runtime in humanfriendly style https://humanfriendly.readthedocs.io/en/latest/
-
-
-# docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#plotting
-plotting:
-  map:
-    boundaries: [-11, 30, 34, 71]
-    color_geomap:
-      ocean: white
-      land: white
-  projection:
-    name: "EqualEarth"
-    # See https://scitools.org.uk/cartopy/docs/latest/reference/projections.html for alternatives, for example:
-    # name: "LambertConformal"
-    # central_longitude: 10.
-    # central_latitude: 50.
-    # standard_parallels: [35, 65]
-  eu_node_location:
-    x: -5.5
-    y: 46.
-  costs_max: 1000
-  costs_threshold: 1
-  energy_max: 20000
-  energy_min: -20000
-  energy_threshold: 50.
-
-  nice_names:
-    OCGT: "Open-Cycle Gas"
-    CCGT: "Combined-Cycle Gas"
-    offwind-ac: "Offshore Wind (AC)"
-    offwind-dc: "Offshore Wind (DC)"
-    offwind-float: "Offshore Wind (Floating)"
-    onwind: "Onshore Wind"
-    solar: "Solar"
-    PHS: "Pumped Hydro Storage"
-    hydro: "Reservoir & Dam"
-    battery: "Battery Storage"
-    H2: "Hydrogen Storage"
-    lines: "Transmission Lines"
-    ror: "Run of River"
-    load: "Load Shedding"
-    ac: "AC"
-    dc: "DC"
-
-  tech_colors:
-    # wind
-    onwind: "#235ebc"
-    onshore wind: "#235ebc"
-    offwind: "#6895dd"
-    offshore wind: "#6895dd"
-    offwind-ac: "#6895dd"
-    offshore wind (AC): "#6895dd"
-    offshore wind ac: "#6895dd"
-    offwind-dc: "#74c6f2"
-    offshore wind (DC): "#74c6f2"
-    offshore wind dc: "#74c6f2"
-    offwind-float: "#b5e2fa"
-    offshore wind (Float): "#b5e2fa"
-    offshore wind float: "#b5e2fa"
-    # water
-    hydro: '#298c81'
-    hydro reservoir: '#298c81'
-    ror: '#3dbfb0'
-    run of river: '#3dbfb0'
-    hydroelectricity: '#298c81'
-    PHS: '#51dbcc'
-    hydro+PHS: "#08ad97"
-    # solar
-    solar: "#f9d002"
-    solar PV: "#f9d002"
-    solar-hsat: "#fdb915"
-    solar thermal: '#ffbf2b'
-    residential rural solar thermal: '#f1c069'
-    services rural solar thermal: '#eabf61'
-    residential urban decentral solar thermal: '#e5bc5a'
-    services urban decentral solar thermal: '#dfb953'
-    urban central solar thermal: '#d7b24c'
-    solar rooftop: '#ffea80'
-    # gas
-    OCGT: '#e0986c'
-    OCGT marginal: '#e0986c'
-    OCGT-heat: '#e0986c'
-    gas boiler: '#db6a25'
-    gas boilers: '#db6a25'
-    gas boiler marginal: '#db6a25'
-    residential rural gas boiler: '#d4722e'
-    residential urban decentral gas boiler: '#cb7a36'
-    services rural gas boiler: '#c4813f'
-    services urban decentral gas boiler: '#ba8947'
-    urban central gas boiler: '#b0904f'
-    gas: '#e05b09'
-    fossil gas: '#e05b09'
-    natural gas: '#e05b09'
-    biogas to gas: '#e36311'
-    biogas to gas CC: '#e51245'
-    CCGT: '#a85522'
-    CCGT marginal: '#a85522'
-    allam: '#B98F76'
-    gas for industry co2 to atmosphere: '#692e0a'
-    gas for industry co2 to stored: '#8a3400'
-    gas for industry: '#853403'
-    gas for industry CC: '#692e0a'
-    gas pipeline: '#ebbca0'
-    gas pipeline new: '#a87c62'
-    # oil
-    oil: '#c9c9c9'
-    imported oil: '#a3a3a3'
-    oil boiler: '#adadad'
-    residential rural oil boiler: '#a9a9a9'
-    services rural oil boiler: '#a5a5a5'
-    residential urban decentral oil boiler: '#a1a1a1'
-    urban central oil boiler: '#9d9d9d'
-    services urban decentral oil boiler: '#999999'
-    agriculture machinery oil: '#949494'
-    shipping oil: "#808080"
-    land transport oil: '#afafaf'
-    # nuclear
-    Nuclear: '#ff8c00'
-    Nuclear marginal: '#ff8c00'
-    nuclear: '#ff8c00'
-    uranium: '#ff8c00'
-    # coal
-    Coal: '#545454'
-    coal: '#545454'
-    Coal marginal: '#545454'
-    coal for industry: '#343434'
-    solid: '#545454'
-    Lignite: '#826837'
-    lignite: '#826837'
-    Lignite marginal: '#826837'
-    # biomass
-    biogas: '#e3d37d'
-    biomass: '#baa741'
-    solid biomass: '#baa741'
-    municipal solid waste: '#91ba41'
-    solid biomass import: '#d5ca8d'
-    solid biomass transport: '#baa741'
-    solid biomass for industry: '#7a6d26'
-    solid biomass for industry CC: '#47411c'
-    solid biomass for industry co2 from atmosphere: '#736412'
-    solid biomass for industry co2 to stored: '#47411c'
-    urban central solid biomass CHP: '#9d9042'
-    urban central solid biomass CHP CC: '#6c5d28'
-    biomass boiler: '#8A9A5B'
-    residential rural biomass boiler: '#a1a066'
-    residential urban decentral biomass boiler: '#b0b87b'
-    services rural biomass boiler: '#c6cf98'
-    services urban decentral biomass boiler: '#dde5b5'
-    biomass to liquid: '#32CD32'
-    unsustainable bioliquids: '#32CD32'
-    electrobiofuels: 'red'
-    BioSNG: '#123456'
-    # power transmission
-    lines: '#6c9459'
-    transmission lines: '#6c9459'
-    electricity distribution grid: '#97ad8c'
-    low voltage: '#97ad8c'
-    # electricity demand
-    Electric load: '#110d63'
-    electric demand: '#110d63'
-    electricity: '#110d63'
-    industry electricity: '#2d2a66'
-    industry new electricity: '#2d2a66'
-    agriculture electricity: '#494778'
-    # battery + EVs
-    battery: '#ace37f'
-    battery storage: '#ace37f'
-    battery charger: '#88a75b'
-    battery discharger: '#5d4e29'
-    home battery: '#80c944'
-    home battery storage: '#80c944'
-    home battery charger: '#5e8032'
-    home battery discharger: '#3c5221'
-    BEV charger: '#baf238'
-    V2G: '#e5ffa8'
-    land transport EV: '#baf238'
-    land transport demand: '#38baf2'
-    EV battery: '#baf238'
-    # hot water storage
-    water tanks: '#e69487'
-    residential rural water tanks: '#f7b7a3'
-    services rural water tanks: '#f3afa3'
-    residential urban decentral water tanks: '#f2b2a3'
-    services urban decentral water tanks: '#f1b4a4'
-    urban central water tanks: '#e9977d'
-    hot water storage: '#e69487'
-    hot water charging: '#e8998b'
-    urban central water tanks charger: '#b57a67'
-    residential rural water tanks charger: '#b4887c'
-    residential urban decentral water tanks charger: '#b39995'
-    services rural water tanks charger: '#b3abb0'
-    services urban decentral water tanks charger: '#b3becc'
-    hot water discharging: '#e99c8e'
-    urban central water tanks discharger: '#b9816e'
-    residential rural water tanks discharger: '#ba9685'
-    residential urban decentral water tanks discharger: '#baac9e'
-    services rural water tanks discharger: '#bbc2b8'
-    services urban decentral water tanks discharger: '#bdd8d3'
-    # heat demand
-    Heat load: '#cc1f1f'
-    heat: '#cc1f1f'
-    heat vent: '#aa3344'
-    heat demand: '#cc1f1f'
-    rural heat: '#ff5c5c'
-    residential rural heat: '#ff7c7c'
-    services rural heat: '#ff9c9c'
-    central heat: '#cc1f1f'
-    urban central heat: '#d15959'
-    urban central heat vent: '#a74747'
-    decentral heat: '#750606'
-    residential urban decentral heat: '#a33c3c'
-    services urban decentral heat: '#cc1f1f'
-    low-temperature heat for industry: '#8f2727'
-    process heat: '#ff0000'
-    agriculture heat: '#d9a5a5'
-    # heat supply
-    heat pumps: '#2fb537'
-    heat pump: '#2fb537'
-    air heat pump: '#36eb41'
-    residential urban decentral air heat pump: '#48f74f'
-    services urban decentral air heat pump: '#5af95d'
-    services rural air heat pump: '#5af95d'
-    urban central air heat pump: '#6cfb6b'
-    ground heat pump: '#2fb537'
-    residential rural ground heat pump: '#48f74f'
-    residential rural air heat pump: '#48f74f'
-    services rural ground heat pump: '#5af95d'
-    Ambient: '#98eb9d'
-    CHP: '#8a5751'
-    urban central gas CHP: '#8d5e56'
-    CHP CC: '#634643'
-    urban central gas CHP CC: '#6e4e4c'
-    CHP heat: '#8a5751'
-    CHP electric: '#8a5751'
-    district heating: '#e8beac'
-    resistive heater: '#d8f9b8'
-    residential rural resistive heater: '#bef5b5'
-    residential urban decentral resistive heater: '#b2f1a9'
-    services rural resistive heater: '#a5ed9d'
-    services urban decentral resistive heater: '#98e991'
-    urban central resistive heater: '#8cdf85'
-    retrofitting: '#8487e8'
-    building retrofitting: '#8487e8'
-    # hydrogen
-    H2 for industry: "#f073da"
-    H2 for shipping: "#ebaee0"
-    H2: '#bf13a0'
-    hydrogen: '#bf13a0'
-    retrofitted H2 boiler: '#e5a0d9'
-    SMR: '#870c71'
-    SMR CC: '#4f1745'
-    H2 liquefaction: '#d647bd'
-    hydrogen storage: '#bf13a0'
-    H2 Store: '#bf13a0'
-    H2 storage: '#bf13a0'
-    land transport fuel cell: '#6b3161'
-    H2 pipeline: '#f081dc'
-    H2 pipeline retrofitted: '#ba99b5'
-    H2 Fuel Cell: '#c251ae'
-    H2 fuel cell: '#c251ae'
-    H2 turbine: '#991f83'
-    H2 Electrolysis: '#ff29d9'
-    H2 electrolysis: '#ff29d9'
-    # ammonia
-    NH3: '#46caf0'
-    ammonia: '#46caf0'
-    ammonia store: '#00ace0'
-    ammonia cracker: '#87d0e6'
-    Haber-Bosch: '#076987'
-    # syngas
-    Sabatier: '#9850ad'
-    methanation: '#c44ce6'
-    methane: '#c44ce6'
-    # synfuels
-    Fischer-Tropsch: '#25c49a'
-    liquid: '#25c49a'
-    kerosene for aviation: '#a1ffe6'
-    naphtha for industry: '#57ebc4'
-    methanolisation: '#83d6d5'
-    methanol: '#468c8b'
-    shipping methanol: '#468c8b'
-    industry methanol: '#468c8b'
-    # co2
-    CC: '#f29dae'
-    CCS: '#f29dae'
-    CO2 sequestration: '#f29dae'
-    DAC: '#ff5270'
-    co2 stored: '#f2385a'
-    co2 sequestered: '#f2682f'
-    co2: '#f29dae'
-    co2 vent: '#ffd4dc'
-    CO2 pipeline: '#f5627f'
-    # emissions
-    process emissions CC: '#000000'
-    process emissions: '#222222'
-    process emissions to stored: '#444444'
-    process emissions to atmosphere: '#888888'
-    oil emissions: '#aaaaaa'
-    shipping oil emissions: "#555555"
-    shipping methanol emissions: '#666666'
-    land transport oil emissions: '#777777'
-    agriculture machinery oil emissions: '#333333'
-    # other
-    shipping: '#03a2ff'
-    power-to-heat: '#2fb537'
-    power-to-gas: '#c44ce6'
-    power-to-H2: '#ff29d9'
-    power-to-liquid: '#25c49a'
-    gas-to-power/heat: '#ee8340'
-    waste: '#e3d37d'
-    other: '#000000'
-    geothermal: '#ba91b1'
-    geothermal heat: '#ba91b1'
-    geothermal district heat: '#d19D00'
-    geothermal organic rankine cycle: '#ffbf00'
-    AC: "#70af1d"
-    AC-AC: "#70af1d"
-    AC line: "#70af1d"
-    links: "#8a1caf"
-    HVDC links: "#8a1caf"
-    DC: "#8a1caf"
-    DC-DC: "#8a1caf"
-    DC link: "#8a1caf"
-    load: "#dd2e23"
-    waste CHP: '#e3d37d'
-    waste CHP CC: '#e3d3ff'
-    HVC to air: 'k'

From 7be521704e094ddc061513136cd7c2ed87c3c379 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 21 Aug 2024 16:35:07 +0200
Subject: [PATCH 092/100] Implemented all comments from PR #1079. Cleaned up
 OSM implementation.

---
 .sync-send                               |   3 -
 Snakefile                                |   3 +-
 config/config.default.yaml               |   7 +-
 doc/configtables/electricity.csv         |   3 +-
 doc/configtables/electricity_network.csv |   3 -
 doc/release_notes.rst                    |   4 +-
 rules/build_electricity.smk              | 100 +--
 rules/development.smk                    |  36 +-
 rules/retrieve.smk                       |  61 +-
 scripts/_helpers.py                      |  21 -
 scripts/add_electricity.py               |   7 +-
 scripts/base_network.py                  | 296 ++++-----
 scripts/build_osm_network.py             | 737 +++++------------------
 scripts/clean_osm_data.py                |  62 +-
 scripts/prepare_osm_network_release.py   | 114 ++--
 scripts/retrieve_gdp_uamd.py             |  34 --
 16 files changed, 429 insertions(+), 1062 deletions(-)
 delete mode 100644 doc/configtables/electricity_network.csv
 delete mode 100644 scripts/retrieve_gdp_uamd.py

diff --git a/.sync-send b/.sync-send
index 6fc8cb4c0..483c7a999 100644
--- a/.sync-send
+++ b/.sync-send
@@ -9,6 +9,3 @@ config/test
 envs
 matplotlibrc
 Snakefile
-data/eez/
-data/naturalearth/
-resources/europe-nuts2-gridkit/
diff --git a/Snakefile b/Snakefile
index c45c7e58d..eb99437bf 100644
--- a/Snakefile
+++ b/Snakefile
@@ -135,6 +135,7 @@ rule sync:
     shell:
         """
         rsync -uvarh --ignore-missing-args --files-from=.sync-send . {params.cluster}
-        # rsync -uvarh --no-g {params.cluster}/resources . || echo "No resources directory, skipping rsync"
+        rsync -uvarh --no-g {params.cluster}/resources . || echo "No resources directory, skipping rsync"
         rsync -uvarh --no-g {params.cluster}/results . || echo "No results directory, skipping rsync"
+        rsync -uvarh --no-g {params.cluster}/logs . || echo "No logs directory, skipping rsync"
         """
diff --git a/config/config.default.yaml b/config/config.default.yaml
index e229e1969..4067246ee 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -84,12 +84,10 @@ co2_budget:
   2045: 0.032
   2050: 0.000
 
-electricity_network:
-  base_network: gridkit             # Options: gridkit, osm-prebuilt, osm-raw (built from scratch using OSM data, takes longer)
-  osm_group_tolerance_buses: 5000   # unit: meters, default 5000 - Buses within this distance are grouped together
-
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
+  voltages: [200., 300., 380., 500., 750.]
+  base_network: entsoegridkit
   gaslimit_enable: false
   gaslimit: false
   co2limit_enable: false
@@ -278,6 +276,7 @@ conventional:
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#lines
 lines:
   types:
+    200.: "Al/St 240/40 2-bundle 200.0"
     220.: "Al/St 240/40 2-bundle 220.0"
     300.: "Al/St 240/40 3-bundle 300.0"
     380.: "Al/St 240/40 4-bundle 380.0"
diff --git a/doc/configtables/electricity.csv b/doc/configtables/electricity.csv
index ee733660c..9bad7bfc6 100644
--- a/doc/configtables/electricity.csv
+++ b/doc/configtables/electricity.csv
@@ -1,5 +1,6 @@
 ,Unit,Values,Description
-voltages,kV,"Any subset of {220., 300., 380.}",Voltage levels to consider
+voltages,kV,"Any subset of {200., 220., 300., 380., 500., 750.}",Voltage levels to consider
+base_network, --, "Any value in {'entsoegridkit', 'osm-prebuilt', 'osm-raw}", "Specify the underlying base network, i.e. GridKit (based on ENTSO-E web map extract, OpenStreetMap (OSM) prebuilt or raw (built from raw OSM data), takes longer."
 gaslimit_enable,bool,true or false,Add an overall absolute gas limit configured in ``electricity: gaslimit``.
 gaslimit,MWhth,float or false,Global gas usage limit
 co2limit_enable,bool,true or false,Add an overall absolute carbon-dioxide emissions limit configured in ``electricity: co2limit`` in :mod:`prepare_network`. **Warning:** This option should currently only be used with electricity-only networks, not for sector-coupled networks..
diff --git a/doc/configtables/electricity_network.csv b/doc/configtables/electricity_network.csv
deleted file mode 100644
index f7a51ef1f..000000000
--- a/doc/configtables/electricity_network.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-,Unit,Values,Description
-base_network, --, "Any value in {'gridkit', 'osm-prebuilt', 'osm-raw}", "Specify the underlying base network, i.e. GridKit (based on ENTSO-E web map extract, OpenStreetMap (OSM) prebuilt or raw (built from raw OSM data), takes longer."
-osm_group_tolerance_buses, meters, float, "Specifies the radius in which substations shall be clustered to a single bus. Default recommendation: 5000 (meters)"
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index 759af7765..4add46f82 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -73,9 +73,7 @@ Upcoming Release
 
 * Enable parallelism in :mod:`determine_availability_matrix_MD_UA.py` and remove plots. This requires the use of temporary files.
 
-* Added new feature that to base the electricity network on OpenStreetMap (OSM data) (PR https://github.com/PyPSA/pypsa-eur/pull/1079). Note that a heuristics based cleaning process is used for lines and links where electrical parameters are incomplete, missing, or ambiguous. Through ``electricity_network["base_network"]``, the base network can be set to "gridkit" (original default setting), "osm-prebuilt" (which downloads the latest prebuilt snapshot based on OSM data from Zenodo), or "osm-raw" which retrieves (once) and cleans the raw OSM data and subsequently builds the network. Note that this process may take a few minutes.
-
-* Voltage settings have been aggregated and are now directly read from the line type dictionary. Instead of ``electricity["voltages"]``, scripts have been updated to refer to ``lines["types"].keys()``.
+* Added new major feature to create the base_network from OpenStreetMap (OSM) data (PR https://github.com/PyPSA/pypsa-eur/pull/1079). Note that a heuristics based cleaning process is used for lines and links where electrical parameters are incomplete, missing, or ambiguous. Through ``electricity["base_network"]``, the base network can be set to "entsoegridkit" (original default setting, deprecated soon), "osm-prebuilt" (which downloads the latest prebuilt snapshot based on OSM data from Zenodo), or "osm-raw" which retrieves (once) and cleans the raw OSM data and subsequently builds the network. Note that this process may take a few minutes.
 
 * Updated pre-built `weather data cutouts
   <https://zenodo.org/records/12791128>`__. These are now merged cutouts with
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index b0de316eb..06730bcf6 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -50,6 +50,19 @@ rule build_powerplants:
         "../scripts/build_powerplants.py"
 
 
+def input_base_network(w):
+    base_network = config_provider("electricity", "base_network")(w)
+    components = {"buses", "lines", "links", "converters", "transformers"}
+    if base_network == "osm-raw":
+        inputs = {c: resources(f"osm/pre-base/{c}.csv") for c in components}
+    else:
+        inputs = {c: f"data/{base_network}/{c}.csv" for c in components}
+    if base_network == "entsoegridkit":
+        inputs["parameter_corrections"] = "data/parameter_corrections.yaml"
+        inputs["links_p_nom"] = "data/links_p_nom.csv"
+    return inputs
+
+
 rule base_network:
     params:
         countries=config_provider("countries"),
@@ -58,66 +71,7 @@ rule base_network:
         lines=config_provider("lines"),
         transformers=config_provider("transformers"),
     input:
-        eg_buses=lambda w: (
-            "data/entsoegridkit/buses.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/buses.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/buses.csv")
-            )
-        ),
-        eg_lines=lambda w: (
-            "data/entsoegridkit/lines.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/lines.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/lines.csv")
-            )
-        ),
-        eg_links=lambda w: (
-            "data/entsoegridkit/links.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/links.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/links.csv")
-            )
-        ),
-        eg_converters=lambda w: (
-            "data/entsoegridkit/converters.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/converters.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/converters.csv")
-            )
-        ),
-        eg_transformers=lambda w: (
-            "data/entsoegridkit/transformers.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else (
-                "data/osm/prebuilt/transformers.csv"
-                if config_provider("electricity_network", "base_network")(w)
-                == "osm-prebuilt"
-                else resources("osm/pre-base/transformers.csv")
-            )
-        ),
-        parameter_corrections=lambda w: (
-            "data/parameter_corrections.yaml"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else []
-        ),
-        links_p_nom=lambda w: (
-            "data/links_p_nom.csv"
-            if config_provider("electricity_network", "base_network")(w) == "gridkit"
-            else []
-        ),
+        unpack(input_base_network),
         country_shapes=resources("country_shapes.geojson"),
         offshore_shapes=resources("offshore_shapes.geojson"),
         europe_shape=resources("europe_shape.geojson"),
@@ -684,28 +638,28 @@ rule prepare_network:
         "../scripts/prepare_network.py"
 
 
-if config["electricity_network"]["base_network"] == "osm-raw":
+if config["electricity"]["base_network"] == "osm-raw":
 
     rule clean_osm_data:
         input:
             cables_way=expand(
-                "data/osm/raw/{country}/cables_way.json",
+                "data/osm-raw/{country}/cables_way.json",
                 country=config_provider("countries"),
             ),
             lines_way=expand(
-                "data/osm/raw/{country}/lines_way.json",
+                "data/osm-raw/{country}/lines_way.json",
                 country=config_provider("countries"),
             ),
             links_relation=expand(
-                "data/osm/raw/{country}/links_relation.json",
+                "data/osm-raw/{country}/links_relation.json",
                 country=config_provider("countries"),
             ),
             substations_way=expand(
-                "data/osm/raw/{country}/substations_way.json",
+                "data/osm-raw/{country}/substations_way.json",
                 country=config_provider("countries"),
             ),
             substations_relation=expand(
-                "data/osm/raw/{country}/substations_relation.json",
+                "data/osm-raw/{country}/substations_relation.json",
                 country=config_provider("countries"),
             ),
             offshore_shapes=resources("offshore_shapes.geojson"),
@@ -717,11 +671,18 @@ if config["electricity_network"]["base_network"] == "osm-raw":
             links=resources("osm/clean/links.geojson"),
         log:
             logs("clean_osm_data.log"),
+        benchmark:
+            benchmarks("clean_osm_data")
+        threads: 1
+        resources:
+            mem_mb=4000,
+        conda:
+            "../envs/environment.yaml"
         script:
             "../scripts/clean_osm_data.py"
 
 
-if config["electricity_network"]["base_network"] == "osm-raw":
+if config["electricity"]["base_network"] == "osm-raw":
 
     rule build_osm_network:
         input:
@@ -744,5 +705,10 @@ if config["electricity_network"]["base_network"] == "osm-raw":
             logs("build_osm_network.log"),
         benchmark:
             benchmarks("build_osm_network")
+        threads: 1
+        resources:
+            mem_mb=4000,
+        conda:
+            "../envs/environment.yaml"
         script:
             "../scripts/build_osm_network.py"
diff --git a/rules/development.smk b/rules/development.smk
index 24c46a159..0386e38e8 100644
--- a/rules/development.smk
+++ b/rules/development.smk
@@ -2,19 +2,25 @@
 #
 # SPDX-License-Identifier: MIT
 
+if config["electricity"]["base_network"] == "osm-raw":
 
-rule prepare_osm_network_release:
-    input:
-        base_network=resources("networks/base.nc"),
-    output:
-        buses=resources("osm/release/buses.csv"),
-        converters=resources("osm/release/converters.csv"),
-        lines=resources("osm/release/lines.csv"),
-        links=resources("osm/release/links.csv"),
-        transformers=resources("osm/release/transformers.csv"),
-    log:
-        logs("prepare_osm_network_release.log"),
-    benchmark:
-        benchmarks("prepare_osm_network_release")
-    script:
-        "../scripts/prepare_osm_network_release.py"
+    rule prepare_osm_network_release:
+        input:
+            base_network=resources("networks/base.nc"),
+        output:
+            buses=resources("osm/release/buses.csv"),
+            converters=resources("osm/release/converters.csv"),
+            lines=resources("osm/release/lines.csv"),
+            links=resources("osm/release/links.csv"),
+            transformers=resources("osm/release/transformers.csv"),
+        log:
+            logs("prepare_osm_network_release.log"),
+        benchmark:
+            benchmarks("prepare_osm_network_release")
+        threads: 1
+        resources:
+            mem_mb=1000,
+        conda:
+            "../envs/environment.yaml"
+        script:
+            "../scripts/prepare_osm_network_release.py"
diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 371100d6e..75ab5a375 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -392,25 +392,8 @@ if config["enable"]["retrieve"]:
             "../scripts/retrieve_monthly_fuel_prices.py"
 
 
-if config["enable"]["retrieve"] and {"UA", "MD"}.intersection(config["countries"]):
-
-    rule retrieve_gdp_uamd:
-        output:
-            gdp_non_nuts3="data/GDP_per_capita_PPP_1990_2015_v2.nc",
-            pop_non_nuts3="data/ppp_2013_1km_Aggregated.tif",
-        log:
-            "logs/retrieve_gdp_uamd.log",
-        resources:
-            mem_mb=5000,
-        retries: 2
-        conda:
-            "../envs/retrieve.yaml"
-        script:
-            "../scripts/retrieve_gdp_uamd.py"
-
-
 if config["enable"]["retrieve"] and (
-    config["electricity_network"]["base_network"] == "osm-prebuilt"
+    config["electricity"]["base_network"] == "osm-prebuilt"
 ):
 
     rule retrieve_osm_prebuilt:
@@ -425,65 +408,67 @@ if config["enable"]["retrieve"] and (
                 "https://zenodo.org/records/13342577/files/transformers.csv"
             ),
         output:
-            buses="data/osm/prebuilt/buses.csv",
-            converters="data/osm/prebuilt/converters.csv",
-            lines="data/osm/prebuilt/lines.csv",
-            links="data/osm/prebuilt/links.csv",
-            transformers="data/osm/prebuilt/transformers.csv",
+            buses="data/osm-prebuilt/buses.csv",
+            converters="data/osm-prebuilt/converters.csv",
+            lines="data/osm-prebuilt/lines.csv",
+            links="data/osm-prebuilt/links.csv",
+            transformers="data/osm-prebuilt/transformers.csv",
         log:
             "logs/retrieve_osm_prebuilt.log",
+        threads: 1
         resources:
             mem_mb=500,
         retries: 2
         run:
             for key in input.keys():
                 move(input[key], output[key])
+                validate_checksum(output[key], input[key])
 
 
 
 if config["enable"]["retrieve"] and (
-    config["electricity_network"]["base_network"] == "osm-raw"
+    config["electricity"]["base_network"] == "osm-raw"
 ):
 
     rule retrieve_osm_data:
         output:
-            cables_way="data/osm/raw/{country}/cables_way.json",
-            lines_way="data/osm/raw/{country}/lines_way.json",
-            links_relation="data/osm/raw/{country}/links_relation.json",
-            substations_way="data/osm/raw/{country}/substations_way.json",
-            substations_relation="data/osm/raw/{country}/substations_relation.json",
+            cables_way="data/osm-raw/{country}/cables_way.json",
+            lines_way="data/osm-raw/{country}/lines_way.json",
+            links_relation="data/osm-raw/{country}/links_relation.json",
+            substations_way="data/osm-raw/{country}/substations_way.json",
+            substations_relation="data/osm-raw/{country}/substations_relation.json",
         log:
             "logs/retrieve_osm_data_{country}.log",
-        resources:
-            cores=2,
-            threads=1,
+        threads: 1
+        conda:
+            "../envs/retrieve.yaml"
         script:
             "../scripts/retrieve_osm_data.py"
 
 
 if config["enable"]["retrieve"] and (
-    config["electricity_network"]["base_network"] == "osm-raw"
+    config["electricity"]["base_network"] == "osm-raw"
 ):
 
     rule retrieve_osm_data_all:
         input:
             expand(
-                "data/osm/raw/{country}/cables_way.json",
+                "data/osm-raw/{country}/cables_way.json",
                 country=config_provider("countries"),
             ),
             expand(
-                "data/osm/raw/{country}/lines_way.json",
+                "data/osm-raw/{country}/lines_way.json",
                 country=config_provider("countries"),
             ),
             expand(
-                "data/osm/raw/{country}/links_relation.json",
+                "data/osm-raw/{country}/links_relation.json",
                 country=config_provider("countries"),
             ),
             expand(
-                "data/osm/raw/{country}/substations_way.json",
+                "data/osm-raw/{country}/substations_way.json",
                 country=config_provider("countries"),
             ),
             expand(
-                "data/osm/raw/{country}/substations_relation.json",
+                "data/osm-raw/{country}/substations_relation.json",
                 country=config_provider("countries"),
             ),
diff --git a/scripts/_helpers.py b/scripts/_helpers.py
index 537b8c4f7..a3b77c1c0 100644
--- a/scripts/_helpers.py
+++ b/scripts/_helpers.py
@@ -370,27 +370,6 @@ def update_to(b=1, bsize=1, tsize=None):
             urllib.request.urlretrieve(url, file, reporthook=update_to)
 
 
-def retrieve_file(url, destination):
-    """
-    Downloads a file from a specified URL to a local destination using custom
-    headers that mimic a Firefox browser request.
-
-    This function is useful for overcoming 'HTTP Error 403: Forbidden'
-    issues, which often occur when the server requires more typical
-    browser-like headers for access.
-    """
-
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
-    }
-    response = requests.get(url, headers=headers)
-    response.raise_for_status()
-
-    with open(destination, "wb") as f:
-        f.write(response.content)
-    logger.info(f"File downloaded and saved as {destination}")
-
-
 def mock_snakemake(
     rulename,
     root_dir=None,
diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
index 2aef27a99..076eb84ed 100755
--- a/scripts/add_electricity.py
+++ b/scripts/add_electricity.py
@@ -324,7 +324,7 @@ def upsample(cntry, group, gdp_pop_non_nuts3):
         # relative factors 0.6 and 0.4 have been determined from a linear
         # regression on the country to continent load data
         factors = normed(0.6 * normed(gdp_n) + 0.4 * normed(pop_n))
-        if cntry in ["UA", "MD"] and gdp_pop_non_nuts3 is not None:
+        if cntry in ["UA", "MD"]:
             # overwrite factor because nuts3 provides no data for UA+MD
             gdp_pop_non_nuts3 = gpd.read_file(gdp_pop_non_nuts3).set_index("Bus")
             gdp_pop_non_nuts3 = gdp_pop_non_nuts3.loc[
@@ -847,11 +847,6 @@ def add_transmission_projects(n, transmission_projects):
     )
     ppl = load_powerplants(snakemake.input.powerplants)
 
-    if "gdp_pop_non_nuts3" in snakemake.input.keys():
-        gdp_pop_non_nuts3 = snakemake.input.gdp_pop_non_nuts3
-    else:
-        gdp_pop_non_nuts3 = None
-
     attach_load(
         n,
         snakemake.input.regions,
diff --git a/scripts/base_network.py b/scripts/base_network.py
index 2c45f1f0d..5c98129b1 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -6,8 +6,9 @@
 # coding: utf-8
 """
 Creates the network topology from a `ENTSO-E map extract.
-
-<https://github.com/PyPSA/GridKit/tree/master/entsoe>`_ (March 2022) as a PyPSA
+<https://github.com/PyPSA/GridKit/tree/master/entsoe>`_ (March 2022)
+or `OpenStreetMap data <https://www.openstreetmap.org/>`_ (Aug 2024)
+as a PyPSA
 network.
 
 Relevant Settings
@@ -134,10 +135,10 @@ def _find_closest_links(links, new_links, distance_upper_bound=1.5):
     )
 
 
-def _load_buses_from_eg(eg_buses, europe_shape, config):
+def _load_buses(buses, europe_shape, config):
     buses = (
         pd.read_csv(
-            eg_buses,
+            buses,
             quotechar="'",
             true_values=["t"],
             false_values=["f"],
@@ -160,23 +161,24 @@ def _load_buses_from_eg(eg_buses, europe_shape, config):
         lambda p: europe_shape_prepped.contains(Point(p)), axis=1
     )
 
-    v_nom_min = min(config["lines"]["types"].keys())
-    v_nom_max = max(config["lines"]["types"].keys())
+    v_nom_min = min(config["electricity"]["voltages"])
+    v_nom_max = max(config["electricity"]["voltages"])
 
-    # Quick fix:
     buses_with_v_nom_to_keep_b = (
         (v_nom_min <= buses.v_nom) & (buses.v_nom <= v_nom_max)
         | (buses.v_nom.isnull())
-        | (buses.carrier == "DC")
+        | (
+            buses.carrier == "DC"
+        )  # Keeping all DC buses from the input dataset independent of voltage (e.g. 150 kV connections)
     )
 
     logger.info(f"Removing buses outside of range AC {v_nom_min} - {v_nom_max} V")
     return pd.DataFrame(buses.loc[buses_in_europe_b & buses_with_v_nom_to_keep_b])
 
 
-def _load_transformers_from_eg(buses, eg_transformers):
+def _load_transformers(buses, transformers):
     transformers = pd.read_csv(
-        eg_transformers,
+        transformers,
         quotechar="'",
         true_values=["t"],
         false_values=["f"],
@@ -188,9 +190,9 @@ def _load_transformers_from_eg(buses, eg_transformers):
     return transformers
 
 
-def _load_converters_from_eg(buses, eg_converters):
+def _load_converters_from_eg(buses, converters):
     converters = pd.read_csv(
-        eg_converters,
+        converters,
         quotechar="'",
         true_values=["t"],
         false_values=["f"],
@@ -204,9 +206,9 @@ def _load_converters_from_eg(buses, eg_converters):
     return converters
 
 
-def _load_converters_from_osm(buses, eg_converters):
+def _load_converters_from_osm(buses, converters):
     converters = pd.read_csv(
-        eg_converters,
+        converters,
         quotechar="'",
         true_values=["t"],
         false_values=["f"],
@@ -220,9 +222,9 @@ def _load_converters_from_osm(buses, eg_converters):
     return converters
 
 
-def _load_links_from_eg(buses, eg_links):
+def _load_links_from_eg(buses, links):
     links = pd.read_csv(
-        eg_links,
+        links,
         quotechar="'",
         true_values=["t"],
         false_values=["f"],
@@ -231,7 +233,7 @@ def _load_links_from_eg(buses, eg_links):
 
     links["length"] /= 1e3
 
-    # Skagerrak Link is connected to 132kV bus which is removed in _load_buses_from_eg.
+    # Skagerrak Link is connected to 132kV bus which is removed in _load_buses.
     # Connect to neighboring 380kV bus
     links.loc[links.bus1 == "6396", "bus1"] = "6398"
 
@@ -243,9 +245,9 @@ def _load_links_from_eg(buses, eg_links):
     return links
 
 
-def _load_links_from_osm(buses, eg_links):
+def _load_links_from_osm(buses, links):
     links = pd.read_csv(
-        eg_links,
+        links,
         quotechar="'",
         true_values=["t"],
         false_values=["f"],
@@ -268,116 +270,10 @@ def _load_links_from_osm(buses, eg_links):
     return links
 
 
-def _add_links_from_tyndp(buses, links, links_tyndp, europe_shape):
-    links_tyndp = pd.read_csv(links_tyndp)
-
-    # remove all links from list which lie outside all of the desired countries
-    europe_shape = gpd.read_file(europe_shape).loc[0, "geometry"]
-    europe_shape_prepped = shapely.prepared.prep(europe_shape)
-    x1y1_in_europe_b = links_tyndp[["x1", "y1"]].apply(
-        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
-    )
-    x2y2_in_europe_b = links_tyndp[["x2", "y2"]].apply(
-        lambda p: europe_shape_prepped.contains(Point(p)), axis=1
-    )
-    is_within_covered_countries_b = x1y1_in_europe_b & x2y2_in_europe_b
-
-    if not is_within_covered_countries_b.all():
-        logger.info(
-            "TYNDP links outside of the covered area (skipping): "
-            + ", ".join(links_tyndp.loc[~is_within_covered_countries_b, "Name"])
-        )
-
-        links_tyndp = links_tyndp.loc[is_within_covered_countries_b]
-        if links_tyndp.empty:
-            return buses, links
-
-    has_replaces_b = links_tyndp.replaces.notnull()
-    oids = dict(Bus=_get_oid(buses), Link=_get_oid(links))
-    keep_b = dict(
-        Bus=pd.Series(True, index=buses.index), Link=pd.Series(True, index=links.index)
-    )
-    for reps in links_tyndp.loc[has_replaces_b, "replaces"]:
-        for comps in reps.split(":"):
-            oids_to_remove = comps.split(".")
-            c = oids_to_remove.pop(0)
-            keep_b[c] &= ~oids[c].isin(oids_to_remove)
-    buses = buses.loc[keep_b["Bus"]]
-    links = links.loc[keep_b["Link"]]
-
-    links_tyndp["j"] = _find_closest_links(
-        links, links_tyndp, distance_upper_bound=0.20
-    )
-    # Corresponds approximately to 20km tolerances
-
-    if links_tyndp["j"].notnull().any():
-        logger.info(
-            "TYNDP links already in the dataset (skipping): "
-            + ", ".join(links_tyndp.loc[links_tyndp["j"].notnull(), "Name"])
-        )
-        links_tyndp = links_tyndp.loc[links_tyndp["j"].isnull()]
-        if links_tyndp.empty:
-            return buses, links
-
-    tree_buses = buses.query("carrier=='AC'")
-    tree = KDTree(tree_buses[["x", "y"]])
-    _, ind0 = tree.query(links_tyndp[["x1", "y1"]])
-    ind0_b = ind0 < len(tree_buses)
-    links_tyndp.loc[ind0_b, "bus0"] = tree_buses.index[ind0[ind0_b]]
-
-    _, ind1 = tree.query(links_tyndp[["x2", "y2"]])
-    ind1_b = ind1 < len(tree_buses)
-    links_tyndp.loc[ind1_b, "bus1"] = tree_buses.index[ind1[ind1_b]]
-
-    links_tyndp_located_b = (
-        links_tyndp["bus0"].notnull() & links_tyndp["bus1"].notnull()
-    )
-    if not links_tyndp_located_b.all():
-        logger.warning(
-            "Did not find connected buses for TYNDP links (skipping): "
-            + ", ".join(links_tyndp.loc[~links_tyndp_located_b, "Name"])
-        )
-        links_tyndp = links_tyndp.loc[links_tyndp_located_b]
-
-    logger.info("Adding the following TYNDP links: " + ", ".join(links_tyndp["Name"]))
-
-    links_tyndp = links_tyndp[["bus0", "bus1"]].assign(
-        carrier="DC",
-        p_nom=links_tyndp["Power (MW)"],
-        length=links_tyndp["Length (given) (km)"].fillna(
-            links_tyndp["Length (distance*1.2) (km)"]
-        ),
-        under_construction=True,
-        underground=False,
-        geometry=(
-            links_tyndp[["x1", "y1", "x2", "y2"]].apply(
-                lambda s: str(LineString([[s.x1, s.y1], [s.x2, s.y2]])), axis=1
-            )
-        ),
-        tags=(
-            '"name"=>"'
-            + links_tyndp["Name"]
-            + '", '
-            + '"ref"=>"'
-            + links_tyndp["Ref"]
-            + '", '
-            + '"status"=>"'
-            + links_tyndp["status"]
-            + '"'
-        ),
-    )
-
-    links_tyndp.index = "T" + links_tyndp.index.astype(str)
-
-    links = pd.concat([links, links_tyndp], sort=True)
-
-    return buses, links
-
-
-def _load_lines_from_eg(buses, eg_lines):
+def _load_lines(buses, lines):
     lines = (
         pd.read_csv(
-            eg_lines,
+            lines,
             quotechar="'",
             true_values=["t"],
             false_values=["f"],
@@ -395,7 +291,7 @@ def _load_lines_from_eg(buses, eg_lines):
 
     lines["length"] /= 1e3
 
-    lines["carrier"] = "AC"  # TODO pypsa-eur check
+    lines["carrier"] = "AC"
     lines = _remove_dangling_branches(lines, buses)
 
     return lines
@@ -446,7 +342,7 @@ def _reconnect_crimea(lines):
 
 
 def _set_electrical_parameters_lines_eg(lines, config):
-    v_noms = list(config["lines"]["types"].keys())
+    v_noms = config["electricity"]["voltages"]
     linetypes = config["lines"]["types"]
 
     for v_nom in v_noms:
@@ -462,7 +358,7 @@ def _set_electrical_parameters_lines_osm(lines, config):
         lines["type"] = []
         return lines
 
-    v_noms = list(config["lines"]["types"].keys())
+    v_noms = config["electricity"]["voltages"]
     linetypes = _get_linetypes_config(config["lines"]["types"], v_noms)
 
     lines["carrier"] = "AC"
@@ -807,11 +703,11 @@ def _set_shapes(n, country_shapes, offshore_shapes):
 
 
 def base_network(
-    eg_buses,
-    eg_converters,
-    eg_transformers,
-    eg_lines,
-    eg_links,
+    buses,
+    converters,
+    transformers,
+    lines,
+    links,
     links_p_nom,
     europe_shape,
     country_shapes,
@@ -820,57 +716,58 @@ def base_network(
     config,
 ):
 
-    buses = _load_buses_from_eg(eg_buses, europe_shape, config)
-
-    if config["electricity_network"].get("base_network") == "gridkit":
-        links = _load_links_from_eg(buses, eg_links)
-    elif "osm" in config["electricity_network"].get("base_network"):
-        links = _load_links_from_osm(buses, eg_links)
-    else:
-        raise ValueError("base_network must be either 'gridkit' or 'osm'")
+    base_network = config["electricity"].get("base_network")
+    assert base_network in {
+        "entsoegridkit",
+        "osm-raw",
+        "osm-prebuilt",
+    }, f"base_network must be either 'entsoegridkit', 'osm-raw' or 'osm-prebuilt', but got '{base_network}'"
+    if base_network == "entsoegridkit":
+        warnings.warn(
+            "The 'entsoegridkit' base network is deprecated and will be removed in future versions. Please use 'osm-raw' or 'osm-prebuilt' instead.",
+            DeprecationWarning,
+        )
 
-    if config["electricity_network"].get("base_network") == "gridkit":
-        converters = _load_converters_from_eg(buses, eg_converters)
-    elif "osm" in config["electricity_network"].get("base_network"):
-        converters = _load_converters_from_osm(buses, eg_converters)
+    logger.info(f"Creating base network using {base_network}.")
 
-    transformers = _load_transformers_from_eg(buses, eg_transformers)
+    buses = _load_buses(buses, europe_shape, config)
+    transformers = _load_transformers(buses, transformers)
+    lines = _load_lines(buses, lines)
 
-    lines = _load_lines_from_eg(buses, eg_lines)
+    if base_network == "entsoegridkit":
+        links = _load_links_from_eg(buses, links)
+        converters = _load_converters_from_eg(buses, converters)
 
-    if (
-        (config["electricity_network"].get("base_network") == "gridkit")
-        & (config["lines"].get("reconnect_crimea", True))
-        & ("UA" in config["countries"])
-    ):
-        lines = _reconnect_crimea(lines)
+        # Optionally reconnect Crimea
+        if (config["lines"].get("reconnect_crimea", True)) & (
+            "UA" in config["countries"]
+        ):
+            lines = _reconnect_crimea(lines)
 
-    if config["electricity_network"].get("base_network") == "gridkit":
+        # Set electrical parameters of lines and links
         lines = _set_electrical_parameters_lines_eg(lines, config)
         links = _set_electrical_parameters_links_eg(links, config, links_p_nom)
-    elif "osm" in config["electricity_network"].get("base_network"):
+    elif base_network in {"osm-prebuilt", "osm-raw"}:
+        links = _load_links_from_osm(buses, links)
+        converters = _load_converters_from_osm(buses, converters)
+
+        # Set electrical parameters of lines and links
         lines = _set_electrical_parameters_lines_osm(lines, config)
         links = _set_electrical_parameters_links_osm(links, config)
     else:
-        raise ValueError("base_network must be either 'gridkit' or 'osm'")
+        raise ValueError(
+            "base_network must be either 'entsoegridkit', 'osm-raw', or 'osm-prebuilt'"
+        )
 
+    # Set electrical parameters of transformers and converters
     transformers = _set_electrical_parameters_transformers(transformers, config)
     converters = _set_electrical_parameters_converters(converters, config)
 
     n = pypsa.Network()
-
-    if config["electricity_network"].get("base_network") == "gridkit":
-        n.name = "PyPSA-Eur (GridKit)"
-    elif "osm" in config["electricity_network"].get("base_network"):
-        n.name = "PyPSA-Eur (OSM)"
-    else:
-        raise ValueError("base_network must be either 'gridkit' or 'osm'")
+    n.name = f"PyPSA-Eur ({base_network})"
 
     time = get_snapshots(snakemake.params.snapshots, snakemake.params.drop_leap_day)
     n.set_snapshots(time)
-    n.madd(
-        "Carrier", ["AC", "DC"]
-    )  # TODO: fix hard code and check if AC/DC truly exist
 
     n.import_components_from_dataframe(buses, "Bus")
     n.import_components_from_dataframe(lines, "Line")
@@ -879,15 +776,13 @@ def base_network(
     n.import_components_from_dataframe(converters, "Link")
 
     _set_lines_s_nom_from_linetypes(n)
-    if config["electricity_network"].get("base_network") == "gridkit":
+    if config["electricity"].get("base_network") == "gridkit":
         _apply_parameter_corrections(n, parameter_corrections)
 
-    # TODO: what about this?
     n = _remove_unconnected_components(n)
 
     _set_countries_and_substations(n, config, country_shapes, offshore_shapes)
 
-    # TODO pypsa-eur add this
     _set_links_underwater_fraction(n, offshore_shapes)
 
     _replace_b2b_converter_at_country_border_by_link(n)
@@ -896,9 +791,12 @@ def base_network(
 
     _set_shapes(n, country_shapes, offshore_shapes)
 
-    logger.info(
-        f"Base network created using {config['electricity_network'].get('base_network')}."
-    )
+    # Add carriers if they are present in buses.carriers
+    carriers_in_buses = set(n.buses.carrier.dropna().unique())
+    carriers = carriers_in_buses.intersection({"AC", "DC"})
+
+    if carriers:
+        n.madd("Carrier", carriers)
 
     return n
 
@@ -1066,25 +964,47 @@ def append_bus_shapes(n, shapes, type):
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
+    countries = snakemake.params.countries
+
+    buses = snakemake.input.buses
+    converters = snakemake.input.converters
+    transformers = snakemake.input.transformers
+    lines = snakemake.input.lines
+    links = snakemake.input.links
+    europe_shape = snakemake.input.europe_shape
+    country_shapes = snakemake.input.country_shapes
+    offshore_shapes = snakemake.input.offshore_shapes
+    config = snakemake.config
+
+    if "links_p_nom" in snakemake.input.keys():
+        links_p_nom = snakemake.input.links_p_nom
+    else:
+        links_p_nom = None
+
+    if "parameter_corrections" in snakemake.input.keys():
+        parameter_corrections = snakemake.input.parameter_corrections
+    else:
+        parameter_corrections = None
+
     n = base_network(
-        snakemake.input.eg_buses,
-        snakemake.input.eg_converters,
-        snakemake.input.eg_transformers,
-        snakemake.input.eg_lines,
-        snakemake.input.eg_links,
-        snakemake.input.links_p_nom,
-        snakemake.input.europe_shape,
-        snakemake.input.country_shapes,
-        snakemake.input.offshore_shapes,
-        snakemake.input.parameter_corrections,
-        snakemake.config,
+        buses,
+        converters,
+        transformers,
+        lines,
+        links,
+        links_p_nom,
+        europe_shape,
+        country_shapes,
+        offshore_shapes,
+        parameter_corrections,
+        config,
     )
 
     onshore_regions, offshore_regions, shapes, offshore_shapes = build_bus_shapes(
         n,
-        snakemake.input.country_shapes,
-        snakemake.input.offshore_shapes,
-        snakemake.params.countries,
+        country_shapes,
+        offshore_shapes,
+        countries,
     )
 
     shapes.to_file(snakemake.output.regions_onshore)
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 795712067..889105754 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -10,7 +10,6 @@
 import geopandas as gpd
 import numpy as np
 import pandas as pd
-from _benchmark import memory_logger
 from _helpers import configure_logging, set_scenario_config
 from shapely.geometry import LineString, Point
 from shapely.ops import linemerge, nearest_points, split
@@ -18,45 +17,39 @@
 
 logger = logging.getLogger(__name__)
 
-# list of recognised nan values (NA and na excluded as may be confused with Namibia 2-letter country code)
-NA_VALUES = ["NULL", "", "N/A", "NAN", "NaN", "nan", "Nan", "n/a", "null"]
 
-
-def read_csv_nafix(file, **kwargs):
-    "Function to open a csv as pandas file and standardize the na value"
-    if "keep_default_na" not in kwargs:
-        kwargs["keep_default_na"] = False
-    if "na_values" not in kwargs:
-        kwargs["na_values"] = NA_VALUES
-
-    if os.stat(file).st_size > 0:
-        return pd.read_csv(file, **kwargs)
-    else:
-        return pd.DataFrame()
-
-
-def save_to_geojson(df, fn):
-    """
-    Save a (Geo)DataFrame to a GeoJSON file.
-
-    Parameters:
-    - df: The (Geo)DataFrame to be saved.
-    - fn: The filename (including the path) of the output GeoJSON file.
-
-    Returns:
-    None
-    """
-    if os.path.exists(fn):
-        os.unlink(fn)  # remove file if it exists
-
-    # save file if the (Geo)DataFrame is non-empty
-    if df.empty:
-        # create empty file to avoid issues with snakemake
-        with open(fn, "w") as fp:
-            pass
-    else:
-        # save file
-        df.to_file(fn, driver="GeoJSON")
+GEO_CRS = "EPSG:4326"
+DISTANCE_CRS = "EPSG:3035"
+BUS_TOL = (
+    5000  # unit: meters, default 5000 - Buses within this distance are grouped together
+)
+LINES_COLUMNS = [
+    "bus0",
+    "bus1",
+    "voltage",
+    "circuits",
+    "length",
+    "underground",
+    "under_construction",
+    "geometry",
+]
+LINKS_COLUMNS = [
+    "bus0",
+    "bus1",
+    "voltage",
+    "p_nom",
+    "length",
+    "under_construction",
+    "geometry",
+]
+TRANSFORMERS_COLUMNS = [
+    "bus0",
+    "bus1",
+    "voltage_bus0",
+    "voltage_bus1",
+    "country",
+    "geometry",
+]
 
 
 def read_geojson(fn, cols=[], dtype=None, crs="EPSG:4326"):
@@ -88,35 +81,6 @@ def read_geojson(fn, cols=[], dtype=None, crs="EPSG:4326"):
         return df
 
 
-def to_csv_nafix(df, path, **kwargs):
-    """
-    Write a pandas DataFrame to a CSV file with NA values replaced.
-
-    Parameters:
-    - df: pandas DataFrame
-        The DataFrame to be written to the CSV file.
-    - path: str
-        The file path where the CSV file will be saved.
-    - **kwargs: keyword arguments
-        Additional arguments to be passed to the `to_csv` function of pandas.
-
-    Returns:
-    - None
-
-    If the DataFrame is not empty or does not have empty columns, it will be
-    written to the CSV file with NA values replaced by the first value in the
-    `NA_VALUES` list. If the DataFrame is empty or has empty columns, an empty
-    file will be created at the specified path.
-    """
-    if "na_rep" in kwargs:
-        del kwargs["na_rep"]
-    if not df.empty or not df.columns.empty:
-        return df.to_csv(path, **kwargs, na_rep=NA_VALUES[0])
-    else:
-        with open(path, "w") as fp:
-            pass
-
-
 def line_endings_to_bus_conversion(lines):
     """
     Converts line endings to bus connections.
@@ -252,19 +216,13 @@ def set_lines_ids(lines, buses, distance_crs):
         distance_bus0 = busesepsg.geometry.loc[bus0_id].distance(
             row.geometry.boundary.geoms[0]
         )
-        if distance_bus0 > 0.0:
+
+        if distance_bus0 > 0:
             # the line does not start in the node, thus modify the linestring
-            lines.loc[i, "geometry"] = linemerge(
-                [
-                    LineString(
-                        [
-                            buses.geometry.loc[bus0_id],
-                            lines.geometry.loc[i].boundary.geoms[0],
-                        ]
-                    ),
-                    lines.geometry.loc[i],
-                ]
-            )
+            line_start_point = lines.geometry.loc[i].boundary.geoms[0]
+            new_segment = LineString([buses.geometry.loc[bus0_id], line_start_point])
+            modified_line = linemerge([new_segment, lines.geometry.loc[i]])
+            lines.loc[i, "geometry"] = modified_line
 
         # find the closest node of the bus1 of the line
         bus1_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[1]).idxmin()
@@ -274,19 +232,13 @@ def set_lines_ids(lines, buses, distance_crs):
         distance_bus1 = busesepsg.geometry.loc[bus1_id].distance(
             row.geometry.boundary.geoms[1]
         )
-        if distance_bus1 > 0.0:
-            # the line does not end in the node, thus modify the linestring
-            lines.loc[i, "geometry"] = linemerge(
-                [
-                    lines.geometry.loc[i],
-                    LineString(
-                        [
-                            lines.geometry.loc[i].boundary.geoms[1],
-                            buses.geometry.loc[bus1_id],
-                        ]
-                    ),
-                ]
-            )
+
+        if distance_bus1 > 0:
+            # the line does not start in the node, thus modify the linestring
+            line_end_point = lines.geometry.loc[i].boundary.geoms[1]
+            new_segment = LineString([line_end_point, buses.geometry.loc[bus1_id]])
+            modified_line = linemerge([lines.geometry.loc[i], new_segment])
+            lines.loc[i, "geometry"] = modified_line
 
     return lines, buses
 
@@ -322,27 +274,23 @@ def merge_stations_same_station_id(
             lon_bus = np.round(station_point_x + v_it * delta_lon, precision)
             lat_bus = np.round(station_point_y + v_it * delta_lat, precision)
 
+            bus_data = [
+                n_buses,  # "bus_id"
+                g_name,  # "station_id"
+                v_name[0],  # "voltage"
+                bus_row["dc"].all(),  # "dc"
+                "|".join(bus_row["symbol"].unique()),  # "symbol"
+                bus_row["under_construction"].any(),  # "under_construction"
+                "|".join(bus_row["tag_substation"].unique()),  # "tag_substation"
+                bus_row["tag_area"].sum(),  # "tag_area"
+                lon_bus,  # "lon"
+                lat_bus,  # "lat"
+                bus_row["country"].iloc[0],  # "country"
+                Point(lon_bus, lat_bus),  # "geometry"
+            ]
+
             # add the bus
-            buses_clean.append(
-                [
-                    n_buses,  # "bus_id"
-                    g_name,  # "station_id"
-                    v_name[0],  # "voltage"
-                    bus_row["dc"].all(),  # "dc"
-                    "|".join(bus_row["symbol"].unique()),  # "symbol"
-                    bus_row["under_construction"].any(),  # "under_construction"
-                    "|".join(bus_row["tag_substation"].unique()),  # "tag_substation"
-                    bus_row["tag_area"].sum(),  # "tag_area"
-                    lon_bus,  # "lon"
-                    lat_bus,  # "lat"
-                    bus_row["country"].iloc[0],  # "country",
-                    # is_dclink_boundary_point,  # check if new bus was formed of at least one DC link boundary point
-                    Point(
-                        lon_bus,
-                        lat_bus,
-                    ),  # "geometry"
-                ]
-            )
+            buses_clean.append(bus_data)
 
             # increase counters
             v_it += 1
@@ -404,8 +352,8 @@ def get_transformers(buses, lines):
     df_transformers = []
 
     # Transformers should be added between AC buses only
-    # TODO pypsa-eur: Fix this! instead of tilde use !=
     buses_ac = buses[buses["dc"] != True]
+
     for g_name, g_value in buses_ac.sort_values("voltage", ascending=True).groupby(
         by="station_id"
     ):
@@ -413,26 +361,26 @@ def get_transformers(buses, lines):
         n_voltages = len(g_value)
 
         if n_voltages > 1:
-            for id in range(0, n_voltages - 1):
+            for id in range(n_voltages - 1):
                 # when g_value has more than one node, it means that there are multiple voltages for the same bus
-                geom_trans = LineString(
+                transformer_geometry = LineString(
                     [g_value.geometry.iloc[id], g_value.geometry.iloc[id + 1]]
                 )
 
-                df_transformers.append(
-                    [
-                        f"transf_{g_name}_{id}",  # "line_id"
-                        g_value["bus_id"].iloc[id],  # "bus0"
-                        g_value["bus_id"].iloc[id + 1],  # "bus1"
-                        g_value.voltage.iloc[id],  # "voltage_bus0"
-                        g_value.voltage.iloc[id + 1],  # "voltage_bus0"
-                        g_value.country.iloc[id],  # "country"
-                        geom_trans,  # "geometry"
-                    ]
-                )
-    # TODO pypsa-eur: fix bug in pypsa-earth, where the id column is wrongly named "line_id" instead of "transformer_id
+                transformer_data = [
+                    f"transf_{g_name}_{id}",  # "line_id"
+                    g_value["bus_id"].iloc[id],  # "bus0"
+                    g_value["bus_id"].iloc[id + 1],  # "bus1"
+                    g_value.voltage.iloc[id],  # "voltage_bus0"
+                    g_value.voltage.iloc[id + 1],  # "voltage_bus0"
+                    g_value.country.iloc[id],  # "country"
+                    transformer_geometry,  # "geometry"
+                ]
+
+                df_transformers.append(transformer_data)
+
     # name of the columns
-    trasf_columns = [
+    transformers_columns = [
         "transformer_id",
         "bus0",
         "bus1",
@@ -442,72 +390,18 @@ def get_transformers(buses, lines):
         "geometry",
     ]
 
-    df_transformers = gpd.GeoDataFrame(df_transformers, columns=trasf_columns)
+    df_transformers = gpd.GeoDataFrame(df_transformers, columns=transformers_columns)
     if not df_transformers.empty:
         init_index = 0 if lines.empty else lines.index[-1] + 1
         df_transformers.set_index(init_index + df_transformers.index, inplace=True)
     # update line endings
     df_transformers = line_endings_to_bus_conversion(df_transformers)
+    df_transformers.drop(columns=["bounds", "bus_0_coors", "bus_1_coors"], inplace=True)
 
-    return df_transformers
-
-
-# def get_converters(buses):
-#     """
-#     Function to create fake converter lines that connect buses of the same
-#     station_id of different polarities.
-#     """
-
-#     df_converters = []
-
-#     for g_name, g_value in buses.sort_values("voltage", ascending=True).groupby(
-#         by="station_id"
-#     ):
-#         # note: by construction there cannot be more that two buses with the same station_id and same voltage
-#         n_voltages = len(g_value)
-
-#         # A converter stations should have both AC and DC parts
-#         if g_value["dc"].any() & ~g_value["dc"].all():
-#             dc_voltage = g_value[g_value.dc]["voltage"].values
-
-#             for u in dc_voltage:
-#                 id_0 = g_value[g_value["dc"] & g_value["voltage"].isin([u])].index[0]
-
-#                 ac_voltages = g_value[~g_value.dc]["voltage"]
-#                 # A converter is added between a DC nodes and AC one with the closest voltage
-#                 id_1 = ac_voltages.sub(u).abs().idxmin()
-
-#                 geom_conv = LineString(
-#                     [g_value.geometry.loc[id_0], g_value.geometry.loc[id_1]]
-#                 )
-
-#                 # check if bus is a dclink boundary point, only then add converter
-#                 df_converters.append(
-#                     [
-#                         f"convert_{g_name}_{id_0}",  # "line_id"
-#                         g_value["bus_id"].loc[id_0],  # "bus0"
-#                         g_value["bus_id"].loc[id_1],  # "bus1"
-#                         False,  # "underground"
-#                         False,  # "under_construction"
-#                         g_value.country.loc[id_0],  # "country"
-#                         geom_conv,  # "geometry"
-#                     ]
-#                 )
-
-#     # name of the columns
-#     conv_columns = [
-#         "converter_id",
-#         "bus0",
-#         "bus1",
-#         "underground",
-#         "under_construction",
-#         "country",
-#         "geometry",
-#     ]
+    gdf_transformers = gpd.GeoDataFrame(df_transformers)
+    gdf_transformers.crs = GEO_CRS
 
-#     df_converters = gpd.GeoDataFrame(df_converters, columns=conv_columns).reset_index()
-
-#     return df_converters
+    return gdf_transformers
 
 
 def _find_closest_bus(row, buses, distance_crs, tol=5000):
@@ -552,7 +446,7 @@ def _find_closest_bus(row, buses, distance_crs, tol=5000):
     return closest_bus_id
 
 
-def _get_converters(buses, links, distance_crs, tol):
+def _get_converters(buses, links, distance_crs):
     """
     Get the converters for the given buses and links. Connecting link endings
     to closest AC bus.
@@ -575,30 +469,30 @@ def _get_converters(buses, links, distance_crs, tol):
                 continue
 
             converter_id = f"converter/{row['link_id']}_{conv}"
+            converter_geometry = LineString(
+                [
+                    buses[buses["bus_id"] == link_end].geometry.values[0],
+                    buses[buses["bus_id"] == closest_bus].geometry.values[0],
+                ]
+            )
+
             logger.info(
                 f"Added converter #{conv+1}/2 for link {row['link_id']}:{converter_id}."
             )
 
+            converter_data = [
+                converter_id,  # "line_id"
+                link_end,  # "bus0"
+                closest_bus,  # "bus1"
+                row["p_nom"],  # "p_nom"
+                False,  # "underground"
+                False,  # "under_construction"
+                buses[buses["bus_id"] == closest_bus].country.values[0],  # "country"
+                converter_geometry,  # "geometry"
+            ]
+
             # Create the converter
-            converters.append(
-                [
-                    converter_id,  # "line_id"
-                    link_end,  # "bus0"
-                    closest_bus,  # "bus1"
-                    row["p_nom"],  # "p_nom"
-                    False,  # "underground"
-                    False,  # "under_construction"
-                    buses[buses["bus_id"] == closest_bus].country.values[
-                        0
-                    ],  # "country"
-                    LineString(
-                        [
-                            buses[buses["bus_id"] == link_end].geometry.values[0],
-                            buses[buses["bus_id"] == closest_bus].geometry.values[0],
-                        ]
-                    ),  # "geometry"
-                ]
-            )
+            converters.append(converter_data)
 
     conv_columns = [
         "converter_id",
@@ -612,7 +506,7 @@ def _get_converters(buses, links, distance_crs, tol):
     ]
 
     gdf_converters = gpd.GeoDataFrame(
-        converters, columns=conv_columns, crs=geo_crs
+        converters, columns=conv_columns, crs=GEO_CRS
     ).reset_index()
 
     return gdf_converters
@@ -634,39 +528,37 @@ def connect_stations_same_station_id(lines, buses):
 
         if len(buses_station_id) > 1:
             for b_it in range(1, len(buses_station_id)):
-                add_lines.append(
+                line_geometry = LineString(
                     [
-                        f"link{buses_station_id}_{b_it}",  # "line_id"
-                        buses_station_id.index[0],  # "bus0"
-                        buses_station_id.index[b_it],  # "bus1"
-                        400000,  # "voltage"
-                        1,  # "circuits"
-                        0.0,  # "length"
-                        False,  # "underground"
-                        False,  # "under_construction"
-                        "transmission",  # "tag_type"
-                        ac_freq,  # "tag_frequency"
-                        buses_station_id.country.iloc[0],  # "country"
-                        LineString(
-                            [
-                                buses_station_id.geometry.iloc[0],
-                                buses_station_id.geometry.iloc[b_it],
-                            ]
-                        ),  # "geometry"
-                        LineString(
-                            [
-                                buses_station_id.geometry.iloc[0],
-                                buses_station_id.geometry.iloc[b_it],
-                            ]
-                        ).bounds,  # "bounds"
-                        buses_station_id.geometry.iloc[0],  # "bus_0_coors"
-                        buses_station_id.geometry.iloc[b_it],  # "bus_1_coors"
-                        buses_station_id.lon.iloc[0],  # "bus0_lon"
-                        buses_station_id.lat.iloc[0],  # "bus0_lat"
-                        buses_station_id.lon.iloc[b_it],  # "bus1_lon"
-                        buses_station_id.lat.iloc[b_it],  # "bus1_lat"
+                        buses_station_id.geometry.iloc[0],
+                        buses_station_id.geometry.iloc[b_it],
                     ]
                 )
+                line_bounds = line_geometry.bounds
+
+                line_data = [
+                    f"link{buses_station_id}_{b_it}",  # "line_id"
+                    buses_station_id.index[0],  # "bus0"
+                    buses_station_id.index[b_it],  # "bus1"
+                    400000,  # "voltage"
+                    1,  # "circuits"
+                    0.0,  # "length"
+                    False,  # "underground"
+                    False,  # "under_construction"
+                    "transmission",  # "tag_type"
+                    ac_freq,  # "tag_frequency"
+                    buses_station_id.country.iloc[0],  # "country"
+                    line_geometry,  # "geometry"
+                    line_bounds,  # "bounds"
+                    buses_station_id.geometry.iloc[0],  # "bus_0_coors"
+                    buses_station_id.geometry.iloc[b_it],  # "bus_1_coors"
+                    buses_station_id.lon.iloc[0],  # "bus0_lon"
+                    buses_station_id.lat.iloc[0],  # "bus0_lat"
+                    buses_station_id.lon.iloc[b_it],  # "bus1_lon"
+                    buses_station_id.lat.iloc[b_it],  # "bus1_lat"
+                ]
+
+                add_lines.append(line_data)
 
     # name of the columns
     add_lines_columns = [
@@ -733,12 +625,10 @@ def merge_stations_lines_by_station_id_and_voltage(
 
     logger.info(" - Setting substation ids with tolerance of %.2f m" % (tol))
 
-    # TODO pypsa-eur: Add this fix to pypsa-earth: Buses should not be clustered geographically if they are different
     # bus types (AC != DC)
     buses_ac = buses[buses["dc"] == False].reset_index()
     buses_dc = buses[buses["dc"] == True].reset_index()
 
-    # set substation ids
     # set_substations_ids(buses, distance_crs, tol=tol)
     set_substations_ids(buses_ac, distance_crs, tol=tol)
     set_substations_ids(buses_dc, distance_crs, tol=tol)
@@ -772,8 +662,6 @@ def merge_stations_lines_by_station_id_and_voltage(
     # reset index
     lines.reset_index(drop=True, inplace=True)
     links.reset_index(drop=True, inplace=True)
-    # if len(links) > 0:
-    #     links.reset_index(drop=True, inplace=True)
 
     return lines, links, buses
 
@@ -841,23 +729,16 @@ def fix_overpassing_lines(lines, buses, distance_crs, tol=1):
             buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol
         ]
 
+        # Get boundary points
+        endpoint0 = lines_epsgmod.geometry.loc[l].boundary.geoms[0]
+        endpoint1 = lines_epsgmod.geometry.loc[l].boundary.geoms[1]
+
+        # Calculate distances
+        dist_to_ep0 = bus_in_tol_epsg.geometry.distance(endpoint0)
+        dist_to_ep1 = bus_in_tol_epsg.geometry.distance(endpoint1)
+
         # exclude endings of the lines
-        bus_in_tol_epsg = bus_in_tol_epsg[
-            (
-                (
-                    bus_in_tol_epsg.geometry.distance(
-                        lines_epsgmod.geometry.loc[l].boundary.geoms[0]
-                    )
-                    > tol
-                )
-                | (
-                    bus_in_tol_epsg.geometry.distance(
-                        lines_epsgmod.geometry.loc[l].boundary.geoms[1]
-                    )
-                    > tol
-                )
-            )
-        ]
+        bus_in_tol_epsg = bus_in_tol_epsg[(dist_to_ep0 > tol) | (dist_to_ep1 > tol)]
 
         if not bus_in_tol_epsg.empty:
             # add index of line to split
@@ -905,74 +786,12 @@ def fix_overpassing_lines(lines, buses, distance_crs, tol=1):
     return lines, buses
 
 
-def build_network(
-    inputs,
-    outputs,
-    geo_crs,
-    distance_crs,
-):
-    osm_clean_columns = {
-        "substation": {
-            "bus_id": "object",
-            "station_id": "float",
-            "voltage": "float",
-            "dc": "bool",
-            "symbol": "object",
-            "under_construction": "bool",
-            "tag_substation": "str",
-            "tag_area": "str",
-            "lon": "float",
-            "lat": "float",
-            "country": "str",
-            "geometry": "object",
-            "tag_source": "str",
-        },
-        "line": {
-            "line_id": "object",
-            "bus0": "object",
-            "bus1": "object",
-            "voltage": "float",
-            "circuits": "float",
-            "length": "float",
-            "underground": "bool",
-            "under_construction": "bool",
-            "tag_type": "str",
-            "tag_frequency": "float",
-            "dc": "bool",
-            "country": "object",
-            "geometry": "object",
-        },
-        "link": {
-            "link_id": "object",
-            "bus0": "object",
-            "bus1": "object",
-            "voltage": "float",
-            "length": "float",
-            "under_construction": "bool",
-            "dc": "bool",
-            "country": "object",
-            "geometry": "object",
-        },
-    }
+def build_network(inputs, outputs):
 
     logger.info("Reading input data.")
-    buses = read_geojson(
-        inputs["substations"],
-        osm_clean_columns["substation"].keys(),
-        dtype=osm_clean_columns["substation"],
-    )
-
-    lines = read_geojson(
-        inputs["lines"],
-        osm_clean_columns["line"].keys(),
-        dtype=osm_clean_columns["line"],
-    )
-
-    links = read_geojson(
-        inputs["links"],
-        osm_clean_columns["link"].keys(),
-        dtype=osm_clean_columns["link"],
-    )
+    buses = gpd.read_file(inputs["substations"])
+    lines = gpd.read_file(inputs["lines"])
+    links = gpd.read_file(inputs["links"])
 
     lines = line_endings_to_bus_conversion(lines)
     links = line_endings_to_bus_conversion(links)
@@ -980,14 +799,13 @@ def build_network(
     logger.info(
         "Fixing lines overpassing nodes: Connecting nodes and splittling lines."
     )
-    lines, buses = fix_overpassing_lines(lines, buses, distance_crs, tol=1)
+    lines, buses = fix_overpassing_lines(lines, buses, DISTANCE_CRS, tol=1)
 
-    # METHOD to merge buses with same voltage and within tolerance
-    tol = snakemake.config["electricity_network"]["osm_group_tolerance_buses"]
-    logger.info(f"Aggregating close substations: Enabled with tolerance {tol} m")
+    # Merge buses with same voltage and within tolerance
+    logger.info(f"Aggregating close substations: Enabled with tolerance {BUS_TOL} m")
 
     lines, links, buses = merge_stations_lines_by_station_id_and_voltage(
-        lines, links, buses, distance_crs, tol=tol
+        lines, links, buses, DISTANCE_CRS, BUS_TOL
     )
 
     # Recalculate lengths of lines
@@ -995,19 +813,11 @@ def build_network(
     lines["length"] = lines.to_crs(utm).length
     links["length"] = links.to_crs(utm).length
 
-    # TODO pypsa-eur: check if needed for updated links scripts
-    # get transformers: modelled as lines connecting buses with different voltage
     transformers = get_transformers(buses, lines)
-
-    # get converters: currently modelled as links connecting buses with different polarity
-    converters = _get_converters(buses, links, distance_crs, tol)
+    converters = _get_converters(buses, links, DISTANCE_CRS)
 
     logger.info("Saving outputs")
 
-    # create clean directory if not already exist
-    if not os.path.exists(outputs["lines"]):
-        os.makedirs(os.path.dirname(outputs["lines"]), exist_ok=True)
-
     ### Convert output to pypsa-eur friendly format
     # Rename "substation" in buses["symbol"] to "Substation"
     buses["symbol"] = buses["symbol"].replace({"substation": "Substation"})
@@ -1037,232 +847,31 @@ def build_network(
     buses = buses.replace({True: "t", False: "f"})
 
     # Change column orders
-    cols_lines = [
-        "bus0",
-        "bus1",
-        "voltage",
-        "circuits",
-        "length",
-        "underground",
-        "under_construction",
-        "geometry",
-    ]
-
-    lines = lines[cols_lines]
-
-    cols_links = [
-        "bus0",
-        "bus1",
-        "voltage",
-        "p_nom",
-        "length",
-        "under_construction",
-        "geometry",
-    ]
-
+    lines = lines[LINES_COLUMNS]
     if not links.empty:
-        links = links[cols_links]
-
-    cols_transformers = [
-        "bus0",
-        "bus1",
-        "voltage_bus0",
-        "voltage_bus1",
-        "country",
-        "geometry",
-    ]
-
-    transformers = transformers[cols_transformers]
-
-    if links.empty:  # create empty dataframe with cols_links as columns
-        links = pd.DataFrame(columns=["link_id"] + cols_links)
+        links = links[LINKS_COLUMNS]
+    else:
+        links = pd.DataFrame(columns=["link_id"] + LINKS_COLUMNS)
         links.set_index("link_id", inplace=True)
+    transformers = transformers[TRANSFORMERS_COLUMNS]
 
-    to_csv_nafix(lines, outputs["lines"], quotechar="'")  # Generate CSV
-    to_csv_nafix(links, outputs["links"], quotechar="'")  # Generate CSV
-    to_csv_nafix(converters, outputs["converters"], quotechar="'")  # Generate CSV
-    to_csv_nafix(transformers, outputs["transformers"], quotechar="'")  # Generate CSV
+    # Export to csv for base_network
+    buses.to_csv(outputs["substations"], quotechar="'")
+    lines.to_csv(outputs["lines"], quotechar="'")
+    links.to_csv(outputs["links"], quotechar="'")
+    converters.to_csv(outputs["converters"], quotechar="'")
+    transformers.to_csv(outputs["transformers"], quotechar="'")
 
     # Export to GeoJSON for quick validations
-    save_to_geojson(
-        gpd.GeoDataFrame(lines),
-        outputs["lines_geojson"],
-    )
-    save_to_geojson(
-        gpd.GeoDataFrame(links),
-        outputs["links_geojson"],
-    )
-    save_to_geojson(
-        gpd.GeoDataFrame(converters, geometry="geometry", crs=geo_crs),
-        outputs["converters_geojson"],
-    )
-    save_to_geojson(
-        gpd.GeoDataFrame(transformers, geometry="geometry", crs=geo_crs),
-        outputs["transformers_geojson"],
-    )
-
-    # create clean directory if not already exist
-    if not os.path.exists(outputs["substations"]):
-        os.makedirs(os.path.dirname(outputs["substations"]), exist_ok=True)
-    # Generate CSV
-    to_csv_nafix(buses, outputs["substations"], quotechar="'")
-    save_to_geojson(
-        gpd.GeoDataFrame(buses, geometry="geometry", crs=geo_crs),
-        outputs["substations_geojson"],
-    )
+    buses.to_file(outputs["substations_geojson"])
+    lines.to_file(outputs["lines_geojson"])
+    links.to_file(outputs["links_geojson"])
+    converters.to_file(outputs["converters_geojson"])
+    transformers.to_file(outputs["transformers_geojson"])
 
     return None
 
 
-# Function to check if two lines are connected
-def are_lines_connected(line1, line2):
-    """
-    Check if two lines are connected.
-
-    Parameters:
-    line1 (dict): A dictionary representing the first line.
-    line2 (dict): A dictionary representing the second line.
-
-    Returns:
-    tuple: A tuple of boolean values indicating the connection status between
-    the lines.
-
-    The tuple contains four elements:
-    - True if the first line's bus_0_coors is almost equal to the second line's
-      bus_0_coors, False otherwise.
-    - True if the first line's bus_0_coors is almost equal to the second line's
-      bus_1_coors, False otherwise.
-    - True if the first line's bus_1_coors is almost equal to the second line's
-      bus_0_coors, False otherwise.
-    - True if the first line's bus_1_coors is almost equal to the second line's
-      bus_1_coors, False otherwise.
-    """
-    return (
-        are_almost_equal(line1["bus_0_coors"], line2["bus_0_coors"]),
-        are_almost_equal(line1["bus_0_coors"], line2["bus_1_coors"]),
-        are_almost_equal(line1["bus_1_coors"], line2["bus_0_coors"]),
-        are_almost_equal(line1["bus_1_coors"], line2["bus_1_coors"]),
-    )
-
-
-def _dfs(adj_matrix, visited, current_vertex, path):
-    """
-    Perform a depth-first search (DFS) on a graph represented by an adjacency
-    matrix.
-
-    Parameters:
-    - adj_matrix (list of lists): The adjacency matrix representing the graph.
-    - visited (list of bool): A list to keep track of visited vertices.
-    - current_vertex (int): The current vertex being visited.
-    - path (list): The path of vertices visited so far.
-
-    Returns:
-    - path (list): The path of vertices visited during the DFS.
-    """
-    visited[current_vertex] = True
-    path.append(current_vertex)
-    for neighbor in range(len(adj_matrix)):
-        if adj_matrix[current_vertex][neighbor] == 1 and not visited[neighbor]:
-            _dfs(adj_matrix, visited, neighbor, path)
-    return path
-
-
-# Returns all connected paths as a vector
-def find_paths(adj_matrix):
-    """
-    Find all paths in a graph represented by an adjacency matrix.
-
-    Parameters:
-    - adj_matrix (list of lists): The adjacency matrix representing the graph.
-
-    Returns:
-    - paths (list of lists): A list of lists, where each inner list represents
-      a path in the graph.
-    """
-    visited = [False] * len(adj_matrix)
-    paths = []
-    for vertex in range(len(adj_matrix)):
-        if not visited[vertex]:
-            path = _dfs(adj_matrix, visited, vertex, [])
-            if path:
-                paths.append(path)
-    return paths
-
-
-def are_almost_equal(point1, point2, tolerance=1e-6):
-    """
-    Check if two Shapely points are almost equal with a given tolerance.
-
-    Args:
-    point1 (Point): First Shapely point.
-    point2 (Point): Second Shapely point.
-    tolerance (float): Tolerance for coordinate deviation.
-
-    Returns:
-    bool: True if the points are almost equal, False otherwise.
-    """
-    return abs(point1.x - point2.x) < tolerance and abs(point1.y - point2.y) < tolerance
-
-
-def merge_linestrings(gdf):
-    """
-    Merge LineStrings in a GeoDataFrame wherever the endpoints match.
-
-    Parameters:
-    gdf (GeoDataFrame): A GeoDataFrame containing LineString geometries.
-
-    Returns:
-    GeoDataFrame: A GeoDataFrame with merged LineString geometries.
-    """
-    gdf = gdf.copy()
-    if len(gdf) == 1:
-        return gdf
-
-    lines = list(gdf.geometry)
-    merged_lines = []
-    while lines:
-        line = lines.pop(0)
-        merged_line = line
-        i = 0
-        while i < len(lines):
-            if are_almost_equal(
-                Point(merged_line.coords[-1]), Point(lines[i].coords[0])
-            ):
-                merged_line = LineString(
-                    list(merged_line.coords) + list(lines.pop(i).coords[1:])
-                )
-                i = 0  # Restart the scan after merging
-            elif are_almost_equal(
-                Point(merged_line.coords[0]), Point(lines[i].coords[-1])
-            ):
-                merged_line = LineString(
-                    list(lines.pop(i).coords)[:-1] + list(merged_line.coords)
-                )
-                i = 0  # Restart the scan after merging
-            elif are_almost_equal(
-                Point(merged_line.coords[-1]), Point(lines[i].coords[-1])
-            ):
-                merged_line = LineString(
-                    list(merged_line.coords) + list(lines.pop(i).coords[::-1])[1:]
-                )
-                i = 0  # Restart the scan after merging
-            elif are_almost_equal(
-                Point(merged_line.coords[0]), Point(lines[i].coords[0])
-            ):
-                merged_line = LineString(
-                    list(lines.pop(i).coords[::-1])[:-1] + list(merged_line.coords)
-                )
-                i = 0  # Restart the scan after merging
-            else:
-                i += 1
-        merged_lines.append(merged_line)
-        no_coordinates = [len(merged_lines[i].coords) for i in range(len(merged_lines))]
-        max_index = np.argmax(no_coordinates)
-        merged_lines = [merged_lines[max_index]]
-
-    return gpd.GeoDataFrame(geometry=merged_lines, crs=gdf.crs)
-
-
 if __name__ == "__main__":
     # Detect running outside of snakemake and mock snakemake for testing
     if "snakemake" not in globals():
@@ -1273,20 +882,6 @@ def merge_linestrings(gdf):
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
-    # load default crs
-    geo_crs = "EPSG:4326"
-    distance_crs = "EPSG:3035"
-
     countries = snakemake.config["countries"]
 
-    with memory_logger(
-        filename=getattr(snakemake.log, "memory", None), interval=30.0
-    ) as mem:
-        build_network(
-            snakemake.input,
-            snakemake.output,
-            geo_crs,
-            distance_crs,
-        )
-
-    logger.info(f"Maximum memory usage: {mem.mem_usage}")
+    build_network(snakemake.input, snakemake.output)
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index bf9d1c4ab..7f42ee2c3 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -3,26 +3,14 @@
 #
 # SPDX-License-Identifier: MIT
 """
-This script is used to clean OpenStreetMap (OSM) data for the PyPSA-Eur
-project.
+This script is used to clean OpenStreetMap (OSM) data for creating a PyPSA-Eur
+ready network.
 
 The script performs various cleaning operations on the OSM data, including:
 - Cleaning voltage, circuits, cables, wires, and frequency columns
 - Splitting semicolon-separated cells into new rows
 - Distributing values to circuits based on the number of splits
 - Adding line endings to substations based on line data
-
-The cleaned data is then written to an output file.
-
-Usage:
-    python clean_osm_data.py <output_file>
-
-Arguments:
-    output_file (str): The path to the output file where the cleaned data will
-    be written.
-
-Example:
-    python clean_osm_data.py cleaned_data.csv
 """
 
 import json
@@ -1001,16 +989,13 @@ def _clean_lines(df_lines, list_voltages):
 
     # Clean those values where multiple circuit values are present, divided by
     # semicolon
-    bool_cables = (
-        (df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1))
-        & (
-            df_lines.apply(
-                lambda row: len(row["circuits"].split(";")) == row["split_elements"],
-                axis=1,
-            )
-        )
-        & (df_lines["cleaned"] == False)
+    has_multiple_circuits = df_lines["circuits"].apply(lambda x: len(x.split(";")) > 1)
+    circuits_match_split_elements = df_lines.apply(
+        lambda row: len(row["circuits"].split(";")) == row["split_elements"],
+        axis=1,
     )
+    is_not_cleaned = df_lines["cleaned"] == False
+    bool_cables = has_multiple_circuits & circuits_match_split_elements & is_not_cleaned
 
     df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables].apply(
         lambda row: str(row["circuits"].split(";")[int(row["id"].split("-")[-1]) - 1]),
@@ -1023,16 +1008,13 @@ def _clean_lines(df_lines, list_voltages):
 
     # Clean those values where multiple cables values are present, divided by
     # semicolon
-    bool_cables = (
-        (df_lines["cables"].apply(lambda x: len(x.split(";")) > 1))
-        & (
-            df_lines.apply(
-                lambda row: len(row["cables"].split(";")) == row["split_elements"],
-                axis=1,
-            )
-        )
-        & (df_lines["cleaned"] == False)
+    has_multiple_cables = df_lines["cables"].apply(lambda x: len(x.split(";")) > 1)
+    cables_match_split_elements = df_lines.apply(
+        lambda row: len(row["cables"].split(";")) == row["split_elements"],
+        axis=1,
     )
+    is_not_cleaned = df_lines["cleaned"] == False
+    bool_cables = has_multiple_cables & cables_match_split_elements & is_not_cleaned
 
     df_lines.loc[bool_cables, "circuits"] = df_lines.loc[bool_cables].apply(
         lambda row: str(
@@ -1713,8 +1695,6 @@ def _bridge_lines(lines):
     min_voltage_ac = 200000  # [unit: V] Minimum voltage value to filter AC lines.
     min_voltage_dc = 150000  #  [unit: V] Minimum voltage value to filter DC links.
 
-    lines_to_drop = [""]
-
     logger.info("---")
     logger.info("SUBSTATIONS")
     # Input
@@ -1732,11 +1712,9 @@ def _bridge_lines(lines):
     df_substations["frequency"] = _clean_frequency(df_substations["frequency"])
     df_substations = _clean_substations(df_substations, list_voltages)
     df_substations = _create_substations_geometry(df_substations)
+
     # Merge touching polygons
     df_substations = _merge_touching_polygons(df_substations)
-    # df_substations["polygon"] = df_substations["polygon"].apply(
-    #     lambda x: x.convex_hull
-    # )
     df_substations = _create_substations_centroid(df_substations)
     df_substations = _finalise_substations(df_substations)
 
@@ -1778,12 +1756,6 @@ def _bridge_lines(lines):
     df_lines = _create_lines_geometry(df_lines)
     df_lines = _finalise_lines(df_lines)
 
-    # Dropping specific lines, manually
-    if lines_to_drop in df_lines["line_id"].values:
-        df_lines.drop(
-            df_lines[df_lines["line_id"].isin(lines_to_drop)].index, inplace=True
-        )
-
     # Create GeoDataFrame
     gdf_lines = gpd.GeoDataFrame(df_lines, geometry="geometry", crs=crs)
     gdf_lines = _remove_lines_within_substations(gdf_lines, gdf_substations_polygon)
@@ -1838,10 +1810,6 @@ def _bridge_lines(lines):
         df_substations.drop(columns=["polygon"]), geometry="geometry", crs=crs
     )
 
-    # Export GeoDataFrames to GeoJSON in specified output paths
-    parentfolder = os.path.dirname(snakemake.output.substations)
-    if not os.path.exists(parentfolder):
-        os.makedirs(parentfolder)
     output_substations_polygon = snakemake.output["substations_polygon"]
     output_substations = snakemake.output["substations"]
     output_lines = snakemake.output["lines"]
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
index b33009e0b..13b287816 100644
--- a/scripts/prepare_osm_network_release.py
+++ b/scripts/prepare_osm_network_release.py
@@ -13,6 +13,55 @@
 logger = logging.getLogger(__name__)
 
 
+BUSES_COLUMNS = [
+    "bus_id",
+    "voltage",
+    "dc",
+    "symbol",
+    "under_construction",
+    "x",
+    "y",
+    "country",
+    "geometry",
+]
+LINES_COLUMNS = [
+    "line_id",
+    "bus0",
+    "bus1",
+    "voltage",
+    "circuits",
+    "length",
+    "underground",
+    "under_construction",
+    "geometry",
+]
+LINKS_COLUMNS = [
+    "link_id",
+    "bus0",
+    "bus1",
+    "voltage",
+    "p_nom",
+    "length",
+    "underground",
+    "under_construction",
+    "geometry",
+]
+TRANSFORMERS_COLUMNS = [
+    "transformer_id",
+    "bus0",
+    "bus1",
+    "voltage_bus0",
+    "voltage_bus1",
+    "geometry",
+]
+CONVERTERS_COLUMNS = [
+    "converter_id",
+    "bus0",
+    "bus1",
+    "geometry",
+]
+
+
 def export_clean_csv(df, columns, output_file):
     """
     Export a cleaned DataFrame to a CSV file.
@@ -37,9 +86,6 @@ def export_clean_csv(df, columns, output_file):
     if "converter_id" in columns:
         rename_dict["Link"] = "converter_id"
 
-    # Create the directory if it doesn't exist
-    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-
     df.reset_index().rename(columns=rename_dict).loc[:, columns].replace(
         {True: "t", False: "f"}
     ).to_csv(output_file, index=False, quotechar="'")
@@ -56,58 +102,6 @@ def export_clean_csv(df, columns, output_file):
     configure_logging(snakemake)
     set_scenario_config(snakemake)
 
-    buses_columns = [
-        "bus_id",
-        "voltage",
-        "dc",
-        "symbol",
-        "under_construction",
-        "x",
-        "y",
-        "country",
-        "geometry",
-    ]
-
-    lines_columns = [
-        "line_id",
-        "bus0",
-        "bus1",
-        "voltage",
-        "circuits",
-        "length",
-        "underground",
-        "under_construction",
-        "geometry",
-    ]
-
-    links_columns = [
-        "link_id",
-        "bus0",
-        "bus1",
-        "voltage",
-        "p_nom",
-        "length",
-        "underground",
-        "under_construction",
-        "geometry",
-    ]
-
-    transformers_columns = [
-        "transformer_id",
-        "bus0",
-        "bus1",
-        "voltage_bus0",
-        "voltage_bus1",
-        "geometry",
-    ]
-
-    converters_columns = [
-        "converter_id",
-        "bus0",
-        "bus1",
-        "geometry",
-    ]
-
     network = pypsa.Network(snakemake.input.base_network)
 
     network.buses["dc"] = network.buses.pop("carrier").map({"DC": "t", "AC": "f"})
@@ -116,18 +110,18 @@ def export_clean_csv(df, columns, output_file):
 
     # Export to clean csv for release
     logger.info(f"Exporting {len(network.buses)} buses to %s", snakemake.output.buses)
-    export_clean_csv(network.buses, buses_columns, snakemake.output.buses)
+    export_clean_csv(network.buses, BUSES_COLUMNS, snakemake.output.buses)
 
     logger.info(
         f"Exporting {len(network.transformers)} transformers to %s",
         snakemake.output.transformers,
     )
     export_clean_csv(
-        network.transformers, transformers_columns, snakemake.output.transformers
+        network.transformers, TRANSFORMERS_COLUMNS, snakemake.output.transformers
     )
 
     logger.info(f"Exporting {len(network.lines)} lines to %s", snakemake.output.lines)
-    export_clean_csv(network.lines, lines_columns, snakemake.output.lines)
+    export_clean_csv(network.lines, LINES_COLUMNS, snakemake.output.lines)
 
     # Boolean that specifies if link element is a converter
     is_converter = network.links.index.str.startswith("conv") == True
@@ -137,7 +131,7 @@ def export_clean_csv(df, columns, output_file):
         snakemake.output.links,
     )
     export_clean_csv(
-        network.links[~is_converter], links_columns, snakemake.output.links
+        network.links[~is_converter], LINKS_COLUMNS, snakemake.output.links
     )
 
     logger.info(
@@ -145,7 +139,7 @@ def export_clean_csv(df, columns, output_file):
         snakemake.output.converters,
     )
     export_clean_csv(
-        network.links[is_converter], converters_columns, snakemake.output.converters
+        network.links[is_converter], CONVERTERS_COLUMNS, snakemake.output.converters
     )
 
     logger.info("Export of OSM network for release complete.")
diff --git a/scripts/retrieve_gdp_uamd.py b/scripts/retrieve_gdp_uamd.py
deleted file mode 100644
index 780f2ea65..000000000
--- a/scripts/retrieve_gdp_uamd.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# -*- coding: utf-8 -*-
-# SPDX-FileCopyrightText: : 2023-2024 The PyPSA-Eur Authors
-#
-# SPDX-License-Identifier: MIT
-"""
-Retrieve monthly fuel prices from Destatis.
-"""
-
-import logging
-from pathlib import Path
-
-from _helpers import configure_logging, retrieve_file, set_scenario_config
-
-logger = logging.getLogger(__name__)
-
-
-if __name__ == "__main__":
-    if "snakemake" not in globals():
-        from _helpers import mock_snakemake
-
-        snakemake = mock_snakemake("retrieve_gdp_uamd")
-    configure_logging(snakemake)
-    set_scenario_config(snakemake)
-
-dict_urls = dict(
-    {
-        "gdp_non_nuts3": "https://datadryad.org/stash/downloads/file_stream/241947",
-        "pop_non_nuts3": "https://github.com/ecohealthalliance/sars_cov_risk/releases/download/v2.0.1/ppp_2020_1km_Aggregated.tif",
-    }
-)
-
-# Download and validate each dataset
-for key, path in snakemake.output.items():
-    retrieve_file(dict_urls[key], path)

From 48c2cae37d378481a2d526d35ee135407893f602 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 21 Aug 2024 17:10:47 +0200
Subject: [PATCH 093/100] Bug fix: Added all voltages, 200 kV-750 kV, to
 default config.

---
 config/config.default.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index 39017dfa5..0fd6e9869 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -86,7 +86,7 @@ co2_budget:
 
 # docs in https://pypsa-eur.readthedocs.io/en/latest/configuration.html#electricity
 electricity:
-  voltages: [200., 300., 380., 500., 750.]
+  voltages: [200., 220., 300., 380., 500., 750.]
   base_network: entsoegridkit
   gaslimit_enable: false
   gaslimit: false

From 89f09554e51397df70d3d2ba407915786edfe91a Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Wed, 21 Aug 2024 22:21:44 +0200
Subject: [PATCH 094/100] Cleaning and bugfixes.

---
 config/config.default.yaml             |  2 +-
 rules/build_electricity.smk            | 36 +++++++++++++-------------
 rules/development.smk                  | 10 +++----
 scripts/build_osm_network.py           |  7 ++++-
 scripts/clean_osm_data.py              |  2 ++
 scripts/prepare_osm_network_release.py |  1 +
 6 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/config/config.default.yaml b/config/config.default.yaml
index 0fd6e9869..86689ea72 100644
--- a/config/config.default.yaml
+++ b/config/config.default.yaml
@@ -42,7 +42,7 @@ scenario:
   ll:
   - vopt
   clusters:
-  - 38
+  - 41
   - 128
   - 256
   opts:
diff --git a/rules/build_electricity.smk b/rules/build_electricity.smk
index 06730bcf6..4446ef765 100644
--- a/rules/build_electricity.smk
+++ b/rules/build_electricity.smk
@@ -54,7 +54,7 @@ def input_base_network(w):
     base_network = config_provider("electricity", "base_network")(w)
     components = {"buses", "lines", "links", "converters", "transformers"}
     if base_network == "osm-raw":
-        inputs = {c: resources(f"osm/pre-base/{c}.csv") for c in components}
+        inputs = {c: resources(f"osm-raw/build/{c}.csv") for c in components}
     else:
         inputs = {c: f"data/{base_network}/{c}.csv" for c in components}
     if base_network == "entsoegridkit":
@@ -665,10 +665,10 @@ if config["electricity"]["base_network"] == "osm-raw":
             offshore_shapes=resources("offshore_shapes.geojson"),
             country_shapes=resources("country_shapes.geojson"),
         output:
-            substations=resources("osm/clean/substations.geojson"),
-            substations_polygon=resources("osm/clean/substations_polygon.geojson"),
-            lines=resources("osm/clean/lines.geojson"),
-            links=resources("osm/clean/links.geojson"),
+            substations=resources("osm-raw/clean/substations.geojson"),
+            substations_polygon=resources("osm-raw/clean/substations_polygon.geojson"),
+            lines=resources("osm-raw/clean/lines.geojson"),
+            links=resources("osm-raw/clean/links.geojson"),
         log:
             logs("clean_osm_data.log"),
         benchmark:
@@ -686,21 +686,21 @@ if config["electricity"]["base_network"] == "osm-raw":
 
     rule build_osm_network:
         input:
-            substations=resources("osm/clean/substations.geojson"),
-            lines=resources("osm/clean/lines.geojson"),
-            links=resources("osm/clean/links.geojson"),
+            substations=resources("osm-raw/clean/substations.geojson"),
+            lines=resources("osm-raw/clean/lines.geojson"),
+            links=resources("osm-raw/clean/links.geojson"),
             country_shapes=resources("country_shapes.geojson"),
         output:
-            lines=resources("osm/pre-base/lines.csv"),
-            links=resources("osm/pre-base/links.csv"),
-            converters=resources("osm/pre-base/converters.csv"),
-            transformers=resources("osm/pre-base/transformers.csv"),
-            substations=resources("osm/pre-base/buses.csv"),
-            lines_geojson=resources("osm/pre-base/lines.geojson"),
-            links_geojson=resources("osm/pre-base/links.geojson"),
-            converters_geojson=resources("osm/pre-base/converters.geojson"),
-            transformers_geojson=resources("osm/pre-base/transformers.geojson"),
-            substations_geojson=resources("osm/pre-base/buses.geojson"),
+            lines=resources("osm-raw/build/lines.csv"),
+            links=resources("osm-raw/build/links.csv"),
+            converters=resources("osm-raw/build/converters.csv"),
+            transformers=resources("osm-raw/build/transformers.csv"),
+            substations=resources("osm-raw/build/buses.csv"),
+            lines_geojson=resources("osm-raw/build/geojson/lines.geojson"),
+            links_geojson=resources("osm-raw/build/geojson/links.geojson"),
+            converters_geojson=resources("osm-raw/build/geojson/converters.geojson"),
+            transformers_geojson=resources("osm-raw/build/geojson/transformers.geojson"),
+            substations_geojson=resources("osm-raw/build/geojson/buses.geojson"),
         log:
             logs("build_osm_network.log"),
         benchmark:
diff --git a/rules/development.smk b/rules/development.smk
index 0386e38e8..465490258 100644
--- a/rules/development.smk
+++ b/rules/development.smk
@@ -8,11 +8,11 @@ if config["electricity"]["base_network"] == "osm-raw":
         input:
             base_network=resources("networks/base.nc"),
         output:
-            buses=resources("osm/release/buses.csv"),
-            converters=resources("osm/release/converters.csv"),
-            lines=resources("osm/release/lines.csv"),
-            links=resources("osm/release/links.csv"),
-            transformers=resources("osm/release/transformers.csv"),
+            buses=resources("osm-raw/release/buses.csv"),
+            converters=resources("osm-raw/release/converters.csv"),
+            lines=resources("osm-raw/release/lines.csv"),
+            links=resources("osm-raw/release/links.csv"),
+            transformers=resources("osm-raw/release/transformers.csv"),
         log:
             logs("prepare_osm_network_release.log"),
         benchmark:
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 889105754..6d55f8ee6 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -39,6 +39,7 @@
     "voltage",
     "p_nom",
     "length",
+    "underground",
     "under_construction",
     "geometry",
 ]
@@ -484,6 +485,7 @@ def _get_converters(buses, links, distance_crs):
                 converter_id,  # "line_id"
                 link_end,  # "bus0"
                 closest_bus,  # "bus1"
+                row["voltage"],  # "voltage"
                 row["p_nom"],  # "p_nom"
                 False,  # "underground"
                 False,  # "under_construction"
@@ -498,6 +500,7 @@ def _get_converters(buses, links, distance_crs):
         "converter_id",
         "bus0",
         "bus1",
+        "voltage",
         "p_nom",
         "underground",
         "under_construction",
@@ -802,7 +805,7 @@ def build_network(inputs, outputs):
     lines, buses = fix_overpassing_lines(lines, buses, DISTANCE_CRS, tol=1)
 
     # Merge buses with same voltage and within tolerance
-    logger.info(f"Aggregating close substations: Enabled with tolerance {BUS_TOL} m")
+    logger.info(f"Aggregating close substations with a tolerance of {BUS_TOL} m")
 
     lines, links, buses = merge_stations_lines_by_station_id_and_voltage(
         lines, links, buses, DISTANCE_CRS, BUS_TOL
@@ -834,6 +837,8 @@ def build_network(inputs, outputs):
     lines["voltage"] = lines["voltage"] / 1000
     if not links.empty:
         links["voltage"] = links["voltage"] / 1000
+    if not converters.empty:
+        converters["voltage"] = converters["voltage"] / 1000
     transformers["voltage_bus0"], transformers["voltage_bus1"] = (
         transformers["voltage_bus0"] / 1000,
         transformers["voltage_bus1"] / 1000,
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 7f42ee2c3..4377f84af 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1295,6 +1295,7 @@ def _finalise_links(df_links):
     df_links["bus0"] = None
     df_links["bus1"] = None
     df_links["length"] = None
+    df_links["underground"] = True
     df_links["under_construction"] = False
     df_links["dc"] = True
 
@@ -1307,6 +1308,7 @@ def _finalise_links(df_links):
             "bus0",
             "bus1",
             "length",
+            "underground",
             "under_construction",
             "dc",
             "country",
diff --git a/scripts/prepare_osm_network_release.py b/scripts/prepare_osm_network_release.py
index 13b287816..ac6b25354 100644
--- a/scripts/prepare_osm_network_release.py
+++ b/scripts/prepare_osm_network_release.py
@@ -58,6 +58,7 @@
     "converter_id",
     "bus0",
     "bus1",
+    "voltage",
     "geometry",
 ]
 

From 07a7a6476600d0dd23d3e5cc957b70132959a4f9 Mon Sep 17 00:00:00 2001
From: bobbyxng <bobbyxng@gmail.com>
Date: Thu, 22 Aug 2024 11:25:07 +0200
Subject: [PATCH 095/100] Updated Zenodo repository to
 https://zenodo.org/records/13358976. Added converter voltages, 'underground'
 property for DC lines/cables, and included Konti-Skan HVDC (DK-SE). Added
 compatibility with https://github.com/PyPSA/pypsa-eur/pull/1079 and
 https://github.com/PyPSA/pypsa-eur/pull/1085

---
 rules/retrieve.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rules/retrieve.smk b/rules/retrieve.smk
index 75ab5a375..c30696ccf 100644
--- a/rules/retrieve.smk
+++ b/rules/retrieve.smk
@@ -398,14 +398,14 @@ if config["enable"]["retrieve"] and (
 
     rule retrieve_osm_prebuilt:
         input:
-            buses=storage("https://zenodo.org/records/13342577/files/buses.csv"),
+            buses=storage("https://zenodo.org/records/13358976/files/buses.csv"),
             converters=storage(
-                "https://zenodo.org/records/13342577/files/converters.csv"
+                "https://zenodo.org/records/13358976/files/converters.csv"
             ),
-            lines=storage("https://zenodo.org/records/13342577/files/lines.csv"),
-            links=storage("https://zenodo.org/records/13342577/files/links.csv"),
+            lines=storage("https://zenodo.org/records/13358976/files/lines.csv"),
+            links=storage("https://zenodo.org/records/13358976/files/links.csv"),
             transformers=storage(
-                "https://zenodo.org/records/13342577/files/transformers.csv"
+                "https://zenodo.org/records/13358976/files/transformers.csv"
             ),
         output:
             buses="data/osm-prebuilt/buses.csv",

From 6be527f66d4889ec1c4f234e68af8724d63b30e9 Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Thu, 22 Aug 2024 11:53:17 +0200
Subject: [PATCH 096/100] Apply suggestions from code review

---
 scripts/base_network.py      |  2 +-
 scripts/build_osm_network.py | 27 ---------------------------
 scripts/clean_osm_data.py    | 26 --------------------------
 3 files changed, 1 insertion(+), 54 deletions(-)

diff --git a/scripts/base_network.py b/scripts/base_network.py
index 5c98129b1..afb66387e 100644
--- a/scripts/base_network.py
+++ b/scripts/base_network.py
@@ -776,7 +776,7 @@ def base_network(
     n.import_components_from_dataframe(converters, "Link")
 
     _set_lines_s_nom_from_linetypes(n)
-    if config["electricity"].get("base_network") == "gridkit":
+    if config["electricity"].get("base_network") == "entsoegridkit":
         _apply_parameter_corrections(n, parameter_corrections)
 
     n = _remove_unconnected_components(n)
diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index 6d55f8ee6..fb2c0ea15 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -53,33 +53,6 @@
 ]
 
 
-def read_geojson(fn, cols=[], dtype=None, crs="EPSG:4326"):
-    """
-    Function to read a geojson file fn. When the file is empty, then an empty
-    GeoDataFrame is returned having columns cols, the specified crs and the
-    columns specified by the dtype dictionary it not none.
-
-    Parameters:
-    ------------
-    fn : str
-        Path to the file to read
-    cols : list
-        List of columns of the GeoDataFrame
-    dtype : dict
-        Dictionary of the type of the object by column
-    crs : str
-        CRS of the GeoDataFrame
-    """
-    # if the file is non-zero, read the geodataframe and return it
-    if os.path.getsize(fn) > 0:
-        return gpd.read_file(fn)
-    else:
-        # else return an empty GeoDataFrame
-        df = gpd.GeoDataFrame(columns=cols, geometry=[], crs=crs)
-        if isinstance(dtype, dict):
-            for k, v in dtype.items():
-                df[k] = df[k].astype(v)
-        return df
 
 
 def line_endings_to_bus_conversion(lines):
diff --git a/scripts/clean_osm_data.py b/scripts/clean_osm_data.py
index 4377f84af..8669d0af5 100644
--- a/scripts/clean_osm_data.py
+++ b/scripts/clean_osm_data.py
@@ -1655,32 +1655,6 @@ def _extend_lines_to_substations(gdf_lines, gdf_substations_polygon):
 
 
 # Function to bridge gaps between all lines
-def _bridge_lines(lines):
-    bridged_lines = []
-    for i in range(len(lines) - 1):
-        bridged_lines.append(lines[i])
-
-        # Get the endpoints of the current line and the startpoints of the next line
-        end_points = [lines[i].coords[-1], lines[i].coords[0]]
-        start_points = [lines[i + 1].coords[0], lines[i + 1].coords[-1]]
-
-        # Find the closest pair of points between the two LineStrings
-        min_distance = float("inf")
-        closest_pair = None
-
-        for end_point in end_points:
-            for start_point in start_points:
-                distance = LineString([end_point, start_point]).length
-                if distance < min_distance:
-                    min_distance = distance
-                    closest_pair = (end_point, start_point)
-
-        # Create a bridge between the closest points
-        bridge = LineString(closest_pair)
-        bridged_lines.append(bridge)
-
-    bridged_lines.append(lines[-1])
-    return bridged_lines
 
 
 if __name__ == "__main__":

From c7cfd45c816cfe1a7f6bcd23a43c6d93a23eedfd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Aug 2024 09:53:36 +0000
Subject: [PATCH 097/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/build_osm_network.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py
index fb2c0ea15..83461a98d 100644
--- a/scripts/build_osm_network.py
+++ b/scripts/build_osm_network.py
@@ -53,8 +53,6 @@
 ]
 
 
-
-
 def line_endings_to_bus_conversion(lines):
     """
     Converts line endings to bus connections.

From 259b4ec7c9da2fd372b1f53f392987bc06af09a7 Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Thu, 22 Aug 2024 14:47:57 +0200
Subject: [PATCH 098/100] simplify_network: handle complicated transformer
 topologies

---
 scripts/simplify_network.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 5407748fd..2b19a2b01 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -132,8 +132,8 @@ def simplify_network_to_380(n, linetype_380):
 
     trafo_map = pd.Series(n.transformers.bus1.values, n.transformers.bus0.values)
     trafo_map = trafo_map[~trafo_map.index.duplicated(keep="first")]
-    several_trafo_b = trafo_map.isin(trafo_map.index)
-    trafo_map[several_trafo_b] = trafo_map[several_trafo_b].map(trafo_map)
+    while (several_trafo_b := trafo_map.isin(trafo_map.index)).any():
+        trafo_map[several_trafo_b] = trafo_map[several_trafo_b].map(trafo_map)
     missing_buses_i = n.buses.index.difference(trafo_map.index)
     missing = pd.Series(missing_buses_i, missing_buses_i)
     trafo_map = pd.concat([trafo_map, missing])
@@ -632,7 +632,7 @@ def find_closest_bus(n, x, y, tol=2000):
             aggregation_strategies=params.aggregation_strategies,
         )
         busmaps.append(stub_map)
-
+'
     if params.simplify_network["to_substations"]:
         n, substation_map = aggregate_to_substations(n, params.aggregation_strategies)
         busmaps.append(substation_map)
@@ -696,3 +696,4 @@ def find_closest_bus(n, x, y, tol=2000):
 
     n.meta = dict(snakemake.config, **dict(wildcards=dict(snakemake.wildcards)))
     n.export_to_netcdf(snakemake.output.network)
+'
\ No newline at end of file

From e1d4b49b0a487a2e3d0d76c372499c489a9b8ea3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:58:26 +0000
Subject: [PATCH 099/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/simplify_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 2b19a2b01..116f155bf 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -696,4 +696,4 @@ def find_closest_bus(n, x, y, tol=2000):
 
     n.meta = dict(snakemake.config, **dict(wildcards=dict(snakemake.wildcards)))
     n.export_to_netcdf(snakemake.output.network)
-'
\ No newline at end of file
+'

From cd9855fd1905ba79488414f7635a9a756934f7fa Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Thu, 22 Aug 2024 14:59:13 +0200
Subject: [PATCH 100/100] syntax fix

---
 scripts/simplify_network.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/simplify_network.py b/scripts/simplify_network.py
index 2b19a2b01..f8d2ed87e 100644
--- a/scripts/simplify_network.py
+++ b/scripts/simplify_network.py
@@ -632,7 +632,7 @@ def find_closest_bus(n, x, y, tol=2000):
             aggregation_strategies=params.aggregation_strategies,
         )
         busmaps.append(stub_map)
-'
+
     if params.simplify_network["to_substations"]:
         n, substation_map = aggregate_to_substations(n, params.aggregation_strategies)
         busmaps.append(substation_map)
@@ -695,5 +695,4 @@ def find_closest_bus(n, x, y, tol=2000):
         append_bus_shapes(n, clustered_regions, type=which.split("_")[1])
 
     n.meta = dict(snakemake.config, **dict(wildcards=dict(snakemake.wildcards)))
-    n.export_to_netcdf(snakemake.output.network)
-'
\ No newline at end of file
+    n.export_to_netcdf(snakemake.output.network)
\ No newline at end of file