added post-processing 0.940 -> 0.950

mnpinto · Feb 19, 2021 · f864048 · f864048
1 parent deaf4c4
commit f864048
Show file tree

Hide file tree

Showing 10 changed files with 577 additions and 264 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,13 @@ pip install -e .
 ```
 
 ## Rainforest Connection Species Audio Detection
+* Final ranking: 29th place (top 3%)
+* Final score: 0.940
+* Best single model (5-fold): 0.931
+* Train time for 5-folds of best single model (gtx 1080, i7-7700): ~150 min
+* Writeup: https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/220306
+
+* Update 1: With [this](https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/220389) post processing the final score improves to 0.950 and R
 
 ```bash
 #!/bin/bash
@@ -29,7 +36,7 @@ for fold in 0 1 2 3 4
 do
     echo "Training $model for fold $fold"
     kaggle_rainforest2021 --fold $fold --model_name $model_name \
-        --model $arch --sample_rate $sample_rate --n_mels $n_mels \
+        --model_arch $arch --sample_rate $sample_rate --n_mels $n_mels \
         --hop_length $hop_length --bs 32 --head_ps 0.8 \
         --tile_width 1024 --mixup true >> log.train
 done
@@ -38,7 +45,7 @@ for tw in 64 128 256
 do
     echo "Generate predictions for $model with tile_width of $tw"
     kaggle_rainforest2021 --run_test true --model_name $model_name \
-        --model $arch --sample_rate $sample_rate --n_mels $n_mels \
+        --model_arch $arch --sample_rate $sample_rate --n_mels $n_mels \
         --hop_length $hop_length --tile_width $tw \
         --save_preds true >> log.predict
 done

diff --git a/dl_pipeline/_nbdev.py b/dl_pipeline/_nbdev.py
@@ -102,6 +102,8 @@
          "MixUp": "01audio_util.ipynb",
          "LabelSED": "01audio_util.ipynb",
          "train": "kaggle_rfcx-species-audio-detection.ipynb",
+         "post_processing": "kaggle_rfcx-species-audio-detection.ipynb",
+         "ensemble": "kaggle_rfcx-species-audio-detection.ipynb",
          "test": "kaggle_rfcx-species-audio-detection.ipynb",
          "main": "kaggle_rfcx-species-audio-detection.ipynb"}
 

diff --git a/dl_pipeline/audio/core.py b/dl_pipeline/audio/core.py
@@ -49,6 +49,6 @@ def show_sample(file=sample_file(), tfms=lambda x : x, sample_rate=48_000, hop_l
 # Cell
 def audio2npy(file, path_save, sample_rate=32_000):
     path_save.mkdir(exist_ok=True, parents=True)
-    path_save.mkdir(exist_ok=True, parents=True)
-    wave, _ = librosa.load(file, sr=sample_rate)
-    np.save(path_save/f'{file.stem}.npy', wave)
+    if not (path_save/f'{file.stem}.npy').is_file():
+        wave, _ = librosa.load(file, sr=sample_rate)
+        np.save(path_save/f'{file.stem}.npy', wave)
diff --git a/dl_pipeline/kaggle/rfcx_species_audio_detection.py b/dl_pipeline/kaggle/rfcx_species_audio_detection.py
@@ -1,6 +1,6 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/kaggle_rfcx-species-audio-detection.ipynb (unless otherwise specified).
 
-__all__ = ['audio_augment', 'train', 'get_preds', 'test', 'main']
+__all__ = ['audio_augment', 'train', 'post_processing', 'ensemble', 'get_preds', 'test', 'main']
 
 # Cell
 import numpy as np
@@ -33,7 +33,7 @@ def audio_augment(sample_rate, p=0.25):
 # Cell
 def train(sample_rate, num_classes, fold, n_epochs, lr, wd, tile_width, bs, aug_ps,
           model_name, loss_func, plot, load_checkpoint=None, lr_find=False, head_ps=0.8,
-          mixup=False, n_mels=128, hop_length=512, model='resnest50'):
+          mixup=False, n_mels=128, hop_length=512, model_arch='resnest50'):
     seed_everything()
     cbs = []
     path = Path('/kaggle/kaggle_rainforest_audio/data')
@@ -69,7 +69,7 @@ def train(sample_rate, num_classes, fold, n_epochs, lr, wd, tile_width, bs, aug_
         xb, yb = dls.one_batch()
         show_augmentations(train_data, train_dl, sample_rate=sample_rate)
 
-    model = get_model(model, num_classes=num_classes, head_ps=head_ps, in_channels=1)
+    model = get_model(model_arch, num_classes=num_classes, head_ps=head_ps, in_channels=1)
 
     if mixup:
         cbs.append(MixUp(0.4))
@@ -99,6 +99,47 @@ def after_loss(loss, y):
     print(f'Model saved to', path.parent/f'models/{model_name}_fold{fold}')
 
 # Cell
+def post_processing(df, path_save, model_name, tile_width, MODE=2):
+    """
+    Post processing idea by Chris Deotte shared at
+    https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/220389
+    """
+    # USE MODE 1, 2, or 3
+    # LOAD SUBMISSION
+    FUDGE = 2.0
+    for k in range(24):
+        df.iloc[:,1+k] -= df.iloc[:,1+k].min()
+        df.iloc[:,1+k] /= df.iloc[:,1+k].max()
+
+    # CONVERT PROBS TO ODDS, APPLY MULTIPLIER, CONVERT BACK TO PROBS
+    def scale(probs, factor):
+        probs = probs.copy()
+        idx = np.where(probs!=1)[0]
+        odds = factor * probs[idx] / (1-probs[idx])
+        probs[idx] =  odds/(1+odds)
+        return probs
+
+    # TRAIN AND TEST MEANS
+    d1 = df.iloc[:,1:].mean().values
+    d2 = np.array([113,204,44,923,53,41,3,213,44,23,26,149,255,14,123,222,46,6,474,4,17,18,23,72])/1000.
+
+    for k in range(24):
+        if MODE==1: d = FUDGE
+        if MODE==2: d = d1[k]/(1-d1[k])
+        if MODE==3: s = d2[k] / d1[k]
+        else: s = (d2[k]/(1-d2[k]))/d
+        df.iloc[:,k+1] = scale(df.iloc[:,k+1].values,s)
+
+    df.to_csv(path_save/f'submission_with_pp_{model_name}_{tile_width}.csv',index=False)
+
+def ensemble(files):
+    dfs = [pd.read_csv(f) for f in files]
+    df = pd.concat(dfs).groupby('recording_id').mean().reset_index()
+    tstr = datetime.now().strftime('%Y%m%d%H%M')
+    fsave = files[0].parent/f'submission_ens_{tstr}.csv'
+    df.to_csv(fsave, index=False)
+    print(f'Saved to {fsave}')
+
 def get_preds(dataloader, model, device=torch.device("cuda:0"), max_reduce=True):
     model.eval().to(device)
     with torch.no_grad():
@@ -114,74 +155,83 @@ def get_preds(dataloader, model, device=torch.device("cuda:0"), max_reduce=True)
         ys = torch.cat(ys, dim=0)
     return preds, ys
 
-def test(sample_rate, num_classes, tile_width, model_name, ens_folds, head_ps=0.8,
-         n_mels=128, hop_length=512, save_preds=False, model='resnest50'):
+def test(sample_rate, num_classes, tile_widths, model_name, ens_folds, head_ps=0.8,
+         n_mels=128, hop_length=512, save_preds=False, model_arch='resnest50'):
     bs = 1
     _path_save = Path('preds')
     _path_save.mkdir(exist_ok=True)
     max_reduce = not save_preds
 
-    preds_ens, preds_valid, ys_valid = [], [], []
-    preds_train, ys_train = [], []
-    for fold in ens_folds:
-        seed_everything()
-        path = Path('/kaggle/kaggle_rainforest_audio/data')
-        rename_cols_test = RenameColumns(id='recording_id')
-        rename_cols_valid = RenameColumns(id='recording_id', label='species_id', tmin='t_min',
-                                    tmax='t_max',fmin='f_min', fmax='f_max')
-
-        df = Pipeline([load_dataframe, rename_cols_valid, group_labels])(path/'train_tp.csv')
-
-        train_df, valid_df = kfold_dataframes(df, fold)
-
-        test_df = Pipeline([load_dataframe, rename_cols_test])(path/'sample_submission.csv')
-        fp_df = Pipeline([load_dataframe, rename_cols_valid, group_labels])(path/'train_fp.csv')
-
-        datasets = [Datasets(items=dataframe, tfms=partial(create_dataset_item, path=path,
-            sample_rate=sample_rate, tile_width=None, n_mels=n_mels, hop_length=hop_length))
-            for dataframe in [train_df, valid_df, test_df, fp_df]]
-
-        dls = DataLoaders(*[DataLoader(dataset, bs=bs, do_batch=reorganize_batch, num_workers=8,
-                        after_batch=Pipeline([MelSpectrogram(sample_rate, n_mels=n_mels,
-                                    hop_length=hop_length), TilesTransform(tile_width)]))
-             for dataset in datasets])
-
-        model = get_model(model, num_classes=num_classes, head_ps=head_ps, in_channels=1,
-                          pretrained=False)
-
-        dls.device = torch.device("cuda:0")
-        learn = Learner(dls, model, loss_func=cross_entropy, metrics=[accuracy, lrap])
-        learn.to_fp16(clip=0.5);
-        learn.load(path.parent/f'models/{model_name}_fold{fold}')
-        print('Load model: ', path.parent/f'models/{model_name}_fold{fold}')
-
-        preds, ys = get_preds(dls[1], model, max_reduce=max_reduce)
-        np.save(_path_save/f'{model_name}_{tile_width}_fold{fold}_valid.npy',
-                {'preds':preds, 'ys':ys})
-        if not max_reduce: preds = preds.max(1).values
-        preds_valid.append(preds)
-        ys_valid.append(ys)
-
-        preds, ys = get_preds(dls[2], model, max_reduce=max_reduce)
-        np.save(_path_save/f'{model_name}_{tile_width}_fold{fold}_test.npy',
-                {'preds':preds, 'ys':ys})
-        if not max_reduce: preds = preds.max(1).values
-        preds_ens.append(preds[None])
-
-    preds_valid, ys_valid = torch.cat(preds_valid), torch.cat(ys_valid)
-    valid_score = lrap(preds_valid, ys_valid.long().squeeze(), before=lambda *o:o)
-    print(f'Validation score: {valid_score:.3f}')
-
-    preds_ens = torch.cat(preds_ens).mean(0).softmax(-1)
-    test_df = Pipeline([load_dataframe])(path/'sample_submission.csv')
-
-    for i in range(preds_ens.shape[1]):
-        test_df.loc[:, f's{i}'] = preds_ens[:,i]
-
-    tstr = datetime.now().strftime('%Y%m%d%H%M')
-    test_df.to_csv(path.parent/f'subs/submission_{tstr}.csv',
-                   index=False)
-    print('Submission file saved: ', path.parent/f'subs/submission_{tstr}.csv')
+    ens_files = []
+    for tile_width in tile_widths:
+        print(f'Running inference for tile_width {tile_width}')
+        preds_ens, preds_valid, ys_valid = [], [], []
+        preds_train, ys_train = [], []
+        for fold in ens_folds:
+            seed_everything()
+            print(f'Running inference for fold {fold}')
+            path = Path('/kaggle/kaggle_rainforest_audio/data')
+            rename_cols_test = RenameColumns(id='recording_id')
+            rename_cols_valid = RenameColumns(id='recording_id', label='species_id', tmin='t_min',
+                                        tmax='t_max',fmin='f_min', fmax='f_max')
+
+            df = Pipeline([load_dataframe, rename_cols_valid, group_labels])(path/'train_tp.csv')
+
+            train_df, valid_df = kfold_dataframes(df, fold)
+
+            test_df = Pipeline([load_dataframe, rename_cols_test])(path/'sample_submission.csv')
+            fp_df = Pipeline([load_dataframe, rename_cols_valid, group_labels])(path/'train_fp.csv')
+
+            datasets = [Datasets(items=dataframe, tfms=partial(create_dataset_item, path=path,
+                sample_rate=sample_rate, tile_width=None, n_mels=n_mels, hop_length=hop_length))
+                for dataframe in [train_df, valid_df, test_df, fp_df]]
+
+            dls = DataLoaders(*[DataLoader(dataset, bs=bs, do_batch=reorganize_batch, num_workers=8,
+                            after_batch=Pipeline([MelSpectrogram(sample_rate, n_mels=n_mels,
+                                        hop_length=hop_length), TilesTransform(tile_width)]))
+                 for dataset in datasets])
+
+            model = get_model(model_arch, num_classes=num_classes, head_ps=head_ps, in_channels=1,
+                              pretrained=False)
+
+            dls.device = torch.device("cuda:0")
+            learn = Learner(dls, model, loss_func=cross_entropy, metrics=[accuracy, lrap])
+            learn.to_fp16(clip=0.5);
+            learn.load(path.parent/f'models/{model_name}_fold{fold}')
+            print('Load model: ', path.parent/f'models/{model_name}_fold{fold}')
+
+            preds, ys = get_preds(dls[1], model, max_reduce=max_reduce)
+            np.save(_path_save/f'{model_name}_{tile_width}_fold{fold}_valid.npy',
+                    {'preds':preds, 'ys':ys})
+            if not max_reduce: preds = preds.max(1).values
+            preds_valid.append(preds)
+            ys_valid.append(ys)
+
+            preds, ys = get_preds(dls[2], model, max_reduce=max_reduce)
+            np.save(_path_save/f'{model_name}_{tile_width}_fold{fold}_test.npy',
+                    {'preds':preds, 'ys':ys})
+            if not max_reduce: preds = preds.max(1).values
+            preds_ens.append(preds[None])
+
+        preds_valid, ys_valid = torch.cat(preds_valid), torch.cat(ys_valid)
+        valid_score = lrap(preds_valid, ys_valid.long().squeeze(), before=lambda *o:o)
+        print(f'Validation score: {valid_score:.3f}')
+
+        preds_ens = torch.cat(preds_ens).mean(0).softmax(-1)
+        test_df = Pipeline([load_dataframe])(path/'sample_submission.csv')
+
+        for i in range(preds_ens.shape[1]):
+            test_df.loc[:, f's{i}'] = preds_ens[:,i]
+
+        tstr = datetime.now().strftime('%Y%m%d%H%M')
+        test_df.to_csv(path.parent/f'subs/submission_{tstr}.csv',index=False)
+        print('Submission file saved: ', path.parent/f'subs/submission_{model_name}_{tile_width}.csv')
+
+        #Post-processing
+        post_processing(test_df, path.parent/'subs', model_name, tile_width)
+
+        ens_files.append(path.parent/f'subs/submission_with_pp_{model_name}_{tile_width}.csv')
+    ensemble(ens_files)
 
 # Cell
 @call_parse
@@ -191,13 +241,14 @@ def main(fold:Param('Fold number', int)=0,
          wd:Param('Weight decay', float)=3e-2,
          loss_func:Param('Loss function', str)='cross_entropy',
          tile_width:Param('Tile width', int)=1024,
+         tile_widths_inference:Param('List of tile widths for inference', list)=[128,256],
          sample_rate:Param('Sample rate', int)=32000,
          n_mels:Param('Spectrogram n_mels', int)=128,
          hop_length:Param('Spectrogram hop_length', int)=640,
          bs:Param('Batch size', int)=32,
          accumulate_gradients:Param('Batch size for gradient accumulation', int)=None,
          aug_ps:Param('Augmentation probability', float)=0.25,
-         model:Param('Name of model architecture', str)='densenet121',
+         model_arch:Param('Name of model architecture', str)='densenet121',
          model_name:Param('Name of parameters file', str)='model_n0',
          ens_folds:Param('Folds to use for ensemble', list)=[0,1,2,3,4],
          run_test:Param('Run test prediction (default is train)', str)=False,
@@ -211,9 +262,9 @@ def main(fold:Param('Fold number', int)=0,
     if mixup in [True, 'true', 'True']: mixup = True
     if save_preds in [True, 'true', 'True']: save_preds=True
     if run_test:
-        test(sample_rate, num_classes, tile_width, model_name, ens_folds, head_ps=head_ps,
-             n_mels=n_mels, hop_length=hop_length, save_preds=save_preds, model=model)
+        test(sample_rate, num_classes, tile_widths_inference, model_name, ens_folds, head_ps=head_ps,
+             n_mels=n_mels, hop_length=hop_length, save_preds=save_preds, model_arch=model_arch)
     else:
         train(sample_rate, num_classes, fold, n_epochs, lr, wd, tile_width, bs, aug_ps,
           model_name, loss_func, plot=False, load_checkpoint=load_checkpoint, lr_find=False,
-          head_ps=head_ps, mixup=mixup, n_mels=n_mels, hop_length=hop_length, model=model)
+          head_ps=head_ps, mixup=mixup, n_mels=n_mels, hop_length=hop_length, model_arch=model_arch)
diff --git a/docs/core.html b/docs/core.html
@@ -167,8 +167,8 @@ <h4 id="sample_file" class="doc_header"><code>sample_file</code><a href="https:/
 <div class="output_area">
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>CPU times: user 19.5 ms, sys: 0 ns, total: 19.5 ms
-Wall time: 19.8 ms
+<pre>CPU times: user 18.5 ms, sys: 525 µs, total: 19 ms
+Wall time: 39.2 ms
 </pre>
 </div>
 </div>
@@ -178,7 +178,7 @@ <h4 id="sample_file" class="doc_header"><code>sample_file</code><a href="https:/
 
 
 <div class="output_text output_subarea output_execute_result">
-<pre>[&lt;matplotlib.lines.Line2D at 0x7f329ae146a0&gt;]</pre>
+<pre>[&lt;matplotlib.lines.Line2D at 0x7f4e12a386a0&gt;]</pre>
 </div>
 
 </div>
@@ -289,8 +289,8 @@ <h4 id="show_sample" class="doc_header"><code>show_sample</code><a href="https:/
 <div class="output_area">
 
 <div class="output_subarea output_stream output_stdout output_text">
-<pre>CPU times: user 642 ms, sys: 40.4 ms, total: 683 ms
-Wall time: 410 ms
+<pre>CPU times: user 615 ms, sys: 18.3 ms, total: 634 ms
+Wall time: 397 ms
 </pre>
 </div>
 </div>
@@ -338,30 +338,6 @@ <h4 id="audio2npy" class="doc_header"><code>audio2npy</code><a href="https://git
 </div>
     {% endraw %}
 
-<div class="cell border-box-sizing text_cell rendered"><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<div class="highlight"><pre><span></span><span class="n">sample_rate</span> <span class="o">=</span> <span class="mi">32_000</span>
-<span class="n">path</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="s1">&#39;/kaggle/kaggle_rainforest_audio/data&#39;</span><span class="p">)</span>
-<span class="n">train_path</span> <span class="o">=</span> <span class="n">path</span><span class="o">/</span><span class="s1">&#39;train&#39;</span>
-<span class="n">test_path</span> <span class="o">=</span> <span class="n">path</span><span class="o">/</span><span class="s1">&#39;test&#39;</span>
-<span class="n">train_path_npy</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="n">path</span><span class="o">/</span><span class="sa">f</span><span class="s1">&#39;npy</span><span class="si">{</span><span class="n">sample_rate</span><span class="si">}</span><span class="s1">/train&#39;</span><span class="p">)</span>
-<span class="n">test_path_npy</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="n">path</span><span class="o">/</span><span class="sa">f</span><span class="s1">&#39;npy</span><span class="si">{</span><span class="n">sample_rate</span><span class="si">}</span><span class="s1">/test&#39;</span><span class="p">)</span>
-
-<span class="n">files</span> <span class="o">=</span> <span class="n">train_path</span><span class="o">.</span><span class="n">ls</span><span class="p">()</span>
-<span class="n">files</span> <span class="o">=</span> <span class="p">[</span><span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">files</span> <span class="k">if</span> <span class="s1">&#39;.flac&#39;</span> <span class="ow">in</span> <span class="n">f</span><span class="o">.</span><span class="n">suffix</span><span class="p">]</span>
-
-<span class="n">f</span> <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">audio2npy</span><span class="p">,</span> <span class="n">path_save</span><span class="o">=</span><span class="n">train_path_npy</span><span class="p">,</span> <span class="n">sample_rate</span><span class="o">=</span><span class="n">sample_rate</span><span class="p">)</span>
-<span class="n">parallel</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">files</span><span class="p">)</span>
-
-<span class="n">files</span> <span class="o">=</span> <span class="n">test_path</span><span class="o">.</span><span class="n">ls</span><span class="p">()</span>
-<span class="n">files</span> <span class="o">=</span> <span class="p">[</span><span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">files</span> <span class="k">if</span> <span class="s1">&#39;.flac&#39;</span> <span class="ow">in</span> <span class="n">f</span><span class="o">.</span><span class="n">suffix</span><span class="p">]</span>
-<span class="n">f</span> <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">audio2npy</span><span class="p">,</span> <span class="n">path_save</span><span class="o">=</span><span class="n">test_path_npy</span><span class="p">,</span> <span class="n">sample_rate</span><span class="o">=</span><span class="n">sample_rate</span><span class="p">)</span>
-<span class="n">parallel</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">files</span><span class="p">)</span>
-</pre></div>
-
-</div>
-</div>
-</div>
 </div>