diff --git a/examples/nas/fbnet/__init__.py b/examples/nas/fbnet/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/examples/nas/fbnet/datasets.py b/examples/nas/fbnet/datasets.py
deleted file mode 100644
index 8e55c06a194..00000000000
--- a/examples/nas/fbnet/datasets.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import cv2
-import os
-import random
-import sys
-
-import numpy as np
-
-from torch.utils import data
-from torch.utils.data import DataLoader
-
-
-def flip(img, annotation):
-    # parse
-    img = np.fliplr(img).copy()
-    h, w = img.shape[:2]
-    x_min, y_min, x_max, y_max = annotation[0:4]
-    landmark_x = annotation[4::2]
-    landmark_y = annotation[4 + 1 :: 2]
-
-    bbox = np.array([w - x_max, y_min, w - x_min, y_max])
-    for i in range(len(landmark_x)):
-        landmark_x[i] = w - landmark_x[i]
-
-    new_annotation = list()
-    new_annotation.append(x_min)
-    new_annotation.append(y_min)
-    new_annotation.append(x_max)
-    new_annotation.append(y_max)
-
-    for i in range(len(landmark_x)):
-        new_annotation.append(landmark_x[i])
-        new_annotation.append(landmark_y[i])
-
-    return img, new_annotation
-
-
-def channel_shuffle(img, annotation):
-    if img.shape[2] == 3:
-        ch_arr = [0, 1, 2]
-        np.random.shuffle(ch_arr)
-        img = img[..., ch_arr]
-    return img, annotation
-
-
-def random_noise(img, annotation, limit=[0, 0.2], p=0.5):
-    if random.random() < p:
-        H, W = img.shape[:2]
-        noise = np.random.uniform(limit[0], limit[1], size=(H, W)) * 255
-
-        img = img + noise[:, :, np.newaxis] * np.array([1, 1, 1])
-        img = np.clip(img, 0, 255).astype(np.uint8)
-
-    return img, annotation
-
-
-def random_brightness(img, annotation, brightness=0.3):
-    alpha = 1 + np.random.uniform(-brightness, brightness)
-    img = alpha * img
-    img = np.clip(img, 0, 255).astype(np.uint8)
-    return img, annotation
-
-
-def random_contrast(img, annotation, contrast=0.3):
-    # rgb to gray (YCbCr)
-    coef = np.array([[[0.114, 0.587, 0.299]]])
-    alpha = 1.0 + np.random.uniform(-contrast, contrast)
-    gray = img * coef
-    gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray)
-    img = alpha * img + gray
-    img = np.clip(img, 0, 255).astype(np.uint8)
-    return img, annotation
-
-
-def random_saturation(img, annotation, saturation=0.5):
-    coef = np.array([[[0.299, 0.587, 0.114]]])
-    alpha = np.random.uniform(-saturation, saturation)
-    gray = img * coef
-    gray = np.sum(gray, axis=2, keepdims=True)
-    img = alpha * img + (1.0 - alpha) * gray
-    img = np.clip(img, 0, 255).astype(np.uint8)
-    return img, annotation
-
-
-def random_hue(image, annotation, hue=0.5):
-    h = int(np.random.uniform(-hue, hue) * 180)
-
-    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
-    hsv[:, :, 0] = (hsv[:, :, 0].astype(int) + h) % 180
-    image = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
-    return image, annotation
-
-
-def scale(img, annotation):
-    f_xy = np.random.uniform(-0.4, 0.8)
-    origin_h, origin_w = img.shape[:2]
-
-    bbox = annotation[0:4]
-    landmark_x = annotation[4::2]
-    landmark_y = annotation[4 + 1 :: 2]
-
-    h, w = int(origin_h * f_xy), int(origin_w * f_xy)
-    image = cv2.resize(img, (h, w)).astype(np.uint8)
-
-    new_annotation = list()
-    for i in range(len(bbox)):
-        bbox[i] = bbox[i] * f_xy
-        new_annotation.append(bbox[i])
-
-    for i in range(len(landmark_x)):
-        landmark_x[i] = landmark_x[i] * f_xy
-        landmark_y[i] = landmark_y[i] * f_xy
-        new_annotation.append(landmark_x[i])
-        new_annotation.append(landmark_y[i])
-
-    return image, new_annotation
-
-
-def rotate(img, annotation, alpha=30):
-
-    bbox = annotation[0:4]
-    landmark_x = annotation[4::2]
-    landmark_y = annotation[4 + 1 :: 2]
-    center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2)
-    rot_mat = cv2.getRotationMatrix2D(center, alpha, 1)
-    img_rotated_by_alpha = cv2.warpAffine(img, rot_mat, (img.shape[1], img.shape[0]))
-
-    point_x = [bbox[0], bbox[2], bbox[0], bbox[2]]
-    point_y = [bbox[1], bbox[3], bbox[3], bbox[1]]
-
-    new_point_x = list()
-    new_point_y = list()
-    for (x, y) in zip(landmark_x, landmark_y):
-        new_point_x.append(rot_mat[0][0] * x + rot_mat[0][1] * y + rot_mat[0][2])
-        new_point_y.append(rot_mat[1][0] * x + rot_mat[1][1] * y + rot_mat[1][2])
-
-    new_annotation = list()
-    new_annotation.append(min(new_point_x))
-    new_annotation.append(min(new_point_y))
-    new_annotation.append(max(new_point_x))
-    new_annotation.append(max(new_point_y))
-
-    for (x, y) in zip(landmark_x, landmark_y):
-        new_annotation.append(rot_mat[0][0] * x + rot_mat[0][1] * y + rot_mat[0][2])
-        new_annotation.append(rot_mat[1][0] * x + rot_mat[1][1] * y + rot_mat[1][2])
-
-    return img_rotated_by_alpha, new_annotation
-
-
-class PFLDDatasets(data.Dataset):
-    def __init__(
-        self, file_list, transforms=None, data_root="", img_size=112
-    ):
-        self.line = None
-        self.path = None
-        self.img_size = img_size
-        self.landmarks = None
-        self.filenames = None
-        self.euler_angle = None
-        self.data_root = data_root
-        self.transforms = transforms
-        with open(file_list, "r") as f:
-            self.lines = f.readlines()
-
-    def __getitem__(self, index):
-        self.line = self.lines[index].strip().split()
-        # load image
-        if self.data_root:
-            self.img = cv2.imread(os.path.join(self.data_root, self.line[0]))
-        else:
-            self.img = cv2.imread(self.line[0])
-        # resize
-        self.img = cv2.resize(self.img, (self.img_size, self.img_size))
-        # obtain gt labels
-        self.landmark = np.asarray(self.line[1 : 106 * 2 + 1], dtype=np.float32)
-        self.euler_angle = np.asarray(self.line[106 * 2 + 1:], dtype=np.float32)
-
-        # augmentation
-        if self.transforms:
-            self.img = self.transforms(self.img)
-        return self.img, self.landmark, self.euler_angle
-
-    def __len__(self):
-        return len(self.lines)
diff --git a/examples/nas/fbnet/lib/__init__.py b/examples/nas/fbnet/lib/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/examples/nas/fbnet/lib/builder.py b/examples/nas/fbnet/lib/builder.py
deleted file mode 100644
index 431044a20a8..00000000000
--- a/examples/nas/fbnet/lib/builder.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import gc
-import os
-import time
-import timeit
-import torch
-
-import numpy as np
-
-from lib.ops import PRIMITIVES
-from lib.utils import count_model_flops, model_init
-
-LUT_FILE = "lut.npy"
-
-
-def supernet_sample(model, state_dict, sampled_arch=[], lookup_table=None):
-    """Initialize the searched sub-model from supernet."""
-    replace = list()
-    stage_names = [stage_name for stage_name in lookup_table.layer_num]
-    stage_lnum = [
-        lookup_table.layer_num[stage_name] for stage_name in stage_names
-    ]
-
-    if sampled_arch:
-        layer_id = 0
-        for i, stage_name in enumerate(stage_names):
-            ops_names = [
-                op_name for op_name in lookup_table.lut_ops[stage_name]
-            ]
-            for j in range(stage_lnum[i]):
-                searched_op = sampled_arch[layer_id]
-                layer_id += 1
-                op_i = ops_names.index(searched_op)
-                replace.append(
-                    ["nas_stages_{}.{}.".format(i, j), ".op.", ".ops.{}.".format(op_i)]
-                )
-
-    model_init(model, state_dict, replace=replace)
-
-
-def sub_arch_sample(model, lookup_table, logger):
-    """ Sample the ops names for the sub-network."""
-    stage_names = [stage_name for stage_name in lookup_table.layer_num]
-    stage_lnum = [
-        lookup_table.layer_num[stage_name] for stage_name in stage_names
-    ]
-
-    # get the op idx in each layer
-    arch_idxs = list()
-    layer_id = 0
-    for theta_param in get_parameters(model, [BLOCK_THETA], mode='include'):
-        theta_np = theta_param.detach().cpu().numpy()
-        op_idx = np.argmax(theta_np)
-        arch_idxs.append(op_idx)
-        logger.info("layer {}: {}, index: {}".format(layer_id, theta_np, op_idx))
-        layer_id += 1
-
-    # get the arch_sample
-    arch_operations = list()
-    layer_id = 0
-    for i, stage_name in enumerate(stage_names):
-        ops_names = [
-            op_name for op_name in lookup_table.lut_ops[stage_name]
-        ]
-        for j in range(stage_lnum[i]):
-            searched_op = ops_names[arch_idxs[layer_id]]
-            arch_operations.append(searched_op)
-            layer_id += 1
-
-    logger.info(arch_operations)
-    return arch_operations
-
-
-class LookUpTable:
-    """Build look-up table for NAS."""
-
-    def __init__(self, config):
-        # definition of search blocks and space
-        self.search_space = config.search_space
-        # layers for NAS
-        self.cnt_layers = len(self.search_space["input_shape"])
-        # constructors for each operation
-        self.lut_ops = {
-            stage_name: {
-                op_name: PRIMITIVES[op_name]
-                for op_name in self.search_space["stages"][stage_name]["ops"]
-            } for stage_name in self.search_space["stages"]
-        }
-        self.layer_num = {
-            stage_name: self.search_space["stages"][stage_name]["layer_num"]
-            for stage_name in self.search_space["stages"]
-        }
-
-        # arguments for the ops constructors, input_shapes just for convinience
-        (
-            self.layers_params,
-            self.layers_input_shapes,
-        ) = self._generate_layers_params()
-
-        # lookup_table
-        self.perf_metric = config.perf_metric
-
-        if config.lut_en:
-            self.lut_perf = None
-            self.lut_file = os.path.join(config.lut_path, LUT_FILE)
-            if config.lut_load:
-                self._create_from_file()
-            else:
-                self._create_from_operations()
-
-    def _generate_layers_params(self):
-        """Generate basic params for different layers."""
-        # layers_params are : c_in, c_out, stride, fm_size
-        layers_params = [
-            [
-                self.search_space["input_shape"][layer_id][0],
-                self.search_space["channel_size"][layer_id],
-                self.search_space["strides"][layer_id],
-                self.search_space["fm_size"][layer_id],
-            ]
-            for layer_id in range(self.cnt_layers)
-        ]
-
-        # layers_input_shapes are (C_in, input_w, input_h)
-        layers_input_shapes = self.search_space["input_shape"]
-
-        return layers_params, layers_input_shapes
-
-    def _create_from_operations(self, cnt_of_runs=200):
-        """Create performance cost for each op."""
-        if self.perf_metric == "latency":
-            self.lut_perf = self._calculate_latency(cnt_of_runs)
-        elif self.perf_metric == "flops":
-            self.lut_perf = self._calculate_flops()
-
-        self._write_lookup_table_to_file()
-
-    def _calculate_flops(self, eps=0.001):
-        """FLOPs cost."""
-        flops_table_layer_by_ops = [{} for i in range(self.cnt_layers)]
-        layer_id = 0
-
-        for stage_name in self.lut_ops:
-            stage_ops = self.lut_ops[stage_name]
-            ops_num = self.layer_num[stage_name]
-
-            for _ in range(ops_num):
-                for op_name in stage_ops:
-                    layer_param = self.layers_params[layer_id]
-                    key_params = {"fm_size": layer_param[3]}
-                    op = stage_ops[op_name](*layer_param[0:3], **key_params)
-
-                    # measured in micro-second
-                    flops = count_model_flops(op, self.layers_input_shapes[layer_id])
-                    flops = eps if flops == 0.0 else flops
-                    flops_table_layer_by_ops[layer_id][op_name] = float(flops)
-                layer_id += 1
-
-        return flops_table_layer_by_ops
-
-    def _calculate_latency(self, cnt_of_runs):
-        """Latency cost."""
-        LATENCY_BATCH_SIZE = 1
-        latency_table_layer_by_ops = [{} for i in range(self.cnt_layers)]
-        layer_id = 0
-
-        for stage_name in self.lut_ops:
-            stage_ops = self.lut_ops[stage_name]
-            ops_num = self.layer_num[stage_name]
-
-            for _ in range(ops_num):
-                for op_name in stage_ops:
-                    layer_param = self.layers_params[layer_id]
-                    key_params = {"fm_size": layer_param[3]}
-                    op = stage_ops[op_name](*layer_param[0:3], **key_params)
-                    input_sample = torch.randn(
-                        (LATENCY_BATCH_SIZE, *self.layers_input_shapes[layer_id])
-                    )
-                    globals()["op"], globals()["input_sample"] = op, input_sample
-                    total_time = timeit.timeit(
-                        "output = op(input_sample)",
-                        setup="gc.enable()",
-                        globals=globals(),
-                        number=cnt_of_runs,
-                    )
-                    # measured in micro-second
-                    latency_table_layer_by_ops[layer_id][op_name] = (
-                        total_time / cnt_of_runs / LATENCY_BATCH_SIZE * 1e6
-                    )
-                layer_id += 1
-
-        return latency_table_layer_by_ops
-
-    def _write_lookup_table_to_file(self):
-        """Save lut as numpy file."""
-        np.save(self.lut_file, self.lut_perf)
-
-    def _create_from_file(self):
-        """Load numpy file."""
-        self.lut_perf = np.load(self.lut_file, allow_pickle=True)
diff --git a/examples/nas/fbnet/lib/config.py b/examples/nas/fbnet/lib/config.py
deleted file mode 100644
index cf332bb5a18..00000000000
--- a/examples/nas/fbnet/lib/config.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import os
-
-import numpy as np
-
-LUT_PATH = "lut"
-
-
-search_space = {
-    # multi-stage definition for candidate layers
-    "stages": {
-        "stage_0": {
-            "ops": [
-                "mb_k3_res",
-                "mb_k3_e2_res",
-                "mb_k3_res_d3",
-                "mb_k5_res",
-                "mb_k5_e2_res",
-                "sep_k3",
-                "sep_k5",
-                "gh_k3",
-                "gh_k5",
-            ],
-            "layer_num": 2,
-        },
-
-        "stage_1": {
-            "ops": [
-                "mb_k3_e2_res",
-                "mb_k3_e4_res",
-                "mb_k3_e2_res_se",
-                "mb_k3_res_d3",
-                "mb_k5_res",
-                "mb_k5_e2_res",
-                "mb_k5_res_se",
-                "mb_k5_e2_res_se",
-                "gh_k5",
-            ],
-            "layer_num": 3,
-        },
-    },
-
-    # necessary information of layers for NAS
-    "input_shape": [
-        (32, 14, 14),
-        (32, 14, 14),
-        (32, 14, 14),
-        (64, 7, 7),
-        (64, 7, 7),
-    ],
-    "channel_size": [32, 32, 64, 64, 64],
-    "strides": [1, 1, 2, 1, 1],
-    "fm_size": [14, 14, 7, 7, 7],
-}
-
-
-class NASConfig:
-
-    def __init__(
-        self,
-        perf_metric='flops',
-        lut_load=False,
-        arch_search=True,
-        model_dir=None,
-        nas_lr=0.01,
-        nas_weight_decay=5e-4,
-        mode='mul',
-        alpha=0.18,
-        beta=0.6,
-        start_epoch=50,
-        init_temperature=5.0,
-        exp_anneal_rate=np.exp(-0.045),
-        search_space=None,
-    ):
-        # LUT of performance metric
-        self.perf_metric = perf_metric
-        assert perf_metric in ['flops', 'latency'], "perf_metric should be ['flops', 'latency']"
-        # wether load or create lut file
-        self.lut_load = lut_load
-        self.arch_search = arch_search
-        # necessary dirs
-        self.lut_en = model_dir is not None
-        if self.lut_en:
-            self.model_dir = model_dir
-            os.makedirs(model_dir, exist_ok=True)
-            self.lut_path = os.path.join(model_dir, LUT_PATH)
-            os.makedirs(self.lut_path, exist_ok=True)
-        # NAS learning setting
-        self.nas_lr = nas_lr
-        self.nas_weight_decay = nas_weight_decay
-        # hardware-aware loss setting
-        self.mode = mode
-        self.alpha = alpha
-        self.beta = beta
-        # NAS training setting
-        self.start_epoch = start_epoch
-        self.init_temperature = init_temperature
-        self.exp_anneal_rate = exp_anneal_rate
-        # definition of search blocks and space
-        self.search_space = search_space
diff --git a/examples/nas/fbnet/lib/ops.py b/examples/nas/fbnet/lib/ops.py
deleted file mode 100644
index 78add840ae8..00000000000
--- a/examples/nas/fbnet/lib/ops.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import torch
-
-import numpy as np
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-# Basic primitives as the network path
-PRIMITIVES = {
-    "skip": lambda c_in, c_out, stride, **kwargs: Identity(
-        c_in, c_out, stride, **kwargs
-    ),
-    "conv1x1": lambda c_in, c_out, stride, **kwargs: Conv1x1(
-        c_in, c_out, stride, **kwargs
-    ),
-    "depth_conv": lambda c_in, c_out, stride, **kwargs: DepthConv(
-        c_in, c_out, stride, **kwargs
-    ),
-    "sep_k3": lambda c_in, c_out, stride, **kwargs: SeparableConv(
-        c_in, c_out, stride, **kwargs
-    ),
-    "sep_k5": lambda c_in, c_out, stride, **kwargs: SeparableConv(
-        c_in, c_out, stride, kernel=5, **kwargs
-    ),
-    "gh_k3": lambda c_in, c_out, stride, **kwargs: GhostModule(
-        c_in, c_out, stride, **kwargs
-    ),
-    "gh_k5": lambda c_in, c_out, stride, **kwargs: GhostModule(
-        c_in, c_out, stride, kernel=5, **kwargs
-    ),
-    "mb_k3": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=1, **kwargs
-    ),
-    "mb_k3_e2": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, **kwargs
-    ),
-    "mb_k3_e4": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=4, **kwargs
-    ),
-    "mb_k3_res": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=1, res=True, **kwargs
-    ),
-    "mb_k3_e2_res": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, res=True, **kwargs
-    ),
-    "mb_k3_e4_res": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=4, res=True, **kwargs
-    ),
-    "mb_k3_d2": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, res=False, dilation=2, **kwargs
-    ),
-    "mb_k3_d3": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, res=False, dilation=3, **kwargs
-    ),
-    "mb_k3_res_d2": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, res=True, dilation=2, **kwargs
-    ),
-    "mb_k3_res_d3": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, res=True, dilation=3, **kwargs
-    ),
-    "mb_k3_res_se": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=1, res=True, dilation=1, se=True, **kwargs
-    ),
-    "mb_k3_e2_res_se": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=2, res=True, dilation=1, se=True, **kwargs
-    ),
-    "mb_k3_e4_res_se": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=3, expand=4, res=True, dilation=1, se=True, **kwargs
-    ),
-    "mb_k5": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=5, expand=1, **kwargs
-    ),
-    "mb_k5_e2": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=5, expand=2, **kwargs
-    ),
-    "mb_k5_res": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=5, expand=1, res=True, **kwargs
-    ),
-    "mb_k5_e2_res": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=5, expand=2, res=True, **kwargs
-    ),
-    "mb_k5_res_se": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=5, expand=1, res=True, dilation=1, se=True, **kwargs
-    ),
-    "mb_k5_e2_res_se": lambda c_in, c_out, stride, **kwargs: MBBlock(
-        c_in, c_out, stride, kernel=5, expand=2, res=True, dilation=1, se=True, **kwargs
-    ),
-}
-
-
-def conv_bn(inp, oup, kernel, stride, padding=1, groups=1):
-    return nn.Sequential(
-        nn.Conv2d(inp, oup, kernel, stride, padding, groups=groups, bias=False),
-        nn.BatchNorm2d(oup),
-        nn.ReLU(inplace=True),
-    )
-
-
-class SeparableConv(nn.Module):
-    """Separable convolution."""
-
-    def __init__(self, in_ch, out_ch, stride=1, kernel=3, fm_size=7):
-        super(SeparableConv, self).__init__()
-        assert stride in [1, 2], "stride should be in [1, 2]"
-        padding = kernel // 2
-
-        self.conv = nn.Sequential(
-            conv_bn(in_ch, in_ch, kernel, stride, padding=padding, groups=in_ch),
-            conv_bn(in_ch, out_ch, 1, 1, padding=0),
-        )
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-
-class Conv1x1(nn.Module):
-    """1x1 convolution."""
-
-    def __init__(self, in_ch, out_ch, stride=1, kernel=1, fm_size=7):
-        super(Conv1x1, self).__init__()
-        assert stride in [1, 2], "stride should be in [1, 2]"
-        padding = kernel // 2
-
-        self.conv = nn.Sequential(
-            nn.Conv2d(in_ch, out_ch, kernel, stride, padding),
-            nn.ReLU(inplace=True),
-        )
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class DepthConv(nn.Module):
-    """depth convolution."""
-
-    def __init__(self, in_ch, out_ch, stride=1, kernel=3, fm_size=7):
-        super(DepthConv, self).__init__()
-        assert stride in [1, 2], "stride should be in [1, 2]"
-        padding = kernel // 2
-
-        self.conv = nn.Sequential(
-            nn.Conv2d(in_ch, in_ch, kernel, stride, padding, groups=in_ch),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(in_ch, out_ch, 1, 1, 0),
-            nn.ReLU(inplace=True),
-        )
-
-    def forward(self, x):
-        return self.conv(x)
-
-
-class GhostModule(nn.Module):
-    """Gost module."""
-    def __init__(self, in_ch, out_ch, stride=1, kernel=3, fm_size=7):
-        super(GhostModule, self).__init__()
-        mid_ch = out_ch // 2
-        self.primary_conv = conv_bn(in_ch, mid_ch, 1, stride, padding=0)
-        self.cheap_operation = conv_bn(
-            mid_ch, mid_ch, kernel, 1, kernel // 2, mid_ch
-        )
-
-    def forward(self, x):
-        x1 = self.primary_conv(x)
-        x2 = self.cheap_operation(x1)
-        return torch.cat([x1, x2], dim=1)
-
-
-class StemBlock(nn.Module):
-    def __init__(self, in_ch=3, init_ch=32, bottleneck=True):
-        super(StemBlock, self).__init__()
-        self.stem_1 = conv_bn(in_ch, init_ch, 3, 2, 1)
-        mid_ch = int(init_ch // 2) if bottleneck else init_ch
-        self.stem_2a = conv_bn(init_ch, mid_ch, 1, 1, 0)
-        self.stem_2b = SeparableConv(mid_ch, init_ch, 2, 1)
-        self.stem_2p = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.stem_3 = conv_bn(init_ch * 2, init_ch, 1, 1, 0)
-
-    def forward(self, x):
-        stem_1_out = self.stem_1(x)
-
-        stem_2a_out = self.stem_2a(stem_1_out)
-        stem_2b_out = self.stem_2b(stem_2a_out)
-
-        stem_2p_out = self.stem_2p(stem_1_out)
-
-        out = self.stem_3(torch.cat((stem_2b_out, stem_2p_out), 1))
-        return out, stem_1_out
-
-
-class Identity(nn.Module):
-    """ Identity module."""
-
-    def __init__(self, in_ch, out_ch, stride=1, fm_size=7):
-        super(Identity, self).__init__()
-        self.conv = (
-            conv_bn(in_ch, out_ch, kernel=1, stride=stride, padding=0)
-            if in_ch != out_ch or stride != 1
-            else None
-        )
-
-    def forward(self, x):
-        if self.conv:
-            out = self.conv(x)
-        else:
-            out = x
-            # Add dropout to avoid overfit on Identity (PDARTS)
-            out = nn.functional.dropout(out, p=0.5)
-        return out
-
-
-class Hsigmoid(nn.Module):
-    """Hsigmoid activation function."""
-
-    def __init__(self, inplace=True):
-        super(Hsigmoid, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
-
-
-class eSEModule(nn.Module):
-    """ The improved SE Module."""
-
-    def __init__(self, channel, fm_size=7, se=True):
-        super(eSEModule, self).__init__()
-        self.se = se
-
-        if self.se:
-            self.avg_pool = nn.Conv2d(
-                channel, channel, fm_size, 1, 0, groups=channel, bias=False
-            )
-            self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
-            self.hsigmoid = Hsigmoid()
-
-    def forward(self, x):
-        if self.se:
-            input = x
-            x = self.avg_pool(x)
-            x = self.fc(x)
-            x = self.hsigmoid(x)
-            return input * x
-        else:
-            return x
-
-
-class ChannelShuffle(nn.Module):
-    """Procedure: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]."""
-
-    def __init__(self, groups):
-        super(ChannelShuffle, self).__init__()
-        self.groups = groups
-
-    def forward(self, x):
-        if self.groups == 1:
-            return x
-
-        N, C, H, W = x.size()
-        g = self.groups
-        assert C % g == 0, "Incompatible group size {} for input channel {}".format(
-            g, C
-        )
-        return (
-            x.view(N, g, int(C // g), H, W)
-            .permute(0, 2, 1, 3, 4)
-            .contiguous()
-            .view(N, C, H, W)
-        )
-
-
-class MBBlock(nn.Module):
-    """The Inverted Residual Block, with channel shuffle or eSEModule."""
-
-    def __init__(
-        self,
-        in_ch,
-        out_ch,
-        stride=1,
-        kernel=3,
-        expand=1,
-        res=False,
-        dilation=1,
-        se=False,
-        fm_size=7,
-        group=1,
-        mid_ch=-1,
-    ):
-        super(MBBlock, self).__init__()
-        assert stride in [1, 2], "stride should be in [1, 2]"
-        assert kernel in [3, 5], "kernel size should be in [3, 5]"
-        assert dilation in [1, 2, 3, 4], "dilation should be in [1, 2, 3, 4]"
-        assert group in [1, 2], "group should be in [1, 2]"
-
-        self.use_res_connect = res and (stride == 1)
-        padding = kernel // 2 + (dilation - 1)
-        mid_ch = mid_ch if mid_ch > 0 else (in_ch * expand)
-
-        # Basic Modules
-        conv_layer = nn.Conv2d
-        norm_layer = nn.BatchNorm2d
-        activation_layer = nn.ReLU
-        channel_suffle = ChannelShuffle
-        se_layer = eSEModule
-
-        self.ir_block = nn.Sequential(
-            # pointwise convolution
-            conv_layer(in_ch, mid_ch, 1, 1, 0, bias=False, groups=group),
-            norm_layer(mid_ch),
-            activation_layer(inplace=True),
-            # channel shuffle if necessary
-            channel_suffle(group),
-            # depthwise convolution
-            conv_layer(
-                mid_ch,
-                mid_ch,
-                kernel,
-                stride,
-                padding=padding,
-                dilation=dilation,
-                groups=mid_ch,
-                bias=False,
-            ),
-            norm_layer(mid_ch),
-            # eSEModule if necessary
-            se_layer(mid_ch, fm_size, se),
-            activation_layer(inplace=True),
-            # pointwise convolution
-            conv_layer(mid_ch, out_ch, 1, 1, 0, bias=False, groups=group),
-            norm_layer(out_ch),
-        )
-
-    def forward(self, x):
-        if self.use_res_connect:
-            return x + self.ir_block(x)
-        else:
-            return self.ir_block(x)
-
-
-class SingleOperation(nn.Module):
-    """Single operation for sampled path.
-    """
-
-    def __init__(
-        self, layers_params, stage_ops, sampled_op='', io_ch=[]
-    ):
-        super(SingleOperation, self).__init__()
-
-        if io_ch:
-            assert len(io_ch) == 2, "io_ch should have two elements"
-            layers_params[0:2] = io_ch
-        key_params = {"fm_size": layers_params[3]}
-        ops_names = [op_name for op_name in stage_ops]
-        sampled_op = sampled_op if sampled_op else ops_names[0]
-
-        # define the single op
-        self.op = stage_ops[sampled_op](*layers_params[0:3], **key_params)
-
-    def forward(self, x):
-        return self.op(x)
-
-
-def choice_blocks(layers_params, stage_ops):
-    """ Create list of layer candidates for NNI one-shot NAS"""
-    ops_names = [op_name for op_name in stage_ops]
-    key_params = {"fm_size": layers_params[3]}
-
-    op_list = [
-        stage_ops[op_name](*layers_params[0:3], **key_params)
-        for op_name in ops_names
-    ]
-    return op_list
diff --git a/examples/nas/fbnet/lib/subnet.py b/examples/nas/fbnet/lib/subnet.py
deleted file mode 100644
index dc9ad641a66..00000000000
--- a/examples/nas/fbnet/lib/subnet.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import math
-import torch
-import torch.nn as nn
-
-from lib.ops import (
-    MBBlock,
-    SeparableConv,
-    SingleOperation,
-    StemBlock,
-    conv_bn,
-)
-
-INIT_CH = 16
-
-
-class PFLDInference(nn.Module):
-    def __init__(self, lookup_table, sampled_ops, num_points=98):
-        super(PFLDInference, self).__init__()
-
-        stage_names = [stage_name for stage_name in lookup_table.layer_num]
-        stage_lnum = [
-            lookup_table.layer_num[stage_name] for stage_name in stage_names
-        ]
-        self.stem = StemBlock(init_ch=INIT_CH, bottleneck=False)
-
-        self.block4_1 = MBBlock(INIT_CH, 32, stride=2, mid_ch=32)
-        self.nas_stages_0 = nn.ModuleList(
-            [
-                SingleOperation(
-                    lookup_table.layers_params[layer_id],
-                    lookup_table.lut_ops[stage_names[0]],
-                    sampled_ops[layer_id],
-                )
-                for layer_id in range(stage_lnum[0])
-            ]
-        )
-
-        self.nas_stages_1 = nn.ModuleList(
-            [
-                SingleOperation(
-                    lookup_table.layers_params[layer_id],
-                    lookup_table.lut_ops[stage_names[1]],
-                    sampled_ops[layer_id],
-                )
-                for layer_id in range(
-                    stage_lnum[0], stage_lnum[0] + stage_lnum[1]
-                )
-            ]
-        )
-
-        self.avg_pool1 = nn.Conv2d(
-            INIT_CH, INIT_CH, 9, 8, 1, groups=INIT_CH, bias=False
-        )
-        self.avg_pool2 = nn.Conv2d(32, 32, 3, 2, 1, groups=32, bias=False)
-
-        self.block6_1 = nn.Conv2d(96 + INIT_CH, 64, 1, 1, 0, bias=False)
-        self.block6_2 = MBBlock(64, 64, res=True, se=True, mid_ch=128)
-        self.block6_3 = SeparableConv(64, 128, 1)
-
-        self.conv7 = nn.Conv2d(128, 128, 7, 1, 0, groups=128, bias=False)
-        self.fc = nn.Conv2d(128, num_points * 2, 1, 1, 0, bias=True)
-
-    def forward(self, x):
-        # x: 3, 112, 112
-        x, y1 = self.stem(x)
-        out1 = x
-
-        x = self.block4_1(x)
-        for i, single_op in enumerate(self.nas_stages_0):
-            x = single_op(x)
-        y2 = x
-
-        for i, single_op in enumerate(self.nas_stages_1):
-            x = single_op(x)
-        y3 = x
-
-        y1 = self.avg_pool1(y1)
-        y2 = self.avg_pool2(y2)
-        multi_scale = torch.cat([y3, y2, y1], 1)
-
-        y = self.block6_1(multi_scale)
-        y = self.block6_2(y)
-        y = self.block6_3(y)
-        y = self.conv7(y)
-        landmarks = self.fc(y)
-
-        return landmarks, out1
-
-
-class AuxiliaryNet(nn.Module):
-    def __init__(self):
-        super(AuxiliaryNet, self).__init__()
-        self.conv1 = conv_bn(INIT_CH, 64, 3, 2)
-        self.conv2 = conv_bn(64, 64, 3, 1)
-        self.conv3 = conv_bn(64, 32, 3, 2)
-        self.conv4 = conv_bn(32, 64, 7, 1)
-        self.max_pool1 = nn.MaxPool2d(3)
-        self.fc1 = nn.Linear(64, 32)
-        self.fc2 = nn.Linear(32, 3)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        x = self.conv4(x)
-        x = self.max_pool1(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc1(x)
-        x = self.fc2(x)
-
-        return x
diff --git a/examples/nas/fbnet/lib/supernet.py b/examples/nas/fbnet/lib/supernet.py
deleted file mode 100644
index c8e900e34b0..00000000000
--- a/examples/nas/fbnet/lib/supernet.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import math
-import torch
-import torch.nn as nn
-
-from lib.ops import (
-    MBBlock,
-    SeparableConv,
-    StemBlock,
-    choice_blocks,
-    conv_bn,
-)
-from nni.nas.pytorch import mutables
-from torch.nn import init
-
-INIT_CH = 16
-
-
-class PFLDInference(nn.Module):
-    def __init__(self, lookup_table, num_points=98, slice=4):
-        super(PFLDInference, self).__init__()
-
-        stage_names = [stage_name for stage_name in lookup_table.layer_num]
-        stage_lnum = [
-            lookup_table.layer_num[stage_name] for stage_name in stage_names
-        ]
-        self.stem = StemBlock(init_ch=INIT_CH, bottleneck=False)
-
-        self.block4_1 = MBBlock(INIT_CH, 32, stride=2, mid_ch=32)
-
-        stages_0 = [
-            mutables.LayerChoice(
-                choice_blocks(
-                    lookup_table.layers_params[layer_id],
-                    lookup_table.lut_ops[stage_names[0]],
-                )
-            )
-            for layer_id in range(stage_lnum[0])
-        ]
-        stages_1 = [
-            mutables.LayerChoice(
-                choice_blocks(
-                    lookup_table.layers_params[layer_id],
-                    lookup_table.lut_ops[stage_names[1]],
-                )
-            )
-            for layer_id in range(
-                stage_lnum[0], stage_lnum[0] + stage_lnum[1]
-            )
-        ]
-        blocks = stages_0 + stages_1
-        self.blocks = nn.Sequential(*blocks)
-
-        self.avg_pool1 = nn.Conv2d(
-            INIT_CH, INIT_CH, 9, 8, 1, groups=INIT_CH, bias=False
-        )
-        self.avg_pool2 = nn.Conv2d(32, 32, 3, 2, 1, groups=32, bias=False)
-
-        self.block6_1 = nn.Conv2d(96 + INIT_CH, 64, 1, 1, 0, bias=False)
-        self.block6_2 = MBBlock(64, 64, res=True, se=True, mid_ch=128)
-        self.block6_3 = SeparableConv(64, 128, 1)
-
-        self.conv7 = nn.Conv2d(128, 128, 7, 1, 0, groups=128, bias=False)
-        self.fc = nn.Conv2d(128, num_points * 2, 1, 1, 0, bias=True)
-
-        self.meta_layer = nn.Linear(num_points * 2 * slice, 1)
-        # init params
-        self.init_params()
-
-    def init_params(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                init.kaiming_normal_(m.weight, mode='fan_out')
-                if m.bias is not None:
-                    init.constant_(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm2d):
-                init.constant_(m.weight, 1)
-                init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                init.normal_(m.weight, std=0.001)
-                if m.bias is not None:
-                    init.constant_(m.bias, 0)
-
-    def forward(self, x, temperature, perf_cost):
-        # x: 3, 112, 112
-        x, y1 = self.stem(x)
-        out1 = x
-
-        x = self.block4_1(x)
-        for i, block in enumerate(self.blocks):
-            x, perf_cost = block(x, temperature, perf_cost)
-            if i == 1:
-                y2 = x
-            elif i == 4:
-                y3 = x
-
-        y1 = self.avg_pool1(y1)
-        y2 = self.avg_pool2(y2)
-        multi_scale = torch.cat([y3, y2, y1], 1)
-
-        y = self.block6_1(multi_scale)
-        y = self.block6_2(y)
-        y = self.block6_3(y)
-        y = self.conv7(y)
-        landmarks = self.fc(y)
-
-        return landmarks, out1, perf_cost
-
-
-class AuxiliaryNet(nn.Module):
-    def __init__(self):
-        super(AuxiliaryNet, self).__init__()
-        self.conv1 = conv_bn(INIT_CH, 64, 3, 2)
-        self.conv2 = conv_bn(64, 64, 3, 1)
-        self.conv3 = conv_bn(64, 32, 3, 2)
-        self.conv4 = conv_bn(32, 64, 7, 1)
-        self.max_pool1 = nn.MaxPool2d(3)
-        self.fc1 = nn.Linear(64, 32)
-        self.fc2 = nn.Linear(32, 3)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        x = self.conv4(x)
-        x = self.max_pool1(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc1(x)
-        x = self.fc2(x)
-
-        return x
diff --git a/examples/nas/fbnet/lib/utils.py b/examples/nas/fbnet/lib/utils.py
deleted file mode 100644
index 4923a897fb5..00000000000
--- a/examples/nas/fbnet/lib/utils.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from __future__ import absolute_import, division, print_function
-
-import os
-import torch
-
-from torch import nn
-from torch.autograd import Variable
-
-
-def count_model_flops(model=None, in_shape=(3, 112, 112), multiply_adds=False):
-    """Compute the flops of model."""
-    prods = {}
-
-    def save_hook(name):
-        def hook_per(self, input, output):
-            prods[name] = np.prod(input[0].shape)
-
-        return hook_per
-
-    list_1 = []
-
-    def simple_hook(self, input, output):
-        list_1.append(np.prod(input[0].shape))
-
-    list_2 = {}
-
-    def simple_hook2(self, input, output):
-        list_2["names"] = np.prod(input[0].shape)
-
-    list_conv = []
-
-    def conv_hook(self, input, output):
-        batch_size, input_channels, input_height, input_width = input[0].size()
-        output_channels, output_height, output_width = output[0].size()
-
-        kernel_ops = (
-            self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups)
-        )
-        bias_ops = 1 if self.bias is not None else 0
-
-        params = output_channels * (kernel_ops + bias_ops)
-        # flops = (kernel_ops * (2 if multiply_adds else 1) + bias_ops) * output_channels * output_height * output_width * batch_size
-
-        num_weight_params_non_zero = (self.weight.data != 0).float().sum()
-        num_weight_params_zero = (self.weight.data == 0).float().sum()
-        num_weight_params = num_weight_params_non_zero + num_weight_params_zero
-        if self.groups == 1:
-            ops = num_weight_params * (2 if multiply_adds else 1)
-        else:
-            multiplys = num_weight_params / self.groups
-            adds = multiplys - output_channels
-            ops = (multiplys + adds) if multiply_adds else adds
-        flops = (
-            (ops + bias_ops * output_channels)
-            * output_height
-            * output_width
-            * batch_size
-        )
-        list_conv.append(flops)
-
-    list_deconv = []
-
-    def deconv_hook(self, input, output):
-        batch_size, input_channels, input_height, input_width = input[0].size()
-        output_channels, output_height, output_width = output[0].size()
-
-        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * self.in_channels
-        bias_ops = 1 if self.bias is not None else 0
-
-        params = output_channels * (kernel_ops + bias_ops)
-        # flops = (kernel_ops * (2 if multiply_adds else 1) + bias_ops) * output_channels * output_height * output_width * batch_size
-
-        num_weight_params_non_zero = (self.weight.data != 0).float().sum()
-        num_weight_params_zero = (self.weight.data == 0).float().sum()
-        num_weight_params = num_weight_params_non_zero + num_weight_params_zero
-        ops = num_weight_params * (2 if multiply_adds else 1)
-        flops = (
-            (ops + bias_ops * output_channels)
-            * output_height
-            * output_width
-            * batch_size
-        )
-        list_deconv.append(flops)
-
-    list_linear = []
-
-    def linear_hook(self, input, output):
-        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
-
-        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
-        bias_ops = self.bias.nelement() if self.bias is not None else 0
-
-        flops = batch_size * (weight_ops + bias_ops)
-        list_linear.append(flops)
-
-    list_bn = []
-
-    def bn_hook(self, input, output):
-        list_bn.append(input[0].nelement() * 2)
-
-    list_relu = []
-
-    def relu_hook(self, input, output):
-        list_relu.append(input[0].nelement())
-
-    list_pooling = []
-
-    def pooling_hook(self, input, output):
-        batch_size, input_channels, input_height, input_width = input[0].size()
-        output_channels, output_height, output_width = output[0].size()
-
-        kernel_ops = self.kernel_size * self.kernel_size
-        bias_ops = 0
-        params = 0
-        flops = (
-            (kernel_ops + bias_ops)
-            * output_channels
-            * output_height
-            * output_width
-            * batch_size
-        )
-
-        list_pooling.append(flops)
-
-    list_upsample = []
-    # For bilinear upsample
-    def upsample_hook(self, input, output):
-        batch_size, input_channels, input_height, input_width = input[0].size()
-        output_channels, output_height, output_width = output[0].size()
-
-        flops = output_height * output_width * output_channels * batch_size * 12
-        list_upsample.append(flops)
-
-    def foo(net):
-        childrens = list(net.children())
-        if not childrens:
-            if isinstance(net, torch.nn.Conv2d):
-                net.register_forward_hook(conv_hook)
-            if isinstance(net, torch.nn.ConvTranspose2d):
-                net.register_forward_hook(deconv_hook)
-            if isinstance(net, torch.nn.Linear):
-                net.register_forward_hook(linear_hook)
-            if isinstance(net, torch.nn.BatchNorm2d):
-                net.register_forward_hook(bn_hook)
-            if isinstance(net, torch.nn.ReLU):
-                net.register_forward_hook(relu_hook)
-            if isinstance(net, torch.nn.MaxPool2d) or isinstance(
-                net, torch.nn.AvgPool2d
-            ):
-                net.register_forward_hook(pooling_hook)
-            if isinstance(net, torch.nn.Upsample):
-                net.register_forward_hook(upsample_hook)
-            return
-        for c in childrens:
-            foo(c)
-
-    foo(model)
-    model.eval()
-    with torch.no_grad():
-        input = Variable(
-            torch.rand(3, in_shape[0], in_shape[1], in_shape[2]), requires_grad=False
-        )
-        out = model(input)
-    total_flops = (
-        sum(list_conv)
-        + sum(list_deconv)
-        + sum(list_linear)
-        + sum(list_bn)
-        + sum(list_relu)
-        + sum(list_pooling)
-        + sum(list_upsample)
-    )
-    # batchsize=3
-    del input, out
-    return total_flops / 3
-
-
-def model_init(model, state_dict, replace=[]):
-    """Initialize the model from state_dict."""
-    prefix = 'module.'
-    param_dict = dict()
-    for k, v in state_dict.items():
-        if k.startswith(prefix):
-            k = k[7:]
-        param_dict[k] = v
-
-    for k, (name, m) in enumerate(model.named_modules()):
-        if replace:
-            for layer_replace in replace:
-                assert len(layer_replace) == 3, "The elements should be three."
-                pre_scope, key, replace_key = layer_replace
-                if pre_scope in name:
-                    name = name.replace(key, replace_key)
-
-        # Copy the state_dict to current model
-        if (name+'.weight' in param_dict) or (name+'.running_mean' in param_dict):
-            if isinstance(m, nn.BatchNorm2d):
-                shape = m.running_mean.shape
-                if shape == param_dict[name+'.running_mean'].shape:
-                    print('Init OK with pretrained model: {}'.format(name))
-                    if m.weight is not None:
-                        m.weight.data = param_dict[name+'.weight']
-                        m.bias.data = param_dict[name+'.bias']
-                    m.running_mean = param_dict[name+'.running_mean']
-                    m.running_var = param_dict[name+'.running_var']
-                else:
-                    print('Init random: {}'.format(name))
-
-            elif isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
-                shape = m.weight.data.shape
-                if shape == param_dict[name+'.weight'].shape:
-                    print('Init OK with pretrained model: {}'.format(name))
-                    m.weight.data = param_dict[name+'.weight']
-                    if m.bias is not None:
-                        m.bias.data = param_dict[name+'.bias']
-                else:
-                    print('Init random: {}'.format(name))
-
-            elif isinstance(m, nn.ConvTranspose2d):
-                print('Init OK with pretrained model: {}'.format(name))
-                m.weight.data = param_dict[name+'.weight']
-                if m.bias is not None:
-                    m.bias.data = param_dict[name+'.bias']
diff --git a/examples/nas/fbnet/loss.py b/examples/nas/fbnet/loss.py
deleted file mode 100644
index 90605551d09..00000000000
--- a/examples/nas/fbnet/loss.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import math
-import torch
-from torch import nn
-
-import torch.nn.functional as F
-
-
-class PFLDLoss(nn.Module):
-    def __init__(self):
-        super(PFLDLoss, self).__init__()
-
-        self.w = 0.12
-        self.epsilon = 0.008
-        self.c = self.w * (1.0 - math.log(1.0 + self.w / self.epsilon))
-
-    def forward(self, targets, euler_angle_gts, angles, inputs):
-        x = targets - inputs
-        absolute_x = torch.abs(x)
-
-        weight_angle = torch.sum(1.5 - torch.cos(euler_angle_gts), axis=1)
-        pose_loss = F.smooth_l1_loss(angles, euler_angle_gts, reduction='mean')
-
-        losses = torch.where(
-            self.w > absolute_x,
-            self.w * torch.log(1.0 + absolute_x / self.epsilon),
-            absolute_x - self.c
-        )
-        sum_losses = torch.sum(losses, axis=1)
-        loss = torch.mean(weight_angle * sum_losses, axis=0)
-
-        return loss + 0.1 * pose_loss, loss
diff --git a/examples/nas/fbnet/train.py b/examples/nas/fbnet/train.py
deleted file mode 100644
index a4cd48b7f25..00000000000
--- a/examples/nas/fbnet/train.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import argparse
-import logging
-import os
-import torch
-import torchvision
-
-import numpy as np
-
-from datasets import PFLDDatasets
-from loss import PFLDLoss
-from lib.config import NASConfig, search_space
-from lib.builder import LookUpTable, supernet_sample
-from nni.algorithms.nas.pytorch.fbnet.trainer import FBNetTrainer
-from torch.utils.data import DataLoader
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-
-def main(args):
-
-    logging.basicConfig(
-        format="[%(asctime)s] [p%(process)s] [%(pathname)s:%(lineno)d] [%(levelname)s] %(message)s",
-        level=logging.INFO,
-        handlers=[
-            logging.FileHandler(args.log_file, mode="w"),
-            logging.StreamHandler(),
-        ],
-    )
-
-    # print the information of arguments
-    for arg in vars(args):
-        s = arg + ": " + str(getattr(args, arg))
-        logging.info(s)
-
-    # for 106 landmarks
-    num_points = 106
-    # list of device ids, and the number of workers for data loading
-    device_ids = [int(id) for id in args.dev_id.split(",")]
-    dev_num = len(device_ids)
-    num_workers = 4 * dev_num
-
-    # random seed
-    manual_seed = 1
-    np.random.seed(manual_seed)
-    torch.manual_seed(manual_seed)
-    torch.cuda.manual_seed_all(manual_seed)
-
-    if args.backbone == "supernet":
-        # import supernet for block-wise DNAS pre-training
-        from lib.supernet import PFLDInference, AuxiliaryNet
-    elif args.backbone == "subnet":
-        # import subnet for fine-tuning
-        from lib.subnet import PFLDInference, AuxiliaryNet
-    else:
-        raise ValueError("backbone is not implemented")
-
-    # the configuration for training control
-    nas_config = NASConfig(
-        arch_search=args.arch_search,
-        model_dir=args.snapshot,
-        nas_lr=args.theta_lr,
-        mode=args.mode,
-        alpha=args.alpha,
-        search_space=search_space,
-    )
-    # look-up table with basic information of search space, flops per block, etc.
-    lookup_table = LookUpTable(config=nas_config)
-    # the auxiliary-net of PFLD to predict the pose angle
-    auxiliarynet = AuxiliaryNet()
-
-    if "sub" in args.backbone:
-        check = torch.load(args.supernet, map_location=torch.device("cpu"))
-        sampled_arch = check["arch_sample"]
-        logging.info(sampled_arch)
-        # create subnet
-        pfld_backbone = PFLDInference(lookup_table, sampled_arch, num_points)
-
-        # pre-load the weights from pre-trained supernet
-        state_dict = check["pfld_backbone"]
-        supernet_sample(pfld_backbone, state_dict, sampled_arch, lookup_table)
-
-    else:
-        # create supernet
-        pfld_backbone = PFLDInference(lookup_table, num_points)
-
-    # main task loss
-    criterion = PFLDLoss()
-
-    # optimizer for weight train
-    if args.opt == 'adam':
-        optimizer = torch.optim.AdamW(
-            [
-                {"params": pfld_backbone.parameters()},
-                {"params": auxiliarynet.parameters()},
-            ],
-            lr=args.base_lr,
-            weight_decay=args.weight_decay,
-        )
-    elif args.opt == 'rms':
-        optimizer = torch.optim.RMSprop(
-            [
-                {"params": pfld_backbone.parameters()},
-                {"params": auxiliarynet.parameters()},
-            ],
-            lr=args.base_lr,
-            momentum=0.0,
-            weight_decay=args.weight_decay
-        )
-
-    # data argmentation and dataloader
-    transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
-    # the landmark dataset with 106 points is default used
-    train_dataset = PFLDDatasets(
-        os.path.join(args.data_root, 'train_data/list.txt'),
-        transform,
-        data_root=args.data_root,
-        img_size=args.img_size,
-    )
-    dataloader = DataLoader(
-        train_dataset,
-        batch_size=args.train_batchsize,
-        shuffle=True,
-        num_workers=num_workers,
-        pin_memory=True,
-        drop_last=False,
-    )
-
-    val_dataset = PFLDDatasets(
-        os.path.join(args.data_root, 'test_data/list.txt'),
-        transform,
-        data_root=args.data_root,
-        img_size=args.img_size,
-    )
-    val_dataloader = DataLoader(
-        val_dataset,
-        batch_size=args.val_batchsize,
-        shuffle=False,
-        num_workers=num_workers,
-        pin_memory=True,
-    )
-
-    # create the trainer, then search/finetune
-    trainer = FBNetTrainer(
-        pfld_backbone,
-        auxiliarynet,
-        optimizer,
-        criterion,
-        device,
-        device_ids,
-        nas_config,
-        lookup_table,
-        dataloader,
-        val_dataloader,
-        n_epochs=args.end_epoch,
-        logger=logging,
-    )
-    trainer.train()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="FBNet for PFLD")
-    parser.add_argument("--backbone", default="supernet", type=str, choices=['supernet', 'subnet'])
-    parser.add_argument("--dev_id", dest="dev_id", default="0", type=str)
-    parser.add_argument("--opt", default="rms", type=str)
-    parser.add_argument("--base_lr", default=0.0001, type=int)
-    parser.add_argument("--weight-decay", "--wd", default=1e-6, type=float)
-    parser.add_argument("--img_size", default=112, type=int)
-    parser.add_argument("--theta-lr", "--tlr", default=0.01, type=float)
-    parser.add_argument("--mode", default="mul", type=str, choices=['mul', 'add'])
-    parser.add_argument("--alpha", default=0.18, type=float)
-    parser.add_argument("--supernet", default="", type=str, metavar="PATH")
-    parser.add_argument("--end_epoch", default=300, type=int)
-    parser.add_argument("--snapshot", default="models", type=str, metavar="PATH")
-    parser.add_argument("--log_file", default="train.log", type=str)
-    parser.add_argument("--data_root", default="/dataset", type=str, metavar="PATH")
-    parser.add_argument("--train_batchsize", default=256, type=int)
-    parser.add_argument("--val_batchsize", default=128, type=int)
-    parser.add_argument("--arch-search", "-as", action="store_true")
-    args = parser.parse_args()
-    args.snapshot = os.path.join(args.snapshot, args.backbone)
-    args.log_file = os.path.join(args.snapshot, "train_{}.log".format(args.backbone))
-    os.makedirs(args.snapshot, exist_ok=True)
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/nni/algorithms/nas/pytorch/fbnet/__init__.py b/nni/algorithms/nas/pytorch/fbnet/__init__.py
deleted file mode 100644
index 9220ce40f65..00000000000
--- a/nni/algorithms/nas/pytorch/fbnet/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .mutator import FBNetMutator
-from .trainer import FBNetTrainer
diff --git a/nni/algorithms/nas/pytorch/fbnet/mutator.py b/nni/algorithms/nas/pytorch/fbnet/mutator.py
deleted file mode 100644
index 700be8c388b..00000000000
--- a/nni/algorithms/nas/pytorch/fbnet/mutator.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-import numpy as np
-
-from nni.nas.pytorch.base_mutator import BaseMutator
-from nni.nas.pytorch.mutables import LayerChoice
-
-
-class MixedOp(nn.Module):
-    """
-    This class is to instantiate and manage info of one LayerChoice.
-    It includes architecture weights and member functions operating the weights.
-    """
-
-    def __init__(self, mutable, latency):
-        """
-        Parameters
-        ----------
-        mutable : LayerChoice
-            A LayerChoice in user model
-        latency : List
-            performance cost for each op in mutable
-        """
-        super(MixedOp, self).__init__()
-        self.latency = latency
-        self.n_choices = len(mutable)
-        self.path_alpha = nn.Parameter(
-            torch.FloatTensor([1.0 / self.n_choices for i in range(self.n_choices)])
-        )
-
-    def get_path_alpha(self):
-        return self.path_alpha
-
-    def to_requires_grad(self):
-        self.path_alpha.requires_grad = True
-
-    def to_disable_grad(self):
-        self.path_alpha.requires_grad = False
-
-    def forward(self, mutable, x, temperature, perf_cost):
-        """
-        Define forward of LayerChoice.
-
-        Parameters
-        ----------
-        mutable : LayerChoice
-            this layer's mutable
-        x : tensor
-            inputs of this layer, only support one input
-        temperature : float32
-            the temperature for gumbel softmax
-        perf_cost : tensor
-            accumulated performance cost
-
-        Returns
-        -------
-        output: tensor
-            output of this layer
-        perf_cost : tensor
-            accumulated performance cost
-        """
-        candidate_ops = list(mutable)
-        soft_mask_thetas = self.probs_over_ops(temperature)
-        output = sum(m * op(x) for m, op in zip(soft_mask_thetas, candidate_ops))
-        layer_perf = sum(
-            m * lat for m, lat in zip(soft_mask_thetas, self.latency)
-        )
-        perf_cost = perf_cost + layer_perf
-
-        return output, perf_cost
-
-    def probs_over_ops(self, temperature):
-        """
-        Apply softmax on alpha to generate probability distribution
-
-        Returns
-        -------
-        pytorch tensor
-            probability distribution
-        """
-        probs = F.gumbel_softmax(self.path_alpha, temperature)
-        return probs
-
-    @property
-    def chosen_index(self):
-        """
-        choose the op with max prob
-
-        Returns
-        -------
-        int
-            index of the chosen one
-        numpy.float32
-            prob of the chosen one
-        """
-        alphas = self.path_alpha.data.detach().cpu().numpy()
-        index = int(np.argmax(alphas))
-        return index
-
-
-class FBNetMutator(BaseMutator):
-    """
-    This mutator initializes and operates all the LayerChoices of the input model.
-    It is for the corresponding trainer to control the training process of LayerChoices,
-    coordinating with whole training process.
-    """
-    def __init__(self, model, lookup_table):
-        """
-        Init a MixedOp instance for each mutable i.e., LayerChoice.
-        And register the instantiated MixedOp in corresponding LayerChoice.
-        If does not register it in LayerChoice, DataParallel does not work then,
-        because architecture weights are not included in the DataParallel model.
-        When MixedOPs are registered, we use ```requires_grad``` to control
-        whether calculate gradients of architecture weights.
-
-        Parameters
-        ----------
-        model : pytorch model
-            The model that users want to tune, it includes search space defined with nni nas apis
-        """
-        super(FBNetMutator, self).__init__(model)
-        self.mutable_list = []
-
-        # Collect the op names of the candidate ops within each mutable
-        ops_names_mutable = dict()
-        left = 0
-        right = 1
-        for stage_name in lookup_table.layer_num:
-            right = lookup_table.layer_num[stage_name]
-            stage_ops = lookup_table.lut_ops[stage_name]
-            ops_names = [op_name for op_name in stage_ops]
-
-            for i in range(left, left + right):
-                ops_names_mutable[i] = ops_names
-            left = right
-
-        # Create the mixed op
-        for i, mutable in enumerate(self.undedup_mutables):
-            ops_names = ops_names_mutable[i]
-            latency_mutable = lookup_table.lut_perf[i]
-            latency = [latency_mutable[op_name] for op_name in ops_names]
-            self.mutable_list.append(mutable)
-            mutable.registered_module = MixedOp(mutable, latency)
-
-    def on_forward_layer_choice(self, mutable, *args, **kwargs):
-        """
-        Callback of layer choice forward. This function defines the forward
-        logic of the input mutable. So mutable is only interface, its real
-        implementation is defined in mutator.
-
-        Parameters
-        ----------
-        mutable: LayerChoice
-            forward logic of this input mutable
-        args: list of torch.Tensor
-            inputs of this mutable
-        kwargs: dict
-            inputs of this mutable
-
-        Returns
-        -------
-        torch.Tensor
-            output of this mutable, i.e., LayerChoice
-        int
-            index of the chosen op
-        """
-        # FIXME: return mask, to be consistent with other algorithms
-        idx = mutable.registered_module.chosen_index
-        return mutable.registered_module(mutable, *args, **kwargs), idx
-
-    def num_arch_params(self):
-        """
-        The number of mutables, i.e., LayerChoice
-
-        Returns
-        -------
-        int
-            the number of LayerChoice in user model
-        """
-        return len(self.mutable_list)
-
-    def get_architecture_parameters(self):
-        """
-        Get all the architecture parameters.
-
-        yield
-        -----
-        PyTorch Parameter
-            Return ap_path_alpha of the traversed mutable
-        """
-        for mutable in self.undedup_mutables:
-            yield mutable.registered_module.get_path_alpha()
-
-    def arch_requires_grad(self):
-        """
-        Make architecture weights require gradient
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_requires_grad()
-
-    def arch_disable_grad(self):
-        """
-        Disable gradient of architecture weights, i.e., does not
-        calcuate gradient for them.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_disable_grad()
-
-    def sample_final(self):
-        """
-        Generate the final chosen architecture.
-
-        Returns
-        -------
-        dict
-            the choice of each mutable, i.e., LayerChoice
-        """
-        result = dict()
-        for mutable in self.undedup_mutables:
-            assert isinstance(mutable, LayerChoice)
-            index = mutable.registered_module.chosen_index
-            # pylint: disable=not-callable
-            result[mutable.key] = F.one_hot(torch.tensor(index), num_classes=len(mutable)).view(-1).bool()
-        return result
diff --git a/nni/algorithms/nas/pytorch/fbnet/trainer.py b/nni/algorithms/nas/pytorch/fbnet/trainer.py
deleted file mode 100644
index 20880948f69..00000000000
--- a/nni/algorithms/nas/pytorch/fbnet/trainer.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import time
-import json
-import os
-import torch
-
-import numpy as np
-
-from torch import nn as nn
-from torch.autograd import Variable
-from nni.nas.pytorch.base_trainer import BaseTrainer
-from nni.nas.pytorch.trainer import TorchTensorEncoder
-from nni.nas.pytorch.utils import AverageMeter
-from nni.algorithms.nas.pytorch.fbnet import FBNetMutator
-from .utils import accuracy
-
-
-class RegularizerLoss(nn.Module):
-    """Auxilliary loss for hardware-aware NAS."""
-
-    def __init__(self, config):
-        super(RegularizerLoss, self).__init__()
-        self.mode = config.mode
-        self.alpha = config.alpha
-        self.beta = config.beta
-
-    def forward(self, perf_cost, batch_size=1):
-        if self.mode == 'mul':
-            return self.alpha * torch.log((perf_cost / batch_size) ** self.beta)
-
-        elif self.mode == 'add':
-            return self.alpha * ((perf_cost / batch_size) ** self.beta)
-
-
-class FBNetTrainer(BaseTrainer):
-    def __init__(
-        self,
-        model,
-        auxiliarynet,
-        model_optim,
-        criterion,
-        device,
-        device_ids,
-        config,
-        lookup_table,
-        train_loader,
-        valid_loader,
-        n_epochs=300,
-        load_ckpt=False,
-        arch_path=None,
-        logger=None,
-    ):
-        """
-        Parameters
-        ----------
-        model : pytorch model
-            the user model, which has mutables
-        auxiliarynet : pytorch model
-            the auxiliarynet to regress angle
-        model_optim : pytorch optimizer
-            the user defined optimizer
-        criterion : pytorch loss
-            the main task loss
-        device : pytorch device
-            the devices to train/search the model
-        device_ids : list of int
-            the indexes of devices used for training
-        config : class
-            configuration object for fbnet training
-        lookup_table : class
-            lookup table object for fbnet training
-        train_loader : pytorch data loader
-            data loader for the training set
-        valid_loader : pytorch data loader
-            data loader for the validation set
-        n_epochs : int
-            number of epochs to train/search
-        load_ckpt : bool
-            whether load checkpoint
-        arch_path : str
-            the path to store chosen architecture
-        logger : logger
-            the logger
-        """
-        self.model = model
-        self.auxiliarynet = auxiliarynet
-        self.model_optim = model_optim
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-        self.device = device
-        self.dev_num = len(device_ids)
-        self.n_epochs = n_epochs
-        self.config = config
-        self.lookup_table = lookup_table
-        self.arch_search = config.arch_search
-        self.start_epoch = config.start_epoch
-        self.temperature = config.init_temperature
-        self.exp_anneal_rate = config.exp_anneal_rate
-        self.mode = config.mode
-
-        self.load_ckpt = load_ckpt
-        self.arch_path = arch_path
-        self.logger = logger
-
-        # scheduler of learning rate
-        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-            model_optim, T_max=n_epochs, last_epoch=-1
-        )
-
-        if self.arch_search:
-            # init mutator
-            self.mutator = FBNetMutator(model, lookup_table)
-
-        # DataParallel should be put behind the init of mutator
-        self.model = torch.nn.DataParallel(
-            self.model, device_ids=device_ids
-        ).to(device)
-        self.auxiliarynet = torch.nn.DataParallel(
-            self.auxiliarynet, device_ids=device_ids
-        ).to(device)
-
-        if self.arch_search:
-            # build architecture optimizer
-            self.arch_optimizer = torch.optim.AdamW(
-                self.mutator.get_architecture_parameters(),
-                config.nas_lr,
-                weight_decay=config.nas_weight_decay,
-            )
-            self.reg_loss = RegularizerLoss(config=config)
-
-        self.criterion = criterion
-        self.epoch = 0
-
-    def _layer_choice_sample(self):
-        """
-        sample the index of network within layer choice
-        """
-        stage_names = [stage_name for stage_name in self.lookup_table.layer_num]
-        stage_lnum = [
-            self.lookup_table.layer_num[stage_name] for stage_name in stage_names
-        ]
-
-        # get the choice idx in each layer
-        choice_ids = list()
-        layer_id = 0
-        for param in self.mutator.get_architecture_parameters():
-            param_np = param.detach().cpu().numpy()
-            op_idx = np.argmax(param_np)
-            choice_ids.append(op_idx)
-            self.logger.info("layer {}: {}, index: {}".format(layer_id, param_np, op_idx))
-            layer_id += 1
-
-        # get the arch_sample
-        choice_names = list()
-        layer_id = 0
-        for i, stage_name in enumerate(stage_names):
-            ops_names = [
-                op_name for op_name in self.lookup_table.lut_ops[stage_name]
-            ]
-            for j in range(stage_lnum[i]):
-                searched_op = ops_names[choice_ids[layer_id]]
-                choice_names.append(searched_op)
-                layer_id += 1
-
-        self.logger.info(choice_names)
-        return choice_names
-
-    def _validate(self):
-        """
-        Do validation. During validation, LayerChoices use the chosen active op.
-
-        Returns
-        -------
-        float, float, float
-            average loss, average top1 accuracy, average top5 accuracy
-        """
-
-        # test on validation set under eval mode
-        self.model.eval()
-        self.auxiliarynet.eval()
-
-        losses, nme = list(), list()
-        batch_time = AverageMeter('batch_time')
-        end = time.time()
-        with torch.no_grad():
-            for i, (img, landmark_gt, euler_angle_gt) in enumerate(self.valid_loader):
-                img = img.to(self.device, non_blocking=True)
-                landmark_gt = landmark_gt.to(self.device, non_blocking=True)
-                euler_angle_gt = euler_angle_gt.to(self.device, non_blocking=True)
-
-                if self.arch_search:
-                    perf_cost = Variable(
-                        torch.zeros(self.dev_num, 1)
-                    ).to(self.device, non_blocking=True)
-                    landmark, _, _ = self.model(img, self.temperature, perf_cost)
-
-                else:
-                    landmark, _ = self.model(img)
-
-                landmark = landmark.squeeze()
-                loss = torch.mean(torch.sum((landmark_gt - landmark) ** 2, axis=1))
-
-                landmark = landmark.cpu().numpy().reshape(landmark.shape[0], -1, 2)
-                landmark_gt = landmark_gt.cpu().numpy().reshape(landmark_gt.shape[0], -1, 2)
-                _, nme_i = accuracy(landmark, landmark_gt)
-                losses.append(loss.cpu().numpy())
-                for item in nme_i:
-                    nme.append(item)
-
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-
-        self.logger.info("===> Evaluate:")
-        self.logger.info("Eval set: Average loss: {:.4f} nme: {:.4f}".format(
-                np.mean(losses), np.mean(nme)
-            )
-        )
-        return np.mean(losses), np.mean(nme)
-
-    def _train_epoch(self, epoch, optimizer, data_loader, arch_train=False):
-        """
-        Train one epoch.
-        """
-        # switch to train mode
-        self.model.train()
-        self.auxiliarynet.train()
-
-        batch_time = AverageMeter('batch_time')
-        data_time = AverageMeter('data_time')
-        losses = AverageMeter('losses')
-
-        end = time.time()
-        for i, (img, landmark_gt, euler_angle_gt) in enumerate(data_loader):
-            data_time.update(time.time() - end)
-            img = img.to(self.device, non_blocking=True)
-            landmark_gt = landmark_gt.to(self.device, non_blocking=True)
-            euler_angle_gt = euler_angle_gt.to(self.device, non_blocking=True)
-
-            if self.arch_search:
-                perf_cost = Variable(
-                    torch.zeros(self.dev_num, 1), requires_grad=True
-                ).to(self.device, non_blocking=True)
-                landmarks, features, perf_cost = self.model(img, self.temperature, perf_cost)
-            else:
-                landmarks, features = self.model(img)
-            landmarks = landmarks.squeeze()
-            angle = self.auxiliarynet(features)
-
-            # task loss
-            weighted_loss, l2_loss = self.criterion(
-                landmark_gt, euler_angle_gt, angle, landmarks
-            )
-            loss = l2_loss if arch_train else weighted_loss
-
-            if self.arch_search:
-                # hardware-aware loss
-                regu_loss = self.reg_loss(perf_cost.mean(dim=0))
-                if self.mode == 'mul':
-                    loss = loss * regu_loss
-                elif self.mode == 'add':
-                    loss = loss + regu_loss
-
-            # compute gradient and do SGD step
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            # measure elapsed time
-            batch_time.update(time.time() - end)
-            end = time.time()
-            # measure accuracy and record loss
-            losses.update(loss.item(), img.size(0))
-
-            if i % 10 == 0:
-                batch_log = 'Train [{0}][{1}]\t' \
-                            'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
-                            'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' \
-                            'Loss {losses.val:.4f} ({losses.avg:.4f})'. \
-                    format(epoch + 1, i, batch_time=batch_time, data_time=data_time,
-                           losses=losses)
-                self.logger.info(batch_log)
-
-    def _warm_up(self):
-        """
-        Warm up the model, during warm up, architecture weights are not trained.
-        """
-        for epoch in range(self.epoch, self.start_epoch):
-            self.logger.info('\n--------Warmup epoch: %d--------\n', epoch + 1)
-            self._train_epoch(epoch, self.model_optim, self.train_loader)
-            # adjust learning rate
-            self.scheduler.step()
-
-            # validation
-            _, _ = self._validate()
-            if epoch % 10 == 0:
-                filename = os.path.join(self.config.model_dir, "checkpoint_%s.pth" % epoch)
-                self.save_checkpoint(epoch, filename)
-
-    def _train(self):
-        """
-        Train the model, it trains model weights and architecute weights.
-        Architecture weights are trained according to the schedule.
-        Before updating architecture weights, ```requires_grad``` is enabled.
-        Then, it is disabled after the updating, in order not to update
-        architecture weights when training model weights.
-        """
-        if self.arch_search:
-            arch_param_num = self.mutator.num_arch_params()
-            self.logger.info('#arch_params: {}'.format(arch_param_num))
-            self.epoch = max(self.start_epoch, self.epoch)
-        val_nme = 1e6
-
-        for epoch in range(self.epoch, self.n_epochs):
-            self.logger.info('\n--------Train epoch: %d--------\n', epoch + 1)
-            # update the weight parameters
-            self._train_epoch(epoch, self.model_optim, self.train_loader)
-            # adjust learning rate
-            self.scheduler.step()
-
-            if self.arch_search:
-                self.logger.info("Update architecute parameters")
-                # update the architecture parameters
-                self._train_epoch(epoch, self.arch_optimizer, self.valid_loader, True)
-
-            # validate
-            _, nme = self._validate()
-
-            # temperature annealing
-            self.temperature = self.temperature * self.exp_anneal_rate
-            # sub-network sampling
-            choice_names = self._layer_choice_sample() if self.arch_search else None
-
-            if epoch % 10 == 0:
-                filename = os.path.join(self.config.model_dir, "checkpoint_%s.pth" % epoch)
-                self.save_checkpoint(epoch, filename, choice_names=choice_names)
-            if nme < val_nme:
-                filename = os.path.join(self.config.model_dir, "checkpoint_min_nme.pth")
-                self.save_checkpoint(epoch, filename, choice_names=choice_names)
-                val_nme = nme
-            self.logger.info("Best nme: {:.4f}".format(val_nme))
-
-    def save_checkpoint(self, epoch, filename, choice_names=None):
-        """
-        Save checkpoint of the whole model. Saving model weights and architecture weights in
-        ```ckpt_path```, and saving currently chosen architecture in ```arch_path```.
-        """
-        state = {
-            "pfld_backbone": self.model.state_dict(),
-            "auxiliarynet": self.auxiliarynet.state_dict(),
-            'optim': self.model_optim.state_dict(),
-            'epoch': epoch,
-            'arch_sample': choice_names,
-        }
-        torch.save(state, filename)
-        self.logger.info("Save checkpoint to {0:}".format(filename))
-
-        if self.arch_path:
-            self.export(self.arch_path)
-
-    def load_checkpoint(self, filename):
-        """
-        Load the checkpoint from ```ckpt_path```.
-        """
-        ckpt = torch.load(filename)
-        self.epoch = ckpt['epoch']
-        self.model.load_state_dict(ckpt['pfld_backbone'])
-        self.auxiliarynet.load_state_dict(ckpt['auxiliarynet'])
-        self.model_optim.load_state_dict(ckpt['optim'])
-
-    def train(self):
-        """
-        Train the whole model.
-        """
-        if self.load_ckpt:
-            filename = os.path.join(self.config.model_dir, "checkpoint_min_nme.pth")
-            if os.path.exists(filename):
-                self.load_checkpoint(filename)
-
-        if (self.epoch < self.start_epoch) and self.arch_search:
-            self._warm_up()
-        self._train()
-
-    def export(self, file_name):
-        """
-        Export the chosen architecture into a file
-
-        Parameters
-        ----------
-        file_name : str
-            the file that stores exported chosen architecture
-        """
-        exported_arch = self.mutator.sample_final()
-        with open(file_name, 'w') as f:
-            json.dump(exported_arch, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)
-
-    def validate(self):
-        raise NotImplementedError
-
-    def checkpoint(self):
-        raise NotImplementedError
diff --git a/nni/algorithms/nas/pytorch/fbnet/utils.py b/nni/algorithms/nas/pytorch/fbnet/utils.py
deleted file mode 100644
index 80c0d0d5359..00000000000
--- a/nni/algorithms/nas/pytorch/fbnet/utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import numpy as np
-
-def accuracy(preds, target):
-    """preds/target:: numpy array, shape is (N, L, 2)
-    N: batchsize L: num of landmark
-    """
-    N = preds.shape[0]
-    L = preds.shape[1]
-    rmse = np.zeros(N).astype(np.float32)
-
-    for i in range(N):
-        pts_pred, pts_gt = (
-            preds[i],
-            target[i],
-        )
-        if L == 19:  # aflw
-            interocular = 34  # meta['box_size'][i]
-        elif L == 29:  # cofw
-            interocular = np.linalg.norm(pts_gt[8] - pts_gt[9])
-        elif L == 68:  # 300w
-            # interocular
-            interocular = np.linalg.norm(pts_gt[36] - pts_gt[45])
-        elif L == 98:
-            interocular = np.linalg.norm(pts_gt[60] - pts_gt[72])
-        elif L == 106:
-            # euclidean dis from left eye to right eye
-            interocular = np.linalg.norm(pts_gt[35] - pts_gt[93])
-        else:
-            raise ValueError("Number of landmarks is wrong")
-        rmse[i] = np.sum(np.linalg.norm(pts_pred - pts_gt, axis=1)) / (interocular * L)
-
-    return np.mean(rmse), rmse