[
  {
    "path": ".all-contributorsrc",
    "content": "{\n  \"files\": [\n    \"README.md\"\n  ],\n  \"imageSize\": 100,\n  \"commit\": false,\n  \"contributors\": [\n    {\n      \"login\": \"sjtuytc\",\n      \"name\": \"Zelin Zhao\",\n      \"avatar_url\": \"https://avatars.githubusercontent.com/u/31123348?v=4\",\n      \"profile\": \"https://sjtuytc.github.io/\",\n      \"contributions\": [\n        \"code\",\n        \"maintenance\"\n      ]\n    },\n    {\n      \"login\": \"SEUleaderYang\",\n      \"name\": \"EZ-Yang\",\n      \"avatar_url\": \"https://avatars.githubusercontent.com/u/55042050?v=4\",\n      \"profile\": \"https://github.com/SEUleaderYang\",\n      \"contributions\": [\n        \"code\"\n      ]\n    },\n    {\n      \"login\": \"Alex-Alison-Zhang\",\n      \"name\": \"Alex-Zhang\",\n      \"avatar_url\": \"https://avatars.githubusercontent.com/u/71915735?v=4\",\n      \"profile\": \"https://github.com/Alex-Alison-Zhang\",\n      \"contributions\": [\n        \"bug\"\n      ]\n    },\n    {\n      \"login\": \"FanLu97\",\n      \"name\": \"Fan Lu\",\n      \"avatar_url\": \"https://avatars.githubusercontent.com/u/45007531?v=4\",\n      \"profile\": \"https://fanlu97.github.io/\",\n      \"contributions\": [\n        \"bug\"\n      ]\n    },\n    {\n      \"login\": \"MaybeShewill-CV\",\n      \"name\": \"MaybeShewill-CV\",\n      \"avatar_url\": \"https://avatars.githubusercontent.com/u/15725187?v=4\",\n      \"profile\": \"https://maybeshewill-cv.github.io\",\n      \"contributions\": [\n        \"bug\"\n      ]\n    },\n    {\n      \"login\": \"buer1121\",\n      \"name\": \"buer1121\",\n      \"avatar_url\": \"https://avatars.githubusercontent.com/u/48516434?v=4\",\n      \"profile\": \"https://github.com/buer1121\",\n      \"contributions\": [\n        \"bug\"\n      ]\n    }\n  ],\n  \"contributorsPerLine\": 7,\n  \"projectName\": \"LargeScaleNeRFPytorch\",\n  \"projectOwner\": \"sjtuytc\",\n  \"repoType\": \"github\",\n  \"repoHost\": \"https://github.com\",\n  \"skipCi\": true,\n  \"commitConvention\": \"angular\"\n}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\ndata/\ndata/*\nlogs/\nlogs\ndata\nlogs/*\nckpts/\nckpts/*\ndata\n*.png\n.vscode\n.vscode/*\n"
  },
  {
    "path": "FourierGrid/FourierGrid_ckpt_manager.py",
    "content": "from FourierGrid.FourierGrid_model import FourierGridModel\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nimport torch\nimport pdb\nimport os\nfrom tqdm import tqdm\nfrom FourierGrid.run_train import create_new_model\nimport torch.nn.functional as F\n\n\nclass FourierGridCheckpointManager:\n    def __init__(self, args, cfg) -> None:\n        super(FourierGridCheckpointManager, self).__init__()\n        self.args = args\n        self.cfg = cfg\n\n    def load_all_info(self, model, optimizer, ckpt_path, no_reload_optimizer):\n        ckpt = torch.load(ckpt_path)\n        start = ckpt['global_step']\n        model.load_state_dict(ckpt['model_state_dict'])\n        if not no_reload_optimizer:\n            optimizer.load_state_dict(ckpt['optimizer_state_dict'])\n        return model, optimizer, start\n\n    def load_existing_model(self, args, cfg, cfg_train, reload_ckpt_path, device):\n        # not used in training\n        FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n        if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n            model_class = FourierGridModel\n        elif cfg.data.ndc:\n            model_class = dmpigo.DirectMPIGO\n        elif cfg.data.unbounded_inward:\n            model_class = dcvgo.DirectContractedVoxGO\n        else:\n            model_class = dvgo.DirectVoxGO\n        model, _ = self.load_model(model_class, reload_ckpt_path)\n        model = model.to(device)\n        optimizer = utils.create_optimizer_or_freeze_model(model, cfg_train, global_step=0,\n                                                           verbose=False)\n        model, optimizer, start = self.load_all_info(\n                model, optimizer, reload_ckpt_path, args.no_reload_optimizer)\n        return model, optimizer, start\n\n    def save_model(self, global_step, model, optimizer, save_path):\n        torch.save({\n                'global_step': global_step,\n                'model_kwargs': model.get_kwargs(),\n                'model_state_dict': model.state_dict(),\n                'optimizer_state_dict': optimizer.state_dict(),\n            }, save_path)\n        print(f'Saved checkpoints at', save_path)\n\n    def load_model(self, model_class, ckpt_path):\n        ckpt = torch.load(ckpt_path)\n        model_args = ckpt['model_kwargs']\n        model = model_class(**model_args)\n        model.load_state_dict(ckpt['model_state_dict'])\n        return model, model_args\n    \n    @torch.no_grad()\n    def merge_blocks(self, args, cfg, device):\n        stage = 'fine'\n        exp_folder = os.path.join(cfg.basedir, cfg.expname)\n        paths = [os.path.join(exp_folder, f'{stage}_last_{block_id}.tar') for block_id in range(args.block_num)]\n        model_class = FourierGridModel\n        cp1 = paths[0]\n        m1, m1_args = self.load_model(model_class, cp1)\n        m1 = m1.to(device)\n        \n        merged_model, _ = create_new_model(args, cfg, cfg.fine_model_and_render, cfg.fine_train, \n                                           m1_args['xyz_min'], m1_args['xyz_max'], stage, None, device)\n        merged_model = merged_model.to(device)\n        merged_state_dict = m1.state_dict()\n        # merge the grids consequently\n        for idx, cur_cp in enumerate(paths[1:]):\n            print(f\"Meging grid {idx} / {len(paths[1:])}: {cur_cp} ...\")\n            cur_m, _ = self.load_model(model_class, cur_cp)\n            cur_m = cur_m.to(device)\n            for key in merged_state_dict:\n                print(f\"Merging model key: {key} ...\")\n                if key in ['density.grid', 'k0.grid'] or 'rgb' in key:\n                    g1, g2 = merged_state_dict[key], cur_m.state_dict()[key]\n                    merged_g = torch.min(g1, g2)\n                    # merged_g = torch.max(g1, g2)\n                    # del g1\n                    # del g2\n                    merged_state_dict[key] = merged_g\n                # else:\n                #     merged_state_dict[key] = merged_model.state_dict()[key]\n            # del cur_m\n            torch.cuda.empty_cache()\n        if \"mask_cache.mask\" in merged_state_dict:\n            merged_state_dict.pop(\"mask_cache.mask\")\n        merged_model.load_state_dict(merged_state_dict, strict=False)\n        merged_model.update_occupancy_cache()\n        # merged_model.export_geometry_for_visualize(os.path.join(exp_folder, \"debug.npz\"))\n        return merged_model\n"
  },
  {
    "path": "FourierGrid/FourierGrid_grid.py",
    "content": "import os\nimport time\nimport functools\nimport numpy as np\nfrom einops import rearrange\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport pdb\nimport render_utils_cuda\nimport total_variation_cuda\n\n\ndef create_grid(type, **kwargs):\n    if type == 'DenseGrid':\n        return FourierGrid(**kwargs)\n    else:\n        raise NotImplementedError\n\n\nclass NeRFPosEmbedding(nn.Module):\n    def __init__(self, num_freqs: int, logscale=True):\n        \"\"\"\n        Defines a function that embeds x to (x, sin(2^k x), cos(2^k x), ...)\n        \"\"\"\n        super(NeRFPosEmbedding, self).__init__()\n        if logscale:\n            self.freq_bands = 2 ** torch.linspace(0, num_freqs - 1, num_freqs)\n        else:\n            self.freq_bands = torch.linspace(1, 2 ** (num_freqs - 1), num_freqs)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        out = [x]\n        for freq in self.freq_bands:\n            out += [torch.sin(freq * x), torch.cos(freq * x)]\n        return torch.cat(out, -1)\n\n\n''' \nDense 3D grid\n'''\nclass FourierGrid(nn.Module):\n    def __init__(self, channels, world_size, xyz_min, xyz_max, use_nerf_pos, fourier_freq_num, config):\n        super(FourierGrid, self).__init__()\n        self.channels = channels\n        self.world_size = world_size\n        self.register_buffer('xyz_min', torch.Tensor(xyz_min))\n        self.register_buffer('xyz_max', torch.Tensor(xyz_max))\n        if use_nerf_pos:\n            self.nerf_pos_num_freq = fourier_freq_num\n            self.nerf_pos = NeRFPosEmbedding(num_freqs=self.nerf_pos_num_freq)\n            self.pos_embed_output_dim = 1 + self.nerf_pos_num_freq * 2\n            self.grid = nn.Parameter(torch.zeros([self.pos_embed_output_dim, channels, *world_size]))\n        else:\n            self.nerf_pos_num_freq = -1\n            self.pos_embed_output_dim = -1\n            self.nerf_pos = None\n            self.grid = nn.Parameter(torch.zeros([1, channels, *world_size]))\n    \n    def forward(self, xyz):\n        '''\n        xyz: global coordinates to query\n        '''\n        shape = xyz.shape[:-1]\n        xyz = xyz.reshape(1,1,1,-1,3)\n        ind_norm = ((xyz - self.xyz_min) / (self.xyz_max - self.xyz_min)).flip((-1,)) * 2 - 1\n        if self.nerf_pos is not None:\n            pos_embed = self.nerf_pos(ind_norm)\n            out = 0\n            batch_pos_emb = rearrange(pos_embed, '1 1 1 b (n d) -> n 1 1 b d', d=3)\n            batch_out = F.grid_sample(self.grid, batch_pos_emb, mode='bilinear', align_corners=True)\n            out = batch_out.mean(0).reshape(self.channels, -1).T.reshape(*shape, self.channels)\n        else:\n            out = F.grid_sample(self.grid, ind_norm, mode='bilinear', align_corners=True)\n            out = out.reshape(self.channels,-1).T.reshape(*shape, self.channels)\n        if self.channels == 1:\n            out = out.squeeze(-1)\n        return out\n\n    def scale_volume_grid(self, new_world_size):\n        if self.channels == 0:\n            self.grid = nn.Parameter(torch.zeros([1, self.channels, *new_world_size]))\n        else:\n            self.grid = nn.Parameter(\n                F.interpolate(self.grid.data, size=tuple(new_world_size), mode='trilinear', align_corners=True))\n\n    def total_variation_add_grad(self, wx, wy, wz, dense_mode):\n        '''Add gradients by total variation loss in-place'''\n        total_variation_cuda.total_variation_add_grad(\n            self.grid, self.grid.grad, wx, wy, wz, dense_mode)\n\n    def get_dense_grid(self):\n        return self.grid\n\n    @torch.no_grad()\n    def __isub__(self, val):\n        self.grid.data -= val\n        return self\n\n    def extra_repr(self):\n        return f'channels={self.channels}, world_size={self.world_size.tolist()}'\n\n\ndef compute_tensorf_feat(xy_plane, xz_plane, yz_plane, x_vec, y_vec, z_vec, f_vec, ind_norm):\n    # Interp feature (feat shape: [n_pts, n_comp])\n    xy_feat = F.grid_sample(xy_plane, ind_norm[:,:,:,[1,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    xz_feat = F.grid_sample(xz_plane, ind_norm[:,:,:,[2,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    yz_feat = F.grid_sample(yz_plane, ind_norm[:,:,:,[2,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    x_feat = F.grid_sample(x_vec, ind_norm[:,:,:,[3,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    y_feat = F.grid_sample(y_vec, ind_norm[:,:,:,[3,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    z_feat = F.grid_sample(z_vec, ind_norm[:,:,:,[3,2]], mode='bilinear', align_corners=True).flatten(0,2).T\n    # Aggregate components\n    feat = torch.cat([\n        xy_feat * z_feat,\n        xz_feat * y_feat,\n        yz_feat * x_feat,\n    ], dim=-1)\n    feat = torch.mm(feat, f_vec)\n    return feat\n\n\ndef compute_tensorf_val(xy_plane, xz_plane, yz_plane, x_vec, y_vec, z_vec, ind_norm):\n    # Interp feature (feat shape: [n_pts, n_comp])\n    xy_feat = F.grid_sample(xy_plane, ind_norm[:,:,:,[1,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    xz_feat = F.grid_sample(xz_plane, ind_norm[:,:,:,[2,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    yz_feat = F.grid_sample(yz_plane, ind_norm[:,:,:,[2,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    x_feat = F.grid_sample(x_vec, ind_norm[:,:,:,[3,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    y_feat = F.grid_sample(y_vec, ind_norm[:,:,:,[3,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    z_feat = F.grid_sample(z_vec, ind_norm[:,:,:,[3,2]], mode='bilinear', align_corners=True).flatten(0,2).T\n    # Aggregate components\n    feat = (xy_feat * z_feat).sum(-1) + (xz_feat * y_feat).sum(-1) + (yz_feat * x_feat).sum(-1)\n    return feat\n\n\n''' Mask grid\nIt supports query for the known free space and unknown space.\n'''\nclass MaskGrid(nn.Module):\n    def __init__(self, path=None, mask_cache_thres=None, mask=None, xyz_min=None, xyz_max=None):\n        super(MaskGrid, self).__init__()\n        if path is not None:\n            st = torch.load(path)\n            self.mask_cache_thres = mask_cache_thres\n            density = F.max_pool3d(st['model_state_dict']['density.grid'], kernel_size=3, padding=1, stride=1)\n            alpha = 1 - torch.exp(-F.softplus(density + st['model_state_dict']['act_shift']) * st['model_kwargs']['voxel_size_ratio'])\n            mask = (alpha >= self.mask_cache_thres).squeeze(0).squeeze(0)\n            xyz_min = torch.Tensor(st['model_kwargs']['xyz_min'])\n            xyz_max = torch.Tensor(st['model_kwargs']['xyz_max'])\n        else:\n            mask = mask.bool()\n            xyz_min = torch.Tensor(xyz_min)\n            xyz_max = torch.Tensor(xyz_max)\n\n        self.register_buffer('mask', mask)\n        xyz_len = xyz_max - xyz_min\n        self.register_buffer('xyz2ijk_scale', (torch.Tensor(list(mask.shape)) - 1) / xyz_len)\n        self.register_buffer('xyz2ijk_shift', -xyz_min * self.xyz2ijk_scale)\n\n    @torch.no_grad()\n    def forward(self, xyz):\n        '''Skip know freespace\n        @xyz:   [..., 3] the xyz in global coordinate.\n        '''\n        shape = xyz.shape[:-1]\n        xyz = xyz.reshape(-1, 3)\n        mask = render_utils_cuda.maskcache_lookup(self.mask, xyz, self.xyz2ijk_scale, self.xyz2ijk_shift)\n        mask = mask.reshape(shape)\n        return mask\n\n    def extra_repr(self):\n        return f'mask.shape=list(self.mask.shape)'\n\n"
  },
  {
    "path": "FourierGrid/FourierGrid_model.py",
    "content": "import os\nimport time\nimport functools\nimport numpy as np\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom torch_scatter import segment_coo\nfrom FourierGrid.grid import DenseGrid\n\nfrom . import FourierGrid_grid\nfrom .dvgo import Raw2Alpha, Alphas2Weights\nfrom .dmpigo import create_full_step_id\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nimport render_utils_cuda\nimport ub360_utils_cuda\n\n\ndef get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):\n    i, j = torch.meshgrid(\n        torch.linspace(0, W-1, W, device=c2w.device),\n        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'\n    i = i.t().float()\n    j = j.t().float()\n    if mode == 'lefttop':\n        pass\n    elif mode == 'center':\n        i, j = i+0.5, j+0.5\n    elif mode == 'random':\n        i = i+torch.rand_like(i)\n        j = j+torch.rand_like(j)\n    else:\n        raise NotImplementedError\n\n    if flip_x:\n        i = i.flip((1,))\n    if flip_y:\n        j = j.flip((0,))\n    if inverse_y:\n        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)\n    else:\n        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)\n    # Rotate ray directions from camera frame to the world frame\n    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]\n    # Translate camera frame's origin to the world frame. It is the origin of all rays.\n    rays_o = c2w[:3,3].expand(rays_d.shape)\n    return rays_o, rays_d\n\n\ndef ndc_rays(H, W, focal, near, rays_o, rays_d):\n    # Shift ray origins to near plane\n    t = -(near + rays_o[...,2]) / rays_d[...,2]\n    rays_o = rays_o + t[...,None] * rays_d\n\n    # Projection\n    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]\n    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]\n    o2 = 1. + 2. * near / rays_o[...,2]\n\n    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])\n    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])\n    d2 = -2. * near / rays_o[...,2]\n\n    rays_o = torch.stack([o0,o1,o2], -1)\n    rays_d = torch.stack([d0,d1,d2], -1)\n\n    return rays_o, rays_d\n\n\ndef get_rays_of_a_view(H, W, K, c2w, ndc, inverse_y, flip_x, flip_y, mode='center'):\n    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)\n    viewdirs = rays_d / rays_d.norm(dim=-1, keepdim=True)\n    if ndc:\n        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)\n    return rays_o, rays_d, viewdirs\n\n\nclass NeRFPosEmbedding(nn.Module):\n    def __init__(self, num_freqs: int, logscale=True):\n        \"\"\"\n        Defines a function that embeds x to (x, sin(2^k x), cos(2^k x), ...)\n        \"\"\"\n        super(NeRFPosEmbedding, self).__init__()\n\n        if logscale:\n            self.freq_bands = 2 ** torch.linspace(0, num_freqs - 1, num_freqs)\n        else:\n            self.freq_bands = torch.linspace(1, 2 ** (num_freqs - 1), num_freqs)\n\n    def forward(self, x: torch.Tensor) -> torch.Tensor:\n        out = [x]\n        for freq in self.freq_bands:\n            out += [torch.sin(freq * x), torch.cos(freq * x)]\n\n        return torch.cat(out, -1)\n\n\nclass FourierMSELoss(nn.Module):\n    def __init__(self,):\n        super(FourierMSELoss, self).__init__()\n\n    def forward(self, pred, gt):\n        fft_dim = -1\n        pred_fft = torch.fft.fft(pred, dim=fft_dim)\n        gt_fft = torch.fft.fft(gt, dim=fft_dim)\n        pred_real, pred_imag = pred_fft.real, pred_fft.imag\n        gt_real, gt_imag = gt_fft.real, gt_fft.imag\n        real_loss = F.mse_loss(pred_real, gt_real)\n        return real_loss\n\n\nclass FourierMSELoss(nn.Module):\n    def __init__(self, num_freqs=7, logscale=True):\n        super(FourierMSELoss, self).__init__()\n        # self.nerf_pos = NeRFPosEmbedding(num_freqs=num_freqs, logscale=logscale)\n\n    def forward(self, pred, gt):\n        # pred_embed = self.nerf_pos(pred)\n        # gt_embed = self.nerf_pos(gt)\n        # return F.mse_loss(pred_embed, gt_embed)\n        fft_dim = -1\n        pred_fft = torch.fft.fft(pred, dim=fft_dim)\n        gt_fft = torch.fft.fft(gt, dim=fft_dim)\n        pred_real, pred_imag = pred_fft.real, pred_fft.imag\n        gt_real, gt_imag = gt_fft.real, gt_fft.imag\n        real_loss = F.mse_loss(pred_real, gt_real)\n        # imag_loss = F.mse_loss(pred_imag, gt_imag)\n        return real_loss\n\n\n'''Model'''\nclass FourierGridModel(nn.Module):\n    def __init__(self, xyz_min, xyz_max, num_voxels_density=0, num_voxels_base_density=0, num_voxels_rgb=0,\n                 num_voxels_base_rgb=0, num_voxels_viewdir=0, alpha_init=None, mask_cache_world_size=None, fast_color_thres=0, \n                 bg_len=0.2, contracted_norm='inf', density_type='DenseGrid', k0_type='DenseGrid', density_config={}, k0_config={},\n                 rgbnet_dim=0, rgbnet_depth=3, rgbnet_width=128, fourier_freq_num=5, viewbase_pe=4, img_emb_dim=-1, verbose=False,\n                 **kwargs):\n        super(FourierGridModel, self).__init__()\n        xyz_min = torch.Tensor(xyz_min)\n        xyz_max = torch.Tensor(xyz_max)\n        assert len(((xyz_max - xyz_min) * 100000).long().unique()), 'scene bbox must be a cube in DirectContractedVoxGO'\n        self.register_buffer('scene_center', (xyz_min + xyz_max) * 0.5)\n        self.register_buffer('scene_radius', (xyz_max - xyz_min) * 0.5)\n        # xyz_min/max are the boundary that separates fg and bg scene in NDC.\n        self.register_buffer('xyz_min', torch.Tensor([-1,-1,-1]) - bg_len)\n        self.register_buffer('xyz_max', torch.Tensor([1,1,1]) + bg_len)\n        if isinstance(fast_color_thres, dict):\n            self._fast_color_thres = fast_color_thres\n            self.fast_color_thres = fast_color_thres[0]\n        else:\n            self._fast_color_thres = None\n            self.fast_color_thres = fast_color_thres\n        self.bg_len = bg_len\n        self.contracted_norm = contracted_norm\n        self.verbose = verbose\n\n        # determine based grid resolution\n        self.fourier_freq_num = fourier_freq_num\n        self.num_voxels_base_density = num_voxels_base_density\n        self.voxel_size_base_density = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base_density).pow(1/3)\n        self.num_voxels_base_rgb = num_voxels_base_rgb\n        self.voxel_size_base_rgb = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base_rgb).pow(1/3)\n        self.num_voxels_viewdir = num_voxels_viewdir\n        self.voxel_size_viewdir = ((torch.Tensor([1,1,1]) - torch.Tensor([-1,-1,-1])).prod() / self.num_voxels_viewdir).pow(1/3)\n\n        # determine init grid resolution\n        self._set_grid_resolution(num_voxels_density, num_voxels_rgb)\n\n        # determine the density bias shift\n        self.alpha_init = alpha_init\n        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))\n        if self.verbose:\n            print('FourierGrid: set density bias shift to', self.act_shift)\n\n        # init density voxel grid\n        self.density_type = density_type\n        self.density_config = density_config\n        self.world_size = self.world_size_density\n        self.density = FourierGrid_grid.create_grid(\n            density_type, channels=1, world_size=self.world_size_density,\n            xyz_min=self.xyz_min, xyz_max=self.xyz_max, use_nerf_pos=True,\n            fourier_freq_num=self.fourier_freq_num, config=self.density_config)\n        \n        # init color representation\n        self.rgbnet_kwargs = {\n            'rgbnet_dim': rgbnet_dim,\n            'rgbnet_depth': rgbnet_depth, \n            'rgbnet_width': rgbnet_width,\n            'viewbase_pe': viewbase_pe,\n        }\n        self.k0_type = k0_type\n        self.k0_config = k0_config\n        \n        self.img_embed_dim = img_emb_dim\n        if 'sample_num' not in kwargs:\n            self.sample_num = -1\n        else:\n            self.sample_num = kwargs['sample_num']\n\n        if img_emb_dim > 0 and self.sample_num > 0:    # use apperance embeddings\n            self.img_embeddings = nn.Embedding(num_embeddings=self.sample_num, \n                                        embedding_dim=self.img_embed_dim)\n        else:\n            self.img_embeddings = None\n            self.img_embed_dim = 0\n\n        pos_emb = False\n        if pos_emb and self.sample_num > 0:    # use apperance embeddings\n            self.pos_emb = torch.zeros((self.sample_num, 3), requires_grad=True)\n        else:\n            self.pos_emb = None\n\n        # rgbnet configurations\n        if rgbnet_dim <= 0:\n            # color voxel grid  (coarse stage)\n            self.k0_dim = 3\n            self.k0 = FourierGrid_grid.create_grid(\n                k0_type, channels=self.k0_dim, world_size=self.world_size_rgb,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max,  use_nerf_pos=False,\n                fourier_freq_num=self.fourier_freq_num, config=self.k0_config)\n            self.rgbnet = None\n        else:\n            # feature voxel grid + shallow MLP  (fine stage)\n            self.k0_dim = rgbnet_dim\n            self.k0 = FourierGrid_grid.create_grid(\n                k0_type, channels=self.k0_dim, world_size=self.world_size_rgb,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max, use_nerf_pos=True, \n                fourier_freq_num=self.fourier_freq_num, config=self.k0_config)\n            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))\n            dim0 = (3+3*viewbase_pe*2)  # view freq dim\n            dim0 += self.k0_dim\n            self.rgbnet = nn.Sequential(\n                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),\n                *[\n                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))\n                    for _ in range(rgbnet_depth-2)\n                ],\n                nn.Linear(rgbnet_width, 3),\n            )\n            nn.init.constant_(self.rgbnet[-1].bias, 0)\n            if self.verbose:\n                print('FourierGrid: feature voxel grid', self.k0)\n                print('FourierGrid: mlp', self.rgbnet)\n        \n        use_view_grid = num_voxels_viewdir > 0\n        if use_view_grid:\n            self.vd = FourierGrid_grid.create_grid(k0_type, channels=3, world_size=self.world_size_viewdir,\n                                            xyz_min=torch.Tensor([-1, -1, -1]), xyz_max=torch.Tensor([1, 1, 1]),\n                                            fourier_freq_num=self.fourier_freq_num, use_nerf_pos=False,)\n        else:\n            self.vd = None\n        # Using the coarse geometry if provided (used to determine known free space and unknown space)\n        # Re-implement as occupancy grid\n        if mask_cache_world_size is None:\n            mask_cache_world_size = self.world_size_density\n        mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)\n        self.mask_cache = FourierGrid_grid.MaskGrid(\n            path=None, mask=mask,\n            xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n    @torch.no_grad()\n    def FourierGrid_get_training_rays(self, rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):\n        if self.pos_emb is not None:\n            train_poses[:, :3, 3] = train_poses[:, :3, 3] + self.pos_emb\n        assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)\n        eps_time = time.time()\n        DEVICE = rgb_tr_ori[0].device\n        N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)\n        rgb_tr = torch.zeros([N,3], device=DEVICE)\n        rays_o_tr = torch.zeros_like(rgb_tr)\n        rays_d_tr = torch.zeros_like(rgb_tr)\n        viewdirs_tr = torch.zeros_like(rgb_tr)\n        indexs_tr = torch.zeros_like(rgb_tr)  # image indexs\n        imsz = []\n        top = 0\n        cur_idx = 0\n        for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):\n            assert img.shape[:2] == (H, W)\n            rays_o, rays_d, viewdirs = get_rays_of_a_view(\n                    H=H, W=W, K=K, c2w=c2w, ndc=ndc,\n                    inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)\n            n = H * W\n            rgb_tr[top:top+n].copy_(img.flatten(0,1))\n            rays_o_tr[top:top+n].copy_(rays_o.flatten(0,1).to(DEVICE))\n            rays_d_tr[top:top+n].copy_(rays_d.flatten(0,1).to(DEVICE))\n            viewdirs_tr[top:top+n].copy_(viewdirs.flatten(0,1).to(DEVICE))\n            indexs_tr[top:top+n].copy_(torch.tensor(cur_idx).long().to(DEVICE))\n            cur_idx += 1\n            imsz.append(n)\n            top += n\n        assert top == N\n        eps_time = time.time() - eps_time\n        return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_tr, imsz\n\n    def gather_training_rays(self, data_dict, images, cfg, i_train, cfg_train, poses, HW, Ks, render_kwargs):\n        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n        if data_dict['irregular_shape']:\n            rgb_tr_ori = [images[i].to('cpu' if cfg.data.load2gpu_on_the_fly else device) for i in i_train]\n        else:\n            rgb_tr_ori = images[i_train].to('cpu' if cfg.data.load2gpu_on_the_fly else device)\n\n        indexs_train = None\n        FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n        if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n            rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_train, imsz = self.FourierGrid_get_training_rays(\n            rgb_tr_ori=rgb_tr_ori, train_poses=poses[i_train], HW=HW[i_train], Ks=Ks[i_train], \n            ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n            flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y, )\n        elif cfg_train.ray_sampler == 'in_maskcache':\n            rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz = dvgo.get_training_rays_in_maskcache_sampling(\n                    rgb_tr_ori=rgb_tr_ori,\n                    train_poses=poses[i_train],\n                    HW=HW[i_train], Ks=Ks[i_train],\n                    ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                    flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y,\n                    model=self, render_kwargs=render_kwargs)\n        elif cfg_train.ray_sampler == 'flatten':\n            rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz = dvgo.get_training_rays_flatten(\n                rgb_tr_ori=rgb_tr_ori,\n                train_poses=poses[i_train],\n                HW=HW[i_train], Ks=Ks[i_train], ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n        else:\n            rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz = dvgo.get_training_rays(\n                rgb_tr=rgb_tr_ori,\n                train_poses=poses[i_train],\n                HW=HW[i_train], Ks=Ks[i_train], ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n        index_generator = dvgo.batch_indices_generator(len(rgb_tr), cfg_train.N_rand)\n        batch_index_sampler = lambda: next(index_generator)\n        return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_train, imsz, batch_index_sampler\n\n    def _set_grid_resolution(self, num_voxels_density, num_voxels_rgb):\n        # Determine grid resolution\n        self.num_voxels_density = num_voxels_density\n        self.num_voxels_rgb = num_voxels_rgb\n        self.voxel_size_density = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_density).pow(1/3)\n        self.voxel_size_rgb = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_rgb).pow(1/3)\n        self.voxel_size_viewdir = ((torch.Tensor([1,1,1]) - torch.Tensor([-1,-1,-1])).prod() / self.num_voxels_viewdir).pow(1/3)\n        self.world_size_density = ((self.xyz_max - self.xyz_min) / self.voxel_size_density).long()\n        self.world_size_rgb = ((self.xyz_max - self.xyz_min) / self.voxel_size_rgb).long()\n        self.world_size_viewdir = (torch.Tensor([1,1,1]) - torch.Tensor([-1,-1,-1]) / self.voxel_size_viewdir).long()\n        self.world_len_density = self.world_size_density[0].item()\n        self.world_len_rgb = self.world_size_rgb[0].item()\n        self.world_len_viewdir = self.world_size_viewdir[0].item()\n        self.voxel_size_ratio_density = self.voxel_size_density / self.voxel_size_base_density\n        self.voxel_size_ratio_rgb = self.voxel_size_rgb / self.voxel_size_base_rgb\n\n    def get_kwargs(self):\n        return {\n            'xyz_min': self.xyz_min.cpu().numpy(),\n            'xyz_max': self.xyz_max.cpu().numpy(),\n            'num_voxels_density': self.num_voxels_density,\n            'num_voxels_rgb': self.num_voxels_rgb,\n            'num_voxels_viewdir': self.num_voxels_viewdir,\n            'fourier_freq_num': self.fourier_freq_num,\n            'num_voxels_base_density': self.num_voxels_base_density,\n            'num_voxels_base_rgb': self.num_voxels_base_rgb,\n            'alpha_init': self.alpha_init,\n            'voxel_size_ratio_density': self.voxel_size_ratio_density,\n            'voxel_size_ratio_rgb': self.voxel_size_ratio_rgb,\n            'mask_cache_world_size': list(self.mask_cache.mask.shape),\n            'fast_color_thres': self.fast_color_thres,\n            'contracted_norm': self.contracted_norm,\n            'density_type': self.density_type,\n            'k0_type': self.k0_type,\n            'density_config': self.density_config,\n            'k0_config': self.k0_config,\n            'sample_num': self.sample_num, \n            **self.rgbnet_kwargs,\n        }\n\n    @torch.no_grad()\n    def maskout_near_cam_vox(self, cam_o, near_clip):\n        ind_norm = ((cam_o - self.xyz_min) / (self.xyz_max - self.xyz_min)).flip((-1,)) * 2 - 1\n        pos_embed = self.density.nerf_pos(ind_norm).squeeze()\n        # maskout grid points that between cameras and their near planes\n        self_grid_xyz = torch.stack(torch.meshgrid(\n            torch.linspace(-1, 1, self.world_size_density[0]),\n            torch.linspace(-1, 1, self.world_size_density[1]),\n            torch.linspace(-1, 1, self.world_size_density[2]),\n        ), -1)\n        for i in range(self.density.pos_embed_output_dim):\n            cur_pos_embed = pos_embed[:, 3*i:3*(i+1)].unsqueeze(0).unsqueeze(0).unsqueeze(0)\n            nearest_dist = torch.stack([(self_grid_xyz.unsqueeze(-2) - co).pow(2).sum(-1).sqrt().amin(-1) for co in cur_pos_embed.split(10)]).amin(0)\n            self.density.grid[0][i][nearest_dist <= near_clip] = -100\n        \n    def voxel_count_views(self, rays_o_tr, rays_d_tr, imsz, near, far, stepsize, downrate=1, irregular_shape=False):\n        print('FourierGrid: voxel_count_views start')\n        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox\n        eps_time = time.time()\n        N_samples = int(np.linalg.norm(np.array(self.world_size_density.cpu())+1) / stepsize) + 1\n        rng = torch.arange(N_samples)[None].float()\n        count = torch.zeros_like(self.density.get_dense_grid())\n        device = rng.device\n        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):\n            ones = DenseGrid(1, self.world_size_density, self.xyz_min, self.xyz_max)\n            if irregular_shape:\n                rays_o_ = rays_o_.split(10000)\n                rays_d_ = rays_d_.split(10000)\n            else:\n                rays_o_ = rays_o_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)\n                rays_d_ = rays_d_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)\n            for rays_o, rays_d in zip(rays_o_, rays_d_):\n                vec = torch.where(rays_d==0, torch.full_like(rays_d, 1e-6), rays_d)\n                rate_a = (self.xyz_max - rays_o) / vec\n                rate_b = (self.xyz_min - rays_o) / vec\n                t_min = torch.minimum(rate_a, rate_b).amax(-1).clamp(min=near, max=far)\n                t_max = torch.maximum(rate_a, rate_b).amin(-1).clamp(min=near, max=far)\n                step = stepsize * self.voxel_size_density * rng\n                interpx = (t_min[...,None] + step/rays_d.norm(dim=-1,keepdim=True))\n                rays_pts = rays_o[...,None,:] + rays_d[...,None,:] * interpx[...,None]\n                ones(rays_pts).sum().backward()\n            with torch.no_grad():\n                count += (ones.grid.grad > 1)\n        eps_time = time.time() - eps_time\n        print('FourierGrid: voxel_count_views finish (eps time:', eps_time, 'sec)')\n        return count\n    \n    @torch.no_grad()\n    def scale_volume_grid(self, num_voxels_density, num_voxels_rgb):\n        if self.verbose:\n            print('FourierGrid: scale_volume_grid start')\n        self._set_grid_resolution(num_voxels_density, num_voxels_rgb)\n        self.density.scale_volume_grid(self.world_size_density)\n        self.k0.scale_volume_grid(self.world_size_rgb)\n\n        if np.prod(self.world_size_density.tolist()) <= 256**3:\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size_density[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size_density[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size_density[2]),\n            ), -1)\n            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]\n            self.mask_cache = FourierGrid_grid.MaskGrid(\n                path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n        print('FourierGrid: scale_volume_grid finish')\n\n    @torch.no_grad()\n    def update_occupancy_cache(self):\n        ori_p = self.mask_cache.mask.float().mean().item()\n        cache_grid_xyz = torch.stack(torch.meshgrid(\n            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),\n            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),\n            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),\n        ), -1)\n        cache_grid_density = self.density(cache_grid_xyz)[None,None]\n        cache_grid_alpha = self.activate_density(cache_grid_density)\n        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]\n        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)\n        new_p = self.mask_cache.mask.float().mean().item()\n        if self.verbose:\n            print(f'FourierGrid: update mask_cache {ori_p:.4f} => {new_p:.4f}')\n\n    def update_occupancy_cache_lt_nviews(self, rays_o_tr, rays_d_tr, imsz, render_kwargs, maskout_lt_nviews):\n        # TODO: Check or remove this function. This is untested and unused for now.\n        if self.verbose:\n            print('FourierGrid: update mask_cache lt_nviews start')\n        eps_time = time.time()\n        count = torch.zeros_like(self.density.get_dense_grid()).long()\n        device = count.device\n        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):\n            ones = FourierGrid_grid.FourierGrid(1, self.world_size_density, self.xyz_min, self.xyz_max)\n            for rays_o, rays_d in zip(rays_o_.split(8192), rays_d_.split(8192)):\n                ray_pts, indexs, inner_mask, t, rays_d_e = self.sample_ray(\n                        ori_rays_o=rays_o.to(device), ori_rays_d=rays_d.to(device),\n                        **render_kwargs)\n                ones(ray_pts).sum().backward()\n            count.data += (ones.grid.grad > 1)\n        ori_p = self.mask_cache.mask.float().mean().item()\n        self.mask_cache.mask &= (count >= maskout_lt_nviews)[0,0]\n        new_p = self.mask_cache.mask.float().mean().item()\n        if self.verbose:\n            print(f'FourierGrid: update mask_cache {ori_p:.4f} => {new_p:.4f}')\n        eps_time = time.time() - eps_time\n        if self.verbose:\n            print(f'FourierGrid: update mask_cache lt_nviews finish (eps time:', eps_time, 'sec)')\n\n    def density_total_variation_add_grad(self, weight, dense_mode):\n        w = weight * self.world_size_density.max() / 128\n        self.density.total_variation_add_grad(w, w, w, dense_mode)\n\n    def k0_total_variation_add_grad(self, weight, dense_mode):\n        w = weight * self.world_size_rgb.max() / 128\n        self.k0.total_variation_add_grad(w, w, w, dense_mode)\n\n    def activate_density(self, density, interval=None):\n        interval = interval if interval is not None else self.voxel_size_ratio_density\n        shape = density.shape\n        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)\n\n    def hit_coarse_geo(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):\n        '''Check whether the rays hit the solved coarse geometry or not'''\n        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox\n        shape = rays_o.shape[:-1]\n        rays_o = rays_o.reshape(-1, 3).contiguous()\n        rays_d = rays_d.reshape(-1, 3).contiguous()\n        stepdist = stepsize * self.voxel_size_density\n        ray_pts, mask_outbbox, ray_id = render_utils_cuda.sample_pts_on_rays(\n                rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)[:3]\n        mask_inbbox = ~mask_outbbox\n        hit = torch.zeros([len(rays_o)], dtype=torch.bool)\n        hit[ray_id[mask_inbbox][self.mask_cache(ray_pts[mask_inbbox])]] = 1\n        return hit.reshape(shape)\n    \n    def sample_ray(self, ori_rays_o, ori_rays_d, stepsize, is_train=False, **render_kwargs):\n        '''Sample query points on rays: central sampling.\n        Ori_rays_o needs to be properly scaled!\n        All the output points are sorted from near to far.\n        Input:\n            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.\n            stepsize:         the number of voxels of each sample step.\n        Output:\n            ray_pts:          [M, 3] storing all the sampled points.\n            ray_id:           [M]    the index of the ray of each point.\n            step_id:          [M]    the i'th step on a ray of each point.\n        '''\n        # NDC coordinates\n        rays_o = (ori_rays_o - self.scene_center) / self.scene_radius  \n        rays_d = ori_rays_d / ori_rays_d.norm(dim=-1, keepdim=True)\n        N_inner = int(2 / (2+2*self.bg_len) * self.world_len_density / stepsize) + 1\n        N_outer = N_inner\n        t_boundary = 1.5 # default t_boundary=2, waymo=1.5\n        b_inner = torch.linspace(0, t_boundary, N_inner+1)\n        b_outer = t_boundary / torch.linspace(1, 1/128, N_outer+1)\n        t = torch.cat([\n            (b_inner[1:] + b_inner[:-1]) * 0.5,\n            (b_outer[1:] + b_outer[:-1]) * 0.5,\n        ])\n        ray_pts = rays_o[:,None,:] + rays_d[:,None,:] * t[None,:,None]\n        if self.contracted_norm == 'inf':\n            norm = ray_pts.abs().amax(dim=-1, keepdim=True)\n        elif self.contracted_norm == 'l2':\n            norm = ray_pts.norm(dim=-1, keepdim=True)\n        else:\n            raise NotImplementedError\n        seperate_boundary = 1.0\n        B = 1 + self.bg_len\n        order = 1  # default order = 1\n        A = B * (seperate_boundary**order) - seperate_boundary ** (order + 1)\n        ray_pts = torch.where(\n            norm<=seperate_boundary,\n            ray_pts,\n            ray_pts / norm * (B - A/ (norm ** order))\n        )\n        indexs = None\n        rays_d_extend = None\n        inner_mask = norm<=seperate_boundary  # this variable is not important\n        return ray_pts, indexs, inner_mask.squeeze(-1), t, rays_d_extend\n\n    def forward(self, rays_o, rays_d, viewdirs, global_step=None, is_train=False, **render_kwargs):\n        '''Volume rendering\n        @rays_o:   [N, 3] the starting point of the N shooting rays.\n        @rays_d:   [N, 3] the shooting direction of the N rays.\n        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.\n        '''\n        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only support point queries in [N, 3] format'\n        if isinstance(self._fast_color_thres, dict) and global_step in self._fast_color_thres:\n            if self.verbose:\n                print(f'FourierGrid: update fast_color_thres {self.fast_color_thres} => {self._fast_color_thres[global_step]}')\n            self.fast_color_thres = self._fast_color_thres[global_step]\n\n        ret_dict = {}\n        num_rays = len(rays_o)\n        # sample points on rays\n        ray_pts, ray_indexs, inner_mask, t, rays_d_e = self.sample_ray(\n                ori_rays_o=rays_o, ori_rays_d=rays_d, is_train=global_step is not None, **render_kwargs)\n        n_max = len(t)\n        interval = render_kwargs['stepsize'] * self.voxel_size_ratio_density\n        ray_id, step_id = create_full_step_id(ray_pts.shape[:2])\n\n        # skip oversampled points outside scene bbox\n        mask = inner_mask.clone() # default\n\n        # changing shapes, only needed when the above procedures are commented out\n        t = t[None].repeat(num_rays, 1)\n        \n        # query for alpha w/ post-activation\n        density = self.density(ray_pts)\n        alpha = self.activate_density(density, interval)\n        \n        # apply fast color thresh\n        if self.fast_color_thres > 0:\n            # masked inner points, change this for other scenes!!!\n            mask = (alpha > self.fast_color_thres)\n            ray_pts = ray_pts[mask]\n            if rays_d_e is not None:\n                rays_d_e = rays_d_e[mask]\n            inner_mask = inner_mask[mask]\n            t = t[mask]\n            # changed because the above masking functions are removed\n            ray_id = ray_id[mask.flatten()]\n            step_id = step_id[mask.flatten()]\n            density = density[mask]\n            alpha = alpha[mask]\n\n        # compute accumulated transmittance\n        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, num_rays)\n        if self.fast_color_thres > 0:\n            mask = (weights > self.fast_color_thres)\n            # print(f\"Masked ratio: {1 - mask.sum() / mask.numel()}.\")\n            ray_pts = ray_pts[mask]        \n            if rays_d_e is not None:\n                rays_d_e = rays_d_e[mask]    \n            inner_mask = inner_mask[mask]\n            t = t[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n            density = density[mask]\n            alpha = alpha[mask]\n            weights = weights[mask]\n        else:\n            ray_pts = ray_pts.reshape(-1, ray_pts.shape[-1])\n            weights = weights.reshape(-1)\n            inner_mask = inner_mask.reshape(-1)\n\n        # query for color\n        k0 = self.k0(ray_pts)\n        \n        if self.rgbnet is None:\n            # no view-depend effect\n            rgb = torch.sigmoid(k0)\n        elif self.vd is not None:\n            viewdirs_color = self.vd(viewdirs)[ray_id]\n            rgb_logit = k0 + viewdirs_color\n            rgb = torch.sigmoid(rgb_logit)\n        else:\n            # view-dependent color emission\n            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)\n            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)\n            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]\n            rgb_feat = torch.cat([k0, viewdirs_emb], -1)\n            rgb_logit = self.rgbnet(rgb_feat)\n            rgb = torch.sigmoid(rgb_logit)\n\n        # Ray marching, rendering equations here.\n        rgb_marched = segment_coo(\n                src=weights.unsqueeze(-1) * rgb,\n                index=ray_id,\n                out=torch.zeros([num_rays, 3]),\n                reduce='sum')\n        \n        if render_kwargs.get('rand_bkgd', False):\n            rgb_marched += (alphainv_last.unsqueeze(-1) * torch.rand_like(rgb_marched))\n        \n        s = 1 - 1/(1+t)  # [0, inf] => [0, 1]\n        ret_dict.update({\n            'alphainv_last': alphainv_last,\n            'weights': weights,\n            'rgb_marched': rgb_marched,\n            'raw_density': density,\n            'raw_alpha': alpha,\n            'raw_rgb': rgb,\n            'ray_id': ray_id,\n            'step_id': step_id,\n            'n_max': n_max,\n            't': t,\n            's': s,\n        })\n\n        if render_kwargs.get('render_depth', False):\n            with torch.no_grad():\n                depth = segment_coo(\n                        src=(weights * s),\n                        index=ray_id,\n                        out=torch.zeros([num_rays]),\n                        reduce='sum')\n            ret_dict.update({'depth': depth})\n        return ret_dict\n    \n    def export_geometry_for_visualize(self, save_path):\n        with torch.no_grad():\n            dense_grid = self.density.get_dense_grid()\n            alpha = self.activate_density(dense_grid).squeeze().cpu().numpy()\n            color_grid = self.k0.get_dense_grid()\n            rgb = torch.sigmoid(color_grid).squeeze().permute(1,2,3,0).cpu().numpy()\n            np.savez_compressed(save_path, alpha=alpha, rgb=rgb)\n            print(f\"Geometry is saved at {save_path}.\")\n\n\nclass DistortionLoss(torch.autograd.Function):\n    @staticmethod\n    def forward(ctx, w, s, n_max, ray_id):\n        n_rays = ray_id.max()+1\n        interval = 1/n_max\n        w_prefix, w_total, ws_prefix, ws_total = ub360_utils_cuda.segment_cumsum(w, s, ray_id)\n        loss_uni = (1/3) * interval * w.pow(2)\n        loss_bi = 2 * w * (s * w_prefix - ws_prefix)\n        ctx.save_for_backward(w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id)\n        ctx.interval = interval\n        return (loss_bi.sum() + loss_uni.sum()) / n_rays\n\n    @staticmethod\n    @torch.autograd.function.once_differentiable\n    def backward(ctx, grad_back):\n        w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id = ctx.saved_tensors\n        interval = ctx.interval\n        grad_uni = (1/3) * interval * 2 * w\n        w_suffix = w_total[ray_id] - (w_prefix + w)\n        ws_suffix = ws_total[ray_id] - (ws_prefix + w*s)\n        grad_bi = 2 * (s * (w_prefix - w_suffix) + (ws_suffix - ws_prefix))\n        grad = grad_back * (grad_bi + grad_uni)\n        return grad, None, None, None\n\ndistortion_loss = DistortionLoss.apply\n"
  },
  {
    "path": "FourierGrid/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/arf.py",
    "content": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport imageio\nimport cv2\nimport pdb\nimport os\n\nclass ARF:\n    def __init__(self, cfg, data_dict, device):\n        super(ARF, self).__init__()\n        assert 'arf' in cfg, \"ARF should not be initialized according to cfg!\"\n        self.style_root = cfg.arf.style_root\n        self.style_id = cfg.arf.style_id\n        style_img_p = os.path.join(self.style_root, str(self.style_id) + \".jpg\")\n        self.device = device\n        # assume the images are of the same height / width.\n        self.w, self.h = data_dict['HW'][0][1], data_dict['HW'][0][0]\n\n        # initialize style image.\n        self.np_style_img = None\n        self.style_img = self.load_style_img(style_img_p)\n        \n        \n    def load_style_img(self, style_img_p):\n        # resize style image such that its long side matches the long side of content images\n        style_img = imageio.imread(style_img_p).astype(np.float32) / 255.0\n        style_h, style_w = style_img.shape[:2]\n        content_long_side = max([self.w, self.h])\n        if style_h > style_w:\n            style_img = cv2.resize(\n                style_img,\n                (int(content_long_side / style_h * style_w), content_long_side),\n                interpolation=cv2.INTER_AREA,\n            )\n        else:\n            style_img = cv2.resize(\n                style_img,\n                (content_long_side, int(content_long_side / style_w * style_h)),\n                interpolation=cv2.INTER_AREA,\n            )\n        style_img = cv2.resize(\n            style_img,\n            (style_img.shape[1] // 2, style_img.shape[0] // 2),\n            interpolation=cv2.INTER_AREA,\n        )\n        self.np_style_img = style_img\n        style_img = torch.from_numpy(style_img).to(device=self.device)\n        return style_img\n        \n    def match_colors_for_image_set(self, image_set, train_save_dir):  # code from ARF\n        \"\"\"\n        image_set: [N, H, W, 3]\n        style_img: [H, W, 3]\n        \"\"\"\n        imageio.imwrite(\n            os.path.join(train_save_dir, \"style_image.png\"),\n            np.clip(self.np_style_img * 255.0, 0.0, 255.0).astype(np.uint8),\n        )\n        sh = image_set.shape\n        image_set = image_set.reshape(-1, 3)\n        image_set = torch.tensor(image_set).to(self.device)\n        style_img = self.style_img.reshape(-1, 3).to(image_set.device)\n\n        mu_c = image_set.mean(0, keepdim=True)\n        mu_s = style_img.mean(0, keepdim=True)\n\n        cov_c = torch.matmul((image_set - mu_c).transpose(1, 0), image_set - mu_c) / float(image_set.size(0))\n        cov_s = torch.matmul((style_img - mu_s).transpose(1, 0), style_img - mu_s) / float(style_img.size(0))\n\n        u_c, sig_c, _ = torch.svd(cov_c)\n        u_s, sig_s, _ = torch.svd(cov_s)\n\n        u_c_i = u_c.transpose(1, 0)\n        u_s_i = u_s.transpose(1, 0)\n\n        scl_c = torch.diag(1.0 / torch.sqrt(torch.clamp(sig_c, 1e-8, 1e8)))\n        scl_s = torch.diag(torch.sqrt(torch.clamp(sig_s, 1e-8, 1e8)))\n\n        tmp_mat = u_s @ scl_s @ u_s_i @ u_c @ scl_c @ u_c_i\n        tmp_vec = mu_s.view(1, 3) - mu_c.view(1, 3) @ tmp_mat.T\n\n        image_set = image_set @ tmp_mat.T + tmp_vec.view(1, 3)\n        image_set = image_set.contiguous().clamp_(0.0, 1.0).view(sh)\n\n        color_tf = torch.eye(4).float().to(tmp_mat.device)\n        color_tf[:3, :3] = tmp_mat\n        color_tf[:3, 3:4] = tmp_vec.T\n        return image_set, color_tf\n\n"
  },
  {
    "path": "FourierGrid/bbox_compute.py",
    "content": "import torch\nimport numpy as np\nfrom FourierGrid import utils, dvgo\nimport time\nfrom FourierGrid.load_everything import load_existing_model\nfrom tqdm import tqdm\nimport pdb\n\n\ndef _compute_bbox_by_cam_frustrm_unbounded(cfg, HW, Ks, poses, i_train, near_clip):\n    # Find a tightest cube that cover all camera centers\n    xyz_min = torch.Tensor([np.inf, np.inf, np.inf])\n    xyz_max = -xyz_min\n    for (H, W), K, c2w in tqdm(zip(HW[i_train], Ks[i_train], poses[i_train]), total=len(HW[i_train])):\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H=H, W=W, K=K, c2w=c2w,\n                ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n        pts = rays_o + rays_d * near_clip\n        xyz_min = torch.minimum(xyz_min, pts.amin((0,1)))\n        xyz_max = torch.maximum(xyz_max, pts.amax((0,1)))\n    center = (xyz_min + xyz_max) * 0.5\n    radius = (center - xyz_min).max() * cfg.data.unbounded_inner_r\n    xyz_min = center - radius\n    xyz_max = center + radius\n    return xyz_min, xyz_max\n\n\ndef FourierGrid_compute_bbox_by_cam_frustrm_nerfpp(cfg, HW, Ks, poses, i_train, near_clip):\n    # Find a tightest cube that cover all camera centers\n    xyz_min = torch.Tensor([np.inf, np.inf, np.inf])\n    xyz_max = -xyz_min\n    for (H, W), K, c2w in tqdm(zip(HW[i_train], Ks[i_train], poses[i_train]), total=len(HW[i_train])):\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H=H, W=W, K=K, c2w=c2w,\n                ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n        pts = rays_o + rays_d * near_clip\n        xyz_min = torch.minimum(xyz_min, pts.amin((0,1)))\n        xyz_max = torch.maximum(xyz_max, pts.amax((0,1)))\n    center = (xyz_min + xyz_max) * 0.5\n    radius = (center - xyz_min).max() * cfg.data.unbounded_inner_r\n    xyz_min = center - radius\n    xyz_max = center + radius\n    return xyz_min, xyz_max\n\n\ndef FourierGrid_compute_bbox_by_cam_frustrm_waymo(cfg, HW, Ks, poses, i_train, near_clip):\n    xs, ys, zs = [], [], []\n    for (H, W), K, c2w in tqdm(zip(HW[i_train], Ks[i_train], poses[i_train]), total=len(HW[i_train])):\n        xs.append(c2w[:, 3][0].item())\n        ys.append(c2w[:, 3][1].item())\n        zs.append(c2w[:, 3][2].item())\n    \n    zmin, zmax = min(zs), max(zs)\n    xmin, xmax = min(xs), max(xs)\n    ymin, ymax = min(ys), max(ys)\n    x_extend = 0.05 # 0.05\n    y_extend = 0.01\n    z_extend = 0.01  # 0.01\n    xyz_min = [xmin - x_extend, ymin - y_extend, zmin - z_extend]\n    xyz_max = [xmax + x_extend, ymax + y_extend, zmax + z_extend]\n    xyz_min, xyz_max = torch.tensor(xyz_min), torch.tensor(xyz_max)\n    center = (xyz_min + xyz_max) * 0.5\n    radius = (center - xyz_min).max() * cfg.data.unbounded_inner_r\n    xyz_min = center - radius\n    xyz_max = center + radius\n    \n    # # manually set xyz_min and xyz_max\n    # xyz_min = torch.tensor([-1.0, -1.0, -1.0])\n    # xyz_max = torch.tensor([1.0, 1.0, 1.0])\n    return xyz_min, xyz_max\n\n\ndef FourierGrid_compute_bbox_by_cam_frustrm_mega(cfg, HW, Ks, poses, i_train, near_clip):\n    xs, ys, zs = [], [], []\n    for (H, W), K, c2w in tqdm(zip(HW[i_train], Ks[i_train], poses[i_train]), total=len(HW[i_train])):\n        xs.append(c2w[:, 3][0].item())\n        ys.append(c2w[:, 3][1].item())\n        zs.append(c2w[:, 3][2].item())\n    zmin, zmax = min(zs), max(zs)\n    xmin, xmax = min(xs), max(xs)\n    ymin, ymax = min(ys), max(ys)\n    x_distance, y_distance, z_distance = abs(xmax - xmin), abs(ymax - ymin), abs(zmax - zmin)\n    boundary_ratio = cfg.data.boundary_ratio\n    xyz_min = [xmin - boundary_ratio*x_distance, ymin - boundary_ratio * y_distance, zmin - boundary_ratio * z_distance]\n    xyz_max = [xmax + boundary_ratio*x_distance, ymax + boundary_ratio * y_distance, zmax + boundary_ratio * z_distance]\n    xyz_min, xyz_max = torch.tensor(xyz_min), torch.tensor(xyz_max)\n    center = (xyz_min + xyz_max) * 0.5\n    radius = (center - xyz_min).max() * cfg.data.unbounded_inner_r\n    xyz_min = center - radius\n    xyz_max = center + radius\n    return xyz_min, xyz_max\n\n\ndef _compute_bbox_by_cam_frustrm_bounded(cfg, HW, Ks, poses, i_train, near, far):\n    xyz_min = torch.Tensor([np.inf, np.inf, np.inf])\n    xyz_max = -xyz_min\n    for (H, W), K, c2w in zip(HW[i_train], Ks[i_train], poses[i_train]):\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H=H, W=W, K=K, c2w=c2w,\n                ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n        if cfg.data.ndc:\n            pts_nf = torch.stack([rays_o+rays_d*near, rays_o+rays_d*far])\n        else:\n            pts_nf = torch.stack([rays_o+viewdirs*near, rays_o+viewdirs*far])\n        xyz_min = torch.minimum(xyz_min, pts_nf.amin((0,1,2)))\n        xyz_max = torch.maximum(xyz_max, pts_nf.amax((0,1,2)))\n    return xyz_min, xyz_max\n\n\ndef compute_bbox_by_cam_frustrm(args, cfg, HW, Ks, poses, i_train, near, far, **kwargs):\n    verbose = args.block_num <= 1\n    if verbose:\n        print('compute_bbox_by_cam_frustrm: start')\n    if cfg.data.dataset_type == \"waymo\":\n        xyz_min, xyz_max = FourierGrid_compute_bbox_by_cam_frustrm_waymo(\n                cfg, HW, Ks, poses, i_train, kwargs.get('near_clip', None))\n    elif cfg.data.dataset_type == \"nerfpp\" or cfg.model == 'FourierGrid':\n        xyz_min, xyz_max = FourierGrid_compute_bbox_by_cam_frustrm_nerfpp(\n                cfg, HW, Ks, poses, i_train, kwargs.get('near_clip', None))\n    elif cfg.data.unbounded_inward:\n        xyz_min, xyz_max = _compute_bbox_by_cam_frustrm_unbounded(\n                cfg, HW, Ks, poses, i_train, kwargs.get('near_clip', None))\n    else:\n        xyz_min, xyz_max = _compute_bbox_by_cam_frustrm_bounded(\n                cfg, HW, Ks, poses, i_train, near, far)\n    if verbose:\n        print('compute_bbox_by_cam_frustrm: xyz_min', xyz_min)\n        print('compute_bbox_by_cam_frustrm: xyz_max', xyz_max)\n        print('compute_bbox_by_cam_frustrm: finish')\n    return xyz_min, xyz_max\n\n\n@torch.no_grad()\ndef compute_bbox_by_coarse_geo(model_class, model_path, thres, device, args, cfg):\n    print('compute_bbox_by_coarse_geo: start')\n    eps_time = time.time()\n    # model = utils.load_model(model_class, model_path)\n    # TODO: validate this, this should be fine\n    model, _, _ = load_existing_model(args, cfg, cfg.fine_train, model_path, device=device)\n    model.to(device)\n    interp = torch.stack(torch.meshgrid(\n        torch.linspace(0, 1, model.world_size[0]),\n        torch.linspace(0, 1, model.world_size[1]),\n        torch.linspace(0, 1, model.world_size[2]),\n    ), -1)\n    dense_xyz = model.xyz_min * (1-interp) + model.xyz_max * interp\n    density = model.density(dense_xyz)\n    alpha = model.activate_density(density)\n    mask = (alpha > thres)\n    if mask.max() > 0:\n        active_xyz = dense_xyz[mask]\n    else:\n        print(\"Warning! No activated voxels found.\")\n        mask = (alpha > -1)\n        active_xyz = dense_xyz[mask]\n    xyz_min = active_xyz.amin(0)\n    xyz_max = active_xyz.amax(0)\n    print('compute_bbox_by_coarse_geo: xyz_min', xyz_min)\n    print('compute_bbox_by_coarse_geo: xyz_max', xyz_max)\n    eps_time = time.time() - eps_time\n    print('compute_bbox_by_coarse_geo: finish (eps time:', eps_time, 'secs)')\n    return xyz_min, xyz_max\n"
  },
  {
    "path": "FourierGrid/camera_utils.py",
    "content": "import numpy as np\nimport enum\nfrom dataclasses import dataclass\nfrom typing import List, Mapping, Optional, Text, Tuple, Union\nfrom torch import Tensor\nimport torch\n\n\n@dataclass\nclass Rays:\n    origins: Tensor\n    directions: Tensor\n    viewdirs: Tensor\n    radii: Tensor\n    near: Tensor\n    far: Tensor\n\n\ndef split_rays(rays, batch_size):\n    ret = []\n    origins_all = rays.origins.split(batch_size)\n    directions_all = rays.directions.split(batch_size)\n    viewdirs_all = rays.viewdirs.split(batch_size)\n    radii_all = rays.radii.split(batch_size)\n    near_all = rays.near.split(batch_size)\n    far_all  = rays.far.split(batch_size)\n\n    for o, d, v, r, n, f in zip(origins_all, directions_all, viewdirs_all, radii_all, near_all, far_all):\n        ret.append(Rays(o, d, v, r, n, f))\n    return ret\n\ndef intrinsic_matrix(fx: float,\n                     fy: float,\n                     cx: float,\n                     cy: float,\n                     ):\n    \"\"\"Intrinsic matrix for a pinhole camera in OpenCV coordinate system.\"\"\"\n    return np.array([\n        [fx, 0, cx],\n        [0, fy, cy],\n        [0, 0, 1.]], dtype=np.float32)\n\n\nclass ProjectionType(enum.Enum):\n    \"\"\"Camera projection type (standard perspective pinhole or fisheye model).\"\"\"\n    PERSPECTIVE = 'perspective'\n    FISHEYE = 'fisheye'\n\n\ndef convert_to_ndc(origins: Tensor,\n                   directions: Tensor,\n                   pixtocam: Tensor,\n                   near: float = 1.) -> Tuple[Tensor, Tensor]:\n    \"\"\"Converts a set of rays to normalized device coordinates (NDC).\n\n    Args:\n      origins: ndarray(float32), [..., 3], world space ray origins.\n      directions: ndarray(float32), [..., 3], world space ray directions.\n      pixtocam: ndarray(float32), [3, 3], inverse intrinsic matrix.\n      near: float, near plane along the negative z axis.\n      xnp: either numpy or jax.numpy.\n\n    Returns:\n      origins_ndc: ndarray(float32), [..., 3].\n      directions_ndc: ndarray(float32), [..., 3].\n\n    This function assumes input rays should be mapped into the NDC space for a\n    perspective projection pinhole camera, with identity extrinsic matrix (pose)\n    and intrinsic parameters defined by inputs focal, width, and height.\n\n    The near value specifies the near plane of the frustum, and the far plane is\n    assumed to be infinity.\n\n    The ray bundle for the identity pose camera will be remapped to parallel rays\n    within the (-1, -1, -1) to (1, 1, 1) cube. Any other ray in the original\n    world space can be remapped as long as it has dz < 0 (ray direction has a\n    negative z-coord); this allows us to share a common NDC space for \"forward\n    facing\" scenes.\n\n    Note that\n        projection(origins + t * directions)\n    will NOT be equal to\n        origins_ndc + t * directions_ndc\n    and that the directions_ndc are not unit length. Rather, directions_ndc is\n    defined such that the valid near and far planes in NDC will be 0 and 1.\n\n    See Appendix C in https://arxiv.org/abs/2003.08934 for additional details.\n    \"\"\"\n\n    # Shift ray origins to near plane, such that oz = -near.\n    # This makes the new near bound equal to 0.\n    t = -(near + origins[..., 2]) / directions[..., 2]\n    origins = origins + t[..., None] * directions\n\n    dx, dy, dz = torch.moveaxis(directions, -1, 0)\n    ox, oy, oz = torch.moveaxis(origins, -1, 0)\n\n    xmult = 1. / pixtocam[0, 2]  # Equal to -2. * focal / cx\n    ymult = 1. / pixtocam[1, 2]  # Equal to -2. * focal / cy\n\n    # Perspective projection into NDC for the t = 0 near points\n    #     origins + 0 * directions\n    origins_ndc = torch.stack([xmult * ox / oz, ymult * oy / oz,\n                               -torch.ones_like(oz)], dim=-1)\n\n    # Perspective projection into NDC for the t = infinity far points\n    #     origins + infinity * directions\n    infinity_ndc = torch.stack([xmult * dx / dz, ymult * dy / dz,\n                                torch.ones_like(oz)],\n                               dim=-1)\n\n    # directions_ndc points from origins_ndc to infinity_ndc\n    directions_ndc = infinity_ndc - origins_ndc\n\n    return origins_ndc, directions_ndc\n\n\ndef pixels_to_rays(\n        pix_x_int: Tensor,\n        pix_y_int: Tensor,\n        pixtocams: Tensor,\n        camtoworlds: Tensor,\n        distortion_params: Optional[Mapping[str, float]] = None,\n        pixtocam_ndc: Optional[Tensor] = None,\n        camtype: ProjectionType = ProjectionType.PERSPECTIVE,\n) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:\n    \"\"\"Calculates rays given pixel coordinates, intrinisics, and extrinsics.\n\n    Given 2D pixel coordinates pix_x_int, pix_y_int for cameras with\n    inverse intrinsics pixtocams and extrinsics camtoworlds (and optional\n    distortion coefficients distortion_params and NDC space projection matrix\n    pixtocam_ndc), computes the corresponding 3D camera rays.\n\n    Vectorized over the leading dimensions of the first four arguments.\n\n    Args:\n      pix_x_int: int array, shape SH, x coordinates of image pixels.\n      pix_y_int: int array, shape SH, y coordinates of image pixels.\n      pixtocams: float array, broadcastable to SH + [3, 3], inverse intrinsics.\n      camtoworlds: float array, broadcastable to SH + [3, 4], camera extrinsics.\n      distortion_params: dict of floats, optional camera distortion parameters.\n      pixtocam_ndc: float array, [3, 3], optional inverse intrinsics for NDC.\n      camtype: camera_utils.ProjectionType, fisheye or perspective camera.\n      xnp: either numpy or jax.numpy.\n\n    Returns:\n      origins: float array, shape SH + [3], ray origin points.\n      directions: float array, shape SH + [3], ray direction vectors.\n      viewdirs: float array, shape SH + [3], normalized ray direction vectors.\n      radii: float array, shape SH + [1], ray differential radii.\n      imageplane: float array, shape SH + [2], xy coordinates on the image plane.\n        If the image plane is at world space distance 1 from the pinhole, then\n        imageplane will be the xy coordinates of a pixel in that space (so the\n        camera ray direction at the origin would be (x, y, -1) in OpenGL coords).\n    \"\"\"\n\n    # Must add half pixel offset to shoot rays through pixel centers.\n    def pix_to_dir(x, y):\n        return torch.stack([x + .5, y + .5, torch.ones_like(x)], dim=-1)\n\n    # We need the dx and dy rays to calculate ray radii for mip-NeRF cones.\n    pixel_dirs_stacked = torch.stack([\n        pix_to_dir(pix_x_int, pix_y_int),\n        pix_to_dir(pix_x_int + 1, pix_y_int),\n        pix_to_dir(pix_x_int, pix_y_int + 1)\n    ], dim=0)\n\n    # For jax, need to specify high-precision matmul.\n    mat_vec_mul = lambda A, b: torch.matmul(A, b[..., None])[..., 0]\n\n    # Apply inverse intrinsic matrices.\n    camera_dirs_stacked = mat_vec_mul(pixtocams, pixel_dirs_stacked)\n\n    if distortion_params is not None:\n        # Correct for distortion.\n        x, y = _radial_and_tangential_undistort(\n            camera_dirs_stacked[..., 0],\n            camera_dirs_stacked[..., 1],\n            **distortion_params)\n        camera_dirs_stacked = torch.stack([x, y, torch.ones_like(x)], -1)\n\n    if camtype == ProjectionType.FISHEYE:\n        theta = torch.sqrt(torch.sum(torch.square(camera_dirs_stacked[..., :2]), dim=-1))\n        theta = theta.clip(None, np.pi)\n\n        sin_theta_over_theta = torch.sin(theta) / theta\n        camera_dirs_stacked = torch.stack([\n            camera_dirs_stacked[..., 0] * sin_theta_over_theta,\n            camera_dirs_stacked[..., 1] * sin_theta_over_theta,\n            torch.cos(theta),\n            ], dim=-1)\n\n    # Flip from OpenCV to OpenGL coordinate system.\n    camera_dirs_stacked = torch.matmul(camera_dirs_stacked,\n                                       torch.diag(torch.tensor([1., -1., -1.],\n                                                               dtype=torch.float32,\n                                                               device=camera_dirs_stacked.device)))\n\n    # Extract 2D image plane (x, y) coordinates.\n    imageplane = camera_dirs_stacked[0, ..., :2]\n\n    # Apply camera rotation matrices.\n    directions_stacked = mat_vec_mul(camtoworlds[..., :3, :3],\n                                     camera_dirs_stacked)\n    # Extract the offset rays.\n    directions, dx, dy = directions_stacked\n\n    origins = torch.broadcast_to(camtoworlds[..., :3, -1], directions.shape)\n    viewdirs = directions / torch.linalg.norm(directions, ord=2, dim=-1, keepdim=True)\n\n    if pixtocam_ndc is None:\n        # Distance from each unit-norm direction vector to its neighbors.\n        dx_norm = torch.linalg.norm(dx - directions, ord=2, dim=-1)\n        dy_norm = torch.linalg.norm(dy - directions, ord=2, dim=-1)\n    else:\n        # Convert ray origins and directions into projective NDC space.\n        origins_dx, _ = convert_to_ndc(origins, dx, pixtocam_ndc)\n        origins_dy, _ = convert_to_ndc(origins, dy, pixtocam_ndc)\n        origins, directions = convert_to_ndc(origins, directions, pixtocam_ndc)\n\n        # In NDC space, we use the offset between origins instead of directions.\n        dx_norm = torch.linalg.norm(origins_dx - origins, ord=2, dim=-1)\n        dy_norm = torch.linalg.norm(origins_dy - origins, ord=2, dim=-1)\n\n    # Cut the distance in half, multiply it to match the variance of a uniform\n    # distribution the size of a pixel (1/12, see the original mipnerf paper).\n    radii = (0.5 * (dx_norm + dy_norm))[..., None] * 2 / np.sqrt(12.)\n\n    return origins, directions, viewdirs, radii, imageplane\n\n\ndef _compute_residual_and_jacobian(\n        x: Tensor,\n        y: Tensor,\n        xd: Tensor,\n        yd: Tensor,\n        k1: float = 0.0,\n        k2: float = 0.0,\n        k3: float = 0.0,\n        k4: float = 0.0,\n        p1: float = 0.0,\n        p2: float = 0.0,\n) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:\n    \"\"\"Auxiliary function of radial_and_tangential_undistort().\"\"\"\n    # Adapted from https://github.com/google/nerfies/blob/main/nerfies/camera.py\n    # let r(x, y) = x^2 + y^2;\n    #     d(x, y) = 1 + k1 * r(x, y) + k2 * r(x, y) ^2 + k3 * r(x, y)^3 +\n    #                   k4 * r(x, y)^4;\n    r = x * x + y * y\n    d = 1.0 + r * (k1 + r * (k2 + r * (k3 + r * k4)))\n\n    # The perfect projection is:\n    # xd = x * d(x, y) + 2 * p1 * x * y + p2 * (r(x, y) + 2 * x^2);\n    # yd = y * d(x, y) + 2 * p2 * x * y + p1 * (r(x, y) + 2 * y^2);\n    #\n    # Let's define\n    #\n    # fx(x, y) = x * d(x, y) + 2 * p1 * x * y + p2 * (r(x, y) + 2 * x^2) - xd;\n    # fy(x, y) = y * d(x, y) + 2 * p2 * x * y + p1 * (r(x, y) + 2 * y^2) - yd;\n    #\n    # We are looking for a solution that satisfies\n    # fx(x, y) = fy(x, y) = 0;\n    fx = d * x + 2 * p1 * x * y + p2 * (r + 2 * x * x) - xd\n    fy = d * y + 2 * p2 * x * y + p1 * (r + 2 * y * y) - yd\n\n    # Compute derivative of d over [x, y]\n    d_r = (k1 + r * (2.0 * k2 + r * (3.0 * k3 + r * 4.0 * k4)))\n    d_x = 2.0 * x * d_r\n    d_y = 2.0 * y * d_r\n\n    # Compute derivative of fx over x and y.\n    fx_x = d + d_x * x + 2.0 * p1 * y + 6.0 * p2 * x\n    fx_y = d_y * x + 2.0 * p1 * x + 2.0 * p2 * y\n\n    # Compute derivative of fy over x and y.\n    fy_x = d_x * y + 2.0 * p2 * y + 2.0 * p1 * x\n    fy_y = d + d_y * y + 2.0 * p2 * x + 6.0 * p1 * y\n\n    return fx, fy, fx_x, fx_y, fy_x, fy_y\n\n\ndef _radial_and_tangential_undistort(\n        xd: Tensor,\n        yd: Tensor,\n        k1: float = 0,\n        k2: float = 0,\n        k3: float = 0,\n        k4: float = 0,\n        p1: float = 0,\n        p2: float = 0,\n        eps: float = 1e-9,\n        max_iterations=10) -> Tuple[Tensor, Tensor]:\n    \"\"\"Computes undistorted (x, y) from (xd, yd).\"\"\"\n    # From https://github.com/google/nerfies/blob/main/nerfies/camera.py\n    # Initialize from the distorted point.\n    x = xd.clone()\n    y = yd.clone()\n\n    for _ in range(max_iterations):\n        fx, fy, fx_x, fx_y, fy_x, fy_y = _compute_residual_and_jacobian(\n            x=x, y=y, xd=xd, yd=yd, k1=k1, k2=k2, k3=k3, k4=k4, p1=p1, p2=p2)\n        denominator = fy_x * fx_y - fx_x * fy_y\n        x_numerator = fx * fy_y - fy * fx_y\n        y_numerator = fy * fx_x - fx * fy_x\n        step_x = torch.where(\n            torch.abs(denominator) > eps, x_numerator / denominator,\n            torch.zeros_like(denominator))\n        step_y = torch.where(\n            torch.abs(denominator) > eps, y_numerator / denominator,\n            torch.zeros_like(denominator))\n\n        x = x + step_x\n        y = y + step_y\n\n    return x, y\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/common_data_loaders/load_blendedmvs.py",
    "content": "import os\nimport glob\nimport torch\nimport numpy as np\nimport imageio\nimport json\nimport torch.nn.functional as F\nimport cv2\n\n\ndef load_blendedmvs_data(basedir):\n    pose_paths = sorted(glob.glob(os.path.join(basedir, 'pose', '*txt')))\n    rgb_paths = sorted(glob.glob(os.path.join(basedir, 'rgb', '*png')))\n\n    all_poses = []\n    all_imgs = []\n    i_split = [[], []]\n    for i, (pose_path, rgb_path) in enumerate(zip(pose_paths, rgb_paths)):\n        i_set = int(os.path.split(rgb_path)[-1][0])\n        all_imgs.append((imageio.imread(rgb_path) / 255.).astype(np.float32))\n        all_poses.append(np.loadtxt(pose_path).astype(np.float32))\n        i_split[i_set].append(i)\n\n    imgs = np.stack(all_imgs, 0)\n    poses = np.stack(all_poses, 0)\n    i_split.append(i_split[-1])\n\n    path_intrinsics = os.path.join(basedir, 'intrinsics.txt')\n    H, W = imgs[0].shape[:2]\n    K = np.loadtxt(path_intrinsics)\n    focal = float(K[0,0])\n\n    render_poses = torch.Tensor(np.loadtxt(os.path.join(basedir, 'test_traj.txt')).reshape(-1,4,4).astype(np.float32))\n\n    return imgs, poses, render_poses, [H, W, focal], K, i_split\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_blender.py",
    "content": "import os\nimport torch\nimport numpy as np\nimport imageio\nimport pdb\nimport json\nimport torch.nn.functional as F\nimport cv2\n\n\ntrans_t = lambda t : torch.Tensor([\n    [1,0,0,0],\n    [0,1,0,0],\n    [0,0,1,t],\n    [0,0,0,1]]).float()\n\nrot_phi = lambda phi : torch.Tensor([\n    [1,0,0,0],\n    [0,np.cos(phi),-np.sin(phi),0],\n    [0,np.sin(phi), np.cos(phi),0],\n    [0,0,0,1]]).float()\n\nrot_theta = lambda th : torch.Tensor([\n    [np.cos(th),0,-np.sin(th),0],\n    [0,1,0,0],\n    [np.sin(th),0, np.cos(th),0],\n    [0,0,0,1]]).float()\n\n\ndef pose_spherical(theta, phi, radius):\n    c2w = trans_t(radius)\n    c2w = rot_phi(phi/180.*np.pi) @ c2w\n    c2w = rot_theta(theta/180.*np.pi) @ c2w\n    c2w = torch.Tensor(np.array([[-1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1]])) @ c2w\n    return c2w\n\n\ndef load_blender_data(basedir, half_res=False, testskip=1):\n    splits = ['train', 'val', 'test']\n    metas = {}\n    for s in splits:\n        with open(os.path.join(basedir, 'transforms_{}.json'.format(s)), 'r') as fp:\n            metas[s] = json.load(fp)\n\n    all_imgs = []\n    all_poses = []\n    counts = [0]\n    for s in splits:\n        meta = metas[s]\n        imgs = []\n        poses = []\n        if s=='train' or testskip==0:\n            skip = 1\n        else:\n            skip = testskip\n\n        for frame in meta['frames'][::skip]:\n            fname = os.path.join(basedir, frame['file_path'] + '.png')\n            imgs.append(imageio.imread(fname))\n            poses.append(np.array(frame['transform_matrix']))\n        imgs = (np.array(imgs) / 255.).astype(np.float32) # keep all 4 channels (RGBA)\n        poses = np.array(poses).astype(np.float32)\n        counts.append(counts[-1] + imgs.shape[0])\n        all_imgs.append(imgs)\n        all_poses.append(poses)\n\n    i_split = [np.arange(counts[i], counts[i+1]) for i in range(3)]\n\n    imgs = np.concatenate(all_imgs, 0)\n    poses = np.concatenate(all_poses, 0)\n    pdb.set_trace()\n    H, W = imgs[0].shape[:2]\n    camera_angle_x = float(meta['camera_angle_x'])\n    focal = .5 * W / np.tan(.5 * camera_angle_x)\n\n    render_poses = torch.stack([pose_spherical(angle, -30.0, 4.0) for angle in np.linspace(-180,180,160+1)[:-1]], 0)\n\n    if half_res:\n        H = H//2\n        W = W//2\n        focal = focal/2.\n\n        imgs_half_res = np.zeros((imgs.shape[0], H, W, 4))\n        for i, img in enumerate(imgs):\n            imgs_half_res[i] = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA)\n        imgs = imgs_half_res\n        # imgs = tf.image.resize_area(imgs, [400, 400]).numpy()\n    return imgs, poses, render_poses, [H, W, focal], i_split\n\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_co3d.py",
    "content": "import os\nimport json\nimport gzip\nimport glob\nimport torch\nimport numpy as np\nimport imageio\nimport torch.nn.functional as F\nimport cv2\n\n\ndef load_co3d_data(cfg):\n\n    # load meta\n    with gzip.open(cfg.annot_path, 'rt', encoding='utf8') as zipfile:\n        annot = [v for v in json.load(zipfile) if v['sequence_name'] == cfg.sequence_name]\n    with open(cfg.split_path) as f:\n        split = json.load(f)\n        train_im_path = set()\n        test_im_path = set()\n        for k, lst in split.items():\n            for v in lst:\n                if v[0] == cfg.sequence_name:\n                    if 'known' in k:\n                        train_im_path.add(v[-1])\n                    else:\n                        test_im_path.add(v[-1])\n    assert len(annot) == len(train_im_path) + len(test_im_path), 'Mismatch: '\\\n            f'{len(annot)} == {len(train_im_path) + len(test_im_path)}'\n\n    # load datas\n    imgs = []\n    masks = []\n    poses = []\n    Ks = []\n    i_split = [[], []]\n    remove_empty_masks_cnt = [0, 0]\n    for i, meta in enumerate(annot):\n        im_fname = meta['image']['path']\n        assert im_fname in train_im_path or im_fname in test_im_path\n        sid = 0 if im_fname in train_im_path else 1\n        if meta['mask']['mass'] == 0:\n            remove_empty_masks_cnt[sid] += 1\n            continue\n        im_path = os.path.join(cfg.datadir, im_fname)\n        mask_path = os.path.join(cfg.datadir, meta['mask']['path'])\n        mask = imageio.imread(mask_path) / 255.\n        if mask.max() < 0.5:\n            remove_empty_masks_cnt[sid] += 1\n            continue\n        Rt = np.concatenate([meta['viewpoint']['R'], np.array(meta['viewpoint']['T'])[:,None]], 1)\n        pose = np.linalg.inv(np.concatenate([Rt, [[0,0,0,1]]]))\n        imgs.append(imageio.imread(im_path) / 255.)\n        masks.append(mask)\n        poses.append(pose)\n        assert imgs[-1].shape[:2] == tuple(meta['image']['size'])\n        half_image_size_wh = np.float32(meta['image']['size'][::-1]) * 0.5\n        principal_point = np.float32(meta['viewpoint']['principal_point'])\n        focal_length = np.float32(meta['viewpoint']['focal_length'])\n        principal_point_px = -1.0 * (principal_point - 1.0) * half_image_size_wh\n        focal_length_px = focal_length * half_image_size_wh\n        Ks.append(np.array([\n            [focal_length_px[0], 0, principal_point_px[0]],\n            [0, focal_length_px[1], principal_point_px[1]],\n            [0, 0, 1],\n        ]))\n        i_split[sid].append(len(imgs)-1)\n\n    if sum(remove_empty_masks_cnt) > 0:\n        print('load_co3d_data: removed %d train / %d test due to empty mask' % tuple(remove_empty_masks_cnt))\n    print(f'load_co3d_data: num images {len(i_split[0])} train / {len(i_split[1])} test')\n\n    imgs = np.array(imgs)\n    masks = np.array(masks)\n    poses = np.stack(poses, 0)\n    Ks = np.stack(Ks, 0)\n    render_poses = poses[i_split[-1]]\n    i_split.append(i_split[-1])\n\n    # visyalization hwf\n    H, W = np.array([im.shape[:2] for im in imgs]).mean(0).astype(int)\n    focal = Ks[:,[0,1],[0,1]].mean()\n\n    return imgs, masks, poses, render_poses, [H, W, focal], Ks, i_split\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_common_data.py",
    "content": "import numpy as np\nimport pdb\nfrom .load_llff import load_llff_data\nfrom .load_free import load_free_data\nfrom .load_blender import load_blender_data\nfrom .load_nsvf import load_nsvf_data\nfrom .load_blendedmvs import load_blendedmvs_data\nfrom .load_tankstemple import load_tankstemple_data\nfrom .load_deepvoxels import load_dv_data\nfrom .load_co3d import load_co3d_data\nfrom .load_nerfpp import load_nerfpp_data\n\n\ndef load_common_data(args):\n\n    K, depths = None, None\n    near_clip = None\n    if not 'training_ids' in args:\n        training_ids = None\n    else:\n        training_ids = args['training_ids']\n        \n    if args.dataset_type == 'llff':\n        images, depths, poses, bds, render_poses, i_test = load_llff_data(\n                args.datadir, args.factor, args.width, args.height,\n                recenter=True, bd_factor=args.bd_factor,\n                spherify=args.spherify,\n                load_depths=args.load_depths,\n                movie_render_kwargs=args.movie_render_kwargs)\n        hwf = poses[0,:3,-1]\n        poses = poses[:,:3,:4]\n        print('Loaded llff', images.shape, render_poses.shape, hwf, args.datadir)\n        if not isinstance(i_test, list):\n            i_test = [i_test]\n\n        if args.llffhold > 0:\n            print('Auto LLFF holdout,', args.llffhold)\n            i_test = np.arange(images.shape[0])[::args.llffhold]\n\n        i_val = i_test\n        i_train = np.array([i for i in np.arange(int(images.shape[0])) if\n                        (i not in i_test and i not in i_val)])\n\n        print('DEFINING BOUNDS')\n        if args.ndc:\n            near = 0.\n            far = 1.\n        else:\n            near_clip = max(np.ndarray.min(bds) * .9, 0)\n            _far = max(np.ndarray.max(bds) * 1., 0)\n            near = 0\n            far = inward_nearfar_heuristic(poses[i_train, :3, 3])[1]\n            print('near_clip', near_clip)\n            print('original far', _far)\n        print('NEAR FAR', near, far)\n    elif args.dataset_type == 'free':\n        images, depths, intri, poses, bds, render_poses, i_test = load_free_data(\n            args, args.datadir, args.factor, args.width, args.height,\n            recenter=True, bd_factor=args.bd_factor,\n            spherify=args.spherify,\n            load_depths=args.load_depths,\n            movie_render_kwargs=args.movie_render_kwargs, training_ids=training_ids, sc=args.pose_scale)\n        i_val = i_test\n        i_train = np.array([i for i in np.arange(int(images.shape[0])) if (i not in i_test and i not in i_val)])\n        near_clip = max(np.ndarray.min(bds) * .9, 0)\n        _far = max(np.ndarray.max(bds) * 1., 0)\n        if args.ndc:\n            near = 0.\n            far = 1.\n        else:\n            near = 0\n            far = inward_nearfar_heuristic(poses[i_train, :3, 3])[1]\n        \n        # print('DEFINING BOUNDS')\n        # if args.ndc:\n        #     near = 0.\n        #     far = 1.\n        # else:\n        #     near_clip = max(np.ndarray.min(bds) * .9, 0)\n        #     _far = max(np.ndarray.max(bds) * 1., 0)\n        #     near = 0\n        #     far = inward_nearfar_heuristic(poses[i_train, :3, 3])[1]\n        #     print('near_clip', near_clip)\n        #     print('original far', _far)\n        # print('NEAR FAR', near, far)\n        \n        # Cast intrinsics to right types\n        # H, W, focal = hwf\n        # H, W = int(H), int(W)\n        # hwf = [H, W, focal]\n        HW = np.array([im.shape[:2] for im in images])\n        irregular_shape = (images.dtype is np.dtype('object'))\n\n        # all data should be in [N \\times D]\n        # near_clip = None\n        near_clip = max(np.ndarray.min(bds) * .9, 0)\n        data_dict = dict(\n            hwf=None, HW=HW, Ks=intri,\n            near=near, far=far, near_clip=near_clip,\n            i_train=i_train, i_val=i_val, i_test=i_test,\n            poses=poses, render_poses=render_poses,\n            images=images, depths=depths,\n            irregular_shape=irregular_shape,\n        )\n        return data_dict\n    elif args.dataset_type == 'nerfstudio':\n        images, depths, poses, bds, render_poses, i_test = load_nerfstudio_data(\n                args.datadir, args.factor, args.width, args.height, recenter=args.recenter, bd_factor=args.bd_factor, dvgohold=args.dvgohold,\n                spherify=args.spherify, load_depths=args.load_depths, movie_render_kwargs=args.movie_render_kwargs)\n        hwf = poses[0,:3,-1]\n        poses = poses[:,:3,:4]\n        print('Loaded nerfstudio', images.shape, render_poses.shape, hwf, args.datadir)\n        if not isinstance(i_test, list):\n            i_test = [i_test]\n\n        if args.llffhold > 0:\n            print('Auto LLFF holdout,', args.llffhold)\n            i_test = np.arange(images.shape[0])[::args.llffhold]\n\n        i_val = i_test\n        i_train = np.array([i for i in np.arange(int(images.shape[0])) if\n                        (i not in i_test and i not in i_val)])\n\n        print('DEFINING BOUNDS')\n        if args.ndc:\n            near = 0.\n            far = 1.\n        else:\n            near_clip = max(np.ndarray.min(bds) * .9, 0)\n            _far = max(np.ndarray.max(bds) * 1., 0)\n            near = 0\n            far = inward_nearfar_heuristic(poses[i_train, :3, 3])[1]\n    elif args.dataset_type == 'blender':\n        images, poses, render_poses, hwf, i_split = load_blender_data(args.datadir, args.half_res, args.testskip)\n        print('Loaded blender', images.shape, render_poses.shape, hwf, args.datadir)\n        i_train, i_val, i_test = i_split\n        near, far = 2., 6.\n\n        if images.shape[-1] == 4:\n            if args.white_bkgd:\n                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])\n            else:\n                images = images[...,:3]*images[...,-1:]\n\n    elif args.dataset_type == 'blendedmvs':\n        images, poses, render_poses, hwf, K, i_split = load_blendedmvs_data(args.datadir)\n        print('Loaded blendedmvs', images.shape, render_poses.shape, hwf, args.datadir)\n        i_train, i_val, i_test = i_split\n\n        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3])\n\n        assert images.shape[-1] == 3\n\n    elif args.dataset_type == 'tankstemple':\n        images, poses, render_poses, hwf, K, i_split = load_tankstemple_data(\n                args.datadir, movie_render_kwargs=args.movie_render_kwargs)\n        print('Loaded tankstemple', images.shape, render_poses.shape, hwf, args.datadir)\n        i_train, i_val, i_test = i_split\n\n        # near, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0)\n        near_clip, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0.02)\n        near = 0\n        if images.shape[-1] == 4:\n            if args.white_bkgd:\n                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])\n            else:\n                images = images[...,:3]*images[...,-1:]\n\n    elif args.dataset_type == 'nsvf':\n        images, poses, render_poses, hwf, i_split = load_nsvf_data(args.datadir)\n        print('Loaded nsvf', images.shape, render_poses.shape, hwf, args.datadir)\n        i_train, i_val, i_test = i_split\n\n        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3])\n\n        if images.shape[-1] == 4:\n            if args.white_bkgd:\n                images = images[...,:3]*images[...,-1:] + (1.-images[...,-1:])\n            else:\n                images = images[...,:3]*images[...,-1:]\n\n    elif args.dataset_type == 'deepvoxels':\n        images, poses, render_poses, hwf, i_split = load_dv_data(scene=args.scene, basedir=args.datadir, testskip=args.testskip)\n        print('Loaded deepvoxels', images.shape, render_poses.shape, hwf, args.datadir)\n        i_train, i_val, i_test = i_split\n\n        hemi_R = np.mean(np.linalg.norm(poses[:,:3,-1], axis=-1))\n        near = hemi_R - 1\n        far = hemi_R + 1\n        assert args.white_bkgd\n        assert images.shape[-1] == 3\n\n    elif args.dataset_type == 'co3d':\n        # each image can be in different shapes and intrinsics\n        images, masks, poses, render_poses, hwf, K, i_split = load_co3d_data(args)\n        print('Loaded co3d', args.datadir, args.annot_path, args.sequence_name)\n        i_train, i_val, i_test = i_split\n\n        near, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0)\n\n        for i in range(len(images)):\n            if args.white_bkgd:\n                images[i] = images[i] * masks[i][...,None] + (1.-masks[i][...,None])\n            else:\n                images[i] = images[i] * masks[i][...,None]\n\n    elif args.dataset_type == 'nerfpp':\n        images, poses, render_poses, hwf, K, i_split = load_nerfpp_data(args.datadir, rerotate=False,\n                                                                        training_ids=training_ids)\n        print('Loaded nerf_pp', images.shape, hwf, args.datadir)\n        i_train, i_val, i_test = i_split\n        # TODO: remove the near clip and far arguments\n        near_clip, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0.02)\n        near = 0\n    else:\n        raise NotImplementedError(f'Unknown dataset type {args.dataset_type} exiting')\n\n    # Cast intrinsics to right types\n    H, W, focal = hwf\n    H, W = int(H), int(W)\n    hwf = [H, W, focal]\n    HW = np.array([im.shape[:2] for im in images])\n    irregular_shape = (images.dtype is np.dtype('object'))\n\n    if K is None:\n        K = np.array([\n            [focal, 0, 0.5*W],\n            [0, focal, 0.5*H],\n            [0, 0, 1]\n        ])\n\n    if len(K.shape) == 2:\n        Ks = K[None].repeat(len(poses), axis=0)\n    else:\n        Ks = K\n    render_poses = render_poses[...,:4]\n    data_dict = dict(\n        hwf=hwf, HW=HW, Ks=Ks,\n        near=near, far=far, near_clip=near_clip,\n        i_train=i_train, i_val=i_val, i_test=i_test,\n        poses=poses, render_poses=render_poses,\n        images=images, depths=depths,\n        irregular_shape=irregular_shape,\n    )\n    return data_dict\n\n\ndef inward_nearfar_heuristic(cam_o, ratio=0.05):\n    dist = np.linalg.norm(cam_o[:,None] - cam_o, axis=-1)\n    far = dist.max()  # could be too small to exist the scene bbox\n                      # it is only used to determined scene bbox\n                      # lib/dvgo use 1e9 as far\n    near = far * ratio\n    return near, far\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_deepvoxels.py",
    "content": "import os\nimport numpy as np\nimport imageio\n\n\ndef load_dv_data(scene='cube', basedir='/data/deepvoxels', testskip=1):\n\n    def parse_intrinsics(filepath, trgt_sidelength, invert_y=False):\n        # Get camera intrinsics\n        with open(filepath, 'r') as file:\n            f, cx, cy = list(map(float, file.readline().split()))[:3]\n            grid_barycenter = np.array(list(map(float, file.readline().split())))\n            near_plane = float(file.readline())\n            scale = float(file.readline())\n            height, width = map(float, file.readline().split())\n\n            try:\n                world2cam_poses = int(file.readline())\n            except ValueError:\n                world2cam_poses = None\n\n        if world2cam_poses is None:\n            world2cam_poses = False\n\n        world2cam_poses = bool(world2cam_poses)\n\n        print(cx,cy,f,height,width)\n\n        cx = cx / width * trgt_sidelength\n        cy = cy / height * trgt_sidelength\n        f = trgt_sidelength / height * f\n\n        fx = f\n        if invert_y:\n            fy = -f\n        else:\n            fy = f\n\n        # Build the intrinsic matrices\n        full_intrinsic = np.array([[fx, 0., cx, 0.],\n                                   [0., fy, cy, 0],\n                                   [0., 0, 1, 0],\n                                   [0, 0, 0, 1]])\n\n        return full_intrinsic, grid_barycenter, scale, near_plane, world2cam_poses\n\n\n    def load_pose(filename):\n        assert os.path.isfile(filename)\n        nums = open(filename).read().split()\n        return np.array([float(x) for x in nums]).reshape([4,4]).astype(np.float32)\n\n\n    H = 512\n    W = 512\n    deepvoxels_base = '{}/train/{}/'.format(basedir, scene)\n\n    full_intrinsic, grid_barycenter, scale, near_plane, world2cam_poses = parse_intrinsics(os.path.join(deepvoxels_base, 'intrinsics.txt'), H)\n    print(full_intrinsic, grid_barycenter, scale, near_plane, world2cam_poses)\n    focal = full_intrinsic[0,0]\n    print(H, W, focal)\n\n    def dir2poses(posedir):\n        poses = np.stack([load_pose(os.path.join(posedir, f)) for f in sorted(os.listdir(posedir)) if f.endswith('txt')], 0)\n        transf = np.array([\n            [1,0,0,0],\n            [0,-1,0,0],\n            [0,0,-1,0],\n            [0,0,0,1.],\n        ])\n        poses = poses @ transf\n        poses = poses[:,:3,:4].astype(np.float32)\n        return poses\n\n    posedir = os.path.join(deepvoxels_base, 'pose')\n    poses = dir2poses(posedir)\n    testposes = dir2poses('{}/test/{}/pose'.format(basedir, scene))\n    testposes = testposes[::testskip]\n    valposes = dir2poses('{}/validation/{}/pose'.format(basedir, scene))\n    valposes = valposes[::testskip]\n\n    imgfiles = [f for f in sorted(os.listdir(os.path.join(deepvoxels_base, 'rgb'))) if f.endswith('png')]\n    imgs = np.stack([imageio.imread(os.path.join(deepvoxels_base, 'rgb', f))/255. for f in imgfiles], 0).astype(np.float32)\n\n    testimgd = '{}/test/{}/rgb'.format(basedir, scene)\n    imgfiles = [f for f in sorted(os.listdir(testimgd)) if f.endswith('png')]\n    testimgs = np.stack([imageio.imread(os.path.join(testimgd, f))/255. for f in imgfiles[::testskip]], 0).astype(np.float32)\n\n    valimgd = '{}/validation/{}/rgb'.format(basedir, scene)\n    imgfiles = [f for f in sorted(os.listdir(valimgd)) if f.endswith('png')]\n    valimgs = np.stack([imageio.imread(os.path.join(valimgd, f))/255. for f in imgfiles[::testskip]], 0).astype(np.float32)\n\n    all_imgs = [imgs, valimgs, testimgs]\n    counts = [0] + [x.shape[0] for x in all_imgs]\n    counts = np.cumsum(counts)\n    i_split = [np.arange(counts[i], counts[i+1]) for i in range(3)]\n\n    imgs = np.concatenate(all_imgs, 0)\n    poses = np.concatenate([poses, valposes, testposes], 0)\n\n    render_poses = testposes\n\n    print(poses.shape, imgs.shape)\n\n    return imgs, poses, render_poses, [H, W, focal], i_split\n\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_free.py",
    "content": "import numpy as np\nimport os, imageio\nimport torch\nimport scipy\nimport cv2\nimport pdb\nfrom shutil import copy\nfrom subprocess import check_output\nfrom FourierGrid.trajectory_generators.interp_traj import *\n\n    \n########## Slightly modified version of LLFF data loading code\n##########  see https://github.com/Fyusion/LLFF for original\ndef imread(f):\n    if f.endswith('png'):\n        return imageio.imread(f, ignoregamma=True)\n    else:\n        return imageio.imread(f)\n\ndef depthread(path):\n    with open(path, \"rb\") as fid:\n        width, height, channels = np.genfromtxt(fid, delimiter=\"&\", max_rows=1,\n                                                usecols=(0, 1, 2), dtype=int)\n        fid.seek(0)\n        num_delimiter = 0\n        byte = fid.read(1)\n        while True:\n            if byte == b\"&\":\n                num_delimiter += 1\n                if num_delimiter >= 3:\n                    break\n            byte = fid.read(1)\n        array = np.fromfile(fid, np.float32)\n    array = array.reshape((width, height, channels), order=\"F\")\n    return np.transpose(array, (1, 0, 2)).squeeze()\n\n\ndef _minify(basedir, factors=[], resolutions=[]):\n    needtoload = False\n    for r in factors:\n        imgdir = os.path.join(basedir, 'images_{}'.format(r))\n        if not os.path.exists(imgdir):\n            needtoload = True\n    for r in resolutions:\n        imgdir = os.path.join(basedir, 'images_{}x{}'.format(r[1], r[0]))\n        if not os.path.exists(imgdir):\n            needtoload = True\n    if not needtoload:\n        return\n\n    imgdir = os.path.join(basedir, 'images')\n    imgs = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir))]\n    imgs = [f for f in imgs if any([f.endswith(ex) for ex in ['JPG', 'jpg', 'jpeg', 'png', 'jpeg', 'PNG']])]\n    imgdir_orig = imgdir\n\n    wd = os.getcwd()\n\n    for r in factors + resolutions:\n        if isinstance(r, int):\n            name = 'images_{}'.format(r)\n            resizearg = '{}%'.format(100./r)\n        else:\n            name = 'images_{}x{}'.format(r[1], r[0])\n            resizearg = '{}x{}'.format(r[1], r[0])\n        imgdir = os.path.join(basedir, name)\n        if os.path.exists(imgdir):\n            print(\"Image folder exists, do not call the resize function.\")\n            continue\n        os.makedirs(imgdir, exist_ok=True)\n        print('Minifying', r, basedir)\n        # check_output('cp {}/* {}'.format(imgdir_orig, imgdir), shell=True)\n        ext = imgs[0].split('.')[-1]\n        for idx, one_img_p in enumerate(imgs):\n            one_img = cv2.imread(one_img_p)\n            ori_h, ori_w = one_img.shape[0], one_img.shape[1]\n            if isinstance(r, int):\n                target_h, target_w = int(ori_h / r), int(ori_w / r)\n            else:\n                target_h, target_w = r[0], r[1]\n            resized = cv2.resize(one_img, (target_w, target_h), interpolation = cv2.INTER_AREA)\n            target_img_p = one_img_p.replace(imgdir_orig, imgdir)\n            cv2.imwrite(target_img_p, resized)\n        # args = ' '.join(['convert mogrify', '-resize', resizearg, '-format', 'png', '*.{}'.format(ext)])\n        # print(args)\n        # os.chdir(imgdir)\n        # check_output(args, shell=True)\n        # os.chdir(wd)\n\n        # if ext != 'png':\n        #     check_output('rm {}/*.{}'.format(imgdir, ext), shell=True)\n        #     print('Removed duplicates')\n        print('Done')\n\n\ndef normalize_scene(poses, n_images, bounds):\n    # TODO: vdalidate the effectiveness of this function\n    cam_pos = poses[:, :, 3].clone()\n    center_ = cam_pos.mean(dim=0, keepdim=False)\n    bias = cam_pos - center_.unsqueeze(0)\n    radius_ = torch.linalg.norm(bias, ord=2, dim=-1, keepdim=False).max().item()\n    cam_pos = (cam_pos - center_.unsqueeze(0)) / radius_\n    poses[:, :, 3] = cam_pos\n    bounds = (bounds / radius_)\n    return poses, bounds, center_, radius_\n\n\ndef load_images_from_disk(basedir, factor, height, width):\n    img0 = [os.path.join(basedir, 'images', f) for f in sorted(os.listdir(os.path.join(basedir, 'images'))) \\\n            if f.endswith('JPG') or f.endswith('jpg') or f.endswith('jpeg') or f.endswith('png')][0]\n    sh = imageio.imread(img0).shape\n    sfx = ''\n    if height is not None and width is not None:\n        _minify(basedir, resolutions=[[height, width]])\n        sfx = '_{}x{}'.format(width, height)\n    elif factor is not None and factor != 1:\n        sfx = '_{}'.format(factor)\n        _minify(basedir, factors=[factor])\n        factor = factor\n    elif height is not None:\n        factor = sh[0] / float(height)\n        width = int(sh[1] / factor)\n        _minify(basedir, resolutions=[[height, width]])\n        sfx = '_{}x{}'.format(width, height)\n    elif width is not None:\n        factor = sh[1] / float(width)\n        height = int(sh[0] / factor)\n        _minify(basedir, resolutions=[[height, width]])\n        sfx = '_{}x{}'.format(width, height)\n    else:\n        factor = 1\n    imgdir = os.path.join(basedir, 'images' + sfx)\n    print(f'Loading images from {imgdir}')\n    if not os.path.exists(imgdir):\n        print(imgdir, 'does not exist, returning' )\n        import sys; sys.exit()\n        return\n    imgfiles = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir)) if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')]\n    if len(imgfiles) < 3:\n        print('Too few images...')\n        import sys; sys.exit()\n\n    imgs = [imread(f)[...,:3]/255. for f in imgfiles]\n    imgs = np.stack(imgs, 0)\n    return imgs, factor\n\ndef normalize(x):\n    return x / np.linalg.norm(x)\n\ndef ptstocam(pts, c2w):\n    tt = np.matmul(c2w[:3,:3].T, (pts-c2w[:3,3])[...,np.newaxis])[...,0]\n    return tt\n\ndef viewmatrix(z, up, pos):\n    vec2 = normalize(z)\n    vec1_avg = up\n    vec0 = normalize(np.cross(vec1_avg, vec2))\n    vec1 = normalize(np.cross(vec2, vec0))\n    m = np.stack([vec0, vec1, vec2, pos], 1)\n    return m\n\n\ndef render_path_spiral(c2w, up, rads, focal, zdelta, zrate, rots, N):\n    render_poses = []\n    rads = np.array(list(rads) + [1.])\n    hwf = c2w[:,4:5]\n\n    for theta in np.linspace(0., 2. * np.pi * rots, N+1)[:-1]:\n        c = np.dot(c2w[:3,:4], np.array([np.cos(theta), -np.sin(theta), -np.sin(theta*zrate)*zdelta, 1.]) * rads) \n        z = normalize(c - np.dot(c2w[:3,:4], np.array([0,0,-focal, 1.])))\n        render_poses.append(np.concatenate([viewmatrix(z, up, c), hwf], 1))\n    return render_poses\n\n\ndef poses_avg(poses):\n    hwf = poses[0, :3, -1:]\n    center = poses[:, :3, 3].mean(0)\n    vec2 = normalize(poses[:, :3, 2].sum(0))\n    up = poses[:, :3, 1].sum(0)\n    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)\n    return c2w\n\ndef recenter_poses(poses, render_poses):\n    poses_ = poses + 0\n    bottom = np.reshape([0,0,0,1.], [1,4])\n    c2w = poses_avg(poses)\n    c2w = np.concatenate([c2w[:3,:4], bottom], -2)\n    bottom = np.tile(np.reshape(bottom, [1,1,4]), [poses.shape[0],1,1])\n    poses = np.concatenate([poses[:,:3,:4], bottom], -2)\n\n    poses = np.linalg.inv(c2w) @ poses\n    poses_[:,:3,:4] = poses[:,:3,:4]\n    poses = poses_\n    \n    # apply c2w to render poses\n    render_poses_ = render_poses + 0\n    bottom = np.reshape([0,0,0,1.], [1,4])\n    bottom = np.tile(np.reshape(bottom, [1,1,4]), [render_poses.shape[0],1,1])\n    render_poses = np.concatenate([render_poses[:,:3,:4], bottom], -2)\n    render_poses = np.linalg.inv(c2w) @ render_poses\n    render_poses_[:, :3, :4] = render_poses[:, :3, :4]\n    render_poses = render_poses_\n    return poses, render_poses\n\n\ndef rerotate_poses(poses):\n    poses = np.copy(poses)\n    centroid = poses[:,:3,3].mean(0)\n\n    poses[:,:3,3] = poses[:,:3,3] - centroid\n\n    # Find the minimum pca vector with minimum eigen value\n    x = poses[:,:,3]\n    mu = x.mean(0)\n    cov = np.cov((x-mu).T)\n    ev , eig = np.linalg.eig(cov)\n    cams_up = eig[:,np.argmin(ev)]\n    if cams_up[1] < 0:\n        cams_up = -cams_up\n\n    # Find rotation matrix that align cams_up with [0,1,0]\n    R = scipy.spatial.transform.Rotation.align_vectors(\n            [[0,1,0]], cams_up[None])[0].as_matrix()\n\n    # Apply rotation and add back the centroid position\n    poses[:,:3,:3] = R @ poses[:,:3,:3]\n    poses[:,:3,[3]] = R @ poses[:,:3,[3]]\n    poses[:,:3,3] = poses[:,:3,3] + centroid\n    return poses\n\n#####################\n\n\ndef spherify_poses(poses, bds, depths):\n\n    p34_to_44 = lambda p : np.concatenate([p, np.tile(np.reshape(np.eye(4)[-1,:], [1,1,4]), [p.shape[0], 1,1])], 1)\n\n    rays_d = poses[:,:3,2:3]\n    rays_o = poses[:,:3,3:4]\n\n    def min_line_dist(rays_o, rays_d):\n        A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1])\n        b_i = -A_i @ rays_o\n        pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0))\n        return pt_mindist\n\n    pt_mindist = min_line_dist(rays_o, rays_d)\n\n    center = pt_mindist\n    up = (poses[:,:3,3] - center).mean(0)\n\n    vec0 = normalize(up)\n    vec1 = normalize(np.cross([.1,.2,.3], vec0))\n    vec2 = normalize(np.cross(vec0, vec1))\n    pos = center\n    c2w = np.stack([vec1, vec2, vec0, pos], 1)\n\n    poses_reset = np.linalg.inv(p34_to_44(c2w[None])) @ p34_to_44(poses[:,:3,:4])\n\n    radius = np.sqrt(np.mean(np.sum(np.square(poses_reset[:,:3,3]), -1)))\n\n    sc = 1./radius\n    poses_reset[:,:3,3] *= sc\n    bds *= sc\n    radius *= sc\n    depths *= sc\n\n    poses_reset = np.concatenate([poses_reset[:,:3,:4], np.broadcast_to(poses[0,:3,-1:], poses_reset[:,:3,-1:].shape)], -1)\n\n    return poses_reset, radius, bds, depths\n\n\ndef load_free_data(args, basedir, factor=8, width=None, height=None,\n                   recenter=True, rerotate=True,\n                   bd_factor=.75, spherify=False, path_zflat=False, load_depths=False,\n                   movie_render_kwargs={}, training_ids=None, generate_render_poses=True, n_out_poses=200, sc=1.0):\n    # 1. load and parse poses, images, and bounds\n    meta_pose = torch.tensor(np.load(os.path.join(basedir, 'cams_meta.npy')))\n    n_images = meta_pose.shape[0]\n    cam_data = meta_pose.reshape(n_images, 27)\n    poses = cam_data[:, 0:12].reshape(-1, 3, 4)\n    intri = cam_data[:, 12:21].reshape(-1, 3, 3)\n    poses = poses.cpu().numpy()\n    intri = intri.cpu().numpy()\n    \n    # 2. Rotation matrix correct, this has been done in colmap2standard\n    # poses = np.concatenate([poses[:, 1:2, :], -poses[:, 0:1, :], poses[:, 2:, :]], 1)\n    # poses = np.moveaxis(poses, -1, 0).astype(np.float32)\n    imgs, factor = load_images_from_disk(basedir, factor, height, width)\n    intri[..., :2, :3] /= factor\n    dist_params = cam_data[:, 21:25].reshape(-1, 4)\n    bounds = cam_data[:, 25:27].reshape(-1, 2)\n\n    # 2. normalize scenes\n    # poses, bounds, center, radius = normalize_scene(poses, n_images, bounds)\n    assert not load_depths, \"do not support loading depths\"\n    assert len(imgs.shape) == 4, \"image shape is not correct!\"\n    assert intri[0][0][0] == intri[1][0][0] and intri[1][0][0] == intri[2][0][0], \"focal length are varying!\"\n\n    # filter by training_ids\n    if training_ids is not None:\n        poses = np.array([poses[id] for id in training_ids])\n        intri = np.array([intri[id] for id in training_ids])\n        imgs = np.array([imgs[id] for id in training_ids])\n        bounds = bounds[training_ids]\n\n    # 3. load render camera poses or generate render poses on the fly\n    if generate_render_poses:\n        key_poses_indexs = np.arange(0, poses.shape[0], 5)\n        key_poses = poses[key_poses_indexs]\n        render_poses_ = inter_poses(key_poses, n_out_poses)\n    else:\n        poses_render_path = os.path.join(basedir, \"poses_render.npy\")\n        arr = np.load(poses_render_path)\n        cam_data = torch.from_numpy(arr.astype(np.float64)).to(torch.float32).cuda()\n        n_render_poses = arr.shape[0]\n        cam_data = cam_data.reshape((-1, 3, 4))\n        cam_data = cam_data[:n_render_poses, :, :]\n        render_poses_ = cam_data.clone()  # [n, 3, 4]\n        # render_poses_[:, :3, 3] = (render_poses_[:, :3, 3] - center.unsqueeze(0)) / radius  #commented out for debugging\n        render_poses_ = render_poses_.cpu().numpy()\n    hwf = np.array([[imgs.shape[1], imgs.shape[2], intri[0][0][0]]for _ in range(render_poses_.shape[0])])\n    render_poses_ = np.concatenate((render_poses_, hwf.reshape((render_poses_.shape[0], 3, 1))), axis=2)\n    \n    # 4. relax bounds\n    # bounds_factor = [0.5, 4.0]\n    bounds = torch.stack([bounds[:, 0], bounds[:, 1]], dim=-1)\n    bounds.clamp_(1e-2, 1e9)\n    bounds = bounds.cpu().numpy()\n    near = bounds.min().item()\n    # sc = 1 / (near * bd_factor)  # 0.12 by default\n    poses[:,:3,3] *= sc\n    render_poses_[:, :3, 3] *= sc\n    hwf = np.array([[imgs.shape[1], imgs.shape[2], intri[0][0][0]]for _ in range(imgs.shape[0])])\n    poses = np.concatenate((poses, hwf.reshape((imgs.shape[0], 3, 1))), axis=2)\n    poses, render_poses_ = recenter_poses(poses, render_poses_)\n    \n    # 5. get test ID, this part is written by DVGO.\n    if args.llffhold > 0:\n        print('Auto LLFF holdout,', args.llffhold)\n        i_test = np.arange(imgs.shape[0])[::args.llffhold]\n    else:\n        print(\"LLFF hold is not used!\")\n        i_test = [0, 1, 2]\n    return imgs, 0, intri, poses, bounds, render_poses_, i_test\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_llff.py",
    "content": "import numpy as np\nimport os, imageio\nimport torch\nimport scipy\nimport cv2\nimport pdb\nfrom shutil import copy\nfrom subprocess import check_output\n    \n########## Slightly modified version of LLFF data loading code\n##########  see https://github.com/Fyusion/LLFF for original\ndef imread(f):\n    if f.endswith('png'):\n        return imageio.imread(f, ignoregamma=True)\n    else:\n        return imageio.imread(f)\n\ndef depthread(path):\n    with open(path, \"rb\") as fid:\n        width, height, channels = np.genfromtxt(fid, delimiter=\"&\", max_rows=1,\n                                                usecols=(0, 1, 2), dtype=int)\n        fid.seek(0)\n        num_delimiter = 0\n        byte = fid.read(1)\n        while True:\n            if byte == b\"&\":\n                num_delimiter += 1\n                if num_delimiter >= 3:\n                    break\n            byte = fid.read(1)\n        array = np.fromfile(fid, np.float32)\n    array = array.reshape((width, height, channels), order=\"F\")\n    return np.transpose(array, (1, 0, 2)).squeeze()\n\n\ndef _minify(basedir, factors=[], resolutions=[]):\n    needtoload = False\n    for r in factors:\n        imgdir = os.path.join(basedir, 'images_{}'.format(r))\n        if not os.path.exists(imgdir):\n            needtoload = True\n    for r in resolutions:\n        imgdir = os.path.join(basedir, 'images_{}x{}'.format(r[1], r[0]))\n        if not os.path.exists(imgdir):\n            needtoload = True\n    if not needtoload:\n        return\n\n    imgdir = os.path.join(basedir, 'images')\n    imgs = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir))]\n    imgs = [f for f in imgs if any([f.endswith(ex) for ex in ['JPG', 'jpg', 'jpeg', 'png', 'jpeg', 'PNG']])]\n    imgdir_orig = imgdir\n\n    wd = os.getcwd()\n\n    for r in factors + resolutions:\n        if isinstance(r, int):\n            name = 'images_{}'.format(r)\n            resizearg = '{}%'.format(100./r)\n        else:\n            name = 'images_{}x{}'.format(r[1], r[0])\n            resizearg = '{}x{}'.format(r[1], r[0])\n        imgdir = os.path.join(basedir, name)\n        if os.path.exists(imgdir):\n            print(\"Image folder exists, do not call the resize function.\")\n            continue\n        os.makedirs(imgdir, exist_ok=True)\n        print('Minifying', r, basedir)\n        # check_output('cp {}/* {}'.format(imgdir_orig, imgdir), shell=True)\n        ext = imgs[0].split('.')[-1]\n        for idx, one_img_p in enumerate(imgs):\n            one_img = cv2.imread(one_img_p)\n            ori_h, ori_w = one_img.shape[0], one_img.shape[1]\n            if isinstance(r, int):\n                target_h, target_w = int(ori_h / r), int(ori_w / r)\n            else:\n                target_h, target_w = r[0], r[1]\n            resized = cv2.resize(one_img, (target_w, target_h), interpolation = cv2.INTER_AREA)\n            target_img_p = one_img_p.replace(imgdir_orig, imgdir)\n            cv2.imwrite(target_img_p, resized)\n        # args = ' '.join(['convert mogrify', '-resize', resizearg, '-format', 'png', '*.{}'.format(ext)])\n        # print(args)\n        # os.chdir(imgdir)\n        # check_output(args, shell=True)\n        # os.chdir(wd)\n\n        # if ext != 'png':\n        #     check_output('rm {}/*.{}'.format(imgdir, ext), shell=True)\n        #     print('Removed duplicates')\n        print('Done')\n\n\ndef _load_data(basedir, factor=None, width=None, height=None, load_imgs=True, load_depths=False):\n\n    poses_arr = np.load(os.path.join(basedir, 'poses_bounds.npy'))\n    if poses_arr.shape[1] == 17:\n        poses = poses_arr[:, :-2].reshape([-1, 3, 5]).transpose([1,2,0])\n    elif poses_arr.shape[1] == 14:\n        poses = poses_arr[:, :-2].reshape([-1, 3, 4]).transpose([1,2,0])\n    else:\n        raise NotImplementedError\n    bds = poses_arr[:, -2:].transpose([1,0])\n    img0 = [os.path.join(basedir, 'images', f) for f in sorted(os.listdir(os.path.join(basedir, 'images'))) \\\n            if f.endswith('JPG') or f.endswith('jpg') or f.endswith('jpeg') or f.endswith('png')][0]\n    sh = imageio.imread(img0).shape\n\n    sfx = ''\n\n    if height is not None and width is not None:\n        _minify(basedir, resolutions=[[height, width]])\n        sfx = '_{}x{}'.format(width, height)\n    elif factor is not None and factor != 1:\n        sfx = '_{}'.format(factor)\n        _minify(basedir, factors=[factor])\n        factor = factor\n    elif height is not None:\n        factor = sh[0] / float(height)\n        width = int(sh[1] / factor)\n        _minify(basedir, resolutions=[[height, width]])\n        sfx = '_{}x{}'.format(width, height)\n    elif width is not None:\n        factor = sh[1] / float(width)\n        height = int(sh[0] / factor)\n        _minify(basedir, resolutions=[[height, width]])\n        sfx = '_{}x{}'.format(width, height)\n    else:\n        factor = 1\n    imgdir = os.path.join(basedir, 'images' + sfx)\n    print(f'Loading images from {imgdir}')\n    if not os.path.exists(imgdir):\n        print(imgdir, 'does not exist, returning' )\n        return\n\n    imgfiles = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir)) if f.endswith('JPG') or f.endswith('jpg') or f.endswith('png')]\n    if poses.shape[-1] != len(imgfiles):\n        print()\n        print( 'Mismatch between imgs {} and poses {} !!!!'.format(len(imgfiles), poses.shape[-1]) )\n        names = set(name[:-4] for name in np.load(os.path.join(basedir, 'poses_names.npy')))\n        assert len(names) == poses.shape[-1]\n        print('Below failed files are skip due to SfM failure:')\n        new_imgfiles = []\n        for i in imgfiles:\n            fname = os.path.split(i)[1][:-4]\n            if fname in names:\n                new_imgfiles.append(i)\n            else:\n                print('==>', i)\n        imgfiles = new_imgfiles\n\n    if len(imgfiles) < 3:\n        print('Too few images...')\n        import sys; sys.exit()\n\n    sh = imageio.imread(imgfiles[0]).shape\n    if poses.shape[1] == 4:\n        poses = np.concatenate([poses, np.zeros_like(poses[:,[0]])], 1)\n        poses[2, 4, :] = np.load(os.path.join(basedir, 'hwf_cxcy.npy'))[2]\n    poses[:2, 4, :] = np.array(sh[:2]).reshape([2, 1])\n    poses[2, 4, :] = poses[2, 4, :] * 1./factor\n\n    if not load_imgs:\n        return poses, bds\n    imgs = imgs = [imread(f)[...,:3]/255. for f in imgfiles]\n    imgs = np.stack(imgs, -1)\n\n    print('Loaded image data', imgs.shape, poses[:,-1,0])\n    if not load_depths:\n        return poses, bds, imgs\n\n    depthdir = os.path.join(basedir, 'stereo', 'depth_maps')\n    assert os.path.exists(depthdir), f'Dir not found: {depthdir}'\n\n    depthfiles = [os.path.join(depthdir, f) for f in sorted(os.listdir(depthdir)) if f.endswith('.geometric.bin')]\n    assert poses.shape[-1] == len(depthfiles), 'Mismatch between imgs {} and poses {} !!!!'.format(len(depthfiles), poses.shape[-1])\n\n    depths = [depthread(f) for f in depthfiles]\n    depths = np.stack(depths, -1)\n    print('Loaded depth data', depths.shape)\n    return poses, bds, imgs, depths\n\n\ndef normalize(x):\n    return x / np.linalg.norm(x)\n\ndef viewmatrix(z, up, pos):\n    vec2 = normalize(z)\n    vec1_avg = up\n    vec0 = normalize(np.cross(vec1_avg, vec2))\n    vec1 = normalize(np.cross(vec2, vec0))\n    m = np.stack([vec0, vec1, vec2, pos], 1)\n    return m\n\ndef ptstocam(pts, c2w):\n    tt = np.matmul(c2w[:3,:3].T, (pts-c2w[:3,3])[...,np.newaxis])[...,0]\n    return tt\n\ndef poses_avg(poses):\n    assert poses.shape[-1] == 5, \"HWF must be given in the pose matrix!\"\n    hwf = poses[0, :3, -1:]\n    center = poses[:, :3, 3].mean(0)\n    vec2 = normalize(poses[:, :3, 2].sum(0))\n    up = poses[:, :3, 1].sum(0)\n    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)\n    return c2w\n\n\ndef render_path_spiral(c2w, up, rads, focal, zdelta, zrate, rots, N):\n    render_poses = []\n    rads = np.array(list(rads) + [1.])\n    hwf = c2w[:,4:5]\n\n    for theta in np.linspace(0., 2. * np.pi * rots, N+1)[:-1]:\n        c = np.dot(c2w[:3,:4], np.array([np.cos(theta), -np.sin(theta), -np.sin(theta*zrate)*zdelta, 1.]) * rads) \n        z = normalize(c - np.dot(c2w[:3,:4], np.array([0,0,-focal, 1.])))\n        render_poses.append(np.concatenate([viewmatrix(z, up, c), hwf], 1))\n    return render_poses\n\n\ndef recenter_poses(poses):\n    poses_ = poses+0\n    bottom = np.reshape([0,0,0,1.], [1,4])\n    c2w = poses_avg(poses)\n    c2w = np.concatenate([c2w[:3,:4], bottom], -2)\n    bottom = np.tile(np.reshape(bottom, [1,1,4]), [poses.shape[0],1,1])\n    poses = np.concatenate([poses[:,:3,:4], bottom], -2)\n\n    poses = np.linalg.inv(c2w) @ poses\n    poses_[:,:3,:4] = poses[:,:3,:4]\n    poses = poses_\n    return poses\n\n\ndef rerotate_poses(poses):\n    poses = np.copy(poses)\n    centroid = poses[:,:3,3].mean(0)\n\n    poses[:,:3,3] = poses[:,:3,3] - centroid\n\n    # Find the minimum pca vector with minimum eigen value\n    x = poses[:,:,3]\n    mu = x.mean(0)\n    cov = np.cov((x-mu).T)\n    ev , eig = np.linalg.eig(cov)\n    cams_up = eig[:,np.argmin(ev)]\n    if cams_up[1] < 0:\n        cams_up = -cams_up\n\n    # Find rotation matrix that align cams_up with [0,1,0]\n    R = scipy.spatial.transform.Rotation.align_vectors(\n            [[0,1,0]], cams_up[None])[0].as_matrix()\n\n    # Apply rotation and add back the centroid position\n    poses[:,:3,:3] = R @ poses[:,:3,:3]\n    poses[:,:3,[3]] = R @ poses[:,:3,[3]]\n    poses[:,:3,3] = poses[:,:3,3] + centroid\n    return poses\n\n#####################\n\n\ndef spherify_poses(poses, bds, depths):\n\n    p34_to_44 = lambda p : np.concatenate([p, np.tile(np.reshape(np.eye(4)[-1,:], [1,1,4]), [p.shape[0], 1,1])], 1)\n\n    rays_d = poses[:,:3,2:3]\n    rays_o = poses[:,:3,3:4]\n\n    def min_line_dist(rays_o, rays_d):\n        A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0,2,1])\n        b_i = -A_i @ rays_o\n        pt_mindist = np.squeeze(-np.linalg.inv((np.transpose(A_i, [0,2,1]) @ A_i).mean(0)) @ (b_i).mean(0))\n        return pt_mindist\n\n    pt_mindist = min_line_dist(rays_o, rays_d)\n\n    center = pt_mindist\n    up = (poses[:,:3,3] - center).mean(0)\n\n    vec0 = normalize(up)\n    vec1 = normalize(np.cross([.1,.2,.3], vec0))\n    vec2 = normalize(np.cross(vec0, vec1))\n    pos = center\n    c2w = np.stack([vec1, vec2, vec0, pos], 1)\n\n    poses_reset = np.linalg.inv(p34_to_44(c2w[None])) @ p34_to_44(poses[:,:3,:4])\n\n    radius = np.sqrt(np.mean(np.sum(np.square(poses_reset[:,:3,3]), -1)))\n\n    sc = 1./radius\n    poses_reset[:,:3,3] *= sc\n    bds *= sc\n    radius *= sc\n    depths *= sc\n\n    poses_reset = np.concatenate([poses_reset[:,:3,:4], np.broadcast_to(poses[0,:3,-1:], poses_reset[:,:3,-1:].shape)], -1)\n\n    return poses_reset, radius, bds, depths\n\n\ndef load_llff_data(basedir, factor=8, width=None, height=None,\n                   recenter=True, rerotate=True,\n                   bd_factor=.75, spherify=False, path_zflat=False, load_depths=False,\n                   movie_render_kwargs={}):\n    poses, bds, imgs, *depths = _load_data(basedir, factor=factor, width=width, height=height,\n                                           load_depths=load_depths) # factor=8 downsamples original imgs by 8x\n    # poses: [3, 5, N], bds: [2, N], imgs: [H, W, 3, N], depths: []\n    print('Loaded', basedir, bds.min(), bds.max())\n    if load_depths:\n        depths = depths[0]\n    else:\n        depths = 0\n    # Correct rotation matrix ordering and move variable dim to axis 0\n    poses = np.concatenate([poses[:, 1:2, :], -poses[:, 0:1, :], poses[:, 2:, :]], 1)\n    poses = np.moveaxis(poses, -1, 0).astype(np.float32)\n    imgs = np.moveaxis(imgs, -1, 0).astype(np.float32)\n    images = imgs\n    bds = np.moveaxis(bds, -1, 0).astype(np.float32)\n    # Rescale if bd_factor is provided\n    if bds.min() < 0 and bd_factor is not None:\n        print('Found negative z values from SfM sparse points!?')\n        print('Please try bd_factor=None. This program is terminating now!')\n        import sys; sys.exit()\n    sc = 1. if bd_factor is None else 1./(bds.min() * bd_factor)\n    poses[:,:3,3] *= sc\n    bds *= sc\n    depths *= sc\n    if recenter:\n        poses = recenter_poses(poses)\n    if spherify:\n        poses, radius, bds, depths = spherify_poses(poses, bds, depths)\n        if rerotate:\n            poses = rerotate_poses(poses)\n\n        ### generate spiral poses for rendering fly-through movie\n        centroid = poses[:,:3,3].mean(0)\n        radcircle = movie_render_kwargs.get('scale_r', 1) * np.linalg.norm(poses[:,:3,3] - centroid, axis=-1).mean()\n        centroid[0] += movie_render_kwargs.get('shift_x', 0)\n        centroid[1] += movie_render_kwargs.get('shift_y', 0)\n        centroid[2] += movie_render_kwargs.get('shift_z', 0)\n        new_up_rad = movie_render_kwargs.get('pitch_deg', 0) * np.pi / 180\n        target_y = radcircle * np.tan(new_up_rad)\n\n        render_poses = []\n\n        for th in np.linspace(0., 2.*np.pi, 200):\n            camorigin = np.array([radcircle * np.cos(th), 0, radcircle * np.sin(th)])\n            if movie_render_kwargs.get('flip_up', False):\n                up = np.array([0,1.,0])\n            else:\n                up = np.array([0,-1.,0])\n            vec2 = normalize(camorigin)\n            vec0 = normalize(np.cross(vec2, up))\n            vec1 = normalize(np.cross(vec2, vec0))\n            pos = camorigin + centroid\n            # rotate to align with new pitch rotation\n            lookat = -vec2\n            lookat[1] = target_y\n            lookat = normalize(lookat)\n            vec2 = -lookat\n            vec1 = normalize(np.cross(vec2, vec0))\n\n            p = np.stack([vec0, vec1, vec2, pos], 1)\n\n            render_poses.append(p)\n\n        render_poses = np.stack(render_poses, 0)\n        render_poses = np.concatenate([render_poses, np.broadcast_to(poses[0,:3,-1:], render_poses[:,:3,-1:].shape)], -1)\n\n    else:\n\n        c2w = poses_avg(poses)\n        print('recentered', c2w.shape)\n        print(c2w[:3,:4])\n\n        ## Get spiral\n        # Get average pose\n        up = normalize(poses[:, :3, 1].sum(0))\n\n        # Find a reasonable \"focus depth\" for this dataset\n        close_depth, inf_depth = bds.min()*.9, bds.max()*5.\n        dt = .75\n        mean_dz = 1./(((1.-dt)/close_depth + dt/inf_depth))\n        focal = mean_dz * movie_render_kwargs.get('scale_f', 1)\n\n        # Get radii for spiral path\n        zdelta = movie_render_kwargs.get('zdelta', 0.5)\n        zrate = movie_render_kwargs.get('zrate', 1.0)\n        tt = poses[:,:3,3] # ptstocam(poses[:3,3,:].T, c2w).T\n        rads = np.percentile(np.abs(tt), 90, 0) * movie_render_kwargs.get('scale_r', 1)\n        c2w_path = c2w\n        N_views = 120\n        N_rots = movie_render_kwargs.get('N_rots', 1)\n        if path_zflat:\n#             zloc = np.percentile(tt, 10, 0)[2]\n            zloc = -close_depth * .1\n            c2w_path[:3,3] = c2w_path[:3,3] + zloc * c2w_path[:3,2]\n            rads[2] = 0.\n            N_rots = 1\n            N_views/=2\n\n        # Generate poses for spiral path\n        render_poses = render_path_spiral(c2w_path, up, rads, focal, zdelta, zrate=zrate, rots=N_rots, N=N_views)\n\n    render_poses = torch.Tensor(render_poses)\n\n    c2w = poses_avg(poses)\n    print('Data:')\n    print(poses.shape, images.shape, bds.shape)\n    # this part is written by DVGO.\n    dists = np.sum(np.square(c2w[:3,3] - poses[:,:3,3]), -1)\n    i_test = np.argmin(dists)\n    print('HOLDOUT view is', i_test) \n\n    images = images.astype(np.float32)\n    poses = poses.astype(np.float32)\n    return images, depths, poses, bds, render_poses, i_test\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_nerfpp.py",
    "content": "'''\nModify from\nhttps://github.com/Kai-46/nerfplusplus/blob/master/data_loader_split.py\n'''\nimport os\nimport pdb\nimport glob\nimport scipy\nimport imageio\nimport numpy as np\nimport torch\n\n########################################################################################################################\n# camera coordinate system: x-->right, y-->down, z-->scene (opencv/colmap convention)\n# poses is camera-to-world\n########################################################################################################################\ndef find_files(dir, exts):\n    if os.path.isdir(dir):\n        files_grabbed = []\n        for ext in exts:\n            files_grabbed.extend(glob.glob(os.path.join(dir, ext)))\n        if len(files_grabbed) > 0:\n            files_grabbed = sorted(files_grabbed)\n        return files_grabbed\n    else:\n        return []\n\n\ndef load_data_split(split_dir, skip=1, try_load_min_depth=True, only_img_files=False,\n                    training_ids=None):\n\n    def parse_txt(filename):\n        assert os.path.isfile(filename)\n        nums = open(filename).read().split()\n        return np.array([float(x) for x in nums]).reshape([4, 4]).astype(np.float32)\n\n    if only_img_files:\n        img_files = find_files('{}/rgb'.format(split_dir), exts=['*.png', '*.jpg'])\n        return img_files\n\n    # camera parameters files\n    intrinsics_files = find_files('{}/intrinsics'.format(split_dir), exts=['*.txt'])\n    pose_files = find_files('{}/pose'.format(split_dir), exts=['*.txt'])\n\n    intrinsics_files = intrinsics_files[::skip]\n    pose_files = pose_files[::skip]\n    cam_cnt = len(pose_files)\n\n    # img files\n    img_files = find_files('{}/rgb'.format(split_dir), exts=['*.png', '*.jpg'])\n    if len(img_files) > 0:\n        img_files = img_files[::skip]\n        assert(len(img_files) == cam_cnt)\n    else:\n        img_files = [None, ] * cam_cnt\n\n    # mask files\n    mask_files = find_files('{}/mask'.format(split_dir), exts=['*.png', '*.jpg'])\n    if len(mask_files) > 0:\n        mask_files = mask_files[::skip]\n        assert(len(mask_files) == cam_cnt)\n    else:\n        mask_files = [None, ] * cam_cnt\n\n    # min depth files\n    mindepth_files = find_files('{}/min_depth'.format(split_dir), exts=['*.png', '*.jpg'])\n    if try_load_min_depth and len(mindepth_files) > 0:\n        mindepth_files = mindepth_files[::skip]\n        assert(len(mindepth_files) == cam_cnt)\n    else:\n        mindepth_files = [None, ] * cam_cnt\n    \n    # sample by training ids\n    if training_ids is not None:\n        final_training_ids = []\n        for idx, ele in enumerate(intrinsics_files):\n            if int(ele.split(\"/\")[-1].replace(\".txt\", \"\")) in training_ids:\n                final_training_ids.append(idx)\n        training_ids = final_training_ids\n        training_ids = [id - 1 for id in training_ids]  # image id start with 1\n        intrinsics_files = [intrinsics_files[id] for id in training_ids]\n        pose_files = [pose_files[id] for id in training_ids]\n        img_files = [img_files[id] for id in training_ids]\n        mask_files = [mask_files[id] for id in training_ids]\n        mindepth_files = [mindepth_files[id] for id in training_ids]\n    return intrinsics_files, pose_files, img_files, mask_files, mindepth_files\n\n\ndef rerotate_poses(poses, render_poses):\n    poses = np.copy(poses)\n    centroid = poses[:,:3,3].mean(0)\n\n    poses[:,:3,3] = poses[:,:3,3] - centroid\n\n    # Find the minimum pca vector with minimum eigen value\n    x = poses[:,:3,3]\n    mu = x.mean(0)\n    cov = np.cov((x-mu).T)\n    ev , eig = np.linalg.eig(cov)\n    cams_up = eig[:,np.argmin(ev)]\n    if cams_up[1] < 0:\n        cams_up = -cams_up\n\n    # Find rotation matrix that align cams_up with [0,1,0]\n    R = scipy.spatial.transform.Rotation.align_vectors(\n            [[0,-1,0]], cams_up[None])[0].as_matrix()\n\n    # Apply rotation and add back the centroid position\n    poses[:,:3,:3] = R @ poses[:,:3,:3]\n    poses[:,:3,[3]] = R @ poses[:,:3,[3]]\n    poses[:,:3,3] = poses[:,:3,3] + centroid\n    render_poses = np.copy(render_poses)\n    render_poses[:,:3,3] = render_poses[:,:3,3] - centroid\n    render_poses[:,:3,:3] = R @ render_poses[:,:3,:3]\n    render_poses[:,:3,[3]] = R @ render_poses[:,:3,[3]]\n    render_poses[:,:3,3] = render_poses[:,:3,3] + centroid\n    return poses, render_poses\n\n\ndef load_nerfpp_data(basedir, rerotate=True, training_ids=None):\n    tr_K, tr_c2w, tr_im_path = load_data_split(os.path.join(basedir, 'train'), training_ids=training_ids)[:3]\n    assert len(tr_im_path) > 0, f\"Images are not found in {basedir}\"\n    te_K, te_c2w, te_im_path = load_data_split(os.path.join(basedir, 'test'))[:3]\n    assert len(tr_K) == len(tr_c2w) and len(tr_K) == len(tr_im_path)\n    assert len(te_K) == len(te_c2w) and len(te_K) == len(te_im_path)\n\n    # Determine split id list\n    i_split = [[], []]\n    i = 0\n    for _ in tr_c2w:\n        i_split[0].append(i)\n        i += 1\n    for _ in te_c2w:\n        i_split[1].append(i)\n        i += 1\n\n    # Load camera intrinsics. Assume all images share a intrinsic.\n    K_flatten = np.loadtxt(tr_K[0])\n    for path in tr_K:\n        assert np.allclose(np.loadtxt(path), K_flatten)\n    for path in te_K:\n        assert np.allclose(np.loadtxt(path), K_flatten)\n    K = K_flatten.reshape(4,4)[:3,:3]\n\n    # Load camera poses\n    poses = []\n    for path in tr_c2w:\n        poses.append(np.loadtxt(path).reshape(4,4))\n    for path in te_c2w:\n        poses.append(np.loadtxt(path).reshape(4,4))\n\n    # Load images\n    imgs = []\n    for path in tr_im_path:\n        imgs.append(imageio.imread(path) / 255.)\n    for path in te_im_path:\n        imgs.append(imageio.imread(path) / 255.)\n\n    # Bundle all data\n    imgs = np.stack(imgs, 0)\n    poses = np.stack(poses, 0)\n    i_split.append(i_split[1])\n    H, W = imgs.shape[1:3]\n    focal = K[[0,1], [0,1]].mean()\n\n    # Generate movie trajectory\n    render_poses_path = sorted(glob.glob(os.path.join(basedir, 'camera_path', 'pose', '*txt')))\n    render_poses = []\n    for path in render_poses_path:\n        render_poses.append(np.loadtxt(path).reshape(4,4))\n    render_poses = np.array(render_poses)\n    render_K = np.loadtxt(glob.glob(os.path.join(basedir, 'camera_path', 'intrinsics', '*txt'))[0]).reshape(4,4)[:3,:3]\n    render_poses[:,:,0] *= K[0,0] / render_K[0,0]\n    render_poses[:,:,1] *= K[1,1] / render_K[1,1]\n    if rerotate:\n        poses, render_poses = rerotate_poses(poses, render_poses)\n    render_poses = torch.Tensor(render_poses)\n    return imgs, poses, render_poses, [H, W, focal], K, i_split\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_nsvf.py",
    "content": "import os\nimport glob\nimport torch\nimport numpy as np\nimport imageio\nimport json\nimport torch.nn.functional as F\nimport cv2\n\n\ntrans_t = lambda t : torch.Tensor([\n    [1,0,0,0],\n    [0,1,0,0],\n    [0,0,1,t],\n    [0,0,0,1]]).float()\n\nrot_phi = lambda phi : torch.Tensor([\n    [1,0,0,0],\n    [0,np.cos(phi),-np.sin(phi),0],\n    [0,np.sin(phi), np.cos(phi),0],\n    [0,0,0,1]]).float()\n\nrot_theta = lambda th : torch.Tensor([\n    [np.cos(th),0,-np.sin(th),0],\n    [0,1,0,0],\n    [np.sin(th),0, np.cos(th),0],\n    [0,0,0,1]]).float()\n\n\ndef pose_spherical(theta, phi, radius):\n    c2w = trans_t(radius)\n    c2w = rot_phi(phi/180.*np.pi) @ c2w\n    c2w = rot_theta(theta/180.*np.pi) @ c2w\n    c2w = torch.Tensor(np.array([[-1,0,0,0],[0,0,1,0],[0,1,0,0],[0,0,0,1]])) @ c2w\n    c2w[:,[1,2]] *= -1\n    return c2w\n\n\ndef load_nsvf_data(basedir):\n    pose_paths = sorted(glob.glob(os.path.join(basedir, 'pose', '*txt')))\n    rgb_paths = sorted(glob.glob(os.path.join(basedir, 'rgb', '*png')))\n\n    all_poses = []\n    all_imgs = []\n    i_split = [[], [], []]\n    for i, (pose_path, rgb_path) in enumerate(zip(pose_paths, rgb_paths)):\n        i_set = int(os.path.split(rgb_path)[-1][0])\n        all_imgs.append((imageio.imread(rgb_path) / 255.).astype(np.float32))\n        all_poses.append(np.loadtxt(pose_path).astype(np.float32))\n        i_split[i_set].append(i)\n\n    imgs = np.stack(all_imgs, 0)\n    poses = np.stack(all_poses, 0)\n\n    H, W = imgs[0].shape[:2]\n    with open(os.path.join(basedir, 'intrinsics.txt')) as f:\n        focal = float(f.readline().split()[0])\n\n    R = np.sqrt((poses[...,:3,3]**2).sum(-1)).mean()\n    render_poses = torch.stack([pose_spherical(angle, -30.0, R) for angle in np.linspace(-180,180,200+1)[:-1]], 0)\n\n    return imgs, poses, render_poses, [H, W, focal], i_split\n\n"
  },
  {
    "path": "FourierGrid/common_data_loaders/load_tankstemple.py",
    "content": "import os\nimport glob\nimport torch\nimport numpy as np\nimport imageio\nimport json\nimport torch.nn.functional as F\nimport cv2\n\n\ndef normalize(x):\n    return x / np.linalg.norm(x)\n\ndef load_tankstemple_data(basedir, movie_render_kwargs={}):\n    pose_paths = sorted(glob.glob(os.path.join(basedir, 'pose', '*txt')))\n    rgb_paths = sorted(glob.glob(os.path.join(basedir, 'rgb', '*png')))\n\n    all_poses = []\n    all_imgs = []\n    i_split = [[], []]\n    for i, (pose_path, rgb_path) in enumerate(zip(pose_paths, rgb_paths)):\n        i_set = int(os.path.split(rgb_path)[-1][0])\n        all_poses.append(np.loadtxt(pose_path).astype(np.float32))\n        all_imgs.append((imageio.imread(rgb_path) / 255.).astype(np.float32))\n        i_split[i_set].append(i)\n\n    imgs = np.stack(all_imgs, 0)\n    poses = np.stack(all_poses, 0)\n    i_split.append(i_split[-1])\n\n    path_intrinsics = os.path.join(basedir, 'intrinsics.txt')\n    H, W = imgs[0].shape[:2]\n    K = np.loadtxt(path_intrinsics)\n    focal = float(K[0,0])\n\n    ### generate spiral poses for rendering fly-through movie\n    centroid = poses[:,:3,3].mean(0)\n    radcircle = movie_render_kwargs.get('scale_r', 1.0) * np.linalg.norm(poses[:,:3,3] - centroid, axis=-1).mean()\n    centroid[0] += movie_render_kwargs.get('shift_x', 0)\n    centroid[1] += movie_render_kwargs.get('shift_y', 0)\n    centroid[2] += movie_render_kwargs.get('shift_z', 0)\n    new_up_rad = movie_render_kwargs.get('pitch_deg', 0) * np.pi / 180\n    target_y = radcircle * np.tan(new_up_rad)\n\n    render_poses = []\n\n    for th in np.linspace(0., 2.*np.pi, 200):\n        camorigin = np.array([radcircle * np.cos(th), 0, radcircle * np.sin(th)])\n        if movie_render_kwargs.get('flip_up_vec', False):\n            up = np.array([0,-1.,0])\n        else:\n            up = np.array([0,1.,0])\n        vec2 = normalize(camorigin)\n        vec0 = normalize(np.cross(vec2, up))\n        vec1 = normalize(np.cross(vec2, vec0))\n        pos = camorigin + centroid\n        # rotate to align with new pitch rotation\n        lookat = -vec2\n        lookat[1] = target_y\n        lookat = normalize(lookat)\n        lookat *= -1\n        vec2 = -lookat\n        vec1 = normalize(np.cross(vec2, vec0))\n\n        p = np.stack([vec0, vec1, vec2, pos], 1)\n\n        render_poses.append(p)\n\n    render_poses = np.stack(render_poses, 0)\n    render_poses = np.concatenate([render_poses, np.broadcast_to(poses[0,:3,-1:], render_poses[:,:3,-1:].shape)], -1)\n\n    return imgs, poses, render_poses, [H, W, focal], K, i_split\n\n"
  },
  {
    "path": "FourierGrid/configs/blendedmvs/Character.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Character'\nbasedir = './logs/blended_mvs'\n\ndata = dict(\n    datadir='./data/BlendedMVS/Character/',\n    dataset_type='blendedmvs',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/blendedmvs/Fountain.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Fountain'\nbasedir = './logs/blended_mvs'\n\ndata = dict(\n    datadir='./data/BlendedMVS/Fountain/',\n    dataset_type='blendedmvs',\n    inverse_y=True,\n    white_bkgd=False,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/blendedmvs/Jade.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Jade'\nbasedir = './logs/blended_mvs'\n\ndata = dict(\n    datadir='./data/BlendedMVS/Jade/',\n    dataset_type='blendedmvs',\n    inverse_y=True,\n    white_bkgd=False,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/blendedmvs/Statues.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Statues'\nbasedir = './logs/blended_mvs'\n\ndata = dict(\n    datadir='./data/BlendedMVS/Statues/',\n    dataset_type='blendedmvs',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/co3d/donut_369_40208_78816.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_donut_369_40208_78816'\nbasedir = './logs/co3d'\n\ndata = dict(\n    datadir='./data/co3d/',\n    dataset_type='co3d',\n    annot_path='./data/co3d/donut/frame_annotations.jgz',\n    split_path='./data/co3d/donut/set_lists.json',\n    sequence_name='369_40208_78816',\n    flip_x=True,\n    flip_y=True,\n    inverse_y=True,\n    white_bkgd=False,\n)\n\ncoarse_train = dict(\n    ray_sampler='flatten',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/custom/Madoka.py",
    "content": "_base_ = './default_forward_facing.py'\n\nexpname = 'Madoka'\n\ndata = dict(\n    datadir='data/Madoka/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 1.0,\n        'scale_f': 0.8,\n        'zrate': 2.0,\n        'zdelta': 0.5,\n    }\n)\n\nfine_train = dict(\n    N_iters=300000,\n)"
  },
  {
    "path": "FourierGrid/configs/custom/Madoka_long.py",
    "content": "_base_ = './default_forward_facing.py'\n\nexpname = 'Madoka'\n\ndata = dict(\n    datadir='data/Madoka/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 1.0,\n        'scale_f': 0.8,\n        'zrate': 2.0,\n        'zdelta': 0.5,\n    }\n)\n\nfine_train = dict(\n    N_iters=300000,\n    pg_scale=[50000, 100000, 150000, 200000],\n)"
  },
  {
    "path": "FourierGrid/configs/custom/Otobai.py",
    "content": "_base_ = './default_forward_facing.py'\n\nexpname = 'Otobai'\n\ndata = dict(\n    datadir='./data/Otobai/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 0.8,\n        'scale_f': 10.0,\n        'zrate': 6.0,\n        'zdelta': 0.5,\n    }\n)\n\nfine_train = dict(\n    N_iters=300000,\n    pg_scale=[50000, 100000, 150000, 200000],\n)"
  },
  {
    "path": "FourierGrid/configs/custom/default_forward_facing.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/custom'\n\ndata = dict(\n    dataset_type='llff',\n    load2gpu_on_the_fly=True,\n    ndc=True,\n    llffhold=0,\n    rand_bkgd=True,\n    movie_render_kwargs={\n        'scale_r': 1.0, # circling radius\n        'scale_f': 1.0, # the distance to the looking point of foucs\n        'zdelta': 0.5,  # amplitude of forward motion\n        'zrate': 1.0,   # frequency of forward motion\n        'N_rots': 1,    # number of rotation in 120 frames\n    }\n)\n\ncoarse_train = dict(\n    N_iters=0,\n)\n\nfine_train = dict(\n    N_iters=30000,\n    N_rand=4096,\n    weight_distortion=0.01,\n    pg_scale=[2000,4000,6000,8000],\n    decay_after_scale=0.1,\n    ray_sampler='flatten',\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-5,\n    weight_tv_k0=1e-6,\n)\n\n_mpi_depth = 256\n_stepsize = 1.0\n\nfine_model_and_render = dict(\n    num_voxels=384*384*_mpi_depth,\n    mpi_depth=_mpi_depth,\n    stepsize=_stepsize,\n    rgbnet_dim=9,\n    rgbnet_width=64,\n    world_bound_scale=1,\n    fast_color_thres=_stepsize/_mpi_depth/5,\n)"
  },
  {
    "path": "FourierGrid/configs/custom/default_ubd_inward_facing.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/custom'\n\ndata = dict(\n    dataset_type='llff',\n    spherify=True,\n    llffhold=0,\n    bd_factor=None,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=True,\n    load2gpu_on_the_fly=True,\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=40000,\n    N_rand=4096,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=0.0,\n    weight_distortion=0.01,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\nalpha_init = 1e-4\nstepsize = 0.5\n\nfine_model_and_render = dict(\n    num_voxels=320**3,\n    num_voxels_base=320**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/custom/sm01_desktop.py",
    "content": "_base_ = './default_forward_facing.py'\n\nexpname = 'sm01_desktop'\n\ndata = dict(\n    datadir='./data/sm01_desktop/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 0.5,\n        'scale_f': 1.0,\n        'zrate': 1.0,\n        'zdelta': 0.5,\n    }\n)\n"
  },
  {
    "path": "FourierGrid/configs/custom/sm02_multiple_desktop.py",
    "content": "_base_ = './default_forward_facing.py'\n\nexpname = 'sm02_multiple_desktop'\n\ndata = dict(\n    datadir='./data/sm02_multiple_desktop/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 0.5,\n        'scale_f': 1.0,\n        'zrate': 1.0,\n        'zdelta': 0.5,\n    }\n)\n"
  },
  {
    "path": "FourierGrid/configs/custom/sm03_meeting.py",
    "content": "_base_ = './default_forward_facing.py'\n\nexpname = 'sm03_meeting'\n\ndata = dict(\n    datadir='./data/sm03_meeting/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 0.5,\n        'scale_f': 1.0,\n        'zrate': 1.0,\n        'zdelta': 0.5,\n    }\n)\n"
  },
  {
    "path": "FourierGrid/configs/deepvoxels/armchair.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_armchair'\nbasedir = './logs/deepvoxels'\n\ndata = dict(\n    datadir='./data/deepvoxels/',\n    dataset_type='deepvoxels',\n    scene='armchair',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/deepvoxels/cube.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_cube'\nbasedir = './logs/deepvoxels'\n\ndata = dict(\n    datadir='./data/deepvoxels/',\n    dataset_type='deepvoxels',\n    scene='cube',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/deepvoxels/greek.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_greek'\nbasedir = './logs/deepvoxels'\n\ndata = dict(\n    datadir='./data/deepvoxels/',\n    dataset_type='deepvoxels',\n    scene='greek',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/deepvoxels/vase.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_vase'\nbasedir = './logs/deepvoxels'\n\ndata = dict(\n    datadir='./data/deepvoxels/',\n    dataset_type='deepvoxels',\n    scene='vase',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/default.py",
    "content": "from copy import deepcopy\n\nexpname = None                    # experiment name\nbasedir = './logs/'               # where to store ckpts and logs\nmodel = None\n\n''' Template of data options\n'''\ndata = dict(\n    datadir=None,                 # path to dataset root folder\n    dataset_type=None,            # blender | nsvf | blendedmvs | tankstemple | deepvoxels | co3d\n    inverse_y=False,              # intrinsict mode (to support blendedmvs, nsvf, tankstemple)\n    flip_x=False,                 # to support co3d\n    flip_y=False,                 # to support co3d\n    annot_path='',                # to support co3d\n    split_path='',                # to support co3d\n    sequence_name='',             # to support co3d\n    load2gpu_on_the_fly=False,    # do not load all images into gpu (to save gpu memory)\n    testskip=1,                   # subsample testset to preview results\n    white_bkgd=False,             # use white background (note that some dataset don't provide alpha and with blended bg color)\n    rand_bkgd=False,              # use random background during training\n    half_res=False,               # [TODO]\n    bd_factor=.75,                # background \n    movie_render_kwargs=dict(),\n\n    # Below are forward-facing llff specific settings.\n    ndc=False,                    # use ndc coordinate (only for forward-facing; not support yet)\n    spherify=False,               # inward-facing\n    factor=4,                     # [TODO]\n    width=None,                   # enforce image width\n    height=None,                  # enforce image height\n    llffhold=8,                   # test split via llff\n    dvgohold=8,                   # test split via dvgo\n    load_depths=False,            # load depth\n\n    # Below are unbounded inward-facing specific settings.\n    unbounded_inward=False,\n    unbounded_inner_r=1.0,\n    boundary_ratio = 0.01,        # used in FourierGrid model.\n)\n\n''' Template of training options\n'''\ncoarse_train = dict(\n    N_iters=5000,                 # number of optimization steps\n    N_rand=8192,                  # batch size (number of random rays per optimization step)\n    lrate_density=1e-1,           # lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # lr of the mlp to preduct view-dependent color\n    lrate_decay=20,               # lr decay by 0.1 after every lrate_decay*1000 steps\n    pervoxel_lr=True,             # view-count-based lr\n    pervoxel_lr_downrate=1,       # downsampled image for computing view-count-based lr\n    ray_sampler='random',         # ray sampling strategies\n    weight_main=1.0,              # weight of photometric loss\n    weight_entropy_last=0.01,     # weight of background entropy loss\n    weight_nearclip=0,\n    weight_distortion=0,\n    weight_rgbper=0.1,            # weight of per-point rgb loss\n    weight_freq=0,                # weight of fourier mse loss\n    tv_every=1,                   # count total variation loss every tv_every step\n    tv_after=0,                   # count total variation loss from tv_from step\n    tv_before=0,                  # count total variation before the given number of iterations\n    tv_dense_before=0,            # count total variation densely before the given number of iterations\n    weight_tv_density=0.0,        # weight of total variation loss of density voxel grid\n    weight_tv_k0=0.0,             # weight of total variation loss of color/feature voxel grid\n    pg_scale=[],                  # checkpoints for progressive scaling\n    decay_after_scale=1.0,        # decay act_shift after scaling\n    skip_zero_grad_fields=[],     # the variable name to skip optimizing parameters w/ zero grad in each iteration\n    maskout_lt_nviews=0,\n)\n\nfine_train = deepcopy(coarse_train)\nfine_train.update(dict(\n    N_iters=20000,\n    pervoxel_lr=False,\n    ray_sampler='in_maskcache',\n    weight_entropy_last=0.001,\n    weight_rgbper=0.01,\n    pg_scale=[1000, 2000, 3000, 4000],\n    skip_zero_grad_fields=['density', 'k0'],\n))\n\n''' Template of model and rendering options\n'''\ncoarse_model_and_render = dict(\n    num_voxels_rgb=1024000,           # expected number of voxel\n    num_voxels_density=1024000,           # expected number of voxel\n    num_voxels_viewdir=-1,           # expected number of voxel\n    num_voxels_base_density=1024000,      # to rescale delta distance\n    num_voxels_base_rgb=1024000,      # to rescale delta distance\n    density_type='DenseGrid',     # DenseGrid, TensoRFGrid\n    k0_type='DenseGrid',          # DenseGrid, TensoRFGrid\n    density_config=dict(),\n    k0_config=dict(),\n    mpi_depth=128,                # the number of planes in Multiplane Image (work when ndc=True)\n    nearest=False,                # nearest interpolation\n    pre_act_density=False,        # pre-activated trilinear interpolation\n    in_act_density=False,         # in-activated trilinear interpolation\n    bbox_thres=1e-3,              # threshold to determine known free-space in the fine stage\n    mask_cache_thres=1e-3,        # threshold to determine a tighten BBox in the fine stage\n    rgbnet_dim=0,                 # feature voxel grid dim\n    rgbnet_full_implicit=False,   # let the colors MLP ignore feature voxel grid\n    rgbnet_direct=True,           # set to False to treat the first 3 dim of feature voxel grid as diffuse rgb\n    rgbnet_depth=3,               # depth of the colors MLP (there are rgbnet_depth-1 intermediate features)\n    rgbnet_width=128,             # width of the colors MLP\n    alpha_init=1e-6,              # set the alpha values everywhere at the begin of training\n    fast_color_thres=1e-7,        # threshold of alpha value to skip the fine stage sampled point\n    maskout_near_cam_vox=True,    # maskout grid points that between cameras and their near planes\n    world_bound_scale=1,          # rescale the BBox enclosing the scene\n    stepsize=0.5,                 # sampling stepsize in volume rendering\n)\n\nfine_model_and_render = deepcopy(coarse_model_and_render)\nfine_model_and_render.update(dict(\n    num_voxels_density=160**3,\n    num_voxels_rgb=160**3,\n    num_voxels_base_density=160**3,\n    num_voxels_base_rgb=160**3,\n    rgbnet_dim=12,\n    alpha_init=1e-2,\n    fast_color_thres=1e-4,\n    maskout_near_cam_vox=False,\n    world_bound_scale=1.05,\n    fourier_freq_num=3,\n))\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n\ndel deepcopy\n"
  },
  {
    "path": "FourierGrid/configs/free_dataset/grass.py",
    "content": "_base_ = '../default.py'\nexpname = 'grass_may31_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/free_dataset'\ndata = dict(\n    datadir='./data/free_dataset/grass',\n    dataset_type='free',\n    ndc=False,\n    training_ids=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, \\\n        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, \\\n            46, 47, 48, 49, 50],\n    pose_scale=0.75,      # scale of pose\n    factor=1,\n    # inverse_y=True,\n    # load2gpu_on_the_fly=True,\n    # white_bkgd=True,\n    # rand_bkgd=True,\n    # movie_render_kwargs={'pitch_deg': 20},\n)\n\ncoarse_train = dict(\n    N_iters = 0,\n    pervoxel_lr_downrate=2,\n    pervoxel_lr=True,  # DVGO default is True\n)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=4096,\n    weight_distortion=0.0,\n    pg_scale=[2000,4000,6000,8000],\n    ray_sampler='flatten',\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-5,\n    weight_tv_k0=1e-6,\n    )\n\nvoxel_size_density = 250  # default 400\nvoxel_size_rgb = 250  # default 320\n\nfine_model_and_render = dict(    \n    num_voxels=256**3,\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    mpi_depth=128,\n    rgbnet_dim=9,\n    rgbnet_width=64,\n    world_bound_scale=1,\n    fast_color_thres=1e-3,\n)\n"
  },
  {
    "path": "FourierGrid/configs/free_dataset/lab.py",
    "content": "_base_ = '../default.py'\nexpname = 'lab_may30_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='DVGO'\nbasedir = './logs/free_dataset'\ndata = dict(\n    datadir='./data/free_dataset/lab',\n    dataset_type='free',\n    # ndc=True,\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    # movie_render_kwargs={'pitch_deg': 20},\n)\n\ncoarse_train = dict(\n    N_iters = 0,\n    pervoxel_lr_downrate=2,\n    pervoxel_lr=True,  # DVGO default is True\n)\n\nfine_train = dict(\n    N_iters=30000,\n    N_rand=4096,\n    weight_distortion=0.0,\n    pg_scale=[2000,4000,6000,8000],\n    ray_sampler='flatten',\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-5,\n    weight_tv_k0=1e-6,\n    )\n\nvoxel_size_density = 250  # default 400\nvoxel_size_rgb = 250  # default 320\n\nfine_model_and_render = dict(    \n    # num_voxels=256**3,\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    mpi_depth=128,\n    rgbnet_dim=9,\n    rgbnet_width=64,\n    world_bound_scale=1,\n    fast_color_thres=1e-3,\n)\n\n\n\n# _base_ = '../default.py'\n# expname = 'train_may29_'\n# vis = dict(\n#     height_rate = 0.6 # camera direction frustrum height\n# )\n# model='FourierGrid'\n# # model='DVGO'\n# basedir = './logs/free_dataset'\n# visualize_poses = False\n# alpha_init = 1e-4\n# stepsize = 0.5\n# _mpi_depth = 256\n# if visualize_poses:  # for debugging\n#     unbounded_inward = True\n#     coarse_iter = 3000\n#     fast_color_thres=stepsize/_mpi_depth/5\n#     maskout_near_cam_vox = False\n#     pervoxel_lr = False\n#     weight_distortion = 0.0\n# else:\n#     unbounded_inward = True\n#     coarse_iter = 0\n#     fast_color_thres={   # default\n#             '_delete_': True,                           # to ignore the base config\n#             0   : alpha_init*stepsize/10,               # 0.5e-5\n#             1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n#             2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n#             3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n#             4500: min(alpha_init, 1e-4)*stepsize,\n#             5500: min(alpha_init, 1e-4),\n#             6500: 1e-4,\n#         }\n#     maskout_near_cam_vox = False\n#     pervoxel_lr = False\n#     weight_distortion = 0.01\n\n# data = dict(\n#     dataset_type='free',\n#     inverse_y=True,\n#     white_bkgd=True,\n#     rand_bkgd=True,\n#     unbounded_inward=unbounded_inward,\n#     load2gpu_on_the_fly=True,\n#     datadir='./data/free_dataset/grass',\n#     unbounded_inner_r=1.0,\n#     ndc=False,\n# )\n\n# coarse_train = dict(\n#     N_iters = coarse_iter, \n#     pervoxel_lr = pervoxel_lr,\n# )\n\n# fine_train = dict(\n#     N_iters=10000,\n#     # N_rand=2048,  # reduce this to fit into memory\n#     N_rand=4096,  # default\n#     ray_sampler='flatten',\n#     # ray_sampler='random',\n#     weight_distortion=weight_distortion,\n#     # pg_scale=[10,],\n#     pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n#     tv_before=1e9,  # always use tv\n#     tv_dense_before=10000,\n#     tv_after=0, # start from beginning\n#     tv_every=1,\n#     weight_tv_density=1e-6,\n#     weight_tv_k0=1e-7,\n#     pervoxel_lr=False,\n#     lrate_decay=20,               # default\n#     lrate_density=1e-1,           # default lr of density voxel grid\n#     lrate_k0=1e-1,                # lr of color/feature voxel grid\n#     lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n#     weight_entropy_last=1e-3,     # default\n#     weight_rgbper=1e-2,           # default\n#     weight_nearclip=0,\n#     weight_main=1.0,              # default = 1\n#     weight_freq=0.0,            \n# )\n\n# coarse_model_and_render = dict(\n#     maskout_near_cam_vox = maskout_near_cam_vox,\n# )\n\n# voxel_size_density = 250  # default 400\n# voxel_size_rgb = 250  # default 320\n# voxel_size_viewdir = -1\n# # voxel_size_viewdir = 64\n\n# fine_model_and_render = dict(\n#     num_voxels_density=voxel_size_density**3,\n#     num_voxels_base_density=voxel_size_density**3,\n#     num_voxels_rgb=voxel_size_rgb**3,\n#     num_voxels_base_rgb=voxel_size_rgb**3,\n#     num_voxels_viewdir=voxel_size_viewdir**3,\n#     alpha_init=alpha_init,\n#     stepsize=stepsize,\n#     fast_color_thres=fast_color_thres,\n#     world_bound_scale=1,\n#     # contracted_norm='l2', # default\n#     rgbnet_dim=12, # default\n#     # rgbnet_depth=3, # default\n#     viewbase_pe=4, # default=4\n#     bbox_thres=0.001,\n#     fourier_freq_num=3,\n#     maskout_near_cam_vox=False,\n#     bg_len=0.2,   # default=0.2\n# )\n"
  },
  {
    "path": "FourierGrid/configs/lf/africa.py",
    "content": "_base_ = './lf_default.py'\n\nexpname = 'dvgo_Africa_unbounded'\n\ndata = dict(\n    datadir='./data/lf_data/africa',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/lf/basket.py",
    "content": "_base_ = './lf_default.py'\n\nexpname = 'dvgo_Basket_unbounded'\n\ndata = dict(\n    datadir='./data/lf_data/basket',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/lf/lf_default.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/lf'\n\ndata = dict(\n    dataset_type='nerfpp',\n    inverse_y=True,\n    white_bkgd=False,\n    rand_bkgd=True,\n    unbounded_inward=True,\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=25000,\n    N_rand=4096,\n    ray_sampler='flatten',\n    weight_distortion=1e-2,\n    pg_scale=[1000,2000,3000,4000,5000,6000],\n    decay_after_scale=1.0,\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\nalpha_init = 1e-4\nstepsize = 0.5\n\nfine_model_and_render = dict(\n    num_voxels=256**3,\n    num_voxels_base=256**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/lf/ship.py",
    "content": "_base_ = './lf_default.py'\n\nexpname = 'dvgo_Ship_unbounded'\n\ndata = dict(\n    datadir='./data/lf_data/ship',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/lf/statue.py",
    "content": "_base_ = './lf_default.py'\n\nexpname = 'dvgo_Statue_unbounded'\n\ndata = dict(\n    datadir='./data/lf_data/statue',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/lf/torch.py",
    "content": "_base_ = './lf_default.py'\n\nexpname = 'dvgo_Torch_unbounded'\n\ndata = dict(\n    datadir='./data/lf_data/torch',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/linemod/ape.py",
    "content": "_base_ = '../default.py'\nseq_name = 'ape'\nseq_id = 1\nexpname = f'{seq_name}_nov11_'\npose_expname = 'bayes_nerf_v2_4'\nbasedir = 'logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    seq_id=1,\n    width_max=90,\n    height_max=90,\n    load2gpu_on_the_fly=True,\n)\n\nfine_train = dict(\n    N_iters=1*(10**4),\n)\n\n# voxel_num=32**3\n# coarse_model_and_render = dict(\n#     num_voxels_rgb=voxel_num,\n#     num_voxels_base_rgb=voxel_num,\n#     num_voxels_density=voxel_num,\n#     num_voxels_base_density=voxel_num,\n# )\n# fine_model_and_render = dict(\n#     num_voxels_rgb=voxel_num,\n#     num_voxels_base_rgb=voxel_num,\n#     num_voxels_density=voxel_num,\n#     num_voxels_base_density=voxel_num,\n#     # rgbnet_dim=0,\n# )"
  },
  {
    "path": "FourierGrid/configs/linemod/benchvise.py",
    "content": "_base_ = '../default.py'\nseq_name = 'benchvise'\nexpname = f'{seq_name}_nov9_'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=230,\n    height_max=230,\n    #198, 224\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/camera.py",
    "content": "_base_ = '../default.py'\nseq_name = 'camera'\nexpname = f'{seq_name}_nov9_'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=150,\n    height_max=150\n    # 142, 137\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/can.py",
    "content": "_base_ = '../default.py'\nseq_name = 'can'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=190,\n    height_max=190\n    # 162, 180\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/cat.py",
    "content": "_base_ = '../default.py'\nseq_name = 'cat'\nexpname = f'{seq_name}_nov8_'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=130,\n    height_max=130,\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/driller.py",
    "content": "_base_ = '../default.py'\nseq_name = 'driller'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=250,\n    height_max=250,\n    # 240, 237\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/duck.py",
    "content": "_base_ = '../default.py'\nseq_name = 'duck'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=90,\n    height_max=90\n    # 85, 85\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/eggbox.py",
    "content": "_base_ = '../default.py'\nseq_name = 'eggbox'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=140,\n    height_max=140\n    # 131, 132\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/glue.py",
    "content": "_base_ = '../default.py'\nseq_name = 'glue'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=150,\n    height_max=150\n    # 111, 147\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/holepuncher.py",
    "content": "_base_ = '../default.py'\nseq_name = 'holepuncher'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=120,\n    height_max=120\n    # 120, 120\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/iron.py",
    "content": "_base_ = '../default.py'\nseq_name = 'iron'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=240,\n    height_max=240\n    # 233, 224\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/lamp.py",
    "content": "_base_ = '../default.py'\nseq_name = 'lamp'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=260,\n    height_max=260\n    # 232, 250\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/linemod/phone.py",
    "content": "_base_ = '../default.py'\nseq_name = 'phone'\nexpname = f'{seq_name}_nov8'\nbasedir = './logs/linemod'\n\ndata = dict(\n    datadir='./data/linemod',\n    dataset_type='linemod',\n    white_bkgd=True,\n    seq_name=seq_name,\n    width_max=190,\n    height_max=190\n    # 159, 187\n)\n\n# fine_train = dict(\n#     ray_sampler='flatten',\n# )\n"
  },
  {
    "path": "FourierGrid/configs/llff/fern.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'fern'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/fern',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/fern_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'fern_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/fern',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/flower.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'flower'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/flower',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/flower_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'flower_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/flower',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/fortress.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'fortress'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/fortress',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/fortress_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'fortress_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/fortress',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/horns.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'horns'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/horns',\n    # dataset_type='free',\n)\n"
  },
  {
    "path": "FourierGrid/configs/llff/horns_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'horns_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/horns',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/leaves.py",
    "content": "_base_ = './llff_default.py'\nmodel = 'DVGO'\nexpname = 'leaves'\nbasedir = './logs/llff'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/leaves',\n)\n"
  },
  {
    "path": "FourierGrid/configs/llff/leaves_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'leaves_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/leaves',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/llff_default.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/llff'\n\ndata = dict(\n    dataset_type='llff',\n    ndc=True,\n    width=1008,\n    height=756,\n)\n\ncoarse_train = dict(\n    N_iters=0,\n)\n\nfine_train = dict(\n    N_iters=30000,\n    N_rand=4096,\n    weight_distortion=0.01,\n    pg_scale=[2000,4000,6000,8000],\n    ray_sampler='flatten',\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-5,\n    weight_tv_k0=1e-6,\n)\n\nfine_model_and_render = dict(\n    num_voxels=256**3,\n    mpi_depth=128,\n    rgbnet_dim=9,\n    rgbnet_width=64,\n    world_bound_scale=1,\n    fast_color_thres=1e-3,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/llff_default_lg.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/llff'\n\ndata = dict(\n    dataset_type='llff',\n    ndc=True,\n    width=1008,\n    height=756,\n    rand_bkgd=True,\n)\n\ncoarse_train = dict(\n    N_iters=0,\n)\n\nfine_train = dict(\n    N_iters=30000,\n    N_rand=4096,\n    weight_distortion=0.01,\n    pg_scale=[2000,4000,6000,8000],\n    decay_after_scale=0.1,\n    ray_sampler='flatten',\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-5,\n    weight_tv_k0=1e-6,\n)\n\n_mpi_depth = 256\n_stepsize = 1.0\n\nfine_model_and_render = dict(\n    num_voxels=384*384*_mpi_depth,\n    mpi_depth=_mpi_depth,\n    stepsize=_stepsize,\n    rgbnet_dim=9,\n    rgbnet_width=64,\n    world_bound_scale=1,\n    fast_color_thres=_stepsize/_mpi_depth/5,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/orchids.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'orchids'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/orchids',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/orchids_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'orchids_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/orchids',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/room.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'room'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/room',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/room_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'room_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/room',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/trex.py",
    "content": "_base_ = './llff_default.py'\n\nexpname = 'trex'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/trex',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/llff/trex_lg.py",
    "content": "_base_ = './llff_default_lg.py'\n\nexpname = 'trex_lg'\n\ndata = dict(\n    datadir='./data/nerf_llff_data/trex',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/mega/building.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\ndata_name = 'building'\nbasedir = f'./logs/mega/{data_name}'\nvisualize_poses = False\nalpha_init = 1e-2\nstepsize = 0.5\n_mpi_depth = 256\nmaskout_near_cam_vox = False  # changed\npervoxel_lr = False\nunbounded_inward = True\nmega_dataset_root = \"data/oct9_mega\"\nexpname = f'oct9_mega_{data_name}'\nif visualize_poses:  # for debugging only\n    coarse_iter = 600\n    fast_color_thres=stepsize/_mpi_depth/5\n    weight_distortion = 0.0\nelse:\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,\n    #         0   : alpha_init*stepsize/10,\n    #         1500: min(alpha_init, 1e-4)*stepsize/5,\n    #         2500: min(alpha_init, 1e-4)*stepsize/2,\n    #         3500: min(alpha_init, 1e-4)*stepsize/1.5,\n    #         4500: min(alpha_init, 1e-4)*stepsize,\n    #         5500: min(alpha_init, 1e-4),\n    #         6500: 1e-4,\n    #     }\n    fast_color_thres=1e-3\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='mega',\n    inverse_y=True,\n    white_bkgd=True,     # almost no effect when rand_bkgd=True\n    rand_bkgd=False,      # random background\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=False,\n    datadir=f'data/oct9_mega/{data_name}',\n    factor=2,\n    near_clip = 0.1,\n    near = 0.1,\n    far = 0.01,\n    test_rotate_angle=50, # rotate angle in testing phase\n    sample_interval=1,\n    num_per_block=5,  # run this num in block\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n    ray_sampler='flatten',\n)\n\nfine_train = dict(\n    N_iters=40000, # 40k for whole training procedure\n    N_rand=4096,\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    pg_scale=[],\n    # pg_scale=[500],\n    # pg_scale=[1000, 2000, 3000, 4000, 5000,],\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n    bbox_thres=1e-10,  # display all the bboxes\n)\n\nvoxel_size = 400\n# voxel_size = 320 # default\nfine_model_and_render = dict(\n    num_voxels=voxel_size**3,\n    num_voxels_base=voxel_size**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n    rgbnet_dim=12, # default\n    rgbnet_direct=True,\n    density_type='DenseGrid',\n    k0_type='DenseGrid',\n    bg_len=0.2,  # default\n    viewbase_pe=8,\n    # maskout_near_cam_vox=True, \n    maskout_near_cam_vox=True, # changed to False because it might often trigger OOM error\n    # # TensorRF settings\n    # density_type='TensoRFGrid', \n    # k0_type='TensoRFGrid', \n    # density_config=dict(n_comp=8),\n    # k0_config=dict(n_comp=24),\n)\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n\n# artistic radiance fields\narf = dict(\n    style_root = mega_dataset_root + \"/styles\",\n    style_id = 34,\n)\n"
  },
  {
    "path": "FourierGrid/configs/mega/building_no_block.py",
    "content": "_base_ = '../default.py'\ndata_name = 'building'\nmodel='FourierGrid'\nbasedir = f'./logs/mega/{data_name}'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nmaskout_near_cam_vox = False  # changed\npervoxel_lr = False\nunbounded_inward = True\nexpname = f'oct9_mega_{data_name}'\nif visualize_poses:  # for debugging only\n    coarse_iter = 600\n    fast_color_thres=stepsize/_mpi_depth/5\n    weight_distortion = 0.0\nelse:\n    coarse_iter = 0\n    fast_color_thres={\n            '_delete_': True,\n            0   : alpha_init*stepsize/10,\n            1500: min(alpha_init, 1e-4)*stepsize/5,\n            2500: min(alpha_init, 1e-4)*stepsize/2,\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,\n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    weight_distortion = -1\n\ndata = dict(\n    dataset_type='mega',\n    inverse_y=True,\n    white_bkgd=True,     # almost no effect when rand_bkgd=True\n    rand_bkgd=True,      # random background\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n    datadir=f'data/oct9_mega/{data_name}',\n    factor=2,\n    near_clip = 0.1,\n    near = 0.1,\n    far = 0.01,\n    test_rotate_angle=50, # rotate angle in testing phase\n    sample_interval=1,\n    num_per_block=-1,  # run this num in block\n    unbounded_inner_r=1.0,\n    boundary_ratio=0.0,\n    # training_ids=['000517', '000520', '000524', ],\n)\n\nnerf_em = dict(\n    sample_num = 3,\n    pos_x_range = 0.1,\n    pos_y_range = 0.1,\n    pos_z_range = 0.1,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n    ray_sampler='flatten',\n)\n\nfine_train = dict(\n    N_iters_m_step=3000,\n    # N_iters=10*(10**4), \n    N_iters=3000,\n    N_rand=4096,\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    # pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],  # default\n    pg_scale=[2000, 4000, 6000, 7000],  \n    # pg_scale=[],  # used for model size testing\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    # added\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=3.0,              # default = 1\n    weight_freq=1.0,       \n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n    bbox_thres=1e-10,  # display all the bboxes\n)\n\nvoxel_size_density = 300  # default 400\nvoxel_size_rgb = 300  # default 320\nvoxel_size_viewdir = -1\n\n# voxel_size = 320 # default\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n    rgbnet_dim=3, # default\n    rgbnet_direct=True,\n    density_type='DenseGrid',\n    k0_type='DenseGrid',\n    # bg_len=0.2,  # default\n    bg_len=0.25,  # default\n    viewbase_pe=8,\n    maskout_near_cam_vox=False,\n)\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n"
  },
  {
    "path": "FourierGrid/configs/mega/quad.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\ndata_name = 'quad'\nbasedir = f'./logs/mega/{data_name}'\nvisualize_poses = False\nalpha_init = 1e-2\nstepsize = 0.5\n_mpi_depth = 256\nmaskout_near_cam_vox = False  # changed\npervoxel_lr = False\nunbounded_inward = True\nexpname = f'oct12_mega_{data_name}'\nif visualize_poses:  # for debugging only\n    coarse_iter = 600\n    fast_color_thres=stepsize/_mpi_depth/5\n    weight_distortion = 0.0\nelse:\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,\n    #         0   : alpha_init*stepsize/10,\n    #         1500: min(alpha_init, 1e-4)*stepsize/5,\n    #         2500: min(alpha_init, 1e-4)*stepsize/2,\n    #         3500: min(alpha_init, 1e-4)*stepsize/1.5,\n    #         4500: min(alpha_init, 1e-4)*stepsize,\n    #         5500: min(alpha_init, 1e-4),\n    #         6500: 1e-4,\n    #     }\n    fast_color_thres=1e-3\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='mega',\n    inverse_y=True,\n    white_bkgd=True,     # almost no effect when rand_bkgd=True\n    rand_bkgd=False,      # random background\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=False,\n    datadir=f'data/oct9_mega/{data_name}',\n    boundary_ratio=1.0,\n    factor=2, # not used\n    near_clip = 0.1,\n    near = 0.1,\n    far = 0.01,\n    test_rotate_angle=50, # rotate angle in testing phase\n    sample_interval=1,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n    ray_sampler='flatten',\n)\n\nfine_train = dict(\n    N_iters=20000, # 40k for whole training procedure\n    N_rand=4096,\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    pg_scale=[500],\n    # pg_scale=[1000, 2000, 3000, 4000, 5000,],\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n    bbox_thres=1e-10,  # display all the bboxes\n)\n\nvoxel_size = 400\n# voxel_size = 320 # default\nfine_model_and_render = dict(\n    num_voxels=voxel_size**3,\n    num_voxels_base=voxel_size**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n    rgbnet_dim=12, # default\n    rgbnet_direct=True,\n    density_type='DenseGrid',\n    k0_type='DenseGrid',\n    bg_len=0.2,  # default\n    viewbase_pe=8,\n    maskout_near_cam_vox=True,\n    # # TensorRF settings\n    # density_type='TensoRFGrid', \n    # k0_type='TensoRFGrid', \n    # density_config=dict(n_comp=8),\n    # k0_config=dict(n_comp=24),\n)\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n"
  },
  {
    "path": "FourierGrid/configs/mega/rubble.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\ndata_name = 'rubble'\nbasedir = f'./logs/mega/{data_name}'\nvisualize_poses = False\nalpha_init = 1e-2\nstepsize = 0.5\n_mpi_depth = 256\nmaskout_near_cam_vox = False  # changed\npervoxel_lr = False\nunbounded_inward = True\nexpname = f'oct11_mega_{data_name}'\nmega_dataset_root = \"data/oct9_mega\"\n\nif visualize_poses:  # for debugging only\n    coarse_iter = 600\n    fast_color_thres=stepsize/_mpi_depth/5\n    weight_distortion = 0.0\nelse:\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,\n    #         0   : alpha_init*stepsize/10,\n    #         1500: min(alpha_init, 1e-4)*stepsize/5,\n    #         2500: min(alpha_init, 1e-4)*stepsize/2,\n    #         3500: min(alpha_init, 1e-4)*stepsize/1.5,\n    #         4500: min(alpha_init, 1e-4)*stepsize,\n    #         5500: min(alpha_init, 1e-4),\n    #         6500: 1e-4,\n    #     }\n    fast_color_thres=1e-3\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='mega',\n    inverse_y=True,\n    white_bkgd=True,     # almost no effect when rand_bkgd=True\n    rand_bkgd=False,      # random background\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=False,\n    datadir=mega_dataset_root + f'/{data_name}',\n    boundary_ratio=1.0,\n    factor=2, # not used\n    near_clip = 0.1,\n    near = 0.1,\n    far = 0.01,\n    test_rotate_angle=50, # rotate angle in testing phase\n    sample_interval=1,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n    ray_sampler='flatten',\n)\n\nfine_train = dict(\n    N_iters=40000, # 40k for whole training procedure\n    N_rand=4096,\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    pg_scale=[500],\n    # pg_scale=[1000, 2000, 3000, 4000, 5000,],\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n    bbox_thres=1e-10,  # display all the bboxes\n)\n\nvoxel_size = 400\n# voxel_size = 320 # default\nfine_model_and_render = dict(\n    num_voxels=voxel_size**3,\n    num_voxels_base=voxel_size**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n    rgbnet_dim=12, # default\n    rgbnet_direct=True,\n    density_type='DenseGrid',\n    k0_type='DenseGrid',\n    bg_len=0.2,  # default\n    viewbase_pe=8,\n    maskout_near_cam_vox=True,\n    # # TensorRF settings\n    # density_type='TensoRFGrid', \n    # k0_type='TensoRFGrid', \n    # density_config=dict(n_comp=8),\n    # k0_config=dict(n_comp=24),\n)\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n\n\n# artistic radiance fields\narf = dict(\n    style_root = mega_dataset_root + \"/styles\",\n    style_id = 5,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf/chair.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_chair'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/chair',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/drums.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_drums'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/drums',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/ficus.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_ficus'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/ficus',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/hotdog.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_hotdog'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/hotdog',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/lego.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_lego'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/lego',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf/materials.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_materials'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/materials',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/mic.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_mic'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/mic',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/ship.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_ship'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/ship',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf/ship.tensorf.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_ship_tensorf'\nbasedir = './logs/nerf_synthetic'\n\ndata = dict(\n    datadir='./data/nerf_synthetic/ship',\n    dataset_type='blender',\n    white_bkgd=True,\n)\n\nfine_train = dict(\n    lrate_density=0.02,\n    lrate_k0=0.02,\n    pg_scale=[1000,2000,3000,4000,5000,6000],\n)\n\nfine_model_and_render = dict(\n    num_voxels=384**3,\n    density_type='TensoRFGrid',\n    density_config=dict(n_comp=8),\n    k0_type='TensoRFGrid',\n    k0_config=dict(n_comp=24),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_studio/Giannini_Hall.py",
    "content": "_base_ = './nerf_studio_default.py'\nexpname = 'Giannini-Hall_mar16_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/Giannini-Hall'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\n# weight_distortion = 0.01\nweight_distortion = 0.02\ndata = dict(\n    dataset_type='nerfstudio',\n    spherify=True,  # default: True\n    factor=8,\n    llffhold=-1,\n    dvgohold=8,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=True,\n    load2gpu_on_the_fly=True,\n    datadir='./data/nerfstudio_data/Giannini-Hall',\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.3, # negative down\n        shift_z=0,\n        scale_r=0.2,\n        pitch_deg=-40, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    # pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf_studio/nerf_studio_default.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/nerf_unbounded'\n\ndata = dict(\n    dataset_type='nerfstudio',\n    spherify=True,\n    factor=4,\n    llffhold=-1,\n    dvgohold=-1,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=True,\n    load2gpu_on_the_fly=True,\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=40000,\n    N_rand=4096,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=0.01,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\nalpha_init = 1e-4\nstepsize = 0.5\n\nfine_model_and_render = dict(\n    num_voxels=320**3,\n    num_voxels_base=320**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_studio/stump.py",
    "content": "_base_ = './nerf_studio_default.py'\nexpname = 'stump_mar16_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/stump'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.02\ndata = dict(\n    dataset_type='nerfstudio',\n    spherify=False,\n    recenter=False,\n    factor=8,\n    llffhold=-1,\n    dvgohold=8,\n    unbounded_inner_r=1.0,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=True,\n    load2gpu_on_the_fly=True,\n    bd_factor=None,\n    datadir='./data/nerfstudio_data/stump',\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.3, # negative down\n        shift_z=0,\n        scale_r=0.2,\n        pitch_deg=-40, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=10000, # zelin working\n    # N_iters=100000,  \n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    # pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000], # zelin working\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/bicycle.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'dvgo_bicycle_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/bicycle',\n    factor=4, # 1237x822\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=0, # negative down\n        shift_z=0,\n        scale_r=1.0,\n        pitch_deg=-10, # negative look downward\n    ),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/bicycle_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'bicycle_nov29_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\n# fast_color_thres={   # default\n#         '_delete_': True,                           # to ignore the base config\n#         0   : alpha_init*stepsize/10,               # 0.5e-5\n#         1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n#         2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n#         3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n#         4500: min(alpha_init, 1e-4)*stepsize,\n#         5500: min(alpha_init, 1e-4),\n#         6500: 1e-4,\n#     }\nfast_color_thres = 1e-4\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.05\ndata = dict(\n    dataset_type='llff',\n    datadir='./data/360_v2/bicycle',\n    factor=16, # 1237x822\n    # width=320,\n    # height=240,\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=0, # negative down\n        shift_z=0,\n        scale_r=1.0,\n        pitch_deg=-10, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    # pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    weight_freq=5.0,\n)\n\nvoxel_size_for_all = 200 # default 220\nvoxel_size_density = voxel_size_for_all\nvoxel_size_rgb = voxel_size_for_all\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    rgbnet_dim=12, # default\n    rgbnet_depth=3, # default\n    bbox_thres=-1,\n    maskout_near_cam_vox=False,\n    bg_len=0.2,\n)"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/bonsai.py",
    "content": "_base_ = './nerf_unbounded_default.py'\n\nexpname = 'dvgo_bonsai_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/bonsai',\n    factor=2, # 1559x1039\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=0, # negative down\n        shift_z=0,\n        scale_r=1.0,\n        pitch_deg=-30, # negative look downward\n    ),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/bonsai_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'bonsai_nov29_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.01\ndata = dict(\n    dataset_type='llff',\n    datadir='./data/360_v2/bonsai',\n    # factor=2, # 1559x1039\n    factor=8, # 1237x822\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=0, # negative down\n        shift_z=0,\n        scale_r=1.0,\n        pitch_deg=-30, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\n\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/counter.py",
    "content": "_base_ = './nerf_unbounded_default.py'\n\nexpname = 'dvgo_counter_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/counter',\n    factor=2, # 1558x1038\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.2, # negative down\n        shift_z=0,\n        scale_r=0.9,\n        pitch_deg=-30, # negative look downward\n    ),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/counter_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'counter_may25_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.01\ndata = dict(\n    dataset_type='llff',\n    datadir='./data/360_v2/counter',\n    # factor=2, # 1558x1038\n    factor=16, # 1558x1038\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.2, # negative down\n        shift_z=0,\n        scale_r=0.9,\n        pitch_deg=-30, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    # pg_scale=[20], # test memory\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\n\nvoxel_size_density = 250  # default 250\nvoxel_size_rgb = 250 # default 250\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fourier_freq_num=4,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/garden.py",
    "content": "_base_ = './nerf_unbounded_default.py'\n\nexpname = 'dvgo_garden_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/garden',\n    factor=4, # 1297x840\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.0, # negative down\n        shift_z=0,\n        scale_r=0.9,\n        pitch_deg=-30,\n    ),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/garden_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'garden_may24_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.01\ndata = dict(\n    dataset_type='llff',\n    datadir='./data/360_v2/garden',\n    # factor=4, # 1297x840\n    factor=8, # 1297x840\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.0, # negative down\n        shift_z=0,\n        scale_r=0.9,\n        pitch_deg=-30,\n    ),\n)\n\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/kitchen.py",
    "content": "_base_ = './nerf_unbounded_default.py'\n\nexpname = 'dvgo_kitchen_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/kitchen',\n    factor=2, # 1558x1039\n    movie_render_kwargs=dict(\n        shift_y=-0.0,\n        scale_r=0.9,\n        pitch_deg=-40,\n    ),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/kitchen_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'kitchen_nov29_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.01\n\n\ndata = dict(\n    dataset_type='llff',\n    datadir='./data/360_v2/kitchen',\n    # factor=2, # 1558x1039\n    factor=8, # 1558x1039\n    movie_render_kwargs=dict(\n        shift_y=-0.0,\n        scale_r=0.9,\n        pitch_deg=-40,\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/nerf_unbounded_default.py",
    "content": "_base_ = '../default.py'\n\nbasedir = './logs/nerf_unbounded'\n\ndata = dict(\n    dataset_type='llff',\n    spherify=True,\n    factor=4,\n    llffhold=8,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=True,\n    load2gpu_on_the_fly=True,\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=40000,\n    N_rand=4096,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=0.01,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\nalpha_init = 1e-4\nstepsize = 0.5\n\nfine_model_and_render = dict(\n    num_voxels=320**3,\n    num_voxels_base=320**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/room.py",
    "content": "_base_ = './nerf_unbounded_default.py'\n\nexpname = 'dvgo_room_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/room',\n    factor=2, # 1557x1038\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.3, # negative down\n        shift_z=0,\n        scale_r=0.2,\n        pitch_deg=-40, # negative look downward\n    ),\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/room_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'room_nov25_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\n# weight_distortion = 0.01\nweight_distortion = 0.02\ndata = dict(\n    dataset_type='llff',\n    spherify=True,\n    factor=8,\n    llffhold=8,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=True,\n    load2gpu_on_the_fly=True,\n    datadir='./data/360_v2/room',\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.3, # negative down\n        shift_z=0,\n        scale_r=0.2,\n        pitch_deg=-40, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    # weight_freq=0.1,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres={\n        '_delete_': True,\n        0   : alpha_init*stepsize/10,\n        1500: min(alpha_init, 1e-4)*stepsize/5,\n        2500: min(alpha_init, 1e-4)*stepsize/2,\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,\n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    },\n    world_bound_scale=1,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/stump.py",
    "content": "_base_ = './nerf_unbounded_default.py'\n\nexpname = 'dvgo_stump_unbounded'\n\ndata = dict(\n    datadir='./data/360_v2/stump',\n    factor=4,\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.2, # negative down\n        shift_z=0,\n        scale_r=0.8,\n        pitch_deg=-20, # negative look downward\n    ),\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nerf_unbounded/stump_single.py",
    "content": "_base_ = './nerf_unbounded_default.py'\nexpname = 'stump_may23_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/360'\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\ncoarse_iter = 0\nfast_color_thres={   # default\n        '_delete_': True,                           # to ignore the base config\n        0   : alpha_init*stepsize/10,               # 0.5e-5\n        1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n        2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n        3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n        4500: min(alpha_init, 1e-4)*stepsize,\n        5500: min(alpha_init, 1e-4),\n        6500: 1e-4,\n    }\nmaskout_near_cam_vox = False\npervoxel_lr = False\nweight_distortion = 0.01\ndata = dict(\n    dataset_type='llff',\n    datadir='./data/360_v2/stump',\n    factor=16,\n    movie_render_kwargs=dict(\n        shift_x=0.0,  # positive right\n        shift_y=-0.2, # negative down\n        shift_z=0,\n        scale_r=0.8,\n        pitch_deg=-20, # negative look downward\n    ),\n)\n\ncoarse_train = dict(N_iters=0)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,\n    lrate_decay=80,\n    ray_sampler='flatten',\n    weight_nearclip=1.0,\n    weight_distortion=weight_distortion,\n    # pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    pg_scale=[2000,4000,6000,8000,10000,12000,14000,16000],\n    # pg_scale=[10],  # test memory\n    tv_before=20000,\n    tv_dense_before=20000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    weight_main=1.0,\n    weight_freq=5.0,\n)\n\nvoxel_size_density = 250  # default 250\nvoxel_size_rgb = 250  # default 322500\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fourier_freq_num=3,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n)\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Bike.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Bike'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Bike',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Lifestyle.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Lifestyle'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Lifestyle',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Palace.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Palace'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Palace',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Robot.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Robot'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Robot',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Spaceship.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Spaceship'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Spaceship',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Steamtrain.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Steamtrain'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Steamtrain',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Toad.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Toad'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Toad',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/nsvf/Wineholder.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Wineholder'\nbasedir = './logs/nsvf_synthetic'\n\ndata = dict(\n    datadir='./data/Synthetic_NSVF/Wineholder',\n    dataset_type='nsvf',\n    inverse_y=True,\n    white_bkgd=True,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Barn.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Barn'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Barn',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Barn_lg.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Barn_lg'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Barn',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    movie_render_kwargs={'flip_up_vec': True},\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\nfine_train = dict(pg_scale=[1000,2000,3000,4000,5000,6000])\nfine_model_and_render = dict(num_voxels=256**3)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Caterpillar.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Caterpillar'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Caterpillar',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Caterpillar_lg.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Caterpillar_lg'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Caterpillar',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\nfine_train = dict(pg_scale=[1000,2000,3000,4000,5000,6000])\nfine_model_and_render = dict(num_voxels=256**3)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Family.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Family'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Family',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Family_lg.py",
    "content": "_base_ = '../default.py'\n# model='FourierGrid'\nmodel='DVGO'\nexpname = 'dvgo_Family_lg'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Family',\n    dataset_type='tankstemple',  # note: this means bounded\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    movie_render_kwargs={'pitch_deg': 20},\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n    pervoxel_lr=True,  # DVGO default is True\n)\n\nfine_train = dict(\n    pg_scale=[1000,2000,3000,4000,5000,6000],\n    pervoxel_lr=True,   # DVGO default is True\n    )\n\nfine_model_and_render = dict(num_voxels_density=256**3, num_voxels_rgb=256**3, fourier_freq_num=3,\n                             num_voxels_base_rgb=160**3, num_voxels_base_density=160**3)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Ignatius.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Ignatius'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Ignatius',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Ignatius_lg.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Ignatius_lg'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Ignatius',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\nfine_train = dict(pg_scale=[1000,2000,3000,4000,5000,6000])\nfine_model_and_render = dict(num_voxels=256**3)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Truck.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Truck'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Truck',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/Truck_lg.py",
    "content": "_base_ = '../default.py'\n\nexpname = 'dvgo_Truck_lg'\nbasedir = './logs/tanks_and_temple'\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Truck',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    movie_render_kwargs={'flip_up_vec': True, 'shift_y': -0.1},\n)\n\ncoarse_train = dict(\n    pervoxel_lr_downrate=2,\n)\n\nfine_train = dict(pg_scale=[1000,2000,3000,4000,5000,6000])\nfine_model_and_render = dict(num_voxels=256**3)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/barn_single.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\nexpname = 'Barn_nov25_'\nbasedir = './logs/tanks_and_temple'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\n\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Barn',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    unbounded_inner_r=1.0,\n    ndc=False,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter, \n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    # N_iters=3000,\n    N_iters=100000,\n    # N_rand=2048,  # reduce this to fit into memory\n    N_rand=4096,  # default\n    ray_sampler='flatten',\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=1.0,              # default = 1\n    weight_freq=0.3,            \n)\n\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n# voxel_size_viewdir = 64\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    # rgbnet_dim=3,\n    rgbnet_dim=12, # default\n    rgbnet_depth=3, # default\n    # viewbase_pe=4, # default=4\n    bbox_thres=0.001,\n    maskout_near_cam_vox=False,\n    # bg_len=0.2,   # default=0.2\n)\n\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/caterpillar_single.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\nexpname = 'Caterpillar_nov24_'\nbasedir = './logs/tanks_and_temple'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Caterpillar',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    unbounded_inner_r=1.0,\n    ndc=False,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter, \n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    # N_iters=3000,\n    N_iters=100000,\n    # N_rand=2048,  # reduce this to fit into memory\n    N_rand=4096,  # default\n    ray_sampler='flatten',\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=1.0,              # default = 1\n    weight_freq=0.3,            \n)\n\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n# voxel_size_viewdir = 64\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    # rgbnet_dim=3,\n    rgbnet_dim=12, # default\n    rgbnet_depth=3, # default\n    # viewbase_pe=4, # default=4\n    bbox_thres=0.001,\n    maskout_near_cam_vox=False,\n    # bg_len=0.2,   # default=0.2\n)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple/family_single.py",
    "content": "_base_ = '../default.py'\nexpname = 'family_may23_'\nmodel='FourierGrid'\nbasedir = './logs/tanks_and_temple'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nbasedir = './logs/tanks_and_temple_unbounded'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,                           # to ignore the base config\n    #         0   : 1e-4,                                 # 0.5e-5\n    #     }\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    datadir='./data/TanksAndTemple/Family',\n    dataset_type='tankstemple',\n    inverse_y=True,\n    load2gpu_on_the_fly=True,\n    white_bkgd=True,\n    movie_render_kwargs={'pitch_deg': 20},\n    unbounded_inward=unbounded_inward,\n    unbounded_inner_r=1.0,\n    ndc=False,\n)\n\ncoarse_train = dict(\n    # pervoxel_lr_downrate=2,\n    N_iters=coarse_iter, \n    pervoxel_lr = pervoxel_lr,\n)\n\n\nfine_train = dict(\n    # N_iters=3000,\n    N_iters=100000,\n    # N_rand=2048,  # reduce this to fit into memory\n    N_rand=4096,  # default\n    ray_sampler='flatten',\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000,2000,3000,4000,5000,6000],\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=1.0,              # default = 1\n    weight_freq=0.0,            \n)\n\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 160  # default 400\nvoxel_size_rgb = 160  # default 320\nvoxel_size_viewdir = -1\n# voxel_size_viewdir = 64\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    # rgbnet_dim=3,\n    rgbnet_dim=12, # default\n    rgbnet_depth=3, # default\n    # viewbase_pe=4, # default=4\n    bbox_thres=0.001,\n    maskout_near_cam_vox=False,\n    # bg_len=0.2,   # default=0.2\n)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/M60.py",
    "content": "_base_ = './tt_default.py'\n\nexpname = 'dvgo_M60_unbounded'\n\ndata = dict(\n    datadir='./data/tanks_and_temples/tat_intermediate_M60',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/Playground.py",
    "content": "_base_ = './tt_default.py'\n\nexpname = 'oct22_dvgo_Playground_unbounded_baseline'\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\ndata = dict(\n    datadir='./data/tanks_and_temples/tat_intermediate_Playground',\n)\n\nfine_train = dict(\n    N_iters=40000,  # a quick validation\n)"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/Train.py",
    "content": "_base_ = './tt_default.py'\n\nexpname = 'dvgo_Train_unbounded'\n\ndata = dict(\n    datadir='./data/tanks_and_temples/tat_intermediate_Train',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/Truck.py",
    "content": "_base_ = './tt_default.py'\n\nexpname = 'dvgo_Truck_unbounded'\n\ndata = dict(\n    datadir='./data/tanks_and_temples/tat_training_Truck',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/m60_single.py",
    "content": "_base_ = '../default.py'\nexpname = 'm60_may26_'\nmodel='FourierGrid'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nbasedir = './logs/tanks_and_temple_unbounded'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = -1\n\ndata = dict(\n    dataset_type='nerfpp',\n    inverse_y=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n    datadir='./data/tanks_and_temples/tat_intermediate_M60',\n    unbounded_inner_r=1.0,\n    ndc=False,\n    # # remove noisy training images\n    # training_ids=[106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,\n    #               121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,\n    #               136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,\n    #               151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 221, 222, 223, 224,\n    #               225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,\n    #               240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,\n    #               255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,\n    #               270, 271, 272, 273, 274, 275]\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter, \n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    # N_iters=3000,\n    N_iters=30000,\n    # N_rand=2048,  # reduce this to fit into memory\n    N_rand=4096,  # default\n    ray_sampler='flatten',\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    # pg_scale=[10],  # test memory\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=1.0,              # default = 1\n    weight_freq=0.0,            \n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    # rgbnet_dim=3,\n    fourier_freq_num=3,\n    rgbnet_dim=12, # default\n    rgbnet_depth=3, # default\n    # viewbase_pe=4, # default=4\n    bbox_thres=0.001,\n    maskout_near_cam_vox=False,\n    # bg_len=0.2,   # default=0.2\n)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/playground_single.py",
    "content": "_base_ = '../default.py'\nexpname = 'oct29_dvgo_Playground_unbounded_baseline'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/tanks_and_temple_unbounded'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,                           # to ignore the base config\n    #         0   : 1e-4,                                 # 0.5e-5\n    #     }\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='nerfpp',\n    inverse_y=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n    datadir='./data/tanks_and_temples/tat_intermediate_Playground',\n    unbounded_inner_r=1.0,\n    ndc=False,\n    # remove noisy training images\n    training_ids=[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,\\\n        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 61, \\\n                62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, \\\n                    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,\\\n                       115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 133, \\\n                                134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 203, 204, 205, 206, 207, 208, 209, \\\n                                210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, \\\n                                    279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289]\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    N_iters=100000,\n    N_rand=2048,  # reduce this to fit into memory\n    # N_rand=4096,  # default\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000,2000,3000,4000,5000,6000,7000],\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=3.0,              # default = 1\n    # weight_freq=0.0,            \n    weight_freq=1.0,            \n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n# voxel_size_viewdir = 64\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    # rgbnet_dim=3,\n    rgbnet_dim=12, # default\n    rgbnet_depth=3, # default\n    # viewbase_pe=2, # default=4\n    bbox_thres=-1,\n    maskout_near_cam_vox=False,\n    # bg_len=0.2,   # default=0.2\n)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/train_single.py",
    "content": "_base_ = '../default.py'\nexpname = 'train_nov21_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/tanks_and_temple_unbounded'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,                           # to ignore the base config\n    #         0   : 1e-4,                                 # 0.5e-5\n    #     }\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='nerfpp',\n    inverse_y=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n    datadir='./data/tanks_and_temples/tat_intermediate_Train',\n    unbounded_inner_r=1.0,\n    ndc=False,\n    # # remove noisy training images\n    # training_ids=[106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,\n    #               121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,\n    #               136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,\n    #               151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 221, 222, 223, 224,\n    #               225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,\n    #               240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,\n    #               255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,\n    #               270, 271, 272, 273, 274, 275]\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter, \n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    # N_iters=3000,\n    N_iters=30000,\n    # N_rand=2048,  # reduce this to fit into memory\n    N_rand=4096,  # default\n    ray_sampler='flatten',\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=1.0,              # default = 1\n    weight_freq=0.0,            \n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 250  # default 400\nvoxel_size_rgb = 250  # default 320\nvoxel_size_viewdir = -1\n# voxel_size_viewdir = 64\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    # rgbnet_dim=3,\n    rgbnet_dim=15, # default\n    # rgbnet_depth=3, # default\n    viewbase_pe=4, # default=4\n    bbox_thres=0.001,\n    maskout_near_cam_vox=False,\n    bg_len=0.2,   # default=0.2\n)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/truck_single.py",
    "content": "_base_ = '../default.py'\nexpname = 'truck_may28_'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\nmodel='FourierGrid'\nbasedir = './logs/tanks_and_temple_unbounded'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    # fast_color_thres={\n    #         '_delete_': True,                           # to ignore the base config\n    #         0   : 1e-4,                                 # 0.5e-5\n    #     }\n    fast_color_thres={   # default\n            '_delete_': True,                           # to ignore the base config\n            0   : alpha_init*stepsize/10,               # 0.5e-5\n            1500: min(alpha_init, 1e-4)*stepsize/5,     # 1e-5\n            2500: min(alpha_init, 1e-4)*stepsize/2,     # 2.5e-5\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,   \n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='nerfpp',\n    inverse_y=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n    datadir='./data/tanks_and_temples/tat_training_Truck',\n    unbounded_inner_r=1.0,\n    ndc=False,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter, \n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    N_iters=30000,\n    # N_rand=2048,  # reduce this to fit into memory\n    N_rand=4096,  # default\n    ray_sampler='flatten',\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000, 2000, 3000, 4000, 5000, 6000, 7000],\n    # pg_scale=[],\n    tv_before=1e9,  # always use tv\n    tv_dense_before=10000,\n    tv_after=0, # start from beginning\n    tv_every=1,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=1.0,              # default = 1\n    weight_freq=0.0,            \n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nvoxel_size_density = 200  # default 400\nvoxel_size_rgb = 200  # default 320\nvoxel_size_viewdir = -1\n# voxel_size_viewdir = 64\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    # contracted_norm='l2', # default\n    rgbnet_dim=12, # default\n    fourier_freq_num=4,\n    rgbnet_depth=3, # default\n    # viewbase_pe=4, # default=4\n    bbox_thres=0.001, # should not matter\n    maskout_near_cam_vox=False,\n    # bg_len=0.2,   # default=0.2\n)\n"
  },
  {
    "path": "FourierGrid/configs/tankstemple_unbounded/tt_default.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\nbasedir = './logs/tanks_and_temple_unbounded'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nif visualize_poses:  # for debugging\n    unbounded_inward = True\n    coarse_iter = 3000\n    fast_color_thres=stepsize/_mpi_depth/5\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.0\nelse:\n    unbounded_inward = True\n    coarse_iter = 0\n    fast_color_thres={\n            '_delete_': True,\n            0   : alpha_init*stepsize/10,\n            1500: min(alpha_init, 1e-4)*stepsize/5,\n            2500: min(alpha_init, 1e-4)*stepsize/2,\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,\n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    maskout_near_cam_vox = False\n    pervoxel_lr = False\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='nerfpp',\n    inverse_y=True,\n    white_bkgd=True,\n    rand_bkgd=True,\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n)\n\nfine_train = dict(\n    N_iters=30000,\n    N_rand=4096,\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000,2000,3000,4000,5000,6000,7000],\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\nalpha_init = 1e-4\nstepsize = 0.5\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n)\n\nfine_model_and_render = dict(\n    num_voxels=320**3,\n    num_voxels_base=320**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n)\n\n"
  },
  {
    "path": "FourierGrid/configs/waymo/block_0_llff.py",
    "content": "_base_ = './default_waymo.py'\nmodel='FourierGrid'\nexpname = 'sep13_waymo'\n\ndata = dict(\n    datadir='data/sep13_block0/dense',\n    factor=2,\n    movie_render_kwargs={\n        'scale_r': 1.0,\n        'scale_f': 0.8,\n        'zrate': 2.0,\n        'zdelta': 0.5,\n    }\n)\n\nfine_train = dict(\n    N_iters=300000,\n)"
  },
  {
    "path": "FourierGrid/configs/waymo/block_0_tt.py",
    "content": "_base_ = './tankstemple_base.py'\nmodel='FourierGrid'\nexpname = 'sep15_waymo_5_images_vis_poses'\n\ndata = dict(\n    datadir='data/sep13_block0/dense',\n    factor=2,\n    movie_render_kwargs={ # not tuned well\n        'scale_r': 1.0,\n        'scale_f': 0.8,\n        'zrate': 2.0,\n        'zdelta': 0.5,\n    }\n)\n\nfine_train = dict(\n    N_iters=30000, # 30k is for quick validation\n)"
  },
  {
    "path": "FourierGrid/configs/waymo/waymo_base.py",
    "content": "_base_ = '../default.py'\nbasedir = './logs/waymo'\nmodel='FourierGrid'\nvisualize_poses = False\nalpha_init = 1e-2  # default: 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nmaskout_near_cam_vox = False  # changed\npervoxel_lr = False\nunbounded_inward = True\nif visualize_poses:  # for debugging only\n    coarse_iter = 600\n    fast_color_thres=stepsize/_mpi_depth/5\n    weight_distortion = 0.0\nelse:\n    coarse_iter = 0\n    fast_color_thres={\n            '_delete_': True,\n            0   : alpha_init*stepsize/10,\n            1500: min(alpha_init, 1e-4)*stepsize/5,\n            2500: min(alpha_init, 1e-4)*stepsize/2,\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,\n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    weight_distortion = 0.01\n\ndata = dict(\n    dataset_type='waymo',\n    inverse_y=True,\n    white_bkgd=True,     # almost no effect when rand_bkgd=True\n    rand_bkgd=False,      # random background\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n    ray_sampler='flatten',\n)\n\nfine_train = dict(\n    N_iters=30000,\n    N_rand=4096,\n    ray_sampler='flatten',\n    # N_rand=4096,\n    # ray_sampler='random',\n    weight_distortion=weight_distortion,\n    pg_scale=[1000,2000,3000,4000,5000,6000,7000],\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n    bbox_thres=1e-10,  # display all the bboxes\n)\n\nfine_model_and_render = dict(\n    num_voxels=320**3,\n    num_voxels_base=320**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n    # rgbnet_dim=-1,  # would affect performance but as an intial attempt\n    rgbnet_dim=12, # default\n    rgbnet_direct=True,\n    density_type='DenseGrid',\n    k0_type='DenseGrid',\n    bg_len=0.2,  # very important\n    viewbase_pe=8,\n    maskout_near_cam_vox=True,\n    # # TensorRF settings\n    # density_type='TensoRFGrid', \n    # k0_type='TensoRFGrid', \n    # density_config=dict(n_comp=8),\n    # k0_config=dict(n_comp=24),\n)\n"
  },
  {
    "path": "FourierGrid/configs/waymo/waymo_block.py",
    "content": "_base_ = './waymo_base.py'\nmodel='FourierGrid'\ncam_id = 73\nexpname = f'oct99_waymo_{cam_id}_tt'\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n\ndata = dict(\n    datadir='data/sep19_ordered_dataset',\n    factor=2,\n    # near_clip = 0.0356,\n    near_clip = 0.1,\n    near = 0.1,\n    # far = 1,\n    far = 0.01,\n    # movie_render_kwargs={\n    #     'scale_r': 1.0,\n    #     'scale_f': 0.8,\n    #     'zrate': 2.0,\n    #     'zdelta': 0.5,\n    # },\n    sample_cam=cam_id,\n    test_rotate_angle=8, # rotate angle in testing phase\n    # sample_idxs=[1127, 11009, 9805, 9426, 5859, 6315]\n    sample_interval=1,\n    num_per_block=5,  # run this num in block\n)\n\nfine_train = dict(\n    # N_iters=600, # for quick validation\n    N_iters=40000, # 40k for whole training procedure\n    # pg_scale=[1000,2000,3000,4000,5000,6000,7000], # default\n    pg_scale=[1000,2000,3000,4000,5000,],\n)\n"
  },
  {
    "path": "FourierGrid/configs/waymo/waymo_no_block.py",
    "content": "_base_ = '../default.py'\nmodel='FourierGrid'\nbasedir = 'logs/waymo'\nvisualize_poses = False\nalpha_init = 1e-4\nstepsize = 0.5\n_mpi_depth = 256\nmaskout_near_cam_vox = False  # changed\npervoxel_lr = False\nunbounded_inward = True\nexpname = f'oct29_waymo'\nif visualize_poses:  # for debugging only\n    coarse_iter = 600\n    fast_color_thres=stepsize/_mpi_depth/5\n    weight_distortion = 0.0\nelse:\n    coarse_iter = 0\n    fast_color_thres={\n            '_delete_': True,\n            0   : alpha_init*stepsize/10,\n            1500: min(alpha_init, 1e-4)*stepsize/5,\n            2500: min(alpha_init, 1e-4)*stepsize/2,\n            3500: min(alpha_init, 1e-4)*stepsize/1.5,\n            4500: min(alpha_init, 1e-4)*stepsize,\n            5500: min(alpha_init, 1e-4),\n            6500: 1e-4,\n        }\n    weight_distortion = -1\n\ndata = dict(\n    dataset_type='waymo',\n    inverse_y=True,\n    white_bkgd=True,     # almost no effect when rand_bkgd=True\n    rand_bkgd=True,      # random background\n    unbounded_inward=unbounded_inward,\n    load2gpu_on_the_fly=True,\n    datadir='data/sep19_ordered_dataset',\n    factor=2,\n    near_clip = 0.1,\n    near = 0.1,\n    far = 0.01,\n    # sample_cam=cam_id,\n    test_rotate_angle=360, # rotate angle in testing phase\n    sample_interval=1,\n    num_per_block=-1,  # run this num in block\n    unbounded_inner_r=0.8,\n    # three views\n    # training_ids=['69_0', '71_0', '73_0'], \n    training_ids=['73_0', '73_1', '73_2', '73_3', '73_4', '73_5', '73_6', '73_7', '73_8', '73_9', \\\n        '73_10', '73_11', '73_12', '73_13', '73_14', '73_15', '73_16', '73_17', '73_18', '73_19', \\\n            '73_20', '73_21', '73_22', '73_23', '73_24', '73_25', '73_26', '73_27', '73_28', '73_29', \\\n                '73_30', '73_31', '73_32', '73_33', '73_34', '73_35', '73_36', '73_37', '73_38', '73_39', \\\n                    '73_40', '73_41', '73_42', '73_43', '73_44', '73_45', '73_46', '73_47', '73_48', '73_49'],\n    tunning_id = '71_0',\n    search_rot_lower = [129, -2, -2],\n    search_rot_upper = [133, 2, 2],\n    search_pos_lower = [0.0, -0.01, -0.01],\n    search_pos_upper = [0.04, 0.01, 0.01],\n    search_num = 10**4,\n    # assign_pos = {\n    #     '69_0': [0.0, 0.0, 0.0], \n    #               '71_0': [0.03251668821656803, 0.001401165785078217, 0.00560227169424881], \n    #               '73_0': [0.0, 0.0, 0.0]\n    #               },\n    # assign_rot = {\n        # '69_0': [175, 0.0, 0.0],\n                # '71_0': [132.27749560775322, -1.1274407139317342, -0.42476203263358325],\n                # '73_0': [85.2267753, 0.0, 0.0]\n                # },\n    # assign_pos = {\n    #     '69_0': [0.0, 0.0, 0.0], \n    #     '69_10': [0.0, -3.55 * 0.01, 0.0], \n    #     '69_20': [0.0, -4.45 * 0.01, 0.0], \n    #     '69_30': [0.0, -5.84 * 0.01, 0.0], \n    #     '69_40': [0.0, -6.99 * 0.01, 0.0], \n    #     '69_50': [0.0, -8.66 * 0.01, 0.0],\n    # },\n    # assign_rot = {\n    #     '69_0': [175, 0.0, 0.0],\n    #     '69_10': [175, 0.0, 0.0],\n    #     '69_20': [175, 0.0, 0.0], \n    #     '69_30': [175, 0.0, 0.0], \n    #     '69_40': [175, 0.0, 0.0], \n    #     '69_50': [175, 0.0, 0.0],\n    # }\n)\n\ncoarse_train = dict(\n    N_iters=coarse_iter,\n    pervoxel_lr = pervoxel_lr,\n    ray_sampler='flatten',\n)\n\nfine_train = dict(\n    N_iters_m_step=1500,            # search via sfm\n    N_iters=3000,\n    # N_iters=10*(10**4),\n    N_rand=2048,\n    ray_sampler='flatten',\n    weight_distortion=weight_distortion,\n    pg_scale=[3000, 4000, 5000, 6000, 7000],\n    tv_before=1e9,\n    tv_dense_before=10000,\n    weight_tv_density=1e-6,\n    weight_tv_k0=1e-7,\n    # added\n    pervoxel_lr=False,\n    lrate_decay=20,               # default\n    lrate_density=1e-1,           # default lr of density voxel grid\n    lrate_k0=1e-1,                # lr of color/feature voxel grid\n    lrate_rgbnet=1e-3,            # default lr of the mlp to preduct view-dependent color\n    weight_entropy_last=1e-3,     # default\n    weight_rgbper=1e-2,           # default\n    weight_nearclip=0,\n    weight_main=3.0,              # default = 1\n    weight_freq=1.0,       \n)\n\ndiffusion = dict(\n    diff_root = 'diffusion',\n    diff_replace = {'69_0': 'airplane'}    \n)\n\ncoarse_model_and_render = dict(\n    maskout_near_cam_vox = maskout_near_cam_vox,\n    bbox_thres=1e-10,  # display all the bboxes\n)\n\nvoxel_size_density = 300  # default 400\nvoxel_size_rgb = 300  # default 320\nvoxel_size_viewdir = -1\n\nfine_model_and_render = dict(\n    num_voxels_density=voxel_size_density**3,\n    num_voxels_base_density=voxel_size_density**3,\n    num_voxels_rgb=voxel_size_rgb**3,\n    num_voxels_base_rgb=voxel_size_rgb**3,\n    num_voxels_viewdir=voxel_size_viewdir**3,\n    alpha_init=alpha_init,\n    stepsize=stepsize,\n    fast_color_thres=fast_color_thres,\n    world_bound_scale=1,\n    contracted_norm='l2',\n    rgbnet_dim=3, # default\n    rgbnet_direct=True,\n    density_type='DenseGrid',\n    k0_type='DenseGrid',\n    bg_len=0.2,  # very important\n    viewbase_pe=2,\n    maskout_near_cam_vox=False,\n)\n\nvis = dict(\n    height_rate = 0.6 # camera direction frustrum height\n)\n"
  },
  {
    "path": "FourierGrid/cuda/adam_upd.cpp",
    "content": "#include <torch/extension.h>\n\n#include <vector>\n\n// CUDA forward declarations\n\nvoid adam_upd_cuda(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    int step, float beta1, float beta2, float lr, float eps);\n\nvoid masked_adam_upd_cuda(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    int step, float beta1, float beta2, float lr, float eps);\n\nvoid adam_upd_with_perlr_cuda(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    torch::Tensor perlr,\n    int step, float beta1, float beta2, float lr, float eps);\n\n\n// C++ interface\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x \" must be a CUDA tensor\")\n#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x \" must be contiguous\")\n#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)\n\nvoid adam_upd(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    int step, float beta1, float beta2, float lr, float eps) {\n  CHECK_INPUT(param);\n  CHECK_INPUT(grad);\n  CHECK_INPUT(exp_avg);\n  CHECK_INPUT(exp_avg_sq);\n  adam_upd_cuda(param, grad, exp_avg, exp_avg_sq,\n          step, beta1, beta2, lr, eps);\n}\n\nvoid masked_adam_upd(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    int step, float beta1, float beta2, float lr, float eps) {\n  CHECK_INPUT(param);\n  CHECK_INPUT(grad);\n  CHECK_INPUT(exp_avg);\n  CHECK_INPUT(exp_avg_sq);\n  masked_adam_upd_cuda(param, grad, exp_avg, exp_avg_sq,\n          step, beta1, beta2, lr, eps);\n}\n\nvoid adam_upd_with_perlr(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    torch::Tensor perlr,\n    int step, float beta1, float beta2, float lr, float eps) {\n  CHECK_INPUT(param);\n  CHECK_INPUT(grad);\n  CHECK_INPUT(exp_avg);\n  CHECK_INPUT(exp_avg_sq);\n  adam_upd_with_perlr_cuda(param, grad, exp_avg, exp_avg_sq, perlr,\n          step, beta1, beta2, lr, eps);\n}\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"adam_upd\", &adam_upd,\n          \"Adam update\");\n  m.def(\"masked_adam_upd\", &masked_adam_upd,\n          \"Adam update ignoring zero grad\");\n  m.def(\"adam_upd_with_perlr\", &adam_upd_with_perlr,\n          \"Adam update ignoring zero grad with per-voxel lr\");\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/adam_upd_kernel.cu",
    "content": "#include <torch/extension.h>\n\n#include <cuda.h>\n#include <cuda_runtime.h>\n\n#include <vector>\n\ntemplate <typename scalar_t>\n__global__ void adam_upd_cuda_kernel(\n    scalar_t* __restrict__ param,\n    const scalar_t* __restrict__ grad,\n    scalar_t* __restrict__ exp_avg,\n    scalar_t* __restrict__ exp_avg_sq,\n    const size_t N,\n    const float step_size, const float beta1, const float beta2, const float eps) {\n\n  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;\n  if(index<N) {\n    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];\n    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];\n    param[index] -= step_size * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void masked_adam_upd_cuda_kernel(\n    scalar_t* __restrict__ param,\n    const scalar_t* __restrict__ grad,\n    scalar_t* __restrict__ exp_avg,\n    scalar_t* __restrict__ exp_avg_sq,\n    const size_t N,\n    const float step_size, const float beta1, const float beta2, const float eps) {\n\n  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;\n  if(index<N && grad[index]!=0) {\n    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];\n    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];\n    param[index] -= step_size * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void adam_upd_with_perlr_cuda_kernel(\n    scalar_t* __restrict__ param,\n    const scalar_t* __restrict__ grad,\n    scalar_t* __restrict__ exp_avg,\n    scalar_t* __restrict__ exp_avg_sq,\n    scalar_t* __restrict__ perlr,\n    const size_t N,\n    const float step_size, const float beta1, const float beta2, const float eps) {\n\n  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;\n  if(index<N) {\n    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];\n    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];\n    param[index] -= step_size * perlr[index] * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);\n  }\n}\n\nvoid adam_upd_cuda(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    const int step, const float beta1, const float beta2, const float lr, const float eps) {\n\n  const size_t N = param.numel();\n\n  const int threads = 256;\n  const int blocks = (N + threads - 1) / threads;\n\n  const float step_size = lr * sqrt(1 - pow(beta2, (float)step)) / (1 - pow(beta1, (float)step));\n\n  AT_DISPATCH_FLOATING_TYPES(param.type(), \"adam_upd_cuda\", ([&] {\n    adam_upd_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        param.data<scalar_t>(),\n        grad.data<scalar_t>(),\n        exp_avg.data<scalar_t>(),\n        exp_avg_sq.data<scalar_t>(),\n        N, step_size, beta1, beta2, eps);\n  }));\n}\n\nvoid masked_adam_upd_cuda(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    const int step, const float beta1, const float beta2, const float lr, const float eps) {\n\n  const size_t N = param.numel();\n\n  const int threads = 256;\n  const int blocks = (N + threads - 1) / threads;\n\n  const float step_size = lr * sqrt(1 - pow(beta2, (float)step)) / (1 - pow(beta1, (float)step));\n\n  AT_DISPATCH_FLOATING_TYPES(param.type(), \"masked_adam_upd_cuda\", ([&] {\n    masked_adam_upd_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        param.data<scalar_t>(),\n        grad.data<scalar_t>(),\n        exp_avg.data<scalar_t>(),\n        exp_avg_sq.data<scalar_t>(),\n        N, step_size, beta1, beta2, eps);\n  }));\n}\n\nvoid adam_upd_with_perlr_cuda(\n    torch::Tensor param,\n    torch::Tensor grad,\n    torch::Tensor exp_avg,\n    torch::Tensor exp_avg_sq,\n    torch::Tensor perlr,\n    const int step, const float beta1, const float beta2, const float lr, const float eps) {\n\n  const size_t N = param.numel();\n\n  const int threads = 256;\n  const int blocks = (N + threads - 1) / threads;\n\n  const float step_size = lr * sqrt(1 - pow(beta2, (float)step)) / (1 - pow(beta1, (float)step));\n\n  AT_DISPATCH_FLOATING_TYPES(param.type(), \"adam_upd_with_perlr_cuda\", ([&] {\n    adam_upd_with_perlr_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        param.data<scalar_t>(),\n        grad.data<scalar_t>(),\n        exp_avg.data<scalar_t>(),\n        exp_avg_sq.data<scalar_t>(),\n        perlr.data<scalar_t>(),\n        N, step_size, beta1, beta2, eps);\n  }));\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/render_utils.cpp",
    "content": "#include <torch/extension.h>\n\n#include <vector>\n\n// CUDA forward declarations\n\nstd::vector<torch::Tensor> infer_t_minmax_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const float near, const float far);\n\ntorch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const float stepdist);\n\nstd::vector<torch::Tensor> infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min);\n\nstd::vector<torch::Tensor> sample_pts_on_rays_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d,\n        torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const float near, const float far, const float stepdist);\n\nstd::vector<torch::Tensor> sample_ndc_pts_on_rays_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d,\n        torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const int N_samples);\n\ntorch::Tensor sample_bg_pts_on_rays_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,\n        const float bg_preserve, const int N_samples);\n\ntorch::Tensor maskcache_lookup_cuda(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift);\n\nstd::vector<torch::Tensor> raw2alpha_cuda(torch::Tensor density, const float shift, const float interval);\nstd::vector<torch::Tensor> raw2alpha_nonuni_cuda(torch::Tensor density, const float shift, torch::Tensor interval);\n\ntorch::Tensor raw2alpha_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, const float interval);\ntorch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval);\n\nstd::vector<torch::Tensor> alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int n_rays);\n\ntorch::Tensor alpha2weight_backward_cuda(\n        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,\n        torch::Tensor i_start, torch::Tensor i_end, const int n_rays,\n        torch::Tensor grad_weights, torch::Tensor grad_last);\n\n// C++ interface\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x \" must be a CUDA tensor\")\n#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x \" must be contiguous\")\n#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)\n\nstd::vector<torch::Tensor> infer_t_minmax(\n        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const float near, const float far) {\n  CHECK_INPUT(rays_o);\n  CHECK_INPUT(rays_d);\n  CHECK_INPUT(xyz_min);\n  CHECK_INPUT(xyz_max);\n  return infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far);\n}\n\ntorch::Tensor infer_n_samples(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const float stepdist) {\n  CHECK_INPUT(rays_d);\n  CHECK_INPUT(t_min);\n  CHECK_INPUT(t_max);\n  return infer_n_samples_cuda(rays_d, t_min, t_max, stepdist);\n}\n\nstd::vector<torch::Tensor> infer_ray_start_dir(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) {\n  CHECK_INPUT(rays_o);\n  CHECK_INPUT(rays_d);\n  CHECK_INPUT(t_min);\n  return infer_ray_start_dir_cuda(rays_o, rays_d, t_min);\n}\n\nstd::vector<torch::Tensor> sample_pts_on_rays(\n        torch::Tensor rays_o, torch::Tensor rays_d,\n        torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const float near, const float far, const float stepdist) {\n  CHECK_INPUT(rays_o);\n  CHECK_INPUT(rays_d);\n  CHECK_INPUT(xyz_min);\n  CHECK_INPUT(xyz_max);\n  assert(rays_o.dim()==2);\n  assert(rays_o.size(1)==3);\n  return sample_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far, stepdist);\n}\n\nstd::vector<torch::Tensor> sample_ndc_pts_on_rays(\n        torch::Tensor rays_o, torch::Tensor rays_d,\n        torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const int N_samples) {\n  CHECK_INPUT(rays_o);\n  CHECK_INPUT(rays_d);\n  CHECK_INPUT(xyz_min);\n  CHECK_INPUT(xyz_max);\n  assert(rays_o.dim()==2);\n  assert(rays_o.size(1)==3);\n  return sample_ndc_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, N_samples);\n}\n\ntorch::Tensor sample_bg_pts_on_rays(\n        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,\n        const float bg_preserve, const int N_samples) {\n  CHECK_INPUT(rays_o);\n  CHECK_INPUT(rays_d);\n  CHECK_INPUT(t_max);\n  return sample_bg_pts_on_rays_cuda(rays_o, rays_d, t_max, bg_preserve, N_samples);\n}\n\ntorch::Tensor maskcache_lookup(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift) {\n  CHECK_INPUT(world);\n  CHECK_INPUT(xyz);\n  CHECK_INPUT(xyz2ijk_scale);\n  CHECK_INPUT(xyz2ijk_shift);\n  assert(world.dim()==3);\n  assert(xyz.dim()==2);\n  assert(xyz.size(1)==3);\n  return maskcache_lookup_cuda(world, xyz, xyz2ijk_scale, xyz2ijk_shift);\n}\n\nstd::vector<torch::Tensor> raw2alpha(torch::Tensor density, const float shift, const float interval) {\n  CHECK_INPUT(density);\n  assert(density.dim()==1);\n  return raw2alpha_cuda(density, shift, interval);\n}\nstd::vector<torch::Tensor> raw2alpha_nonuni(torch::Tensor density, const float shift, torch::Tensor interval) {\n  CHECK_INPUT(density);\n  assert(density.dim()==1);\n  return raw2alpha_nonuni_cuda(density, shift, interval);\n}\n\ntorch::Tensor raw2alpha_backward(torch::Tensor exp, torch::Tensor grad_back, const float interval) {\n  CHECK_INPUT(exp);\n  CHECK_INPUT(grad_back);\n  return raw2alpha_backward_cuda(exp, grad_back, interval);\n}\ntorch::Tensor raw2alpha_nonuni_backward(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval) {\n  CHECK_INPUT(exp);\n  CHECK_INPUT(grad_back);\n  return raw2alpha_nonuni_backward_cuda(exp, grad_back, interval);\n}\n\nstd::vector<torch::Tensor> alpha2weight(torch::Tensor alpha, torch::Tensor ray_id, const int n_rays) {\n  CHECK_INPUT(alpha);\n  CHECK_INPUT(ray_id);\n  assert(alpha.dim()==1);\n  assert(ray_id.dim()==1);\n  assert(alpha.sizes()==ray_id.sizes());\n  return alpha2weight_cuda(alpha, ray_id, n_rays);\n}\n\ntorch::Tensor alpha2weight_backward(\n        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,\n        torch::Tensor i_start, torch::Tensor i_end, const int n_rays,\n        torch::Tensor grad_weights, torch::Tensor grad_last) {\n  CHECK_INPUT(alpha);\n  CHECK_INPUT(weight);\n  CHECK_INPUT(T);\n  CHECK_INPUT(alphainv_last);\n  CHECK_INPUT(i_start);\n  CHECK_INPUT(i_end);\n  CHECK_INPUT(grad_weights);\n  CHECK_INPUT(grad_last);\n  return alpha2weight_backward_cuda(\n          alpha, weight, T, alphainv_last,\n          i_start, i_end, n_rays,\n          grad_weights, grad_last);\n}\n\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"infer_t_minmax\", &infer_t_minmax, \"Inference t_min and t_max of ray-bbox intersection\");\n  m.def(\"infer_n_samples\", &infer_n_samples, \"Inference the number of points to sample on each ray\");\n  m.def(\"infer_ray_start_dir\", &infer_ray_start_dir, \"Inference the starting point and shooting direction of each ray\");\n  m.def(\"sample_pts_on_rays\", &sample_pts_on_rays, \"Sample points on rays\");\n  m.def(\"sample_ndc_pts_on_rays\", &sample_ndc_pts_on_rays, \"Sample points on rays\");\n  m.def(\"sample_bg_pts_on_rays\", &sample_bg_pts_on_rays, \"Sample points on bg\");\n  m.def(\"maskcache_lookup\", &maskcache_lookup, \"Lookup to skip know freespace.\");\n  m.def(\"raw2alpha\", &raw2alpha, \"Raw values [-inf, inf] to alpha [0, 1].\");\n  m.def(\"raw2alpha_backward\", &raw2alpha_backward, \"Backward pass of the raw to alpha\");\n  m.def(\"raw2alpha_nonuni\", &raw2alpha_nonuni, \"Raw values [-inf, inf] to alpha [0, 1].\");\n  m.def(\"raw2alpha_nonuni_backward\", &raw2alpha_nonuni_backward, \"Backward pass of the raw to alpha\");\n  m.def(\"alpha2weight\", &alpha2weight, \"Per-point alpha to accumulated blending weight\");\n  m.def(\"alpha2weight_backward\", &alpha2weight_backward, \"Backward pass of alpha2weight\");\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/render_utils_kernel.cu",
    "content": "#include <torch/extension.h>\n\n#include <cuda.h>\n#include <cuda_runtime.h>\n\n#include <vector>\n\n/*\n   Points sampling helper functions.\n */\ntemplate <typename scalar_t>\n__global__ void infer_t_minmax_cuda_kernel(\n        scalar_t* __restrict__ rays_o,\n        scalar_t* __restrict__ rays_d,\n        scalar_t* __restrict__ xyz_min,\n        scalar_t* __restrict__ xyz_max,\n        const float near, const float far, const int n_rays,\n        scalar_t* __restrict__ t_min,\n        scalar_t* __restrict__ t_max) {\n  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_ray<n_rays) {\n    const int offset = i_ray * 3;\n    float vx = ((rays_d[offset  ]==0) ? 1e-6 : rays_d[offset  ]);\n    float vy = ((rays_d[offset+1]==0) ? 1e-6 : rays_d[offset+1]);\n    float vz = ((rays_d[offset+2]==0) ? 1e-6 : rays_d[offset+2]);\n    float ax = (xyz_max[0] - rays_o[offset  ]) / vx;\n    float ay = (xyz_max[1] - rays_o[offset+1]) / vy;\n    float az = (xyz_max[2] - rays_o[offset+2]) / vz;\n    float bx = (xyz_min[0] - rays_o[offset  ]) / vx;\n    float by = (xyz_min[1] - rays_o[offset+1]) / vy;\n    float bz = (xyz_min[2] - rays_o[offset+2]) / vz;\n    t_min[i_ray] = max(min(max(max(min(ax, bx), min(ay, by)), min(az, bz)), far), near);\n    t_max[i_ray] = max(min(min(min(max(ax, bx), max(ay, by)), max(az, bz)), far), near);\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void infer_n_samples_cuda_kernel(\n        scalar_t* __restrict__ rays_d,\n        scalar_t* __restrict__ t_min,\n        scalar_t* __restrict__ t_max,\n        const float stepdist,\n        const int n_rays,\n        int64_t* __restrict__ n_samples) {\n  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_ray<n_rays) {\n    const int offset = i_ray * 3;\n    const float rnorm = sqrt(\n            rays_d[offset  ]*rays_d[offset  ] +\\\n            rays_d[offset+1]*rays_d[offset+1] +\\\n            rays_d[offset+2]*rays_d[offset+2]);\n    // at least 1 point for easier implementation in the later sample_pts_on_rays_cuda\n    n_samples[i_ray] = max(ceil((t_max[i_ray]-t_min[i_ray]) * rnorm / stepdist), 1.);\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void infer_ray_start_dir_cuda_kernel(\n        scalar_t* __restrict__ rays_o,\n        scalar_t* __restrict__ rays_d,\n        scalar_t* __restrict__ t_min,\n        const int n_rays,\n        scalar_t* __restrict__ rays_start,\n        scalar_t* __restrict__ rays_dir) {\n  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_ray<n_rays) {\n    const int offset = i_ray * 3;\n    const float rnorm = sqrt(\n            rays_d[offset  ]*rays_d[offset  ] +\\\n            rays_d[offset+1]*rays_d[offset+1] +\\\n            rays_d[offset+2]*rays_d[offset+2]);\n    rays_start[offset  ] = rays_o[offset  ] + rays_d[offset  ] * t_min[i_ray];\n    rays_start[offset+1] = rays_o[offset+1] + rays_d[offset+1] * t_min[i_ray];\n    rays_start[offset+2] = rays_o[offset+2] + rays_d[offset+2] * t_min[i_ray];\n    rays_dir  [offset  ] = rays_d[offset  ] / rnorm;\n    rays_dir  [offset+1] = rays_d[offset+1] / rnorm;\n    rays_dir  [offset+2] = rays_d[offset+2] / rnorm;\n  }\n}\n\n\nstd::vector<torch::Tensor> infer_t_minmax_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const float near, const float far) {\n  const int n_rays = rays_o.size(0);\n  auto t_min = torch::empty({n_rays}, rays_o.options());\n  auto t_max = torch::empty({n_rays}, rays_o.options());\n\n  const int threads = 256;\n  const int blocks = (n_rays + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), \"infer_t_minmax_cuda\", ([&] {\n    infer_t_minmax_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        rays_o.data<scalar_t>(),\n        rays_d.data<scalar_t>(),\n        xyz_min.data<scalar_t>(),\n        xyz_max.data<scalar_t>(),\n        near, far, n_rays,\n        t_min.data<scalar_t>(),\n        t_max.data<scalar_t>());\n  }));\n\n  return {t_min, t_max};\n}\n\ntorch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const float stepdist) {\n  const int n_rays = t_min.size(0);\n  auto n_samples = torch::empty({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));\n  const int threads = 256;\n  const int blocks = (n_rays + threads - 1) / threads;\n  AT_DISPATCH_FLOATING_TYPES(t_min.type(), \"infer_n_samples_cuda\", ([&] {\n    infer_n_samples_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        rays_d.data<scalar_t>(),\n        t_min.data<scalar_t>(),\n        t_max.data<scalar_t>(),\n        stepdist,\n        n_rays,\n        n_samples.data<int64_t>());\n  }));\n  return n_samples;\n}\n\nstd::vector<torch::Tensor> infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) {\n  const int n_rays = rays_o.size(0);\n  const int threads = 256;\n  const int blocks = (n_rays + threads - 1) / threads;\n  auto rays_start = torch::empty_like(rays_o);\n  auto rays_dir = torch::empty_like(rays_o);\n  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), \"infer_ray_start_dir_cuda\", ([&] {\n    infer_ray_start_dir_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        rays_o.data<scalar_t>(),\n        rays_d.data<scalar_t>(),\n        t_min.data<scalar_t>(),\n        n_rays,\n        rays_start.data<scalar_t>(),\n        rays_dir.data<scalar_t>());\n  }));\n  return {rays_start, rays_dir};\n}\n\n/*\n   Sampling query points on rays.\n */\n__global__ void __set_1_at_ray_seg_start(\n        int64_t* __restrict__ ray_id,\n        int64_t* __restrict__ N_steps_cumsum,\n        const int n_rays) {\n  const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if(0<idx && idx<n_rays) {\n    ray_id[N_steps_cumsum[idx-1]] = 1;\n  }\n}\n\n__global__ void __set_step_id(\n        int64_t* __restrict__ step_id,\n        int64_t* __restrict__ ray_id,\n        int64_t* __restrict__ N_steps_cumsum,\n        const int total_len) {\n    const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if(idx<total_len) {\n      const int rid = ray_id[idx];\n      step_id[idx] = idx - ((rid!=0) ? N_steps_cumsum[rid-1] : 0);\n    }\n}\n\ntemplate <typename scalar_t>\n__global__ void sample_pts_on_rays_cuda_kernel(\n        scalar_t* __restrict__ rays_start,\n        scalar_t* __restrict__ rays_dir,\n        scalar_t* __restrict__ xyz_min,\n        scalar_t* __restrict__ xyz_max,\n        int64_t* __restrict__ ray_id,\n        int64_t* __restrict__ step_id,\n        const float stepdist, const int total_len,\n        scalar_t* __restrict__ rays_pts,\n        bool* __restrict__ mask_outbbox) {\n  const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if(idx<total_len) {\n    const int i_ray = ray_id[idx];\n    const int i_step = step_id[idx];\n\n    const int offset_p = idx * 3;\n    const int offset_r = i_ray * 3;\n    const float dist = stepdist * i_step;\n    const float px = rays_start[offset_r  ] + rays_dir[offset_r  ] * dist;\n    const float py = rays_start[offset_r+1] + rays_dir[offset_r+1] * dist;\n    const float pz = rays_start[offset_r+2] + rays_dir[offset_r+2] * dist;\n    rays_pts[offset_p  ] = px;\n    rays_pts[offset_p+1] = py;\n    rays_pts[offset_p+2] = pz;\n    mask_outbbox[idx] = (xyz_min[0]>px) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \\\n                        (xyz_max[0]<px) | (xyz_max[1]<py) | (xyz_max[2]<pz);\n  }\n}\n\nstd::vector<torch::Tensor> sample_pts_on_rays_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d,\n        torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const float near, const float far, const float stepdist) {\n  const int threads = 256;\n  const int n_rays = rays_o.size(0);\n\n  // Compute ray-bbox intersection\n  auto t_minmax = infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far);\n  auto t_min = t_minmax[0];\n  auto t_max = t_minmax[1];\n\n  // Compute the number of points required.\n  // Assign ray index and step index to each.\n  auto N_steps = infer_n_samples_cuda(rays_d, t_min, t_max, stepdist);\n  auto N_steps_cumsum = N_steps.cumsum(0);\n  const int total_len = N_steps.sum().item<int>();\n  auto ray_id = torch::zeros({total_len}, torch::dtype(torch::kInt64).device(torch::kCUDA));\n  __set_1_at_ray_seg_start<<<(n_rays+threads-1)/threads, threads>>>(\n        ray_id.data<int64_t>(), N_steps_cumsum.data<int64_t>(), n_rays);\n  ray_id.cumsum_(0);\n  auto step_id = torch::empty({total_len}, ray_id.options());\n  __set_step_id<<<(total_len+threads-1)/threads, threads>>>(\n        step_id.data<int64_t>(), ray_id.data<int64_t>(), N_steps_cumsum.data<int64_t>(), total_len);\n\n  // Compute the global xyz of each point\n  auto rays_start_dir = infer_ray_start_dir_cuda(rays_o, rays_d, t_min);\n  auto rays_start = rays_start_dir[0];\n  auto rays_dir = rays_start_dir[1];\n\n  auto rays_pts = torch::empty({total_len, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));\n  auto mask_outbbox = torch::empty({total_len}, torch::dtype(torch::kBool).device(torch::kCUDA));\n\n  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), \"sample_pts_on_rays_cuda\", ([&] {\n    sample_pts_on_rays_cuda_kernel<scalar_t><<<(total_len+threads-1)/threads, threads>>>(\n        rays_start.data<scalar_t>(),\n        rays_dir.data<scalar_t>(),\n        xyz_min.data<scalar_t>(),\n        xyz_max.data<scalar_t>(),\n        ray_id.data<int64_t>(),\n        step_id.data<int64_t>(),\n        stepdist, total_len,\n        rays_pts.data<scalar_t>(),\n        mask_outbbox.data<bool>());\n  }));\n  return {rays_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max};\n}\n\ntemplate <typename scalar_t>\n__global__ void sample_ndc_pts_on_rays_cuda_kernel(\n        const scalar_t* __restrict__ rays_o,\n        const scalar_t* __restrict__ rays_d,\n        const scalar_t* __restrict__ xyz_min,\n        const scalar_t* __restrict__ xyz_max,\n        const int N_samples, const int n_rays,\n        scalar_t* __restrict__ rays_pts,\n        bool* __restrict__ mask_outbbox) {\n  const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if(idx<N_samples*n_rays) {\n    const int i_ray = idx / N_samples;\n    const int i_step = idx % N_samples;\n\n    const int offset_p = idx * 3;\n    const int offset_r = i_ray * 3;\n    const float dist = ((float)i_step) / (N_samples-1);\n    const float px = rays_o[offset_r  ] + rays_d[offset_r  ] * dist;\n    const float py = rays_o[offset_r+1] + rays_d[offset_r+1] * dist;\n    const float pz = rays_o[offset_r+2] + rays_d[offset_r+2] * dist;\n    rays_pts[offset_p  ] = px;\n    rays_pts[offset_p+1] = py;\n    rays_pts[offset_p+2] = pz;\n    mask_outbbox[idx] = (xyz_min[0]>px) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \\\n                        (xyz_max[0]<px) | (xyz_max[1]<py) | (xyz_max[2]<pz);\n  }\n}\n\nstd::vector<torch::Tensor> sample_ndc_pts_on_rays_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d,\n        torch::Tensor xyz_min, torch::Tensor xyz_max,\n        const int N_samples) {\n  const int threads = 256;\n  const int n_rays = rays_o.size(0);\n\n  auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));\n  auto mask_outbbox = torch::empty({n_rays, N_samples}, torch::dtype(torch::kBool).device(torch::kCUDA));\n\n  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), \"sample_ndc_pts_on_rays_cuda\", ([&] {\n    sample_ndc_pts_on_rays_cuda_kernel<scalar_t><<<(n_rays*N_samples+threads-1)/threads, threads>>>(\n        rays_o.data<scalar_t>(),\n        rays_d.data<scalar_t>(),\n        xyz_min.data<scalar_t>(),\n        xyz_max.data<scalar_t>(),\n        N_samples, n_rays,\n        rays_pts.data<scalar_t>(),\n        mask_outbbox.data<bool>());\n  }));\n  return {rays_pts, mask_outbbox};\n}\n\ntemplate <typename scalar_t>\n__device__ __forceinline__ scalar_t norm3(const scalar_t x, const scalar_t y, const scalar_t z) {\n  return sqrt(x*x + y*y + z*z);\n}\n\ntemplate <typename scalar_t>\n__global__ void sample_bg_pts_on_rays_cuda_kernel(\n        const scalar_t* __restrict__ rays_o,\n        const scalar_t* __restrict__ rays_d,\n        const scalar_t* __restrict__ t_max,\n        const float bg_preserve,\n        const int N_samples, const int n_rays,\n        scalar_t* __restrict__ rays_pts) {\n  const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if(idx<N_samples*n_rays) {\n    const int i_ray = idx / N_samples;\n    const int i_step = idx % N_samples;\n\n    const int offset_p = idx * 3;\n    const int offset_r = i_ray * 3;\n    /* Original pytorch implementation\n    ori_t_outer = t_max[:,None] - 1 + 1 / torch.linspace(1, 0, N_outer+1)[:-1]\n    ori_ray_pts_outer = (rays_o[:,None,:] + rays_d[:,None,:] * ori_t_outer[:,:,None]).reshape(-1,3)\n    t_outer = ori_ray_pts_outer.norm(dim=-1)\n    R_outer = t_outer / ori_ray_pts_outer.abs().amax(1)\n    # r = R * R / t\n    o2i_p = R_outer.pow(2) / t_outer.pow(2) * (1-self.bg_preserve) + R_outer / t_outer * self.bg_preserve\n    ray_pts_outer = (ori_ray_pts_outer * o2i_p[:,None]).reshape(len(rays_o), -1, 3)\n   */\n    const float t_inner = t_max[i_ray];\n    const float ori_t_outer = t_inner - 1. + 1. / (1. - ((float)i_step) / N_samples);\n    const float ori_ray_pts_x =  rays_o[offset_r  ] + rays_d[offset_r  ] * ori_t_outer;\n    const float ori_ray_pts_y =  rays_o[offset_r+1] + rays_d[offset_r+1] * ori_t_outer;\n    const float ori_ray_pts_z =  rays_o[offset_r+2] + rays_d[offset_r+2] * ori_t_outer;\n    const float t_outer = norm3(ori_ray_pts_x, ori_ray_pts_y, ori_ray_pts_z);\n    const float ori_ray_pts_m = max(abs(ori_ray_pts_x), max(abs(ori_ray_pts_y), abs(ori_ray_pts_z)));\n    const float R_outer = t_outer / ori_ray_pts_m;\n    const float o2i_p = R_outer*R_outer / (t_outer*t_outer) * (1.-bg_preserve) + R_outer / t_outer * bg_preserve;\n    const float px = ori_ray_pts_x * o2i_p;\n    const float py = ori_ray_pts_y * o2i_p;\n    const float pz = ori_ray_pts_z * o2i_p;\n    rays_pts[offset_p  ] = px;\n    rays_pts[offset_p+1] = py;\n    rays_pts[offset_p+2] = pz;\n  }\n}\n\ntorch::Tensor sample_bg_pts_on_rays_cuda(\n        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,\n        const float bg_preserve, const int N_samples) {\n  const int threads = 256;\n  const int n_rays = rays_o.size(0);\n\n  auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));\n\n  AT_DISPATCH_FLOATING_TYPES(rays_o.type(), \"sample_bg_pts_on_rays_cuda\", ([&] {\n    sample_bg_pts_on_rays_cuda_kernel<scalar_t><<<(n_rays*N_samples+threads-1)/threads, threads>>>(\n        rays_o.data<scalar_t>(),\n        rays_d.data<scalar_t>(),\n        t_max.data<scalar_t>(),\n        bg_preserve,\n        N_samples, n_rays,\n        rays_pts.data<scalar_t>());\n  }));\n  return rays_pts;\n}\n\n\n/*\n   MaskCache lookup to skip known freespace.\n */\n\nstatic __forceinline__ __device__\nbool check_xyz(int i, int j, int k, int sz_i, int sz_j, int sz_k) {\n  return (0 <= i) && (i < sz_i) && (0 <= j) && (j < sz_j) && (0 <= k) && (k < sz_k);\n}\n\n\ntemplate <typename scalar_t>\n__global__ void maskcache_lookup_cuda_kernel(\n    bool* __restrict__ world,\n    scalar_t* __restrict__ xyz,\n    bool* __restrict__ out,\n    scalar_t* __restrict__ xyz2ijk_scale,\n    scalar_t* __restrict__ xyz2ijk_shift,\n    const int sz_i, const int sz_j, const int sz_k, const int n_pts) {\n\n  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_pt<n_pts) {\n    const int offset = i_pt * 3;\n    const int i = round(xyz[offset  ] * xyz2ijk_scale[0] + xyz2ijk_shift[0]);\n    const int j = round(xyz[offset+1] * xyz2ijk_scale[1] + xyz2ijk_shift[1]);\n    const int k = round(xyz[offset+2] * xyz2ijk_scale[2] + xyz2ijk_shift[2]);\n    if(check_xyz(i, j, k, sz_i, sz_j, sz_k)) {\n      out[i_pt] = world[i*sz_j*sz_k + j*sz_k + k];\n    }\n  }\n}\n\ntorch::Tensor maskcache_lookup_cuda(\n        torch::Tensor world,\n        torch::Tensor xyz,\n        torch::Tensor xyz2ijk_scale,\n        torch::Tensor xyz2ijk_shift) {\n\n  const int sz_i = world.size(0);\n  const int sz_j = world.size(1);\n  const int sz_k = world.size(2);\n  const int n_pts = xyz.size(0);\n\n  auto out = torch::zeros({n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA));\n  if(n_pts==0) {\n    return out;\n  }\n\n  const int threads = 256;\n  const int blocks = (n_pts + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(xyz.type(), \"maskcache_lookup_cuda\", ([&] {\n    maskcache_lookup_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        world.data<bool>(),\n        xyz.data<scalar_t>(),\n        out.data<bool>(),\n        xyz2ijk_scale.data<scalar_t>(),\n        xyz2ijk_shift.data<scalar_t>(),\n        sz_i, sz_j, sz_k, n_pts);\n  }));\n\n  return out;\n}\n\n\n/*\n    Ray marching helper function.\n */\ntemplate <typename scalar_t>\n__global__ void raw2alpha_cuda_kernel(\n    scalar_t* __restrict__ density,\n    const float shift, const float interval, const int n_pts,\n    scalar_t* __restrict__ exp_d,\n    scalar_t* __restrict__ alpha) {\n\n  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_pt<n_pts) {\n    const scalar_t e = exp(density[i_pt] + shift); // can be inf\n    exp_d[i_pt] = e;\n    alpha[i_pt] = 1 - pow(1 + e, -interval);\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void raw2alpha_nonuni_cuda_kernel(\n    scalar_t* __restrict__ density,\n    const float shift, scalar_t* __restrict__ interval, const int n_pts,\n    scalar_t* __restrict__ exp_d,\n    scalar_t* __restrict__ alpha) {\n\n  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_pt<n_pts) {\n    const scalar_t e = exp(density[i_pt] + shift); // can be inf\n    exp_d[i_pt] = e;\n    alpha[i_pt] = 1 - pow(1 + e, -interval[i_pt]);\n  }\n}\n\nstd::vector<torch::Tensor> raw2alpha_cuda(torch::Tensor density, const float shift, const float interval) {\n\n  const int n_pts = density.size(0);\n  auto exp_d = torch::empty_like(density);\n  auto alpha = torch::empty_like(density);\n  if(n_pts==0) {\n    return {exp_d, alpha};\n  }\n\n  const int threads = 256;\n  const int blocks = (n_pts + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(density.type(), \"raw2alpha_cuda\", ([&] {\n    raw2alpha_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        density.data<scalar_t>(),\n        shift, interval, n_pts,\n        exp_d.data<scalar_t>(),\n        alpha.data<scalar_t>());\n  }));\n\n  return {exp_d, alpha};\n}\n\nstd::vector<torch::Tensor> raw2alpha_nonuni_cuda(torch::Tensor density, const float shift, torch::Tensor interval) {\n\n  const int n_pts = density.size(0);\n  auto exp_d = torch::empty_like(density);\n  auto alpha = torch::empty_like(density);\n  if(n_pts==0) {\n    return {exp_d, alpha};\n  }\n\n  const int threads = 256;\n  const int blocks = (n_pts + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(density.type(), \"raw2alpha_cuda\", ([&] {\n    raw2alpha_nonuni_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        density.data<scalar_t>(),\n        shift, interval.data<scalar_t>(), n_pts,\n        exp_d.data<scalar_t>(),\n        alpha.data<scalar_t>());\n  }));\n\n  return {exp_d, alpha};\n}\n\ntemplate <typename scalar_t>\n__global__ void raw2alpha_backward_cuda_kernel(\n    scalar_t* __restrict__ exp_d,\n    scalar_t* __restrict__ grad_back,\n    const float interval, const int n_pts,\n    scalar_t* __restrict__ grad) {\n\n  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_pt<n_pts) {\n    grad[i_pt] = min(exp_d[i_pt], 1e10) * pow(1+exp_d[i_pt], -interval-1) * interval * grad_back[i_pt];\n  }\n}\n\ntemplate <typename scalar_t>\n__global__ void raw2alpha_nonuni_backward_cuda_kernel(\n    scalar_t* __restrict__ exp_d,\n    scalar_t* __restrict__ grad_back,\n    scalar_t* __restrict__ interval, const int n_pts,\n    scalar_t* __restrict__ grad) {\n\n  const int i_pt = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_pt<n_pts) {\n    grad[i_pt] = min(exp_d[i_pt], 1e10) * pow(1+exp_d[i_pt], -interval[i_pt]-1) * interval[i_pt] * grad_back[i_pt];\n  }\n}\n\ntorch::Tensor raw2alpha_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, const float interval) {\n\n  const int n_pts = exp_d.size(0);\n  auto grad = torch::empty_like(exp_d);\n  if(n_pts==0) {\n    return grad;\n  }\n\n  const int threads = 256;\n  const int blocks = (n_pts + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(exp_d.type(), \"raw2alpha_backward_cuda\", ([&] {\n    raw2alpha_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        exp_d.data<scalar_t>(),\n        grad_back.data<scalar_t>(),\n        interval, n_pts,\n        grad.data<scalar_t>());\n  }));\n\n  return grad;\n}\n\ntorch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, torch::Tensor interval) {\n\n  const int n_pts = exp_d.size(0);\n  auto grad = torch::empty_like(exp_d);\n  if(n_pts==0) {\n    return grad;\n  }\n\n  const int threads = 256;\n  const int blocks = (n_pts + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(exp_d.type(), \"raw2alpha_backward_cuda\", ([&] {\n    raw2alpha_nonuni_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        exp_d.data<scalar_t>(),\n        grad_back.data<scalar_t>(),\n        interval.data<scalar_t>(), n_pts,\n        grad.data<scalar_t>());\n  }));\n\n  return grad;\n}\n\ntemplate <typename scalar_t>\n__global__ void alpha2weight_cuda_kernel(\n    scalar_t* __restrict__ alpha,\n    const int n_rays,\n    scalar_t* __restrict__ weight,\n    scalar_t* __restrict__ T,\n    scalar_t* __restrict__ alphainv_last,\n    int64_t* __restrict__ i_start,\n    int64_t* __restrict__ i_end) {\n\n  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_ray<n_rays) {\n    const int i_s = i_start[i_ray];\n    const int i_e_max = i_end[i_ray];\n\n    float T_cum = 1.;\n    int i;\n    for(i=i_s; i<i_e_max; ++i) {\n      T[i] = T_cum;\n      weight[i] = T_cum * alpha[i];\n      T_cum *= (1. - alpha[i]);\n      if(T_cum<1e-3) {\n        i+=1;\n        break;\n      }\n    }\n    i_end[i_ray] = i;\n    alphainv_last[i_ray] = T_cum;\n  }\n}\n\n__global__ void __set_i_for_segment_start_end(\n        int64_t* __restrict__ ray_id,\n        const int n_pts,\n        int64_t* __restrict__ i_start,\n        int64_t* __restrict__ i_end) {\n  const int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if(0<index && index<n_pts && ray_id[index]!=ray_id[index-1]) {\n    i_start[ray_id[index]] = index;\n    i_end[ray_id[index-1]] = index;\n  }\n}\n\nstd::vector<torch::Tensor> alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int n_rays) {\n\n  const int n_pts = alpha.size(0);\n  const int threads = 256;\n\n  auto weight = torch::zeros_like(alpha);\n  auto T = torch::ones_like(alpha);\n  auto alphainv_last = torch::ones({n_rays}, alpha.options());\n  auto i_start = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));\n  auto i_end = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));\n  if(n_pts==0) {\n    return {weight, T, alphainv_last, i_start, i_end};\n  }\n\n  __set_i_for_segment_start_end<<<(n_pts+threads-1)/threads, threads>>>(\n          ray_id.data<int64_t>(), n_pts, i_start.data<int64_t>(), i_end.data<int64_t>());\n  i_end[ray_id[n_pts-1]] = n_pts;\n\n  const int blocks = (n_rays + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(alpha.type(), \"alpha2weight_cuda\", ([&] {\n    alpha2weight_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        alpha.data<scalar_t>(),\n        n_rays,\n        weight.data<scalar_t>(),\n        T.data<scalar_t>(),\n        alphainv_last.data<scalar_t>(),\n        i_start.data<int64_t>(),\n        i_end.data<int64_t>());\n  }));\n\n  return {weight, T, alphainv_last, i_start, i_end};\n}\n\ntemplate <typename scalar_t>\n__global__ void alpha2weight_backward_cuda_kernel(\n    scalar_t* __restrict__ alpha,\n    scalar_t* __restrict__ weight,\n    scalar_t* __restrict__ T,\n    scalar_t* __restrict__ alphainv_last,\n    int64_t* __restrict__ i_start,\n    int64_t* __restrict__ i_end,\n    const int n_rays,\n    scalar_t* __restrict__ grad_weights,\n    scalar_t* __restrict__ grad_last,\n    scalar_t* __restrict__ grad) {\n\n  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_ray<n_rays) {\n    const int i_s = i_start[i_ray];\n    const int i_e = i_end[i_ray];\n\n    float back_cum = grad_last[i_ray] * alphainv_last[i_ray];\n    for(int i=i_e-1; i>=i_s; --i) {\n      grad[i] = grad_weights[i] * T[i] - back_cum / (1-alpha[i] + 1e-10);\n      back_cum += grad_weights[i] * weight[i];\n    }\n  }\n}\n\ntorch::Tensor alpha2weight_backward_cuda(\n        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,\n        torch::Tensor i_start, torch::Tensor i_end, const int n_rays,\n        torch::Tensor grad_weights, torch::Tensor grad_last) {\n\n  auto grad = torch::zeros_like(alpha);\n  if(n_rays==0) {\n    return grad;\n  }\n\n  const int threads = 256;\n  const int blocks = (n_rays + threads - 1) / threads;\n\n  AT_DISPATCH_FLOATING_TYPES(alpha.type(), \"alpha2weight_backward_cuda\", ([&] {\n    alpha2weight_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        alpha.data<scalar_t>(),\n        weight.data<scalar_t>(),\n        T.data<scalar_t>(),\n        alphainv_last.data<scalar_t>(),\n        i_start.data<int64_t>(),\n        i_end.data<int64_t>(),\n        n_rays,\n        grad_weights.data<scalar_t>(),\n        grad_last.data<scalar_t>(),\n        grad.data<scalar_t>());\n  }));\n\n  return grad;\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/setup.py",
    "content": "import os\nimport pdb\nfrom setuptools import setup, Extension\nfrom torch.utils import cpp_extension\n\n\ndef setup_sources(name, sources):\n      parent_dir = os.path.dirname(os.path.abspath(__file__))\n      sources = [os.path.join(parent_dir, path) for path in sources]\n      setup(name=name,\n            ext_modules=[cpp_extension.CppExtension(name=name, sources=sources)],\n            cmdclass={'build_ext': cpp_extension.BuildExtension})\n      return\n\nsetup_sources(name='adam_upd_cuda', sources=['adam_upd.cpp', 'adam_upd_kernel.cu'])\nsetup_sources(name='ub360_utils_cuda', sources=['ub360_utils.cpp', 'ub360_utils_kernel.cu'])\nsetup_sources(name='render_utils_cuda', sources=['render_utils.cpp', 'render_utils_kernel.cu'])\nsetup_sources(name='total_variation_cuda', sources=['total_variation.cpp', 'total_variation_kernel.cu'])\n\n# run python setup.py install"
  },
  {
    "path": "FourierGrid/cuda/total_variation.cpp",
    "content": "#include <torch/extension.h>\n\n#include <vector>\n\n// CUDA forward declarations\n\nvoid total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, float wx, float wy, float wz, bool dense_mode);\n\n\n// C++ interface\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x \" must be a CUDA tensor\")\n#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x \" must be contiguous\")\n#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)\n\nvoid total_variation_add_grad(torch::Tensor param, torch::Tensor grad, float wx, float wy, float wz, bool dense_mode) {\n  CHECK_INPUT(param);\n  CHECK_INPUT(grad);\n  total_variation_add_grad_cuda(param, grad, wx, wy, wz, dense_mode);\n}\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"total_variation_add_grad\", &total_variation_add_grad, \"Add total variation grad\");\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/total_variation_kernel.cu",
    "content": "#include <torch/extension.h>\n\n#include <cuda.h>\n#include <cuda_runtime.h>\n\n#include <vector>\n\ntemplate <typename scalar_t, typename bound_t>\n__device__ __forceinline__ scalar_t clamp(const scalar_t v, const bound_t lo, const bound_t hi) {\n  return min(max(v, lo), hi);\n}\n\ntemplate <typename scalar_t, bool dense_mode>\n__global__ void total_variation_add_grad_cuda_kernel(\n    const scalar_t* __restrict__ param,\n    scalar_t* __restrict__ grad,\n    float wx, float wy, float wz,\n    const size_t sz_i, const size_t sz_j, const size_t sz_k, const size_t N) {\n\n  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;\n  if(index<N && (dense_mode || grad[index]!=0)) {\n    const size_t k = index % sz_k;\n    const size_t j = index / sz_k % sz_j;\n    const size_t i = index / sz_k / sz_j % sz_i;\n\n    float grad_to_add = 0;\n    grad_to_add += (k==0      ? 0 : wz * clamp(param[index]-param[index-1], -1.f, 1.f));\n    grad_to_add += (k==sz_k-1 ? 0 : wz * clamp(param[index]-param[index+1], -1.f, 1.f));\n    grad_to_add += (j==0      ? 0 : wy * clamp(param[index]-param[index-sz_k], -1.f, 1.f));\n    grad_to_add += (j==sz_j-1 ? 0 : wy * clamp(param[index]-param[index+sz_k], -1.f, 1.f));\n    grad_to_add += (i==0      ? 0 : wz * clamp(param[index]-param[index-sz_k*sz_j], -1.f, 1.f));\n    grad_to_add += (i==sz_i-1 ? 0 : wz * clamp(param[index]-param[index+sz_k*sz_j], -1.f, 1.f));\n    grad[index] += grad_to_add;\n  }\n}\n\nvoid total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, float wx, float wy, float wz, bool dense_mode) {\n  const size_t N = param.numel();\n  const size_t sz_i = param.size(2);\n  const size_t sz_j = param.size(3);\n  const size_t sz_k = param.size(4);\n  const int threads = 256;\n  const int blocks = (N + threads - 1) / threads;\n\n  wx /= 6;\n  wy /= 6;\n  wz /= 6;\n\n  if(dense_mode) {\n    AT_DISPATCH_FLOATING_TYPES(param.type(), \"total_variation_add_grad_cuda\", ([&] {\n      total_variation_add_grad_cuda_kernel<scalar_t,true><<<blocks, threads>>>(\n          param.data<scalar_t>(),\n          grad.data<scalar_t>(),\n          wx, wy, wz,\n          sz_i, sz_j, sz_k, N);\n    }));\n  }\n  else {\n     AT_DISPATCH_FLOATING_TYPES(param.type(), \"total_variation_add_grad_cuda\", ([&] {\n      total_variation_add_grad_cuda_kernel<scalar_t,false><<<blocks, threads>>>(\n          param.data<scalar_t>(),\n          grad.data<scalar_t>(),\n          wx, wy, wz,\n          sz_i, sz_j, sz_k, N);\n    }));\n  }\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/ub360_utils.cpp",
    "content": "#include <torch/extension.h>\n\n#include <vector>\n\n// CUDA forward declarations\n\ntorch::Tensor cumdist_thres_cuda(torch::Tensor dist, float thres);\n\n// C++ interface\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x \" must be a CUDA tensor\")\n#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x \" must be contiguous\")\n#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)\n\ntorch::Tensor cumdist_thres(torch::Tensor dist, float thres) {\n  CHECK_INPUT(dist);\n  return cumdist_thres_cuda(dist, thres);\n}\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"cumdist_thres\", &cumdist_thres, \"Generate mask for cumulative dist.\");\n}\n\n"
  },
  {
    "path": "FourierGrid/cuda/ub360_utils_kernel.cu",
    "content": "#include <torch/extension.h>\n\n#include <cuda.h>\n#include <cuda_runtime.h>\n\n#include <vector>\n\n/*\n   helper function to skip oversampled points,\n   especially near the foreground scene bbox boundary\n   */\ntemplate <typename scalar_t>\n__global__ void cumdist_thres_cuda_kernel(\n        scalar_t* __restrict__ dist,\n        const float thres,\n        const int n_rays,\n        const int n_pts,\n        bool* __restrict__ mask) {\n  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;\n  if(i_ray<n_rays) {\n    float cum_dist = 0;\n    const int i_s = i_ray * n_pts;\n    const int i_t = i_s + n_pts;\n    int i;\n    for(i=i_s; i<i_t; ++i) {\n      cum_dist += dist[i];\n      bool over = (cum_dist > thres);\n      cum_dist *= float(!over);\n      mask[i] = over;\n    }\n  }\n}\n\ntorch::Tensor cumdist_thres_cuda(torch::Tensor dist, float thres) {\n  const int n_rays = dist.size(0);\n  const int n_pts = dist.size(1);\n  const int threads = 256;\n  const int blocks = (n_rays + threads - 1) / threads;\n  auto mask = torch::zeros({n_rays, n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA));\n  AT_DISPATCH_FLOATING_TYPES(dist.type(), \"cumdist_thres_cuda\", ([&] {\n    cumdist_thres_cuda_kernel<scalar_t><<<blocks, threads>>>(\n        dist.data<scalar_t>(), thres,\n        n_rays, n_pts,\n        mask.data<bool>());\n  }));\n  return mask;\n}\n\n"
  },
  {
    "path": "FourierGrid/dcvgo.py",
    "content": "import os\nimport time\nimport functools\nimport numpy as np\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom torch_scatter import segment_coo\n\nfrom . import grid\nfrom .dvgo import Raw2Alpha, Alphas2Weights\nfrom .dmpigo import create_full_step_id\nimport ub360_utils_cuda\n\n# from torch.utils.cpp_extension import load\n# parent_dir = os.path.dirname(os.path.abspath(__file__))\n# ub360_utils_cuda = load(\n#         name='ub360_utils_cuda',\n#         sources=[\n#             os.path.join(parent_dir, path)\n#             for path in ['cuda/ub360_utils.cpp', 'cuda/ub360_utils_kernel.cu']],\n#         verbose=True)\n\n\n'''Model'''\nclass DirectContractedVoxGO(nn.Module):\n    def __init__(self, xyz_min, xyz_max,\n                 num_voxels=0, num_voxels_base=0,\n                 alpha_init=None,\n                 mask_cache_world_size=None,\n                 fast_color_thres=0, bg_len=0.2,\n                 contracted_norm='inf',\n                 density_type='DenseGrid', k0_type='DenseGrid',\n                 density_config={}, k0_config={},\n                 rgbnet_dim=0,\n                 rgbnet_depth=3, rgbnet_width=128,\n                 viewbase_pe=4,\n                 **kwargs):\n        super(DirectContractedVoxGO, self).__init__()\n        # xyz_min/max are the boundary that separates fg and bg scene\n        xyz_min = torch.Tensor(xyz_min)\n        xyz_max = torch.Tensor(xyz_max)\n        assert len(((xyz_max - xyz_min) * 100000).long().unique()), 'scene bbox must be a cube in DirectContractedVoxGO'\n        self.register_buffer('scene_center', (xyz_min + xyz_max) * 0.5)\n        self.register_buffer('scene_radius', (xyz_max - xyz_min) * 0.5)\n        self.register_buffer('xyz_min', torch.Tensor([-1,-1,-1]) - bg_len)\n        self.register_buffer('xyz_max', torch.Tensor([1,1,1]) + bg_len)\n        if isinstance(fast_color_thres, dict):\n            self._fast_color_thres = fast_color_thres\n            self.fast_color_thres = fast_color_thres[0]\n        else:\n            self._fast_color_thres = None\n            self.fast_color_thres = fast_color_thres\n        self.bg_len = bg_len\n        self.contracted_norm = contracted_norm\n\n        # determine based grid resolution\n        self.num_voxels_base = num_voxels_base\n        self.voxel_size_base = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base).pow(1/3)\n\n        # determine init grid resolution\n        self._set_grid_resolution(num_voxels)\n\n        # determine the density bias shift\n        self.alpha_init = alpha_init\n        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))\n        print('dcvgo: set density bias shift to', self.act_shift)\n\n        # init density voxel grid\n        self.density_type = density_type\n        self.density_config = density_config\n        self.density = grid.create_grid(\n            density_type, channels=1, world_size=self.world_size,\n            xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n            config=self.density_config)\n\n        # init color representation\n        self.rgbnet_kwargs = {\n            'rgbnet_dim': rgbnet_dim,\n            'rgbnet_depth': rgbnet_depth, \n            'rgbnet_width': rgbnet_width,\n            'viewbase_pe': viewbase_pe,\n        }\n        self.k0_type = k0_type\n        self.k0_config = k0_config\n        if rgbnet_dim <= 0:\n            # color voxel grid  (coarse stage)\n            self.k0_dim = 3\n            self.k0 = grid.create_grid(\n                k0_type, channels=self.k0_dim, world_size=self.world_size,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                config=self.k0_config)\n            self.rgbnet = None\n        else:\n            # feature voxel grid + shallow MLP  (fine stage)\n            self.k0_dim = rgbnet_dim\n            self.k0 = grid.create_grid(\n                k0_type, channels=self.k0_dim, world_size=self.world_size,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                config=self.k0_config)\n            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))\n            dim0 = (3+3*viewbase_pe*2)\n            dim0 += self.k0_dim\n            self.rgbnet = nn.Sequential(\n                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),\n                *[\n                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))\n                    for _ in range(rgbnet_depth-2)\n                ],\n                nn.Linear(rgbnet_width, 3),\n            )\n            nn.init.constant_(self.rgbnet[-1].bias, 0)\n            print('dcvgo: feature voxel grid', self.k0)\n            print('dcvgo: mlp', self.rgbnet)\n\n        # Using the coarse geometry if provided (used to determine known free space and unknown space)\n        # Re-implement as occupancy grid (2021/1/31)\n        if mask_cache_world_size is None:\n            mask_cache_world_size = self.world_size\n        mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)\n        self.mask_cache = grid.MaskGrid(\n            path=None, mask=mask,\n            xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n    def _set_grid_resolution(self, num_voxels):\n        # Determine grid resolution\n        self.num_voxels = num_voxels\n        self.voxel_size = ((self.xyz_max - self.xyz_min).prod() / num_voxels).pow(1/3)\n        self.world_size = ((self.xyz_max - self.xyz_min) / self.voxel_size).long()\n        self.world_len = self.world_size[0].item()\n        self.voxel_size_ratio = self.voxel_size / self.voxel_size_base\n        print('dcvgo: voxel_size      ', self.voxel_size)\n        print('dcvgo: world_size      ', self.world_size)\n        print('dcvgo: voxel_size_base ', self.voxel_size_base)\n        print('dcvgo: voxel_size_ratio', self.voxel_size_ratio)\n\n    def get_kwargs(self):\n        return {\n            'xyz_min': self.xyz_min.cpu().numpy(),\n            'xyz_max': self.xyz_max.cpu().numpy(),\n            'num_voxels': self.num_voxels,\n            'num_voxels_base': self.num_voxels_base,\n            'alpha_init': self.alpha_init,\n            'voxel_size_ratio': self.voxel_size_ratio,\n            'mask_cache_world_size': list(self.mask_cache.mask.shape),\n            'fast_color_thres': self.fast_color_thres,\n            'contracted_norm': self.contracted_norm,\n            'density_type': self.density_type,\n            'k0_type': self.k0_type,\n            'density_config': self.density_config,\n            'k0_config': self.k0_config,\n            **self.rgbnet_kwargs,\n        }\n\n    @torch.no_grad()\n    def scale_volume_grid(self, num_voxels):\n        print('dcvgo: scale_volume_grid start')\n        ori_world_size = self.world_size\n        self._set_grid_resolution(num_voxels)\n        print('dcvgo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())\n\n        self.density.scale_volume_grid(self.world_size)\n        self.k0.scale_volume_grid(self.world_size)\n\n        if np.prod(self.world_size.tolist()) <= 256**3:\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),\n            ), -1)\n            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]\n            self.mask_cache = grid.MaskGrid(\n                path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n        print('dcvgo: scale_volume_grid finish')\n\n    @torch.no_grad()\n    def update_occupancy_cache(self):\n        ori_p = self.mask_cache.mask.float().mean().item()\n        cache_grid_xyz = torch.stack(torch.meshgrid(\n            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),\n            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),\n            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),\n        ), -1)\n        cache_grid_density = self.density(cache_grid_xyz)[None,None]\n        cache_grid_alpha = self.activate_density(cache_grid_density)\n        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]\n        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)\n        new_p = self.mask_cache.mask.float().mean().item()\n        print(f'dcvgo: update mask_cache {ori_p:.4f} => {new_p:.4f}')\n\n    def update_occupancy_cache_lt_nviews(self, rays_o_tr, rays_d_tr, imsz, render_kwargs, maskout_lt_nviews):\n        print('dcvgo: update mask_cache lt_nviews start')\n        eps_time = time.time()\n        count = torch.zeros_like(self.density.get_dense_grid()).long()\n        device = count.device\n        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):\n            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)\n            for rays_o, rays_d in zip(rays_o_.split(8192), rays_d_.split(8192)):\n                ray_pts, inner_mask, t = self.sample_ray(\n                        ori_rays_o=rays_o.to(device), ori_rays_d=rays_d.to(device),\n                        **render_kwargs)\n                ones(ray_pts).sum().backward()\n            count.data += (ones.grid.grad > 1)\n        ori_p = self.mask_cache.mask.float().mean().item()\n        self.mask_cache.mask &= (count >= maskout_lt_nviews)[0,0]\n        new_p = self.mask_cache.mask.float().mean().item()\n        print(f'dcvgo: update mask_cache {ori_p:.4f} => {new_p:.4f}')\n        eps_time = time.time() - eps_time\n        print(f'dcvgo: update mask_cache lt_nviews finish (eps time:', eps_time, 'sec)')\n\n    def density_total_variation_add_grad(self, weight, dense_mode):\n        w = weight * self.world_size.max() / 128\n        self.density.total_variation_add_grad(w, w, w, dense_mode)\n\n    def k0_total_variation_add_grad(self, weight, dense_mode):\n        w = weight * self.world_size.max() / 128\n        self.k0.total_variation_add_grad(w, w, w, dense_mode)\n\n    def activate_density(self, density, interval=None):\n        interval = interval if interval is not None else self.voxel_size_ratio\n        shape = density.shape\n        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)\n\n    def sample_ray(self, ori_rays_o, ori_rays_d, stepsize, is_train=False, **render_kwargs):\n        '''Sample query points on rays.\n        All the output points are sorted from near to far.\n        Input:\n            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.\n            stepsize:         the number of voxels of each sample step.\n        Output:\n            ray_pts:          [M, 3] storing all the sampled points.\n            ray_id:           [M]    the index of the ray of each point.\n            step_id:          [M]    the i'th step on a ray of each point.\n        '''\n        rays_o = (ori_rays_o - self.scene_center) / self.scene_radius\n        rays_d = ori_rays_d / ori_rays_d.norm(dim=-1, keepdim=True)\n        N_inner = int(2 / (2+2*self.bg_len) * self.world_len / stepsize) + 1\n        N_outer = N_inner\n        b_inner = torch.linspace(0, 2, N_inner+1)\n        b_outer = 2 / torch.linspace(1, 1/128, N_outer+1)\n        t = torch.cat([\n            (b_inner[1:] + b_inner[:-1]) * 0.5,\n            (b_outer[1:] + b_outer[:-1]) * 0.5,\n        ])\n        ray_pts = rays_o[:,None,:] + rays_d[:,None,:] * t[None,:,None]\n        if self.contracted_norm == 'inf':\n            norm = ray_pts.abs().amax(dim=-1, keepdim=True)\n        elif self.contracted_norm == 'l2':\n            norm = ray_pts.norm(dim=-1, keepdim=True)\n        else:\n            raise NotImplementedError\n        inner_mask = (norm<=1)\n        ray_pts = torch.where(\n            inner_mask,\n            ray_pts,\n            ray_pts / norm * ((1+self.bg_len) - self.bg_len/norm)\n        )\n        return ray_pts, inner_mask.squeeze(-1), t\n\n    def forward(self, rays_o, rays_d, viewdirs, global_step=None, is_train=False, **render_kwargs):\n        '''Volume rendering\n        @rays_o:   [N, 3] the starting point of the N shooting rays.\n        @rays_d:   [N, 3] the shooting direction of the N rays.\n        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.\n        '''\n        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only support point queries in [N, 3] format'\n        if isinstance(self._fast_color_thres, dict) and global_step in self._fast_color_thres:\n            print(f'dcvgo: update fast_color_thres {self.fast_color_thres} => {self._fast_color_thres[global_step]}')\n            self.fast_color_thres = self._fast_color_thres[global_step]\n\n        ret_dict = {}\n        N = len(rays_o)\n\n        # sample points on rays\n        ray_pts, inner_mask, t = self.sample_ray(\n                ori_rays_o=rays_o, ori_rays_d=rays_d, is_train=global_step is not None, **render_kwargs)\n        n_max = len(t)\n        interval = render_kwargs['stepsize'] * self.voxel_size_ratio\n        ray_id, step_id = create_full_step_id(ray_pts.shape[:2])\n\n        # skip oversampled points outside scene bbox\n        mask = inner_mask.clone()\n        dist_thres = (2+2*self.bg_len) / self.world_len * render_kwargs['stepsize'] * 0.95\n        dist = (ray_pts[:,1:] - ray_pts[:,:-1]).norm(dim=-1)\n        mask[:, 1:] |= ub360_utils_cuda.cumdist_thres(dist, dist_thres)\n        ray_pts = ray_pts[mask]\n        inner_mask = inner_mask[mask]\n        t = t[None].repeat(N,1)[mask]\n        ray_id = ray_id[mask.flatten()]\n        step_id = step_id[mask.flatten()]\n\n        # skip known free space\n        mask = self.mask_cache(ray_pts)\n        ray_pts = ray_pts[mask]\n        inner_mask = inner_mask[mask]\n        t = t[mask]\n        ray_id = ray_id[mask]\n        step_id = step_id[mask]\n\n        # query for alpha w/ post-activation\n        density = self.density(ray_pts)\n        alpha = self.activate_density(density, interval)\n        if self.fast_color_thres > 0:\n            mask = (alpha > self.fast_color_thres)\n            ray_pts = ray_pts[mask]\n            inner_mask = inner_mask[mask]\n            t = t[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n            density = density[mask]\n            alpha = alpha[mask]\n\n        # compute accumulated transmittance\n        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)\n        if self.fast_color_thres > 0:\n            mask = (weights > self.fast_color_thres)\n            ray_pts = ray_pts[mask]\n            inner_mask = inner_mask[mask]\n            t = t[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n            density = density[mask]\n            alpha = alpha[mask]\n            weights = weights[mask]\n\n        # query for color\n        k0 = self.k0(ray_pts)\n        if self.rgbnet is None:\n            # no view-depend effect\n            rgb = torch.sigmoid(k0)\n        else:\n            # view-dependent color emission\n            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)\n            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)\n            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]\n            rgb_feat = torch.cat([k0, viewdirs_emb], -1)\n            rgb_logit = self.rgbnet(rgb_feat)\n            rgb = torch.sigmoid(rgb_logit)\n\n        # Ray marching\n        rgb_marched = segment_coo(\n                src=(weights.unsqueeze(-1) * rgb),\n                index=ray_id,\n                out=torch.zeros([N, 3]),\n                reduce='sum')\n        if render_kwargs.get('rand_bkgd', False) and is_train:\n            rgb_marched += (alphainv_last.unsqueeze(-1) * torch.rand_like(rgb_marched))\n        else:\n            rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])\n        wsum_mid = segment_coo(\n                src=weights[inner_mask],\n                index=ray_id[inner_mask],\n                out=torch.zeros([N]),\n                reduce='sum')\n        s = 1 - 1/(1+t)  # [0, inf] => [0, 1]\n        ret_dict.update({\n            'alphainv_last': alphainv_last,\n            'weights': weights,\n            'wsum_mid': wsum_mid,\n            'rgb_marched': rgb_marched,\n            'raw_density': density,\n            'raw_alpha': alpha,\n            'raw_rgb': rgb,\n            'ray_id': ray_id,\n            'step_id': step_id,\n            'n_max': n_max,\n            't': t,\n            's': s,\n        })\n\n        if render_kwargs.get('render_depth', False):\n            with torch.no_grad():\n                depth = segment_coo(\n                        src=(weights * s),\n                        index=ray_id,\n                        out=torch.zeros([N]),\n                        reduce='sum')\n            ret_dict.update({'depth': depth})\n\n        return ret_dict\n\n\nclass DistortionLoss(torch.autograd.Function):\n    @staticmethod\n    def forward(ctx, w, s, n_max, ray_id):\n        n_rays = ray_id.max()+1\n        interval = 1/n_max\n        w_prefix, w_total, ws_prefix, ws_total = ub360_utils_cuda.segment_cumsum(w, s, ray_id)\n        loss_uni = (1/3) * interval * w.pow(2)\n        loss_bi = 2 * w * (s * w_prefix - ws_prefix)\n        ctx.save_for_backward(w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id)\n        ctx.interval = interval\n        return (loss_bi.sum() + loss_uni.sum()) / n_rays\n\n    @staticmethod\n    @torch.autograd.function.once_differentiable\n    def backward(ctx, grad_back):\n        w, s, w_prefix, w_total, ws_prefix, ws_total, ray_id = ctx.saved_tensors\n        interval = ctx.interval\n        grad_uni = (1/3) * interval * 2 * w\n        w_suffix = w_total[ray_id] - (w_prefix + w)\n        ws_suffix = ws_total[ray_id] - (ws_prefix + w*s)\n        grad_bi = 2 * (s * (w_prefix - w_suffix) + (ws_suffix - ws_prefix))\n        grad = grad_back * (grad_bi + grad_uni)\n        return grad, None, None, None\n\ndistortion_loss = DistortionLoss.apply\n\n"
  },
  {
    "path": "FourierGrid/dmpigo.py",
    "content": "import os\nimport time\nimport functools\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\nfrom einops import rearrange\nfrom torch_scatter import scatter_add, segment_coo\n\nfrom . import grid\nfrom .dvgo import Raw2Alpha, Alphas2Weights, render_utils_cuda\n\n\n'''Model'''\nclass DirectMPIGO(torch.nn.Module):\n    def __init__(self, xyz_min, xyz_max,\n                 num_voxels=0, mpi_depth=0,\n                 mask_cache_path=None, mask_cache_thres=1e-3, mask_cache_world_size=None,\n                 fast_color_thres=0,\n                 density_type='DenseGrid', k0_type='DenseGrid',\n                 density_config={}, k0_config={},\n                 rgbnet_dim=0,\n                 rgbnet_depth=3, rgbnet_width=128,\n                 viewbase_pe=0,\n                 **kwargs):\n        super(DirectMPIGO, self).__init__()\n        self.register_buffer('xyz_min', torch.Tensor(xyz_min))\n        self.register_buffer('xyz_max', torch.Tensor(xyz_max))\n        self.fast_color_thres = fast_color_thres\n\n        # determine init grid resolution\n        self._set_grid_resolution(num_voxels, mpi_depth)\n\n        # init density voxel grid\n        self.density_type = density_type\n        self.density_config = density_config\n        self.density = grid.create_grid(\n                density_type, channels=1, world_size=self.world_size,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                config=self.density_config)\n\n        # init density bias so that the initial contribution (the alpha values)\n        # of each query points on a ray is equal\n        self.act_shift = grid.DenseGrid(\n                channels=1, world_size=[1,1,mpi_depth],\n                xyz_min=xyz_min, xyz_max=xyz_max)\n        self.act_shift.grid.requires_grad = False\n        with torch.no_grad():\n            g = np.full([mpi_depth], 1./mpi_depth - 1e-6)\n            p = [1-g[0]]\n            for i in range(1, len(g)):\n                p.append((1-g[:i+1].sum())/(1-g[:i].sum()))\n            for i in range(len(p)):\n                self.act_shift.grid[..., i].fill_(np.log(p[i] ** (-1/self.voxel_size_ratio) - 1))\n\n        # init color representation\n        # feature voxel grid + shallow MLP  (fine stage)\n        self.rgbnet_kwargs = {\n            'rgbnet_dim': rgbnet_dim,\n            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,\n            'viewbase_pe': viewbase_pe,\n        }\n        self.k0_type = k0_type\n        self.k0_config = k0_config\n        if rgbnet_dim <= 0:\n            # color voxel grid  (coarse stage)\n            self.k0_dim = 3\n            self.k0 = grid.create_grid(\n                k0_type, channels=self.k0_dim, world_size=self.world_size,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                config=self.k0_config)\n            self.rgbnet = None\n        else:\n            self.k0_dim = rgbnet_dim\n            self.k0 = grid.create_grid(\n                    k0_type, channels=self.k0_dim, world_size=self.world_size,\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                    config=self.k0_config)\n            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))\n            dim0 = (3+3*viewbase_pe*2) + self.k0_dim\n            self.rgbnet = nn.Sequential(\n                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),\n                *[\n                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))\n                    for _ in range(rgbnet_depth-2)\n                ],\n                nn.Linear(rgbnet_width, 3),\n            )\n            nn.init.constant_(self.rgbnet[-1].bias, 0)\n\n        print('dmpigo: densitye grid', self.density)\n        print('dmpigo: feature grid', self.k0)\n        print('dmpigo: mlp', self.rgbnet)\n\n        # Using the coarse geometry if provided (used to determine known free space and unknown space)\n        # Re-implement as occupancy grid (2021/1/31)\n        self.mask_cache_path = mask_cache_path\n        self.mask_cache_thres = mask_cache_thres\n        if mask_cache_world_size is None:\n            mask_cache_world_size = self.world_size\n        if mask_cache_path is not None and mask_cache_path:\n            mask_cache = grid.MaskGrid(\n                    path=mask_cache_path,\n                    mask_cache_thres=mask_cache_thres).to(self.xyz_min.device)\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], mask_cache_world_size[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], mask_cache_world_size[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], mask_cache_world_size[2]),\n            ), -1)\n            mask = mask_cache(self_grid_xyz)\n        else:\n            mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)\n        self.mask_cache = grid.MaskGrid(\n                path=None, mask=mask,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n    def _set_grid_resolution(self, num_voxels, mpi_depth):\n        # Determine grid resolution\n        self.num_voxels = num_voxels\n        self.mpi_depth = mpi_depth\n        r = (num_voxels / self.mpi_depth / (self.xyz_max - self.xyz_min)[:2].prod()).sqrt()\n        self.world_size = torch.zeros(3, dtype=torch.long)\n        self.world_size[:2] = (self.xyz_max - self.xyz_min)[:2] * r\n        self.world_size[2] = self.mpi_depth\n        self.voxel_size_ratio = 256. / mpi_depth\n        print('dmpigo: world_size      ', self.world_size)\n        print('dmpigo: voxel_size_ratio', self.voxel_size_ratio)\n\n    def get_kwargs(self):\n        return {\n            'xyz_min': self.xyz_min.cpu().numpy(),\n            'xyz_max': self.xyz_max.cpu().numpy(),\n            'num_voxels': self.num_voxels,\n            'mpi_depth': self.mpi_depth,\n            'voxel_size_ratio': self.voxel_size_ratio,\n            'mask_cache_path': self.mask_cache_path,\n            'mask_cache_thres': self.mask_cache_thres,\n            'mask_cache_world_size': list(self.mask_cache.mask.shape),\n            'fast_color_thres': self.fast_color_thres,\n            'density_type': self.density_type,\n            'k0_type': self.k0_type,\n            'density_config': self.density_config,\n            'k0_config': self.k0_config,\n            **self.rgbnet_kwargs,\n        }\n\n    @torch.no_grad()\n    def scale_volume_grid(self, num_voxels, mpi_depth):\n        print('dmpigo: scale_volume_grid start')\n        ori_world_size = self.world_size\n        self._set_grid_resolution(num_voxels, mpi_depth)\n        print('dmpigo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())\n\n        self.density.scale_volume_grid(self.world_size)\n        self.k0.scale_volume_grid(self.world_size)\n\n        if np.prod(self.world_size.tolist()) <= 256**3:\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),\n            ), -1)\n            dens = self.density.get_dense_grid() + self.act_shift.grid\n            self_alpha = F.max_pool3d(self.activate_density(dens), kernel_size=3, padding=1, stride=1)[0,0]\n            self.mask_cache = grid.MaskGrid(\n                    path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n        print('dmpigo: scale_volume_grid finish')\n\n    @torch.no_grad()\n    def update_occupancy_cache(self):\n        ori_p = self.mask_cache.mask.float().mean().item()\n        cache_grid_xyz = torch.stack(torch.meshgrid(\n            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),\n            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),\n            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),\n        ), -1)\n        cache_grid_density = self.density(cache_grid_xyz)[None,None]\n        cache_grid_alpha = self.activate_density(cache_grid_density)\n        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]\n        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)\n        new_p = self.mask_cache.mask.float().mean().item()\n        print(f'dmpigo: update mask_cache {ori_p:.4f} => {new_p:.4f}')\n\n    def update_occupancy_cache_lt_nviews(self, rays_o_tr, rays_d_tr, imsz, render_kwargs, maskout_lt_nviews):\n        print('dmpigo: update mask_cache lt_nviews start')\n        eps_time = time.time()\n        count = torch.zeros_like(self.density.get_dense_grid()).long()\n        device = count.device\n        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):\n            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)\n            for rays_o, rays_d in zip(rays_o_.split(8192), rays_d_.split(8192)):\n                ray_pts, ray_id, step_id, N_samples = self.sample_ray(\n                        rays_o=rays_o.to(device), rays_d=rays_d.to(device), **render_kwargs)\n                ones(ray_pts).sum().backward()\n            count.data += (ones.grid.grad > 1)\n        ori_p = self.mask_cache.mask.float().mean().item()\n        self.mask_cache.mask &= (count >= maskout_lt_nviews)[0,0]\n        new_p = self.mask_cache.mask.float().mean().item()\n        print(f'dmpigo: update mask_cache {ori_p:.4f} => {new_p:.4f}')\n        torch.cuda.empty_cache()\n        eps_time = time.time() - eps_time\n        print(f'dmpigo: update mask_cache lt_nviews finish (eps time:', eps_time, 'sec)')\n\n    def density_total_variation_add_grad(self, weight, dense_mode):\n        wxy = weight * self.world_size[:2].max() / 128\n        wz = weight * self.mpi_depth / 128\n        self.density.total_variation_add_grad(wxy, wxy, wz, dense_mode)\n\n    def k0_total_variation_add_grad(self, weight, dense_mode):\n        wxy = weight * self.world_size[:2].max() / 128\n        wz = weight * self.mpi_depth / 128\n        self.k0.total_variation_add_grad(wxy, wxy, wz, dense_mode)\n\n    def activate_density(self, density, interval=None):\n        interval = interval if interval is not None else self.voxel_size_ratio\n        shape = density.shape\n        return Raw2Alpha.apply(density.flatten(), 0, interval).reshape(shape)\n\n    def sample_ray(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):\n        '''Sample query points on rays.\n        All the output points are sorted from near to far.\n        Input:\n            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.\n            near, far:        the near and far distance of the rays.\n            stepsize:         the number of voxels of each sample step.\n        Output:\n            ray_pts:          [M, 3] storing all the sampled points.\n            ray_id:           [M]    the index of the ray of each point.\n            step_id:          [M]    the i'th step on a ray of each point.\n        '''\n        assert near==0 and far==1\n        rays_o = rays_o.contiguous()\n        rays_d = rays_d.contiguous()\n        N_samples = int((self.mpi_depth-1)/stepsize) + 1\n        ray_pts, mask_outbbox = render_utils_cuda.sample_ndc_pts_on_rays(\n            rays_o, rays_d, self.xyz_min, self.xyz_max, N_samples)\n        mask_inbbox = ~mask_outbbox\n        ray_pts = ray_pts[mask_inbbox]\n        if mask_inbbox.all():\n            ray_id, step_id = create_full_step_id(mask_inbbox.shape)\n        else:\n            ray_id = torch.arange(mask_inbbox.shape[0]).view(-1,1).expand_as(mask_inbbox)[mask_inbbox]\n            step_id = torch.arange(mask_inbbox.shape[1]).view(1,-1).expand_as(mask_inbbox)[mask_inbbox]\n        return ray_pts, ray_id, step_id, N_samples\n\n    def forward(self, rays_o, rays_d, viewdirs, global_step=None, **render_kwargs):\n        '''Volume rendering\n        @rays_o:   [N, 3] the starting point of the N shooting rays.\n        @rays_d:   [N, 3] the shooting direction of the N rays.\n        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.\n        '''\n        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'\n\n        ret_dict = {}\n        N = len(rays_o)\n\n        # sample points on rays\n        ray_pts, ray_id, step_id, N_samples = self.sample_ray(\n                rays_o=rays_o, rays_d=rays_d, **render_kwargs)\n        interval = render_kwargs['stepsize'] * self.voxel_size_ratio\n\n        # skip known free space\n        if self.mask_cache is not None:\n            mask = self.mask_cache(ray_pts)\n            ray_pts = ray_pts[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n\n        # query for alpha w/ post-activation\n        density = self.density(ray_pts) + self.act_shift(ray_pts)\n        alpha = self.activate_density(density, interval)\n        if self.fast_color_thres > 0:\n            mask = (alpha > self.fast_color_thres)\n            ray_pts = ray_pts[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n            alpha = alpha[mask]\n\n        # compute accumulated transmittance\n        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)\n        if self.fast_color_thres > 0:\n            mask = (weights > self.fast_color_thres)\n            ray_pts = ray_pts[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n            alpha = alpha[mask]\n            weights = weights[mask]\n\n        # query for color\n        vox_emb = self.k0(ray_pts)\n\n        if self.rgbnet is None:\n            # no view-depend effect\n            rgb = torch.sigmoid(vox_emb)\n        else:\n            # view-dependent color emission\n            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)\n            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)\n            viewdirs_emb = viewdirs_emb[ray_id]\n            rgb_feat = torch.cat([vox_emb, viewdirs_emb], -1)\n            rgb_logit = self.rgbnet(rgb_feat)\n            rgb = torch.sigmoid(rgb_logit)\n\n        # Ray marching\n        rgb_marched = segment_coo(\n                src=(weights.unsqueeze(-1) * rgb),\n                index=ray_id,\n                out=torch.zeros([N, 3]),\n                reduce='sum')\n        if render_kwargs.get('rand_bkgd', False) and global_step is not None:\n            rgb_marched += (alphainv_last.unsqueeze(-1) * torch.rand_like(rgb_marched))\n        else:\n            rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])\n        s = (step_id+0.5) / N_samples\n        ret_dict.update({\n            'alphainv_last': alphainv_last,\n            'weights': weights,\n            'rgb_marched': rgb_marched,\n            'raw_alpha': alpha,\n            'raw_rgb': rgb,\n            'ray_id': ray_id,\n            'n_max': N_samples,\n            's': s,\n        })\n\n        if render_kwargs.get('render_depth', False):\n            with torch.no_grad():\n                depth = segment_coo(\n                        src=(weights * s),\n                        index=ray_id,\n                        out=torch.zeros([N]),\n                        reduce='sum')\n            ret_dict.update({'depth': depth})\n\n        return ret_dict\n\n\n@functools.lru_cache(maxsize=128)\ndef create_full_step_id(shape):\n    ray_id = torch.arange(shape[0]).view(-1,1).expand(shape).flatten()\n    step_id = torch.arange(shape[1]).view(1,-1).expand(shape).flatten()\n    return ray_id, step_id\n\n"
  },
  {
    "path": "FourierGrid/dvgo.py",
    "content": "import os\nimport time\nimport functools\nimport numpy as np\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom torch_scatter import segment_coo\nfrom . import grid\nfrom . import FourierGrid_grid\nimport render_utils_cuda\n\n# from torch.utils.cpp_extension import load\n# parent_dir = os.path.dirname(os.path.abspath(__file__))\n# render_utils_cuda = load(\n#         name='render_utils_cuda',\n#         sources=[\n#             os.path.join(parent_dir, path)\n#             for path in ['cuda/render_utils.cpp', 'cuda/render_utils_kernel.cu']],\n#         verbose=True)\n\n\n'''Model'''\nclass DirectVoxGO(torch.nn.Module):\n    def __init__(self, xyz_min, xyz_max,\n                 num_voxels=0, num_voxels_base=0,\n                 alpha_init=None,\n                 mask_cache_path=None, mask_cache_thres=1e-3, mask_cache_world_size=None,\n                 fast_color_thres=0,\n                 density_type='DenseGrid', k0_type='DenseGrid',\n                 density_config={}, k0_config={},\n                 rgbnet_dim=0, rgbnet_direct=False, rgbnet_full_implicit=False,\n                 rgbnet_depth=3, rgbnet_width=128,\n                 viewbase_pe=4,\n                 **kwargs):\n        super(DirectVoxGO, self).__init__()\n        self.register_buffer('xyz_min', torch.Tensor(xyz_min))\n        self.register_buffer('xyz_max', torch.Tensor(xyz_max))\n        self.fast_color_thres = fast_color_thres\n\n        # determine based grid resolution\n        self.num_voxels_base = num_voxels_base\n        self.voxel_size_base = ((self.xyz_max - self.xyz_min).prod() / self.num_voxels_base).pow(1/3)\n\n        # determine the density bias shift\n        self.alpha_init = alpha_init\n        self.register_buffer('act_shift', torch.FloatTensor([np.log(1/(1-alpha_init) - 1)]))\n        print('dvgo: set density bias shift to', self.act_shift)\n\n        # determine init grid resolution\n        self._set_grid_resolution(num_voxels)\n\n        # init density voxel grid\n        self.density_type = density_type\n        self.density_config = density_config\n        self.fourier_freq_num = 0  # open this to enable FourierGrid\n        self.use_fourier_grid =  self.fourier_freq_num > 1\n        if not self.use_fourier_grid:\n            self.density = grid.create_grid(\n                    density_type, channels=1, world_size=self.world_size,\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                    config=self.density_config)\n        else:\n            self.density = FourierGrid_grid.create_grid(\n                    density_type, channels=1, world_size=self.world_size,\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max, use_nerf_pos=True,\n                    fourier_freq_num=self.fourier_freq_num, config=self.density_config)\n\n        # init color representation\n        self.rgbnet_kwargs = {\n            'rgbnet_dim': rgbnet_dim, 'rgbnet_direct': rgbnet_direct,\n            'rgbnet_full_implicit': rgbnet_full_implicit,\n            'rgbnet_depth': rgbnet_depth, 'rgbnet_width': rgbnet_width,\n            'viewbase_pe': viewbase_pe,\n        }\n        self.k0_type = k0_type\n        self.k0_config = k0_config\n        self.rgbnet_full_implicit = rgbnet_full_implicit\n        if rgbnet_dim <= 0:\n            # color voxel grid  (coarse stage)\n            self.k0_dim = 3\n            if not self.use_fourier_grid:\n                self.k0 = grid.create_grid(\n                    k0_type, channels=self.k0_dim, world_size=self.world_size,\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                    config=self.k0_config)\n            else:\n                self.k0 = FourierGrid_grid.create_grid(\n                    k0_type, channels=self.k0_dim, world_size=self.world_size,\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max, use_nerf_pos=True,\n                    fourier_freq_num=self.fourier_freq_num, config=self.k0_config)\n            self.rgbnet = None\n        else:\n            # feature voxel grid + shallow MLP  (fine stage)\n            if self.rgbnet_full_implicit:\n                self.k0_dim = 0\n            else:\n                self.k0_dim = rgbnet_dim\n            if not self.use_fourier_grid:\n                self.k0 = grid.create_grid(\n                        k0_type, channels=self.k0_dim, world_size=self.world_size,\n                        xyz_min=self.xyz_min, xyz_max=self.xyz_max,\n                        config=self.k0_config)\n            else:\n                self.k0 = FourierGrid_grid.create_grid(\n                        k0_type, channels=self.k0_dim, world_size=self.world_size,\n                        xyz_min=self.xyz_min, xyz_max=self.xyz_max, use_nerf_pos=True, \n                        fourier_freq_num=self.fourier_freq_num, config=self.k0_config)\n            self.rgbnet_direct = rgbnet_direct\n            self.register_buffer('viewfreq', torch.FloatTensor([(2**i) for i in range(viewbase_pe)]))\n            dim0 = (3+3*viewbase_pe*2)\n            if self.rgbnet_full_implicit:\n                pass\n            elif rgbnet_direct:\n                dim0 += self.k0_dim\n            else:\n                dim0 += self.k0_dim-3\n            self.rgbnet = nn.Sequential(\n                nn.Linear(dim0, rgbnet_width), nn.ReLU(inplace=True),\n                *[\n                    nn.Sequential(nn.Linear(rgbnet_width, rgbnet_width), nn.ReLU(inplace=True))\n                    for _ in range(rgbnet_depth-2)\n                ],\n                nn.Linear(rgbnet_width, 3),\n            )\n            nn.init.constant_(self.rgbnet[-1].bias, 0)\n            print('dvgo: feature voxel grid', self.k0)\n            print('dvgo: mlp', self.rgbnet)\n\n        # Using the coarse geometry if provided (used to determine known free space and unknown space)\n        # Re-implement as occupancy grid (2021/1/31)\n        self.mask_cache_path = mask_cache_path\n        self.mask_cache_thres = mask_cache_thres\n        if mask_cache_world_size is None:\n            mask_cache_world_size = self.world_size\n        if mask_cache_path is not None and mask_cache_path:\n            mask_cache = grid.MaskGrid(\n                    path=mask_cache_path,\n                    mask_cache_thres=mask_cache_thres).to(self.xyz_min.device)\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], mask_cache_world_size[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], mask_cache_world_size[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], mask_cache_world_size[2]),\n            ), -1)\n            mask = mask_cache(self_grid_xyz)\n        else:\n            mask = torch.ones(list(mask_cache_world_size), dtype=torch.bool)\n        self.mask_cache = grid.MaskGrid(\n                path=None, mask=mask,\n                xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n    def _set_grid_resolution(self, num_voxels):\n        # Determine grid resolution\n        self.num_voxels = num_voxels\n        self.voxel_size = ((self.xyz_max - self.xyz_min).prod() / num_voxels).pow(1/3)\n        self.world_size = ((self.xyz_max - self.xyz_min) / self.voxel_size).long()\n        self.voxel_size_ratio = self.voxel_size / self.voxel_size_base\n        print('dvgo: voxel_size      ', self.voxel_size)\n        print('dvgo: world_size      ', self.world_size)\n        print('dvgo: voxel_size_base ', self.voxel_size_base)\n        print('dvgo: voxel_size_ratio', self.voxel_size_ratio)\n\n    def get_kwargs(self):\n        return {\n            'xyz_min': self.xyz_min.cpu().numpy(),\n            'xyz_max': self.xyz_max.cpu().numpy(),\n            'num_voxels': self.num_voxels,\n            'num_voxels_base': self.num_voxels_base,\n            'alpha_init': self.alpha_init,\n            'voxel_size_ratio': self.voxel_size_ratio,\n            'mask_cache_path': self.mask_cache_path,\n            'mask_cache_thres': self.mask_cache_thres,\n            'mask_cache_world_size': list(self.mask_cache.mask.shape),\n            'fast_color_thres': self.fast_color_thres,\n            'density_type': self.density_type,\n            'k0_type': self.k0_type,\n            'density_config': self.density_config,\n            'k0_config': self.k0_config,\n            **self.rgbnet_kwargs,\n        }\n\n    @torch.no_grad()\n    def maskout_near_cam_vox(self, cam_o, near_clip):\n        if not self.use_fourier_grid:\n            # maskout grid points that between cameras and their near planes\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),\n            ), -1)\n            nearest_dist = torch.stack([\n                (self_grid_xyz.unsqueeze(-2) - co).pow(2).sum(-1).sqrt().amin(-1)\n                for co in cam_o.split(100)  # for memory saving\n            ]).amin(0)\n            self.density.grid[nearest_dist[None,None] <= near_clip] = -100\n        else:\n            ind_norm = ((cam_o - self.xyz_min) / (self.xyz_max - self.xyz_min)).flip((-1,)) * 2 - 1\n            pos_embed = self.density.nerf_pos(ind_norm).squeeze()\n            # maskout grid points that between cameras and their near planes\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(-1, 1, self.world_size[0]),\n                torch.linspace(-1, 1, self.world_size[1]),\n                torch.linspace(-1, 1, self.world_size[2]),\n            ), -1)\n            for i in range(self.density.pos_embed_output_dim):\n                cur_pos_embed = pos_embed[:, 3*i:3*(i+1)].unsqueeze(0).unsqueeze(0).unsqueeze(0)\n                nearest_dist = torch.stack([(self_grid_xyz.unsqueeze(-2) - co).pow(2).sum(-1).sqrt().amin(-1) for co in cur_pos_embed.split(10)]).amin(0)\n                self.density.grid[0][i][nearest_dist <= near_clip] = -100\n\n    @torch.no_grad()\n    def scale_volume_grid(self, num_voxels):\n        print('dvgo: scale_volume_grid start')\n        ori_world_size = self.world_size\n        self._set_grid_resolution(num_voxels)\n        print('dvgo: scale_volume_grid scale world_size from', ori_world_size.tolist(), 'to', self.world_size.tolist())\n\n        self.density.scale_volume_grid(self.world_size)\n        self.k0.scale_volume_grid(self.world_size)\n\n        if np.prod(self.world_size.tolist()) <= 256**3:\n            self_grid_xyz = torch.stack(torch.meshgrid(\n                torch.linspace(self.xyz_min[0], self.xyz_max[0], self.world_size[0]),\n                torch.linspace(self.xyz_min[1], self.xyz_max[1], self.world_size[1]),\n                torch.linspace(self.xyz_min[2], self.xyz_max[2], self.world_size[2]),\n            ), -1)\n            self_alpha = F.max_pool3d(self.activate_density(self.density.get_dense_grid()), kernel_size=3, padding=1, stride=1)[0,0]\n            self.mask_cache = grid.MaskGrid(\n                    path=None, mask=self.mask_cache(self_grid_xyz) & (self_alpha>self.fast_color_thres),\n                    xyz_min=self.xyz_min, xyz_max=self.xyz_max)\n\n        print('dvgo: scale_volume_grid finish')\n\n    @torch.no_grad()\n    def update_occupancy_cache(self):\n        cache_grid_xyz = torch.stack(torch.meshgrid(\n            torch.linspace(self.xyz_min[0], self.xyz_max[0], self.mask_cache.mask.shape[0]),\n            torch.linspace(self.xyz_min[1], self.xyz_max[1], self.mask_cache.mask.shape[1]),\n            torch.linspace(self.xyz_min[2], self.xyz_max[2], self.mask_cache.mask.shape[2]),\n        ), -1)\n        cache_grid_density = self.density(cache_grid_xyz)[None,None]\n        cache_grid_alpha = self.activate_density(cache_grid_density)\n        cache_grid_alpha = F.max_pool3d(cache_grid_alpha, kernel_size=3, padding=1, stride=1)[0,0]\n        self.mask_cache.mask &= (cache_grid_alpha > self.fast_color_thres)\n\n    def voxel_count_views(self, rays_o_tr, rays_d_tr, imsz, near, far, stepsize, downrate=1, irregular_shape=False):\n        print('dvgo: voxel_count_views start')\n        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox\n        eps_time = time.time()\n        N_samples = int(np.linalg.norm(np.array(self.world_size.cpu())+1) / stepsize) + 1\n        rng = torch.arange(N_samples)[None].float()\n        count = torch.zeros_like(self.density.get_dense_grid())\n        device = rng.device\n        for rays_o_, rays_d_ in zip(rays_o_tr.split(imsz), rays_d_tr.split(imsz)):\n            ones = grid.DenseGrid(1, self.world_size, self.xyz_min, self.xyz_max)\n            if irregular_shape:\n                rays_o_ = rays_o_.split(10000)\n                rays_d_ = rays_d_.split(10000)\n            else:\n                rays_o_ = rays_o_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)\n                rays_d_ = rays_d_[::downrate, ::downrate].to(device).flatten(0,-2).split(10000)\n            for rays_o, rays_d in zip(rays_o_, rays_d_):\n                vec = torch.where(rays_d==0, torch.full_like(rays_d, 1e-6), rays_d)\n                rate_a = (self.xyz_max - rays_o) / vec\n                rate_b = (self.xyz_min - rays_o) / vec\n                t_min = torch.minimum(rate_a, rate_b).amax(-1).clamp(min=near, max=far)\n                t_max = torch.maximum(rate_a, rate_b).amin(-1).clamp(min=near, max=far)\n                step = stepsize * self.voxel_size * rng\n                interpx = (t_min[...,None] + step/rays_d.norm(dim=-1,keepdim=True))\n                rays_pts = rays_o[...,None,:] + rays_d[...,None,:] * interpx[...,None]\n                ones(rays_pts).sum().backward()\n            with torch.no_grad():\n                count += (ones.grid.grad > 1)\n        eps_time = time.time() - eps_time\n        print('dvgo: voxel_count_views finish (eps time:', eps_time, 'sec)')\n        return count\n\n    def density_total_variation_add_grad(self, weight, dense_mode):\n        w = weight * self.world_size.max() / 128\n        self.density.total_variation_add_grad(w, w, w, dense_mode)\n\n    def k0_total_variation_add_grad(self, weight, dense_mode):\n        w = weight * self.world_size.max() / 128\n        self.k0.total_variation_add_grad(w, w, w, dense_mode)\n\n    def activate_density(self, density, interval=None):\n        interval = interval if interval is not None else self.voxel_size_ratio\n        shape = density.shape\n        return Raw2Alpha.apply(density.flatten(), self.act_shift, interval).reshape(shape)\n\n    def hit_coarse_geo(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):\n        '''Check whether the rays hit the solved coarse geometry or not'''\n        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox\n        shape = rays_o.shape[:-1]\n        rays_o = rays_o.reshape(-1, 3).contiguous()\n        rays_d = rays_d.reshape(-1, 3).contiguous()\n        stepdist = stepsize * self.voxel_size\n        ray_pts, mask_outbbox, ray_id = render_utils_cuda.sample_pts_on_rays(\n                rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)[:3]\n        mask_inbbox = ~mask_outbbox\n        hit = torch.zeros([len(rays_o)], dtype=torch.bool)\n        hit[ray_id[mask_inbbox][self.mask_cache(ray_pts[mask_inbbox])]] = 1\n        return hit.reshape(shape)\n\n    def sample_ray(self, rays_o, rays_d, near, far, stepsize, **render_kwargs):\n        '''Sample query points on rays.\n        All the output points are sorted from near to far.\n        Input:\n            rays_o, rayd_d:   both in [N, 3] indicating ray configurations.\n            near, far:        the near and far distance of the rays.\n            stepsize:         the number of voxels of each sample step.\n        Output:\n            ray_pts:          [M, 3] storing all the sampled points.\n            ray_id:           [M]    the index of the ray of each point.\n            step_id:          [M]    the i'th step on a ray of each point.\n        '''\n        far = 1e9  # the given far can be too small while rays stop when hitting scene bbox\n        rays_o = rays_o.contiguous()\n        rays_d = rays_d.contiguous()\n        stepdist = stepsize * self.voxel_size\n        ray_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max = render_utils_cuda.sample_pts_on_rays(\n            rays_o, rays_d, self.xyz_min, self.xyz_max, near, far, stepdist)\n        mask_inbbox = ~mask_outbbox\n        ray_pts = ray_pts[mask_inbbox]\n        ray_id = ray_id[mask_inbbox]\n        step_id = step_id[mask_inbbox]\n        return ray_pts, ray_id, step_id\n\n    def forward(self, rays_o, rays_d, viewdirs, global_step=None, **render_kwargs):\n        '''Volume rendering\n        @rays_o:   [N, 3] the starting point of the N shooting rays.\n        @rays_d:   [N, 3] the shooting direction of the N rays.\n        @viewdirs: [N, 3] viewing direction to compute positional embedding for MLP.\n        '''\n        assert len(rays_o.shape)==2 and rays_o.shape[-1]==3, 'Only suuport point queries in [N, 3] format'\n\n        ret_dict = {}\n        N = len(rays_o)\n\n        # sample points on rays\n        ray_pts, ray_id, step_id = self.sample_ray(\n                rays_o=rays_o, rays_d=rays_d, **render_kwargs)\n        interval = render_kwargs['stepsize'] * self.voxel_size_ratio\n\n        # skip known free space\n        if self.mask_cache is not None and not self.use_fourier_grid:\n            mask = self.mask_cache(ray_pts)\n            ray_pts = ray_pts[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n\n        # query for alpha w/ post-activation\n        density = self.density(ray_pts)\n        alpha = self.activate_density(density, interval)\n        if self.fast_color_thres > 0 and not self.use_fourier_grid:\n            mask = (alpha > self.fast_color_thres)\n            ray_pts = ray_pts[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n            density = density[mask]\n            alpha = alpha[mask]\n\n        # compute accumulated transmittance\n        weights, alphainv_last = Alphas2Weights.apply(alpha, ray_id, N)\n        if self.fast_color_thres > 0 and not self.use_fourier_grid:\n            mask = (weights > self.fast_color_thres)\n            weights = weights[mask]\n            alpha = alpha[mask]\n            ray_pts = ray_pts[mask]\n            ray_id = ray_id[mask]\n            step_id = step_id[mask]\n\n        # query for color\n        if self.rgbnet_full_implicit:\n            pass\n        else:\n            k0 = self.k0(ray_pts)\n\n        if self.rgbnet is None:\n            # no view-depend effect\n            rgb = torch.sigmoid(k0)\n        else:\n            # view-dependent color emission\n            if self.rgbnet_direct:\n                k0_view = k0\n            else:\n                k0_view = k0[:, 3:]\n                k0_diffuse = k0[:, :3]\n            viewdirs_emb = (viewdirs.unsqueeze(-1) * self.viewfreq).flatten(-2)\n            viewdirs_emb = torch.cat([viewdirs, viewdirs_emb.sin(), viewdirs_emb.cos()], -1)\n            viewdirs_emb = viewdirs_emb.flatten(0,-2)[ray_id]\n            rgb_feat = torch.cat([k0_view, viewdirs_emb], -1)\n            rgb_logit = self.rgbnet(rgb_feat)\n            if self.rgbnet_direct:\n                rgb = torch.sigmoid(rgb_logit)\n            else:\n                rgb = torch.sigmoid(rgb_logit + k0_diffuse)\n\n        # Ray marching\n        rgb_marched = segment_coo(\n                src=(weights.unsqueeze(-1) * rgb),\n                index=ray_id,\n                out=torch.zeros([N, 3]),\n                reduce='sum')\n        rgb_marched += (alphainv_last.unsqueeze(-1) * render_kwargs['bg'])\n        ret_dict.update({\n            'alphainv_last': alphainv_last,\n            'weights': weights,\n            'rgb_marched': rgb_marched,\n            'raw_alpha': alpha,\n            'raw_rgb': rgb,\n            'ray_id': ray_id,\n        })\n\n        if render_kwargs.get('render_depth', False):\n            with torch.no_grad():\n                depth = segment_coo(\n                        src=(weights * step_id),\n                        index=ray_id,\n                        out=torch.zeros([N]),\n                        reduce='sum')\n            ret_dict.update({'depth': depth})\n\n        return ret_dict\n\n\n''' Misc\n'''\nclass Raw2Alpha(torch.autograd.Function):\n    @staticmethod\n    def forward(ctx, density, shift, interval):\n        '''\n        alpha = 1 - exp(-softplus(density + shift) * interval)\n              = 1 - exp(-log(1 + exp(density + shift)) * interval)\n              = 1 - exp(log(1 + exp(density + shift)) ^ (-interval))\n              = 1 - (1 + exp(density + shift)) ^ (-interval)\n        '''\n        exp, alpha = render_utils_cuda.raw2alpha(density, shift, interval)\n        if density.requires_grad:\n            ctx.save_for_backward(exp)\n            ctx.interval = interval\n        return alpha\n\n    @staticmethod\n    @torch.autograd.function.once_differentiable\n    def backward(ctx, grad_back):\n        '''\n        alpha' = interval * ((1 + exp(density + shift)) ^ (-interval-1)) * exp(density + shift)'\n               = interval * ((1 + exp(density + shift)) ^ (-interval-1)) * exp(density + shift)\n        '''\n        exp = ctx.saved_tensors[0]\n        interval = ctx.interval\n        return render_utils_cuda.raw2alpha_backward(exp, grad_back.contiguous(), interval), None, None\n\nclass Raw2Alpha_nonuni(torch.autograd.Function):\n    @staticmethod\n    def forward(ctx, density, shift, interval):\n        exp, alpha = render_utils_cuda.raw2alpha_nonuni(density, shift, interval)\n        if density.requires_grad:\n            ctx.save_for_backward(exp)\n            ctx.interval = interval\n        return alpha\n\n    @staticmethod\n    @torch.autograd.function.once_differentiable\n    def backward(ctx, grad_back):\n        exp = ctx.saved_tensors[0]\n        interval = ctx.interval\n        return render_utils_cuda.raw2alpha_nonuni_backward(exp, grad_back.contiguous(), interval), None, None\n\nclass Alphas2Weights(torch.autograd.Function):\n    @staticmethod\n    def forward(ctx, alpha, ray_id, N):\n        weights, T, alphainv_last, i_start, i_end = render_utils_cuda.alpha2weight(alpha, ray_id, N)\n        if alpha.requires_grad:\n            ctx.save_for_backward(alpha, weights, T, alphainv_last, i_start, i_end)\n            ctx.n_rays = N\n        return weights, alphainv_last\n\n    @staticmethod\n    @torch.autograd.function.once_differentiable\n    def backward(ctx, grad_weights, grad_last):\n        alpha, weights, T, alphainv_last, i_start, i_end = ctx.saved_tensors\n        grad = render_utils_cuda.alpha2weight_backward(\n                alpha, weights, T, alphainv_last,\n                i_start, i_end, ctx.n_rays, grad_weights, grad_last)\n        return grad, None, None\n\n\n''' Ray and batch\n'''\ndef get_rays(H, W, K, c2w, inverse_y, flip_x, flip_y, mode='center'):\n    i, j = torch.meshgrid(\n        torch.linspace(0, W-1, W, device=c2w.device),\n        torch.linspace(0, H-1, H, device=c2w.device))  # pytorch's meshgrid has indexing='ij'\n    i = i.t().float()\n    j = j.t().float()\n    if mode == 'lefttop':\n        pass\n    elif mode == 'center':\n        i, j = i+0.5, j+0.5\n    elif mode == 'random':\n        i = i+torch.rand_like(i)\n        j = j+torch.rand_like(j)\n    else:\n        raise NotImplementedError\n\n    if flip_x:\n        i = i.flip((1,))\n    if flip_y:\n        j = j.flip((0,))\n    if inverse_y:\n        dirs = torch.stack([(i-K[0][2])/K[0][0], (j-K[1][2])/K[1][1], torch.ones_like(i)], -1)\n    else:\n        dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)\n    # Rotate ray directions from camera frame to the world frame\n    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]\n    # Translate camera frame's origin to the world frame. It is the origin of all rays.\n    rays_o = c2w[:3,3].expand(rays_d.shape)\n    return rays_o, rays_d\n\n\ndef get_rays_np(H, W, K, c2w):\n    i, j = np.meshgrid(np.arange(W, dtype=np.float32), np.arange(H, dtype=np.float32), indexing='xy')\n    dirs = np.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -np.ones_like(i)], -1)\n    # Rotate ray directions from camera frame to the world frame\n    rays_d = np.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]\n    # Translate camera frame's origin to the world frame. It is the origin of all rays.\n    rays_o = np.broadcast_to(c2w[:3,3], np.shape(rays_d))\n    return rays_o, rays_d\n\n\ndef ndc_rays(H, W, focal, near, rays_o, rays_d):\n    # Shift ray origins to near plane\n    t = -(near + rays_o[...,2]) / rays_d[...,2]\n    rays_o = rays_o + t[...,None] * rays_d\n\n    # Projection\n    o0 = -1./(W/(2.*focal)) * rays_o[...,0] / rays_o[...,2]\n    o1 = -1./(H/(2.*focal)) * rays_o[...,1] / rays_o[...,2]\n    o2 = 1. + 2. * near / rays_o[...,2]\n\n    d0 = -1./(W/(2.*focal)) * (rays_d[...,0]/rays_d[...,2] - rays_o[...,0]/rays_o[...,2])\n    d1 = -1./(H/(2.*focal)) * (rays_d[...,1]/rays_d[...,2] - rays_o[...,1]/rays_o[...,2])\n    d2 = -2. * near / rays_o[...,2]\n\n    rays_o = torch.stack([o0,o1,o2], -1)\n    rays_d = torch.stack([d0,d1,d2], -1)\n\n    return rays_o, rays_d\n\n\ndef get_rays_of_a_view(H, W, K, c2w, ndc, inverse_y, flip_x, flip_y, mode='center'):\n    rays_o, rays_d = get_rays(H, W, K, c2w, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y, mode=mode)\n    viewdirs = rays_d / rays_d.norm(dim=-1, keepdim=True)\n    if ndc:\n        rays_o, rays_d = ndc_rays(H, W, K[0][0], 1., rays_o, rays_d)\n    return rays_o, rays_d, viewdirs\n\n\n@torch.no_grad()\ndef get_training_rays(rgb_tr, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):\n    print('get_training_rays: start')\n    assert len(np.unique(HW, axis=0)) == 1\n    assert len(np.unique(Ks.reshape(len(Ks),-1), axis=0)) == 1\n    assert len(rgb_tr) == len(train_poses) and len(rgb_tr) == len(Ks) and len(rgb_tr) == len(HW)\n    H, W = HW[0]\n    K = Ks[0]\n    eps_time = time.time()\n    rays_o_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)\n    rays_d_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)\n    viewdirs_tr = torch.zeros([len(rgb_tr), H, W, 3], device=rgb_tr.device)\n    imsz = [1] * len(rgb_tr)\n    for i, c2w in enumerate(train_poses):\n        rays_o, rays_d, viewdirs = get_rays_of_a_view(\n                H=H, W=W, K=K, c2w=c2w, ndc=ndc, inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)\n        rays_o_tr[i].copy_(rays_o.to(rgb_tr.device))\n        rays_d_tr[i].copy_(rays_d.to(rgb_tr.device))\n        viewdirs_tr[i].copy_(viewdirs.to(rgb_tr.device))\n        del rays_o, rays_d, viewdirs\n    eps_time = time.time() - eps_time\n    print('get_training_rays: finish (eps time:', eps_time, 'sec)')\n    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz\n\n\n@torch.no_grad()\ndef get_training_rays_flatten(rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y):\n    print('get_training_rays_flatten: start')\n    assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)\n    eps_time = time.time()\n    DEVICE = rgb_tr_ori[0].device\n    N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)\n    rgb_tr = torch.zeros([N,3], device=DEVICE)\n    rays_o_tr = torch.zeros_like(rgb_tr)\n    rays_d_tr = torch.zeros_like(rgb_tr)\n    viewdirs_tr = torch.zeros_like(rgb_tr)\n    imsz = []\n    top = 0\n    for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):\n        assert img.shape[:2] == (H, W)\n        rays_o, rays_d, viewdirs = get_rays_of_a_view(\n                H=H, W=W, K=K, c2w=c2w, ndc=ndc,\n                inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)\n        n = H * W\n        rgb_tr[top:top+n].copy_(img.flatten(0,1))\n        rays_o_tr[top:top+n].copy_(rays_o.flatten(0,1).to(DEVICE))\n        rays_d_tr[top:top+n].copy_(rays_d.flatten(0,1).to(DEVICE))\n        viewdirs_tr[top:top+n].copy_(viewdirs.flatten(0,1).to(DEVICE))\n        imsz.append(n)\n        top += n\n\n    assert top == N\n    eps_time = time.time() - eps_time\n    print('get_training_rays_flatten: finish (eps time:', eps_time, 'sec)')\n    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz\n\n\n@torch.no_grad()\ndef get_training_rays_in_maskcache_sampling(rgb_tr_ori, train_poses, HW, Ks, ndc, inverse_y, flip_x, flip_y, model, render_kwargs):\n    print('get_training_rays_in_maskcache_sampling: start')\n    assert len(rgb_tr_ori) == len(train_poses) and len(rgb_tr_ori) == len(Ks) and len(rgb_tr_ori) == len(HW)\n    CHUNK = 64\n    DEVICE = rgb_tr_ori[0].device\n    eps_time = time.time()\n    N = sum(im.shape[0] * im.shape[1] for im in rgb_tr_ori)\n    rgb_tr = torch.zeros([N,3], device=DEVICE)\n    rays_o_tr = torch.zeros_like(rgb_tr)\n    rays_d_tr = torch.zeros_like(rgb_tr)\n    viewdirs_tr = torch.zeros_like(rgb_tr)\n    imsz = []\n    top = 0\n    for c2w, img, (H, W), K in zip(train_poses, rgb_tr_ori, HW, Ks):\n        assert img.shape[:2] == (H, W)\n        rays_o, rays_d, viewdirs = get_rays_of_a_view(\n                H=H, W=W, K=K, c2w=c2w, ndc=ndc,\n                inverse_y=inverse_y, flip_x=flip_x, flip_y=flip_y)\n        mask = torch.empty(img.shape[:2], device=DEVICE, dtype=torch.bool)\n        for i in range(0, img.shape[0], CHUNK):\n            mask[i:i+CHUNK] = model.hit_coarse_geo(\n                    rays_o=rays_o[i:i+CHUNK], rays_d=rays_d[i:i+CHUNK], **render_kwargs).to(DEVICE)\n        n = mask.sum()\n        rgb_tr[top:top+n].copy_(img[mask])\n        rays_o_tr[top:top+n].copy_(rays_o[mask].to(DEVICE))\n        rays_d_tr[top:top+n].copy_(rays_d[mask].to(DEVICE))\n        viewdirs_tr[top:top+n].copy_(viewdirs[mask].to(DEVICE))\n        imsz.append(n)\n        top += n\n\n    print('get_training_rays_in_maskcache_sampling: ratio', top / N)\n    rgb_tr = rgb_tr[:top]\n    rays_o_tr = rays_o_tr[:top]\n    rays_d_tr = rays_d_tr[:top]\n    viewdirs_tr = viewdirs_tr[:top]\n    eps_time = time.time() - eps_time\n    print('get_training_rays_in_maskcache_sampling: finish (eps time:', eps_time, 'sec)')\n    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz\n\n\ndef batch_indices_generator(N, BS):\n    # torch.randperm on cuda produce incorrect results in my machine\n    idx, top = torch.LongTensor(np.random.permutation(N)), 0\n    while True:\n        if top + BS > N:\n            idx, top = torch.LongTensor(np.random.permutation(N)), 0\n        yield idx[top:top+BS]\n        top += BS\n\n"
  },
  {
    "path": "FourierGrid/grid.py",
    "content": "import os\nimport time\nimport functools\nimport numpy as np\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nimport render_utils_cuda\nimport total_variation_cuda\n\n# from torch.utils.cpp_extension import load\n# parent_dir = os.path.dirname(os.path.abspath(__file__))\n# render_utils_cuda = load(\n#         name='render_utils_cuda',\n#         sources=[\n#             os.path.join(parent_dir, path)\n#             for path in ['cuda/render_utils.cpp', 'cuda/render_utils_kernel.cu']],\n#         verbose=True)\n\n# total_variation_cuda = load(\n#         name='total_variation_cuda',\n#         sources=[\n#             os.path.join(parent_dir, path)\n#             for path in ['cuda/total_variation.cpp', 'cuda/total_variation_kernel.cu']],\n#         verbose=True)\n\n\ndef create_grid(type, **kwargs):\n    if type == 'DenseGrid':\n        return DenseGrid(**kwargs)\n    elif type == 'TensoRFGrid':\n        return TensoRFGrid(**kwargs)\n    else:\n        raise NotImplementedError\n\n\n''' Dense 3D grid\n'''\nclass DenseGrid(nn.Module):\n    def __init__(self, channels, world_size, xyz_min, xyz_max, **kwargs):\n        super(DenseGrid, self).__init__()\n        self.channels = channels\n        self.world_size = world_size\n        self.register_buffer('xyz_min', torch.Tensor(xyz_min))\n        self.register_buffer('xyz_max', torch.Tensor(xyz_max))\n        self.grid = nn.Parameter(torch.zeros([1, channels, *world_size]))\n\n    def forward(self, xyz):\n        '''\n        xyz: global coordinates to query\n        '''\n        shape = xyz.shape[:-1]\n        xyz = xyz.reshape(1,1,1,-1,3)\n        ind_norm = ((xyz - self.xyz_min) / (self.xyz_max - self.xyz_min)).flip((-1,)) * 2 - 1\n        out = F.grid_sample(self.grid, ind_norm, mode='bilinear', align_corners=True)\n        out = out.reshape(self.channels,-1).T.reshape(*shape,self.channels)\n        if self.channels == 1:\n            out = out.squeeze(-1)\n        return out\n\n    def scale_volume_grid(self, new_world_size):\n        if self.channels == 0:\n            self.grid = nn.Parameter(torch.zeros([1, self.channels, *new_world_size]))\n        else:\n            self.grid = nn.Parameter(\n                F.interpolate(self.grid.data, size=tuple(new_world_size), mode='trilinear', align_corners=True))\n\n    def total_variation_add_grad(self, wx, wy, wz, dense_mode):\n        '''Add gradients by total variation loss in-place'''\n        total_variation_cuda.total_variation_add_grad(\n            self.grid, self.grid.grad, wx, wy, wz, dense_mode)\n\n    def get_dense_grid(self):\n        return self.grid\n\n    @torch.no_grad()\n    def __isub__(self, val):\n        self.grid.data -= val\n        return self\n\n    def extra_repr(self):\n        return f'channels={self.channels}, world_size={self.world_size.tolist()}'\n\n\n''' Vector-Matrix decomposited grid\nSee TensoRF: Tensorial Radiance Fields (https://arxiv.org/abs/2203.09517)\n'''\nclass TensoRFGrid(nn.Module):\n    def __init__(self, channels, world_size, xyz_min, xyz_max, config):\n        super(TensoRFGrid, self).__init__()\n        self.channels = channels\n        self.world_size = world_size\n        self.config = config\n        self.register_buffer('xyz_min', torch.Tensor(xyz_min))\n        self.register_buffer('xyz_max', torch.Tensor(xyz_max))\n        X, Y, Z = world_size\n        R = config['n_comp']\n        Rxy = config.get('n_comp_xy', R)\n        self.xy_plane = nn.Parameter(torch.randn([1, Rxy, X, Y]) * 0.1)\n        self.xz_plane = nn.Parameter(torch.randn([1, R, X, Z]) * 0.1)\n        self.yz_plane = nn.Parameter(torch.randn([1, R, Y, Z]) * 0.1)\n        self.x_vec = nn.Parameter(torch.randn([1, R, X, 1]) * 0.1)\n        self.y_vec = nn.Parameter(torch.randn([1, R, Y, 1]) * 0.1)\n        self.z_vec = nn.Parameter(torch.randn([1, Rxy, Z, 1]) * 0.1)\n        if self.channels > 1:\n            self.f_vec = nn.Parameter(torch.ones([R+R+Rxy, channels]))\n            nn.init.kaiming_uniform_(self.f_vec, a=np.sqrt(5))\n\n    def forward(self, xyz):\n        '''\n        xyz: global coordinates to query\n        '''\n        shape = xyz.shape[:-1]\n        xyz = xyz.reshape(1,1,-1,3)\n        ind_norm = (xyz - self.xyz_min) / (self.xyz_max - self.xyz_min) * 2 - 1\n        ind_norm = torch.cat([ind_norm, torch.zeros_like(ind_norm[...,[0]])], dim=-1)\n        if self.channels > 1:\n            out = compute_tensorf_feat(\n                    self.xy_plane, self.xz_plane, self.yz_plane,\n                    self.x_vec, self.y_vec, self.z_vec, self.f_vec, ind_norm)\n            out = out.reshape(*shape,self.channels)\n        else:\n            out = compute_tensorf_val(\n                    self.xy_plane, self.xz_plane, self.yz_plane,\n                    self.x_vec, self.y_vec, self.z_vec, ind_norm)\n            out = out.reshape(*shape)\n        return out\n\n    def scale_volume_grid(self, new_world_size):\n        if self.channels == 0:\n            return\n        X, Y, Z = new_world_size\n        self.xy_plane = nn.Parameter(F.interpolate(self.xy_plane.data, size=[X,Y], mode='bilinear', align_corners=True))\n        self.xz_plane = nn.Parameter(F.interpolate(self.xz_plane.data, size=[X,Z], mode='bilinear', align_corners=True))\n        self.yz_plane = nn.Parameter(F.interpolate(self.yz_plane.data, size=[Y,Z], mode='bilinear', align_corners=True))\n        self.x_vec = nn.Parameter(F.interpolate(self.x_vec.data, size=[X,1], mode='bilinear', align_corners=True))\n        self.y_vec = nn.Parameter(F.interpolate(self.y_vec.data, size=[Y,1], mode='bilinear', align_corners=True))\n        self.z_vec = nn.Parameter(F.interpolate(self.z_vec.data, size=[Z,1], mode='bilinear', align_corners=True))\n\n    def total_variation_add_grad(self, wx, wy, wz, dense_mode):\n        '''Add gradients by total variation loss in-place'''\n        loss = wx * F.smooth_l1_loss(self.xy_plane[:,:,1:], self.xy_plane[:,:,:-1], reduction='sum') +\\\n               wy * F.smooth_l1_loss(self.xy_plane[:,:,:,1:], self.xy_plane[:,:,:,:-1], reduction='sum') +\\\n               wx * F.smooth_l1_loss(self.xz_plane[:,:,1:], self.xz_plane[:,:,:-1], reduction='sum') +\\\n               wz * F.smooth_l1_loss(self.xz_plane[:,:,:,1:], self.xz_plane[:,:,:,:-1], reduction='sum') +\\\n               wy * F.smooth_l1_loss(self.yz_plane[:,:,1:], self.yz_plane[:,:,:-1], reduction='sum') +\\\n               wz * F.smooth_l1_loss(self.yz_plane[:,:,:,1:], self.yz_plane[:,:,:,:-1], reduction='sum') +\\\n               wx * F.smooth_l1_loss(self.x_vec[:,:,1:], self.x_vec[:,:,:-1], reduction='sum') +\\\n               wy * F.smooth_l1_loss(self.y_vec[:,:,1:], self.y_vec[:,:,:-1], reduction='sum') +\\\n               wz * F.smooth_l1_loss(self.z_vec[:,:,1:], self.z_vec[:,:,:-1], reduction='sum')\n        loss /= 6\n        loss.backward()\n\n    def get_dense_grid(self):\n        if self.channels > 1:\n            feat = torch.cat([\n                torch.einsum('rxy,rz->rxyz', self.xy_plane[0], self.z_vec[0,:,:,0]),\n                torch.einsum('rxz,ry->rxyz', self.xz_plane[0], self.y_vec[0,:,:,0]),\n                torch.einsum('ryz,rx->rxyz', self.yz_plane[0], self.x_vec[0,:,:,0]),\n            ])\n            grid = torch.einsum('rxyz,rc->cxyz', feat, self.f_vec)[None]\n        else:\n            grid = torch.einsum('rxy,rz->xyz', self.xy_plane[0], self.z_vec[0,:,:,0]) + \\\n                   torch.einsum('rxz,ry->xyz', self.xz_plane[0], self.y_vec[0,:,:,0]) + \\\n                   torch.einsum('ryz,rx->xyz', self.yz_plane[0], self.x_vec[0,:,:,0])\n            grid = grid[None,None]\n        return grid\n\n    def extra_repr(self):\n        return f'channels={self.channels}, world_size={self.world_size.tolist()}, n_comp={self.config[\"n_comp\"]}'\n\ndef compute_tensorf_feat(xy_plane, xz_plane, yz_plane, x_vec, y_vec, z_vec, f_vec, ind_norm):\n    # Interp feature (feat shape: [n_pts, n_comp])\n    xy_feat = F.grid_sample(xy_plane, ind_norm[:,:,:,[1,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    xz_feat = F.grid_sample(xz_plane, ind_norm[:,:,:,[2,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    yz_feat = F.grid_sample(yz_plane, ind_norm[:,:,:,[2,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    x_feat = F.grid_sample(x_vec, ind_norm[:,:,:,[3,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    y_feat = F.grid_sample(y_vec, ind_norm[:,:,:,[3,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    z_feat = F.grid_sample(z_vec, ind_norm[:,:,:,[3,2]], mode='bilinear', align_corners=True).flatten(0,2).T\n    # Aggregate components\n    feat = torch.cat([\n        xy_feat * z_feat,\n        xz_feat * y_feat,\n        yz_feat * x_feat,\n    ], dim=-1)\n    feat = torch.mm(feat, f_vec)\n    return feat\n\ndef compute_tensorf_val(xy_plane, xz_plane, yz_plane, x_vec, y_vec, z_vec, ind_norm):\n    # Interp feature (feat shape: [n_pts, n_comp])\n    xy_feat = F.grid_sample(xy_plane, ind_norm[:,:,:,[1,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    xz_feat = F.grid_sample(xz_plane, ind_norm[:,:,:,[2,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    yz_feat = F.grid_sample(yz_plane, ind_norm[:,:,:,[2,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    x_feat = F.grid_sample(x_vec, ind_norm[:,:,:,[3,0]], mode='bilinear', align_corners=True).flatten(0,2).T\n    y_feat = F.grid_sample(y_vec, ind_norm[:,:,:,[3,1]], mode='bilinear', align_corners=True).flatten(0,2).T\n    z_feat = F.grid_sample(z_vec, ind_norm[:,:,:,[3,2]], mode='bilinear', align_corners=True).flatten(0,2).T\n    # Aggregate components\n    feat = (xy_feat * z_feat).sum(-1) + (xz_feat * y_feat).sum(-1) + (yz_feat * x_feat).sum(-1)\n    return feat\n\n\n''' Mask grid\nIt supports query for the known free space and unknown space.\n'''\nclass MaskGrid(nn.Module):\n    def __init__(self, path=None, mask_cache_thres=None, mask=None, xyz_min=None, xyz_max=None):\n        super(MaskGrid, self).__init__()\n        if path is not None:\n            st = torch.load(path)\n            self.mask_cache_thres = mask_cache_thres\n            density_grid = st['model_state_dict']['density.grid']\n            if density_grid.shape[1] > 1:   # handle FourierGrid, TODO: revise this\n                density_grid = density_grid[0][0].unsqueeze(0).unsqueeze(0)\n            density = F.max_pool3d(density_grid, kernel_size=3, padding=1, stride=1)\n            alpha = 1 - torch.exp(-F.softplus(density + st['model_state_dict']['act_shift']) * st['model_kwargs']['voxel_size_ratio'])\n            mask = (alpha >= self.mask_cache_thres).squeeze(0).squeeze(0)\n            xyz_min = torch.Tensor(st['model_kwargs']['xyz_min'])\n            xyz_max = torch.Tensor(st['model_kwargs']['xyz_max'])\n        else:\n            mask = mask.bool()\n            xyz_min = torch.Tensor(xyz_min)\n            xyz_max = torch.Tensor(xyz_max)\n        self.register_buffer('mask', mask)\n        xyz_len = xyz_max - xyz_min\n        self.register_buffer('xyz2ijk_scale', (torch.Tensor(list(mask.shape)) - 1) / xyz_len)\n        self.register_buffer('xyz2ijk_shift', -xyz_min * self.xyz2ijk_scale)\n\n    @torch.no_grad()\n    def forward(self, xyz):\n        '''Skip know freespace\n        @xyz:   [..., 3] the xyz in global coordinate.\n        '''\n        shape = xyz.shape[:-1]\n        xyz = xyz.reshape(-1, 3)\n        mask = render_utils_cuda.maskcache_lookup(self.mask, xyz, self.xyz2ijk_scale, self.xyz2ijk_shift)\n        mask = mask.reshape(shape)\n        return mask\n\n    def extra_repr(self):\n        return f'mask.shape=list(self.mask.shape)'\n\n"
  },
  {
    "path": "FourierGrid/load_everything.py",
    "content": "import torch\nfrom FourierGrid.common_data_loaders.load_common_data import load_common_data\nfrom FourierGrid.load_waymo import load_waymo_data\nfrom FourierGrid.load_mega import load_mega_data\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nfrom FourierGrid.FourierGrid_model import FourierGridModel\n\n\ndef load_everything(args, cfg):\n    '''Load images / poses / camera settings / data split.\n    '''\n    if cfg.data.dataset_type == \"waymo\":\n        data_dict = load_waymo_data(args, cfg)\n        return data_dict, args\n    else:\n        data_dict = load_common_data(cfg.data)\n    # remove useless field\n    # kept_keys = {'hwf', 'HW', 'Ks', 'near', 'far', 'near_clip',\n    #         'i_train', 'i_val', 'i_test', 'irregular_shape',\n    #         'poses', 'render_poses', 'images'}  # hwf is not used\n    kept_keys = {'HW', 'Ks', 'near', 'far', 'near_clip',\n            'i_train', 'i_val', 'i_test', 'irregular_shape',\n            'poses', 'render_poses', 'images'}\n    for k in list(data_dict.keys()):\n        if k not in kept_keys:\n            data_dict.pop(k)\n\n    # construct data tensor\n    if data_dict['irregular_shape']:\n        data_dict['images'] = [torch.FloatTensor(im, device='cpu') for im in data_dict['images']]\n    else:\n        data_dict['images'] = torch.FloatTensor(data_dict['images'], device='cpu')\n    data_dict['poses'] = torch.Tensor(data_dict['poses'])\n    if args.sample_num > 0:\n        data_dict['i_train'] = data_dict['i_train'][:args.sample_num]\n    else:\n        args.sample_num = len(data_dict['i_train'])\n    return data_dict, args\n\n\ndef load_existing_model(args, cfg, cfg_train, reload_ckpt_path, device):\n    FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n    if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n        model_class = FourierGridModel\n    elif cfg.data.ndc:\n        model_class = dmpigo.DirectMPIGO\n    elif cfg.data.unbounded_inward:\n        model_class = dcvgo.DirectContractedVoxGO\n    else:\n        model_class = dvgo.DirectVoxGO\n    model = utils.load_model(model_class, reload_ckpt_path).to(device)\n    optimizer = utils.create_optimizer_or_freeze_model(model, cfg_train, global_step=0)\n    model, optimizer, start = utils.load_checkpoint(\n            model, optimizer, reload_ckpt_path, args.no_reload_optimizer)\n    return model, optimizer, start\n"
  },
  {
    "path": "FourierGrid/load_mega.py",
    "content": "'''\nModify from\nhttps://github.com/Kai-46/nerfplusplus/blob/master/data_loader_split.py\n'''\nimport os\nimport pdb\nimport glob\nfrom tkinter.tix import HList\nimport scipy\nimport imageio\nimport numpy as np\nimport torch\nfrom tqdm import tqdm\nimport json\nfrom scipy.spatial.transform import Rotation as R\nfrom FourierGrid.common_data_loaders.load_llff import normalize\nfrom FourierGrid.trajectory_generators.waymo_traj import *\nfrom FourierGrid.trajectory_generators.mega_traj import *\n\n########################################################################################################################\n# camera coordinate system: x-->right, y-->down, z-->scene (opencv/colmap convention)\n# poses is camera-to-world\n########################################################################################################################\ndef find_files(dir, exts):\n    if os.path.isdir(dir):\n        files_grabbed = []\n        for ext in exts:\n            files_grabbed.extend(glob.glob(os.path.join(dir, ext)))\n        if len(files_grabbed) > 0:\n            files_grabbed = sorted(files_grabbed)\n        return files_grabbed\n    else:\n        return []\n\n\ndef waymo_load_img_list(split_dir, skip=1):\n    # img files\n    img_files = find_files('{}'.format(split_dir), exts=['*.png', '*.jpg'])\n    if len(img_files) > 0:\n        img_files = img_files[::skip]\n    else:\n        raise RuntimeError(f\"Cannot find image files at {split_dir}.\")\n\n    return img_files\n\n\ndef sample_list_by_idx(one_list, idxs):\n    # allow idxs to be out of range\n    return [one_list[idx] for idx in idxs if idx < len(one_list)]\n    \n    \ndef sample_metadata_by_cam(metadata, cam_idx):\n    for split in metadata:\n        sample_idxs = []\n        for idx, cam_id in enumerate(metadata[split]['cam_idx']):\n            if cam_id == cam_idx:\n                sample_idxs.append(idx)\n        for one_k in metadata[split]:\n            metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n    return metadata\n    \n\ndef find_most_freq_ele(one_list):\n    most_freq_ele = max(set(one_list), key = one_list.count)\n    freq_count = one_list.count(most_freq_ele)\n    return most_freq_ele, freq_count\n\n\ndef sample_metadata_by_shape(metadata):\n    # only leave images with the same shape\n    w_list, h_list = metadata['train']['width'], metadata['train']['height']\n    wh_list = list(zip(w_list, h_list))\n    wh_most_freq, _ = find_most_freq_ele(wh_list)\n    for split in metadata:\n        cur_wh_list = list(zip(metadata[split]['width'], metadata[split]['height']))\n        filtered_idx = [idx for idx in range(len(cur_wh_list)) if cur_wh_list[idx] == wh_most_freq]\n        for one_k in metadata[split]:\n            metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], filtered_idx)\n    return metadata\n    \n\ndef sample_metadata_by_idxs(metadata, sample_idxs):\n    if sample_idxs is None:\n        return metadata\n    for split in metadata:\n        for one_k in metadata[split]:\n            metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n    return metadata\n\n\ndef sort_metadata_by_pos(metadata):\n    # find the central image.\n    train_positions = []\n    for c2w in metadata['train']['cam2world']:\n        pos = np.array(c2w)[:3, 3]\n        train_positions.append(pos)\n    center_pos = np.mean(train_positions, 0)\n    dis = [np.linalg.norm(pos-center_pos) for pos in train_positions]\n    center_id = dis.index(np.min(dis))\n    # sort images by position\n    list_idxs = list(range(len(dis)))\n    sorted_idxs = sorted(zip(list_idxs, dis), key=lambda row: (row[1]))\n    sorted_idxs = [idx[0] for idx in sorted_idxs]\n    for one_k in metadata['train']:\n        metadata['train'][one_k] = sample_list_by_idx(metadata['train'][one_k], sorted_idxs)\n    return metadata\n\n\ndef normalize(x):\n    return x / np.linalg.norm(x)\n\n\ndef viewmatrix(z, up, pos):\n    vec2 = normalize(z)\n    vec1_avg = up\n    vec0 = normalize(np.cross(vec1_avg, vec2))\n    vec1 = normalize(np.cross(vec2, vec0))\n    m = np.stack([vec0, vec1, vec2, pos], 1)\n    return m\n\n\ndef poses_avg(poses):\n    hwf = poses[0, :3, -1:]\n    center = poses[:, :3, 3].mean(0)\n    vec2 = normalize(poses[:, :3, 2].sum(0))\n    up = poses[:, :3, 1].sum(0)\n    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)\n    return c2w\n    \n\ndef sample_metadata_by_training_ids(metadata, training_ids, assign_pos, assign_rot):\n    if training_ids is None:\n        return metadata\n    for split in metadata:\n        if split != 'train':\n            continue\n        else:\n            sample_idxs = []\n            for ele in training_ids:\n                full_path = f'images_train/{ele}.jpg'\n                if full_path in metadata['train']['file_path']:\n                    sample_idxs.append(metadata['train']['file_path'].index(full_path))\n            assert len(sample_idxs) > 0, \"No image is selected by training id!\"\n            for one_k in metadata[split]:\n                metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n            if assign_pos is not None:\n                for ele in assign_pos:\n                    full_path = f'images_train/{ele}.png'\n                    index = metadata[split]['file_path'].index(full_path)\n                    metadata[split]['position'][index] = assign_pos[ele]\n                    temp_c2w = np.array(metadata[split]['cam2world'][index])\n                    # update position\n                    temp_c2w[:3, -1] = np.array(metadata[split]['position'][index])\n                    trans_rot = R.from_matrix(temp_c2w[:3, :3]).as_euler('yzx', degrees=True)\n                    print(full_path, trans_rot)\n                    new_rot = assign_rot[ele]\n                    r = R.from_euler('yzx', new_rot, degrees=True)\n                    temp_c2w[:3, :3] = r.as_matrix()\n                    metadata[split]['cam2world'][index] = temp_c2w.tolist()\n    return metadata\n\n\ndef load_mega(args, cfg, ):\n    data_cfg = cfg.data\n    load_img = False if args.program == \"gen_trace\" else True\n    basedir = data_cfg.datadir\n    with open(os.path.join(basedir, f'metadata.json'), 'r') as fp:\n        metadata = json.load(fp)\n    if 'sample_cam' in data_cfg:\n        metadata = sample_metadata_by_cam(metadata, data_cfg['sample_cam'])\n    if args.sample_num > 0:\n        sample_idxs = list(range(0, args.sample_num * data_cfg['sample_interval'], data_cfg['sample_interval']))\n        assert args.sample_num * data_cfg['sample_interval'] < len(metadata['train']['file_path']), \\\n            f\"Not enough data to train with given sample interval: {data_cfg['sample_interval']}!\"\n    elif 'sample_idxs' in data_cfg:\n        sample_idxs = data_cfg['sample_idxs']\n    else:\n        sample_idxs = None\n    metadata = sort_metadata_by_pos(metadata)\n    metadata = sample_metadata_by_shape(metadata)  # sample the most common shape\n    metadata = sample_metadata_by_idxs(metadata, sample_idxs)\n    if \"training_ids\" in cfg.data:\n        training_ids = cfg.data.training_ids\n        metadata = sample_metadata_by_training_ids(metadata, training_ids, None, None)\n    # The validation datasets are from the official val split, \n    # but the testing splits are hard-coded sequences (completely novel views)\n    tr_im_path, val_im_path = metadata['train']['file_path'], metadata['val']['file_path']\n    tr_c2w, val_c2w = metadata['train']['cam2world'], metadata['val']['cam2world']\n    tr_K, val_K = metadata['train']['K'], metadata['val']['K']\n\n    # Determine split id list\n    i_split = [[], [], []]\n    loop_id = 0\n    for _ in tr_c2w:\n        i_split[0].append(loop_id)\n        loop_id += 1\n    for _ in val_c2w:\n        i_split[1].append(loop_id)\n        loop_id += 1\n\n    # Load camera poses\n    poses = []\n    for c2w in tr_c2w:\n        poses.append(np.array(c2w).reshape(4,4))\n    for c2w in val_c2w:\n        poses.append(np.array(c2w).reshape(4,4))\n\n    # Load images\n    if not load_img:\n        imgs = tr_im_path + val_im_path  # do not load all the images\n    else:\n        imgs = []\n        print(f\"Loading all the images to disk.\")\n        for path in tqdm(tr_im_path):\n            imgs.append(imageio.imread(os.path.join(basedir, path)) / 255.)\n        for path in tqdm(val_im_path):\n            imgs.append(imageio.imread(os.path.join(basedir, path)) / 255.) \n\n    train_HW = np.array([[metadata['train']['height'][i], metadata['train']['width'][i]] \n                         for i in range(len(metadata['train']['height']))]).tolist()\n    val_HW = np.array([[metadata['val']['height'][i], metadata['val']['width'][i]] \n                       for i in range(len(metadata['val']['height']))]).tolist()\n\n    te_c2w, test_HW, test_K = \\\n        gen_rotational_trajs(args, cfg, metadata, tr_c2w, train_HW, tr_K, \n                       rotate_angle=data_cfg.test_rotate_angle)\n    # dummy test paths\n    # te_c2w, test_HW, test_K = gen_dummy_trajs(metadata, tr_c2w, train_HW, tr_K)\n    \n    for _ in te_c2w:\n        i_split[2].append(loop_id)\n        loop_id += 1\n    for c2w in te_c2w:\n        poses.append(np.array(c2w).reshape(4,4))\n    \n    # Bundle all the data\n    all_K = np.array(tr_K + val_K + test_K)\n    hws = train_HW + val_HW + test_HW\n    hws = [[int(hw[0]), int(hw[1])] for hw in hws]\n    HW = np.array(hws)\n    poses = np.stack(poses, 0)\n    if load_img:\n        imgs = np.stack(imgs)\n    render_poses = te_c2w\n    return imgs, poses, render_poses, HW, all_K, i_split\n\n\ndef inward_nearfar_heuristic(cam_o, ratio=0.05):\n    dist = np.linalg.norm(cam_o[:,None] - cam_o, axis=-1)\n    far = dist.max()  # could be too small to exist the scene bbox\n                      # it is only used to determined scene bbox\n                      # lib/dvgo use 1e9 as far\n    near = far * ratio\n    return near, far\n\n\ndef load_mega_data(args, cfg):\n    data_cfg = cfg.data\n    K, depths = None, None\n    near_clip = None\n    images, poses, render_poses, HW, K, i_split = load_mega(args, cfg)\n    print(f\"Loaded MEGA dataset.\")\n    i_train, i_val, i_test = i_split\n    near_clip, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0.02)  # not used too much in fact\n    \n    # load near and far parameters\n    if \"near_clip\" in data_cfg:\n        near_clip = data_cfg['near_clip']\n    if 'near' in data_cfg:\n        near = data_cfg['near']\n    if 'far' in data_cfg:\n        far = data_cfg['far']\n    Ks = np.array(K)\n    irregular_shape = False\n    data_dict = dict(\n        HW=HW, Ks=Ks, near=near, far=far, near_clip=near_clip,\n        i_train=i_train, i_val=i_val, i_test=i_test,\n        poses=poses, render_poses=render_poses, images=images, depths=depths, irregular_shape=irregular_shape\n    )\n    data_dict['poses'] = torch.tensor(data_dict['poses']).float()\n    # TODO: change to device = cuda to avoid load-on-the-fly costs\n    data_dict['images'] = torch.tensor(data_dict['images'], device='cpu').float()\n    return data_dict\n"
  },
  {
    "path": "FourierGrid/load_waymo.py",
    "content": "'''\nModify from\nhttps://github.com/Kai-46/nerfplusplus/blob/master/data_loader_split.py\n'''\nimport os\nimport pdb\nfrom tkinter import image_names\nimport cv2\nimport glob\nimport scipy\nimport imageio\nimport shutil\nimport numpy as np\nimport torch\nfrom tqdm import tqdm\nimport json\nfrom scipy.spatial.transform import Rotation as R\nfrom FourierGrid.common_data_loaders.load_llff import normalize\nfrom FourierGrid.trajectory_generators.waymo_traj import *\n\n########################################################################################################################\n# camera coordinate system: x-->right, y-->down, z-->scene (opencv/colmap convention)\n# poses is camera-to-world\n########################################################################################################################\ndef find_files(dir, exts):\n    if os.path.isdir(dir):\n        files_grabbed = []\n        for ext in exts:\n            files_grabbed.extend(glob.glob(os.path.join(dir, ext)))\n        if len(files_grabbed) > 0:\n            files_grabbed = sorted(files_grabbed)\n        return files_grabbed\n    else:\n        return []\n\n\ndef waymo_load_img_list(split_dir, skip=1):\n    # img files\n    img_files = find_files('{}'.format(split_dir), exts=['*.png', '*.jpg'])\n    if len(img_files) > 0:\n        img_files = img_files[::skip]\n    else:\n        raise RuntimeError(f\"Cannot find image files at {split_dir}.\")\n    return img_files\n\n\ndef rerotate_poses(poses, render_poses):\n    poses = np.copy(poses)\n    centroid = poses[:,:3,3].mean(0)\n\n    poses[:,:3,3] = poses[:,:3,3] - centroid\n\n    # Find the minimum pca vector with minimum eigen value\n    x = poses[:,:3,3]\n    mu = x.mean(0)\n    cov = np.cov((x-mu).T)\n    ev , eig = np.linalg.eig(cov)\n    cams_up = eig[:,np.argmin(ev)]\n    if cams_up[1] < 0:\n        cams_up = -cams_up\n\n    # Find rotation matrix that align cams_up with [0,1,0]\n    R = scipy.spatial.transform.Rotation.align_vectors(\n            [[0,-1,0]], cams_up[None])[0].as_matrix()\n\n    # Apply rotation and add back the centroid position\n    poses[:,:3,:3] = R @ poses[:,:3,:3]\n    poses[:,:3,[3]] = R @ poses[:,:3,[3]]\n    poses[:,:3,3] = poses[:,:3,3] + centroid\n    render_poses = np.copy(render_poses)\n    render_poses[:,:3,3] = render_poses[:,:3,3] - centroid\n    render_poses[:,:3,:3] = R @ render_poses[:,:3,:3]\n    render_poses[:,:3,[3]] = R @ render_poses[:,:3,[3]]\n    render_poses[:,:3,3] = render_poses[:,:3,3] + centroid\n    return poses, render_poses\n\n\ndef sample_list_by_idx(one_list, idxs):\n    # allow idxs to be out of range\n    return [one_list[idx] for idx in idxs if idx < len(one_list)]\n    \n    \ndef sample_metadata_by_cam(metadata, cam_idx):\n    for split in metadata:\n        sample_idxs = []\n        for idx, cam_id in enumerate(metadata[split]['cam_idx']):\n            if cam_id == cam_idx:\n                sample_idxs.append(idx)\n        for one_k in metadata[split]:\n            metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n    return metadata\n    \n\ndef sample_metadata_by_idxs(metadata, sample_idxs, val_num=5):\n    if sample_idxs is None:\n        for split in metadata:\n            if split != 'train':   # validation is not that important\n                sample_idxs = list(range(val_num))\n                for one_k in metadata[split]:\n                    metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n        return metadata\n    for split in metadata:\n        if split != 'train':   # validation is not that important\n            sample_idxs = sample_idxs[:val_num]\n        for one_k in metadata[split]:\n            metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n    return metadata\n\n\ndef sample_metadata_by_training_ids(metadata, training_ids, assign_pos, assign_rot):\n    if training_ids is None:\n        return metadata\n    for split in metadata:\n        if split != 'train':\n            continue\n        else:\n            sample_idxs = []\n            for ele in training_ids:\n                full_path = f'images_train/{ele}.png'\n                if full_path in metadata['train']['file_path']:\n                    sample_idxs.append(metadata['train']['file_path'].index(full_path))\n            assert len(sample_idxs) > 0, \"No image is selected by training id!\"\n            for one_k in metadata[split]:\n                metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sample_idxs)\n            if assign_pos is not None:\n                for ele in assign_pos:\n                    full_path = f'images_train/{ele}.png'\n                    index = metadata[split]['file_path'].index(full_path)\n                    metadata[split]['position'][index] = assign_pos[ele]\n                    temp_c2w = np.array(metadata[split]['cam2world'][index])\n                    # update position\n                    temp_c2w[:3, -1] = np.array(metadata[split]['position'][index])\n                    trans_rot = R.from_matrix(temp_c2w[:3, :3]).as_euler('yzx', degrees=True)\n                    print(full_path, trans_rot)\n                    new_rot = assign_rot[ele]\n                    r = R.from_euler('yzx', new_rot, degrees=True)\n                    temp_c2w[:3, :3] = r.as_matrix()\n                    metadata[split]['cam2world'][index] = temp_c2w.tolist()\n    return metadata\n\n\ndef sort_metadata_by_pos(metadata):\n    for split in metadata:\n        list_idxs = list(range(len(metadata[split]['position'])))\n        # first sort y, then x\n        sorted_idxs = sorted(zip(list_idxs, metadata[split]['position']), key=lambda row: (row[1][1], row[1][0]))\n        sorted_idxs = [i for i, j in sorted_idxs]\n        for one_k in metadata[split]:\n            metadata[split][one_k] = sample_list_by_idx(metadata[split][one_k], sorted_idxs)\n    return metadata\n\n\ndef normalize(x):\n    return x / np.linalg.norm(x)\n\n\ndef viewmatrix(z, up, pos):\n    vec2 = normalize(z)\n    vec1_avg = up\n    vec0 = normalize(np.cross(vec1_avg, vec2))\n    vec1 = normalize(np.cross(vec2, vec0))\n    m = np.stack([vec0, vec1, vec2, pos], 1)\n    return m\n\n\ndef ptstocam(pts, c2w):\n    tt = np.matmul(c2w[:3,:3].T, (pts-c2w[:3,3])[...,np.newaxis])[...,0]\n    return tt\n\n\ndef poses_avg(poses):\n    hwf = poses[0, :3, -1:]\n    center = poses[:, :3, 3].mean(0)\n    vec2 = normalize(poses[:, :3, 2].sum(0))\n    up = poses[:, :3, 1].sum(0)\n    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)\n    return c2w\n\n\ndef recenter_poses(poses):\n    poses_ = poses+0\n    bottom = np.reshape([0,0,0,1.], [1,4])\n    c2w = poses_avg(poses)\n    c2w = np.concatenate([c2w[:3,:4], bottom], -2)\n    bottom = np.tile(np.reshape(bottom, [1,1,4]), [poses.shape[0],1,1])\n    poses = np.concatenate([poses[:,:3,:4], bottom], -2)\n\n    poses = np.linalg.inv(c2w) @ poses\n    poses_[:,:3,:4] = poses[:,:3,:4]\n    poses = poses_\n    return poses\n\n\ndef find_most_freq_ele(one_list):\n    most_freq_ele = max(set(one_list), key = one_list.count)\n    freq_count = one_list.count(most_freq_ele)\n    return most_freq_ele, freq_count\n\n\ndef save_training_imgs_to_disk(args, cfg, metadata):\n    exp_folder = os.path.join(cfg.basedir, cfg.expname)\n    data_folder = cfg.data.datadir\n    train_imgs = metadata['train']['file_path']\n    os.makedirs(exp_folder, exist_ok=True)\n    for idx, train_img in enumerate(tqdm(train_imgs)):\n        full_data_path = os.path.join(data_folder, train_img)\n        assert os.path.exists(full_data_path), f\"{full_data_path} does not exist!\"\n        shutil.copyfile(full_data_path, os.path.join(exp_folder, train_img.split(\"/\")[-1]))\n        print(f\"img file saved at {exp_folder}.\")\n    return\n\n\ndef resize_img(train_HW, val_HW, imgs, tr_K, val_K):\n    target_h, _ = find_most_freq_ele([hw[0] for hw in train_HW])\n    target_w, _ = find_most_freq_ele([hw[1] for hw in train_HW])\n    imgs = [cv2.resize(img, dsize=(target_w, target_h), interpolation=cv2.INTER_CUBIC) for img in imgs]\n    for idx, one_k in enumerate(tr_K):\n        h_before, w_before = train_HW[idx]\n        assert h_before == tr_K[idx][1][2] * 2\n        assert w_before == tr_K[idx][0][2] * 2\n        h_ratio = target_h / h_before\n        w_ratio = target_w / w_before\n        # alpha x\n        tr_K[idx][0][0] = tr_K[idx][0][0] * w_ratio\n        # x0\n        tr_K[idx][0][2] = tr_K[idx][0][2] * w_ratio\n        # alpha y\n        tr_K[idx][1][1] = tr_K[idx][1][1] * h_ratio\n        # y0\n        tr_K[idx][1][2] = tr_K[idx][1][2] * h_ratio\n        assert target_w == tr_K[idx][0][2] * 2\n        assert target_h == tr_K[idx][1][2] * 2\n    for idx, one_k in enumerate(val_K):\n        h_before, w_before = val_HW[idx]\n        assert h_before == val_K[idx][1][2] * 2\n        assert w_before == val_K[idx][0][2] * 2\n        h_ratio = target_h / h_before\n        w_ratio = target_w / w_before\n        # alpha x\n        val_K[idx][0][0] = val_K[idx][0][0] * w_ratio\n        # x0\n        val_K[idx][0][2] = val_K[idx][0][2] * w_ratio\n        # alpha y\n        val_K[idx][1][1] = val_K[idx][1][1] * h_ratio\n        # y0\n        val_K[idx][1][2] = val_K[idx][1][2] * h_ratio\n        assert target_w == val_K[idx][0][2] * 2\n        assert target_h == val_K[idx][1][2] * 2\n    train_HW = [[target_h, target_w] for hw in train_HW]\n    val_HW = [[target_h, target_w] for hw in val_HW]\n    return train_HW, val_HW, imgs, tr_K, val_K\n\n\ndef find_rotations_from_meta(metadata):\n    rotations = []\n    for idx, c2w in enumerate(metadata['train']['cam2world']):\n        rot = np.array(c2w)[:3, :3]\n        trans_rot = R.from_matrix(rot).as_euler('yzx', degrees=True)\n        rotations.append(trans_rot)\n    return rotations\n    \n\ndef load_waymo(args, cfg, ):\n    data_cfg = cfg.data\n    load_img = False if args.program == \"gen_trace\" else True\n    basedir = data_cfg.datadir\n    with open(os.path.join(basedir, f'metadata.json'), 'r') as fp:\n        metadata = json.load(fp)\n    if 'sample_cam' in data_cfg:\n        metadata = sample_metadata_by_cam(metadata, data_cfg['sample_cam'])\n    if args.sample_num > 0:\n        sample_idxs = list(range(0, args.sample_num * data_cfg['sample_interval'], data_cfg['sample_interval']))\n        assert args.sample_num * data_cfg['sample_interval'] < len(metadata['train']['file_path']), \\\n            f\"Not enough data to train with given sample interval: {data_cfg['sample_interval']}!\"\n    elif 'sample_idxs' in data_cfg:\n        sample_idxs = data_cfg['sample_idxs']\n    else:\n        sample_idxs = None\n    \n    metadata = sort_metadata_by_pos(metadata)\n    metadata = sample_metadata_by_idxs(metadata, sample_idxs)\n\n    if \"training_ids\" in cfg.data:\n        training_ids = cfg.data.training_ids\n        metadata = sample_metadata_by_training_ids(metadata, training_ids, None, None)\n    rotations = find_rotations_from_meta(metadata)\n    if args.diffuse:\n        for idx, fp in enumerate(metadata['train']['file_path']):\n            img_name = fp.split(\"/\")[-1].replace(\".png\", \"\")\n            diffuse_replace = cfg.diffusion.diff_replace\n            if img_name in diffuse_replace:\n                img_path = os.path.join(cfg.diffusion.diff_root, diffuse_replace[img_name] + \".png\")\n                metadata['train']['file_path'][idx] = img_path\n    \n    # The validation datasets are from the official val split, \n    # but the testing splits are hard-coded sequences (completely novel views)\n    tr_cam_idx, val_cam_idx = metadata['train']['cam_idx'], metadata['val']['cam_idx']\n    cam_idxs = tr_cam_idx + val_cam_idx\n    train_pos, val_pos = metadata['train']['position'], metadata['val']['position']\n    positions = train_pos + val_pos\n    tr_im_path, val_im_path = metadata['train']['file_path'], metadata['val']['file_path']\n    tr_c2w, val_c2w = metadata['train']['cam2world'], metadata['val']['cam2world']\n    tr_K, val_K = metadata['train']['K'], metadata['val']['K']\n\n    # Determine split id list\n    i_split = [[], [], []]\n    loop_id = 0\n    for _ in tr_c2w:\n        i_split[0].append(loop_id)\n        loop_id += 1\n    for _ in val_c2w:\n        i_split[1].append(loop_id)\n        loop_id += 1\n\n    # Load camera poses\n    poses = []\n    for c2w in tr_c2w:\n        poses.append(np.array(c2w).reshape(4,4))\n    for c2w in val_c2w:\n        poses.append(np.array(c2w).reshape(4,4))\n\n    # Load images\n    if not load_img:\n        imgs = tr_im_path + val_im_path  # do not load all the images\n    else:\n        imgs = []\n        print(f\"Loading all the images to disk.\")\n        for path in tqdm(tr_im_path):\n            imgs.append(imageio.imread(os.path.join(basedir, path)) / 255.)\n        for path in tqdm(val_im_path):\n            imgs.append(imageio.imread(os.path.join(basedir, path)) / 255.) \n    \n    train_HW = np.array([[metadata['train']['height'][i], metadata['train']['width'][i]] \n                         for i in range(len(metadata['train']['height']))]).tolist()\n    val_HW = np.array([[metadata['val']['height'][i], metadata['val']['width'][i]] \n                       for i in range(len(metadata['val']['height']))]).tolist()\n\n    if args.save_train_imgs:\n        save_training_imgs_to_disk(args, cfg, metadata)\n    train_HW, val_HW, imgs, tr_K, val_K = resize_img(train_HW, val_HW, imgs, tr_K, val_K)\n\n    # Create the test split\n    te_c2w, test_HW, test_K, test_cam_idxs, test_pos = \\\n        gen_rotational_trajs(args, cfg, metadata, tr_c2w, train_HW, tr_K, tr_cam_idx, train_pos, \n                       rotate_angle=data_cfg.test_rotate_angle)\n    # te_c2w, test_HW, test_K, test_cam_idxs = \\\n    #     gen_straight_trajs(metadata, tr_c2w, train_HW, tr_K, tr_cam_idx, train_pos, \n    #                    rotate_angle=data_cfg.test_rotate_angle)\n    # dummy test paths\n    # te_c2w, test_K, test_HW, test_cam_idxs = val_c2w, val_K, val_HW, val_cam_idx\n    # TODO: consider removing the so-called test split.\n    for _ in te_c2w:\n        i_split[2].append(loop_id)\n        loop_id += 1\n    for c2w in te_c2w:\n        poses.append(np.array(c2w).reshape(4,4))\n    \n    # Bundle all the data\n    all_K = np.array(tr_K + val_K + test_K)\n    HW = np.array(train_HW + val_HW + test_HW)\n    poses = np.stack(poses, 0)\n    if load_img:\n        imgs = np.stack(imgs)\n        \n    # note test_cam_idxs can be inaccurate because it may be varied!\n    cam_idxs += test_cam_idxs\n    render_poses = te_c2w\n    return imgs, poses, render_poses, HW, all_K, cam_idxs, i_split\n\n\ndef inward_nearfar_heuristic(cam_o, ratio=0.05):\n    dist = np.linalg.norm(cam_o[:,None] - cam_o, axis=-1)\n    far = dist.max()  # could be too small to exist the scene bbox\n                      # it is only used to determined scene bbox\n                      # lib/dvgo use 1e9 as far\n    near = far * ratio\n    return near, far\n\n\ndef load_waymo_data(args, cfg):\n    data_cfg = cfg.data\n    K, depths = None, None\n    near_clip = None\n    images, poses, render_poses, HW, K, cam_idxs, i_split = load_waymo(args, cfg)\n    print(f\"Loaded waymo dataset.\")\n    i_train, i_val, i_test = i_split\n    near_clip, far = inward_nearfar_heuristic(poses[i_train, :3, 3], ratio=0.02)  # not used too much in fact\n    \n    # load near and far parameters\n    if \"near_clip\" in data_cfg:\n        near_clip = data_cfg['near_clip']\n    if 'near' in data_cfg:\n        near = data_cfg['near']\n    if 'far' in data_cfg:\n        far = data_cfg['far']\n    Ks = np.array(K)\n    irregular_shape = False\n    data_dict = dict(\n        HW=HW, Ks=Ks, near=near, far=far, near_clip=near_clip,\n        i_train=i_train, i_val=i_val, i_test=i_test,\n        poses=poses, render_poses=render_poses, images=images, depths=depths, cam_idxs=cam_idxs, irregular_shape=irregular_shape\n    )\n    data_dict['poses'] = torch.tensor(data_dict['poses']).float()\n    data_dict['images'] = torch.tensor(data_dict['images']).float()\n    return data_dict\n"
  },
  {
    "path": "FourierGrid/masked_adam.py",
    "content": "import os\nimport torch\nimport adam_upd_cuda  # requires pre-build adam\n## online installation as follows\n# from torch.utils.cpp_extension import load\n\n# parent_dir = os.path.dirname(os.path.abspath(__file__))\n# sources = ['cuda/adam_upd.cpp', 'cuda/adam_upd_kernel.cu']\n# print(\"Loading masked adam.\")\n# adam_upd_cuda = load(\n#         name='adam_upd_cuda',\n#         sources=[os.path.join(parent_dir, path) for path in sources],\n#         verbose=False)\n# print(\"CUDA support for masked adam loaded.\")\n\n\n''' Extend Adam optimizer\n1. support per-voxel learning rate\n2. masked update (ignore zero grad) which speeduping training\n'''\nclass MaskedAdam(torch.optim.Optimizer):\n\n    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8):\n        if not 0.0 <= lr:\n            raise ValueError(\"Invalid learning rate: {}\".format(lr))\n        if not 0.0 <= eps:\n            raise ValueError(\"Invalid epsilon value: {}\".format(eps))\n        if not 0.0 <= betas[0] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 0: {}\".format(betas[0]))\n        if not 0.0 <= betas[1] < 1.0:\n            raise ValueError(\"Invalid beta parameter at index 1: {}\".format(betas[1]))\n        defaults = dict(lr=lr, betas=betas, eps=eps)\n        self.per_lr = None\n        super(MaskedAdam, self).__init__(params, defaults)\n\n    def __setstate__(self, state):\n        super(MaskedAdam, self).__setstate__(state)\n\n    def set_pervoxel_lr(self, count):\n        assert self.param_groups[0]['params'][0].shape == count.shape\n        self.per_lr = count.float() / count.max()\n\n    @torch.no_grad()\n    def step(self):\n        for group in self.param_groups:\n            lr = group['lr']\n            beta1, beta2 = group['betas']\n            eps = group['eps']\n            skip_zero_grad = group['skip_zero_grad']\n\n            for param in group['params']:\n                if param.grad is not None:\n                    state = self.state[param]\n                    # Lazy state initialization\n                    if len(state) == 0:\n                        state['step'] = 0\n                        # Exponential moving average of gradient values\n                        state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)\n                        # Exponential moving average of squared gradient values\n                        state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)\n\n                    state['step'] += 1\n\n                    if self.per_lr is not None and param.shape == self.per_lr.shape:\n                        adam_upd_cuda.adam_upd_with_perlr(\n                                param, param.grad, state['exp_avg'], state['exp_avg_sq'], self.per_lr,\n                                state['step'], beta1, beta2, lr, eps)\n                    elif skip_zero_grad:\n                        adam_upd_cuda.masked_adam_upd(\n                                param, param.grad, state['exp_avg'], state['exp_avg_sq'],\n                                state['step'], beta1, beta2, lr, eps)\n                    else:\n                        adam_upd_cuda.adam_upd(\n                                param, param.grad, state['exp_avg'], state['exp_avg_sq'],\n                                state['step'], beta1, beta2, lr, eps)\n\nif __name__=='__main__':\n    a = torch.nn.Linear(3, 4)\n    optim = MaskedAdam(a.parameters())\n    print(\"Testing masked adam optimizer finished!\")\n"
  },
  {
    "path": "FourierGrid/pose_utils/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/pose_utils/image_operators.py",
    "content": "import cv2\nimport pdb\nimport numpy as np\nfrom scipy import stats\n\n\ndef get_bbox_from_img(image, color_thre=1e-2):\n    # assuming the background is white, crop the center obj out.\n    final_image = image.max() - image.max(-1)\n    final_image[final_image < color_thre] = 0.0\n    mask_image = final_image >= color_thre\n    contours = cv2.findNonZero(final_image)\n    contours = contours.squeeze()\n    xmin, xmax = np.min(contours[:, 0]), np.max(contours[:, 0])\n    ymin, ymax = np.min(contours[:, 1]), np.max(contours[:, 1])\n    return mask_image, xmin, xmax, ymin, ymax\n    \n\ndef change_background_from_black_to_white(image, color_thresh=1e-2):\n    assert image.max() > 2, \"the input image must be in (0-255) scale.\"\n    image[image < color_thresh] = 255\n    return image\n\n\ndef get_bbox_from_mask(label_img):\n    contours = cv2.findNonZero(label_img)\n    contours = contours.squeeze()\n    xmin, xmax = np.min(contours[:, 0]), np.max(contours[:, 0])\n    ymin, ymax = np.min(contours[:, 1]), np.max(contours[:, 1])\n    return xmin, xmax, ymin, ymax\n    \n\ndef apply_mask_on_img(one_img, label_img):\n    assert one_img.max() > 2, \"the input image must be in (0-255) scale.\"\n    one_img[..., 0] = one_img[..., 0] * label_img + 255 * (1 - label_img)\n    one_img[..., 1] = one_img[..., 1] * label_img + 255 * (1 - label_img)\n    one_img[..., 2] = one_img[..., 2] * label_img + 255 * (1 - label_img)\n    return one_img\n\n\ndef image_normalization_for_pose(image):\n    assert image.max() > 2, \"the input image must be in (0-255) scale.\"\n    # image[:, : , 0] = (image[:, : , 0] - image[:, : , 0].mean()) / 255.0\n    # image[:, : , 1] = (image[:, : , 1] - image[:, : , 1].mean()) / 255.0\n    # image[:, : , 2] = (image[:, : , 2] - image[:, : , 2].mean()) / 255.0\n    image = image / image.max()\n    return image\n"
  },
  {
    "path": "FourierGrid/pose_utils/linemod_constants.py",
    "content": "import numpy as np\n\n\ndiameters = {\n    'cat': 15.2633,\n    'ape': 9.74298,\n    'benchvise': 28.6908,\n    'bowl': 17.1185,\n    'cam': 17.1593,\n    'camera': 17.1593,\n    'can': 19.3416,\n    'cup': 12.5961,\n    'driller': 25.9425,\n    'duck': 10.7131,\n    'eggbox': 17.6364,\n    'glue': 16.4857,\n    'holepuncher': 14.8204,\n    'iron': 30.3153,\n    'lamp': 28.5155,\n    'phone': 20.8394\n}\n\nlinemod_cls_names = ['ape', 'cam', 'cat', 'duck', 'glue', 'iron', 'phone', 'benchvise', 'can', 'driller', 'eggbox', 'holepuncher', 'lamp']\n\nlinemod_K = np.array([[572.4114, 0., 325.2611],\n                  [0., 573.57043, 242.04899],\n                  [0., 0., 1.]])"
  },
  {
    "path": "FourierGrid/pose_utils/linemod_evaluator.py",
    "content": "# this code is from rnnpose\nimport pdb\nimport numpy as np\nfrom FourierGrid.pose_utils.linemod_constants import *\nfrom FourierGrid.pose_utils.pose_operators import *\nfrom scipy.spatial.transform import Rotation as R\n\n    \ndef rotation_angle_euler(R1, R2):\n    if len(R2.shape)==2:\n        # http://www.boris-belousov.net/2016/12/01/quat-dist/#:~:text=The%20difference%20rotation%20matrix%20that,matrix%20R%20%3D%20P%20Q%20%E2%88%97%20.\n        rotation_difference = R1 @ np.linalg.inv(R2)\n        theta = np.arccos((np.trace(rotation_difference) - 1) / 2)\n        theta = np.rad2deg(theta)\n        euler = R.from_matrix(rotation_difference).as_euler('zyx', degrees=True)\n        norm_angle = np.linalg.norm(euler)\n        return norm_angle\n    else:  # batch mode\n        rotation_difference = R1 @ np.linalg.inv(R2)\n        batch_norm_angles = [rot_diff_to_norm_angle(rot_d) for rot_d in rotation_difference]\n        batch_norm_angles = np.array(batch_norm_angles)\n        sorted_norm_angles = np.sort(batch_norm_angles)\n        return sorted_norm_angles[0]\n\n\ndef project(xyz, K, RT):\n    \"\"\"\n    xyz: [N, 3]\n    K: [3, 3]\n    RT: [3, 4]\n    \"\"\"\n    xyz = np.dot(xyz, RT[:, :3].T) + RT[:, 3:].T\n    xyz = np.dot(xyz, K.T)\n    xy = xyz[:, :2] / xyz[:, 2:]\n    return xy\n\n\nclass LineMODEvaluator:\n    def __init__(self, class_name, obj_m, icp_refine=False):\n        self.class_name = class_name\n        self.icp_refine = icp_refine\n        # model_path = os.path.join(os.path.dirname(os.path.abspath(\n        #     __file__)), '../../linemod/LM6d_converted/models', class_name, class_name + '.ply')\n        # self.model = pvnet_data_utils.get_ply_model(model_path)\n        self.model = obj_m\n        self.diameter = diameters[class_name] / 100\n\n        self.proj2d = []\n        self.add = []\n        self.adds = [] #force sym\n        self.add2 = []\n        self.add5 = []\n        self.cmd5 = []\n\n        self.icp_proj2d = []\n        self.icp_add = []\n        self.icp_cmd5 = []\n\n        self.mask_ap = []\n        # self.pose_preds=[]\n\n        self.height = 480\n        self.width = 640\n\n        # model = inout.load_ply(model_path)\n        model = obj_m\n        # model['pts'] = model['pts'] * 1000\n        self.icp_refiner = icp_utils.ICPRefiner(\n            model, (self.width, self.height)) if icp_refine else None\n\n    def projection_2d(self, pose_pred, pose_targets, K, icp=False, threshold=5):\n        model_2d_pred = project(self.model, K, pose_pred)\n        model_2d_targets = project(self.model, K, pose_targets)\n        proj_mean_diff = np.mean(np.linalg.norm(\n            model_2d_pred - model_2d_targets, axis=-1))\n        if icp:\n            self.icp_proj2d.append(proj_mean_diff < threshold)\n        else:\n            self.proj2d.append(proj_mean_diff < threshold)\n\n    def projection_2d_sym(self, pose_pred, pose_targets, K, threshold=5):\n        model_2d_pred = project(self.model, K, pose_pred)\n        model_2d_targets = project(self.model, K, pose_targets)\n        proj_mean_diff=np.mean(find_nearest_point_distance(model_2d_pred,model_2d_targets))\n\n        self.proj_mean_diffs.append(proj_mean_diff)\n        self.projection_2d_recorder.append(proj_mean_diff < threshold)\n\n    def add2_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.02):\n        diameter = self.diameter * percentage\n        model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]\n        model_targets = np.dot(\n            self.model, pose_targets[:, :3].T) + pose_targets[:, 3]\n\n        if syn:\n            from thirdparty.nn import nn_utils  # TODO: solve this reference\n            idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)\n            # idxs = find_nearest_point_idx(model_pred, model_targets)\n            mean_dist = np.mean(np.linalg.norm(\n                model_pred[idxs] - model_targets, 2, 1))\n        else:\n            mean_dist = np.mean(np.linalg.norm(\n                model_pred - model_targets, axis=-1))\n\n        if icp:\n            self.icp_add.append(mean_dist < diameter)\n        else:\n            self.add2.append(mean_dist < diameter)\n\n    def add5_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.05):\n        diameter = self.diameter * percentage\n        model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]\n        model_targets = np.dot(\n            self.model, pose_targets[:, :3].T) + pose_targets[:, 3]\n\n        if syn:\n            idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)\n            # idxs = find_nearest_point_idx(model_pred, model_targets)\n            mean_dist = np.mean(np.linalg.norm(\n                model_pred[idxs] - model_targets, 2, 1))\n        else:\n            mean_dist = np.mean(np.linalg.norm(\n                model_pred - model_targets, axis=-1))\n\n        if icp:\n            self.icp_add.append(mean_dist < diameter)\n        else:\n            self.add5.append(mean_dist < diameter)\n\n    def add_metric(self, pose_pred, pose_targets, icp=False, syn=False, percentage=0.1):\n        diameter = self.diameter * percentage\n        def cal_one_add(pose_pred, pose_targets):\n            model_pred = np.dot(self.model, pose_pred[:, :3].T) + pose_pred[:, 3]\n            model_targets = np.dot(\n                self.model, pose_targets[:, :3].T) + pose_targets[:, 3]\n\n            if syn:\n                idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)\n                # idxs = find_nearest_point_idx(model_pred, model_targets)\n                mean_dist = np.mean(np.linalg.norm(\n                    model_pred[idxs] - model_targets, 2, 1))\n            else:\n                mean_dist = np.mean(np.linalg.norm(\n                    model_pred - model_targets, axis=-1))\n            return mean_dist\n        \n        if len(pose_pred.shape) == 2:\n            mean_dist = cal_one_add(pose_pred, pose_targets)\n        else:\n            all_dists = []\n            for idx in range(len(pose_pred)):\n                one_dist = cal_one_add(pose_pred[idx], pose_targets[idx])\n                all_dists.append(one_dist)\n            sorted_dists = np.sort(all_dists)\n            mean_dist = sorted_dists[0]\n        if icp:\n            self.icp_add.append(mean_dist < diameter)\n        else:\n            self.add.append(mean_dist < diameter)\n        return mean_dist, mean_dist < diameter\n\n    def cm_degree_5_metric(self, pose_pred, pose_targets, icp=False):\n        translation_distance = np.linalg.norm(\n            pose_pred[:, 3] - pose_targets[:, 3]) * 100\n        rotation_diff = np.dot(pose_pred[:, :3], pose_targets[:, :3].T)\n        trace = np.trace(rotation_diff)\n        trace = trace if trace <= 3 else 3\n        angular_distance = np.rad2deg(np.arccos((trace - 1.) / 2.))\n        if icp:\n            self.icp_cmd5.append(translation_distance <\n                                 5 and angular_distance < 5)\n        else:\n            self.cmd5.append(translation_distance < 5 and angular_distance < 5)\n\n    def mask_iou(self, output, batch):\n        mask_pred = torch.argmax(output['seg'], dim=1)[\n            0].detach().cpu().numpy()\n        mask_gt = batch['mask'][0].detach().cpu().numpy()\n        iou = (mask_pred & mask_gt).sum() / (mask_pred | mask_gt).sum()\n        self.mask_ap.append(iou > 0.7)\n\n    def icp_refine(self, pose_pred, anno, output, K):\n        depth = read_depth(anno['depth_path'])\n        mask = torch.argmax(output['seg'], dim=1)[0].detach().cpu().numpy()\n        if pose_pred[2, 3] <= 0:\n            return pose_pred\n        depth[mask != 1] = 0\n        pose_pred_tmp = pose_pred.copy()\n        pose_pred_tmp[:3, 3] = pose_pred_tmp[:3, 3] * 1000\n\n        R_refined, t_refined = self.icp_refiner.refine(\n            depth, pose_pred_tmp[:3, :3], pose_pred_tmp[:3, 3], K.copy(), depth_only=True, max_mean_dist_factor=5.0)\n        R_refined, _ = self.icp_refiner.refine(\n            depth, R_refined, t_refined, K.copy(), no_depth=True)\n\n        pose_pred = np.hstack((R_refined, t_refined.reshape((3, 1)) / 1000))\n\n        return pose_pred\n\n    def icp_refine_(self, pose, anno, output):\n        depth = read_depth(anno['depth_path']).astype(np.uint16)\n        mask = torch.argmax(output['seg'], dim=1)[0].detach().cpu().numpy()\n        mask = mask.astype(np.int32)\n        pose = pose.astype(np.float32)\n\n        poses = np.zeros([1, 7], dtype=np.float32)\n        poses[0, :4] = mat2quat(pose[:, :3])\n        poses[0, 4:] = pose[:, 3]\n\n        poses_new = np.zeros([1, 7], dtype=np.float32)\n        poses_icp = np.zeros([1, 7], dtype=np.float32)\n\n        fx = 572.41140\n        fy = 573.57043\n        px = 325.26110\n        py = 242.04899\n        zfar = 6.0\n        znear = 0.25\n        factor = 1000.0\n        error_threshold = 0.01\n\n        rois = np.zeros([1, 6], dtype=np.float32)\n        rois[:, :] = 1\n\n        self.icp_refiner.solveICP(mask, depth,\n                                  self.height, self.width,\n                                  fx, fy, px, py,\n                                  znear, zfar,\n                                  factor,\n                                  rois.shape[0], rois,\n                                  poses, poses_new, poses_icp,\n                                  error_threshold\n                                  )\n\n        pose_icp = np.zeros([3, 4], dtype=np.float32)\n        pose_icp[:, :3] = quat2mat(poses_icp[0, :4])\n        pose_icp[:, 3] = poses_icp[0, 4:]\n\n        return pose_icp\n\n    def summarize(self):\n        proj2d = np.mean(self.proj2d)\n        add = np.mean(self.add)\n        # adds = np.mean(self.adds)\n        add2 = np.mean(self.add2)\n        add5 = np.mean(self.add5)\n        cmd5 = np.mean(self.cmd5)\n        ap = np.mean(self.mask_ap)\n        seq_len=len(self.add)\n        print('2d projections metric: {}'.format(proj2d * 100))\n        print('ADD metric: {}'.format(add * 100))\n        print('ADD2 metric: {}'.format(add2 * 100))\n        print('ADD5 metric: {}'.format(add5 * 100))\n        # print('ADDS metric: {}'.format(adds))\n        print('5 cm 5 degree metric: {}'.format(cmd5 * 100))\n        # print('mask ap70: {}'.format(ap))\n        print('seq_len: {}'.format(seq_len))\n        # if cfg.test.icp:\n        if self.icp_refine:\n            print('2d projections metric after icp: {}'.format(\n                np.mean(self.icp_proj2d)))\n            print('ADD metric after icp: {}'.format(np.mean(self.icp_add)))\n            print('5 cm 5 degree metric after icp: {}'.format(\n                np.mean(self.icp_cmd5)))\n        self.proj2d = []\n        self.add = []\n        self.add2 = []\n        self.add5 = []\n        # self.adds = []\n        self.cmd5 = []\n        self.mask_ap = []\n        self.icp_proj2d = []    \n        self.icp_add = []\n        self.icp_cmd5 = []\n\n        # #save pose predictions\n        # if len(self.pose_preds)> 0:\n        #     np.save(f\"{self.class_name}_pose_preds.npy\",self.pose_preds)\n        # self.pose_preds=[]\n\n        return {'proj2d': proj2d, 'add': add, 'add2': add2, 'add5': add5,'cmd5': cmd5, 'ap': ap, \"seq_len\": seq_len}\n        \n    def evaluate_proposals(self, pose_gt, pose_proposals, cam_k):\n        # copy the gt to batch form\n        proposal_num = len(pose_proposals)\n        pose_gt = np.array([pose_gt for i in range(proposal_num)])\n        ang_err_euler = rotation_angle_euler(pose_gt[:, :3, :3], pose_proposals[:, :3, :3])\n        trans_diff = [np.linalg.norm(pose_proposals[idx][:3, -1:] - pose_gt[idx][:3, -1:]) for idx in range(proposal_num)]\n        trans_diff = np.sort(trans_diff)\n        trans_err = np.min(trans_diff)\n        if self.class_name in ['eggbox', 'glue']:\n            add_value, add_final = self.add_metric(pose_proposals, pose_gt, syn=True)\n        else:\n            add_value, add_final = self.add_metric(pose_proposals, pose_gt)\n\n        return {\n            \"ang_err_euler\": ang_err_euler,\n            \"trans_err\": trans_err,\n            \"add_value\": add_value,\n            \"add_final\": add_final\n        }\n    \n    def evaluate_linemod(self, pose_gt, pose_pred, cam_k): # sample_correspondence_pairs=False, direct_align=False, use_cnnpose=True):\n        ang_err_chordal = rotation_angle_chordal(pose_gt[:3, :3], pose_pred[:3, :3])\n        ang_err_euler = rotation_angle_euler(pose_gt[:3, :3], pose_pred[:3, :3])\n        trans_err = np.linalg.norm(pose_pred[:3, -1:] - pose_gt[:3, -1:])  # 3x1\n        if self.class_name in ['eggbox', 'glue']:\n            add_value, add_final = self.add_metric(pose_pred, pose_gt, syn=True)\n            self.add2_metric(pose_pred, pose_gt, syn=True)\n            self.add5_metric(pose_pred, pose_gt, syn=True)\n        else:\n            add_value, add_final = self.add_metric(pose_pred, pose_gt)\n            self.add2_metric(pose_pred, pose_gt)\n            self.add5_metric(pose_pred, pose_gt)\n\n        self.projection_2d(pose_pred, pose_gt, K=cam_k)\n        self.cm_degree_5_metric(pose_pred, pose_gt)\n\n        # vis\n        # pc_proj_vis = vis_pointclouds_cv2((pose_gt[:3, :3]@model_points.cpu().numpy(\n        # ).T+pose_gt[:3, -1:]).T, example[\"K\"].cpu().numpy().squeeze(), [480,640])\n        # pc_proj_vis_pred = vis_pointclouds_cv2((pose_pred[:3, :3]@model_points.cpu().numpy(\n        # ).T+pose_pred[:3, -1:]).T, example[\"K\"].cpu().numpy().squeeze(), [ 480, 640])\n\n        return {\n            \"ang_err_chordal\": ang_err_chordal,\n            \"ang_err_euler\": ang_err_euler,\n            \"trans_err\": trans_err,\n            \"pnp_inliers\": -1,#len(inliers),\n            \"add_value\": add_value,\n            \"add_final\": add_final\n            # \"pc_proj_vis\": pc_proj_vis,\n            # \"pc_proj_vis_pred\": pc_proj_vis_pred,\n            # \"keypoints_2d_vis\": np.zeros_like(pc_proj_vis_pred) #keypoints_2d_vis\n        }\n"
  },
  {
    "path": "FourierGrid/pose_utils/model_operations.py",
    "content": "import numpy as np\nimport pdb\n\n\ndef get_bb8_of_model(obj_m):\n    xmin, ymin, zmin = obj_m[:, 0].min(), obj_m[:, 1].min(), obj_m[:, 2].min()\n    xmax, ymax, zmax = obj_m[:, 0].max(), obj_m[:, 1].max(), obj_m[:, 2].max()\n    bb8 = np.array([\n        [xmin, ymin, zmin],\n        [xmin, ymax, zmin],\n        [xmax, ymax, zmin],\n        [xmax, ymin, zmin],\n        [xmin, ymin, zmax],\n        [xmin, ymax, zmax],\n        [xmax, ymax, zmax],\n        [xmax, ymin, zmax],\n    ])\n    return bb8\n"
  },
  {
    "path": "FourierGrid/pose_utils/pose_operators.py",
    "content": "import pdb\nimport numpy as np\nfrom scipy.spatial.transform import Rotation as R\n\n\ndef chordal_distance(R1,R2):\n    return np.sqrt(np.sum((R1-R2)*(R1-R2))) \n\n\ndef rotation_angle_chordal(R1, R2):\n    return 2*np.arcsin(chordal_distance(R1,R2)/np.sqrt(8))\n\n\ndef cal_pose_rot_diff(pose1, pose2):\n    ang_err_chordal = rotation_angle_chordal(pose1[:3, :3], pose2[:3, :3])\n    return ang_err_chordal\n\n\ndef rot_diff_to_norm_angle(rotation_difference):\n    theta = np.arccos((np.trace(rotation_difference) - 1) / 2)\n    theta = np.rad2deg(theta)\n    euler = R.from_matrix(rotation_difference).as_euler('zyx', degrees=True)\n    norm_angle = np.linalg.norm(euler)\n    return norm_angle\n\n\ndef cal_one_add(model_points, pose_pred, pose_targets, syn=False):\n    model_pred = np.dot(model_points, pose_pred[:, :3].T) + pose_pred[:, 3]\n    model_targets = np.dot(model_points, pose_targets[:, :3].T) + pose_targets[:, 3]\n    if syn:\n        from thirdparty.nn import nn_utils  # TODO: solve this reference\n        idxs = nn_utils.find_nearest_point_idx(model_pred, model_targets)\n        # idxs = find_nearest_point_idx(model_pred, model_targets)\n        mean_dist = np.mean(np.linalg.norm(\n            model_pred[idxs] - model_targets, 2, 1))\n    else:\n        mean_dist = np.mean(np.linalg.norm(\n            model_pred - model_targets, axis=-1))\n    return mean_dist\n\n\ndef pose_rot_interpolation(pose_a, pose_b, inter_num=100):\n    '''\n    interpolate poses ASSUMING pose = [R, Rt] (in the canonical coordinate system). \n    '''\n    pose_a, pose_b = pose_a.cpu().numpy(), pose_b.cpu().numpy()\n    pose_a_rot, pose_b_rot = pose_a[:3, :3], pose_b[:3, :3]\n    pose_a_trans, pose_b_trans = pose_a[:3, -1], pose_b[:3, -1]\n    inv_pose_a_rot, inv_pose_b_rot = np.linalg.inv(pose_a_rot), np.linalg.inv(pose_b_rot)\n    # inv_pose_a_rot@pose_a_trans is equal to inv_pose_b_rot@pose_b_trans, be [0, 0, 400] by default\n    ori_t = (inv_pose_a_rot@pose_a_trans).astype(np.int)\n    # generate rotations\n    rotation_a, rotation_b = R.from_matrix(pose_a_rot), R.from_matrix(pose_b_rot)\n    euler_order = 'xyz'\n    rotation_a_euler, rotation_b_euler = rotation_a.as_euler(euler_order), rotation_b.as_euler(euler_order)\n    all_rotations_euler = [rotation_a_euler + i / inter_num * (rotation_b_euler - rotation_a_euler) for i in range(inter_num)]\n    all_rotations = [R.from_euler(euler_order, rot_euler) for rot_euler in all_rotations_euler]\n    all_rotations = [rot.as_matrix() for rot in all_rotations]\n    # form pose matrixs\n    poses = [pose_a.copy() for i in range(inter_num)]\n    for i in range(inter_num):\n        poses[i][:3, :3] = all_rotations[i]\n        poses[i][:3, -1] = poses[i][:3, :3] @ ori_t\n    return poses\n\n\ndef pose_to_blender(pose):\n    rot = pose[:3, :3]\n    quat = R.from_matrix(rot).as_quat()\n    trans = pose[:3, -1]\n    inv_rot = np.linalg.inv(rot)\n    cam_loc = - inv_rot @ trans \n    return quat, cam_loc"
  },
  {
    "path": "FourierGrid/pose_utils/projection.py",
    "content": "import torch\nimport numpy as np\nfrom FourierGrid.pose_utils.visualization import *\n\n\ndef get_projected_points(cam_pose, cam_k, obj_m, one_img=None, save_root=None, pre_str=\"\", post_str=\"\"):\n    point_num = obj_m.shape[0]\n    homo_points_3d = np.concatenate([obj_m, np.ones((point_num, 1))], axis=-1)\n    batch_cam_pose = torch.tensor(cam_pose).unsqueeze(0).repeat(point_num, 1, 1)\n    batch_cam_k = torch.tensor(cam_k).unsqueeze(0).repeat(point_num, 1, 1)\n    homo_points_2d = torch.bmm(batch_cam_pose, torch.tensor(homo_points_3d).unsqueeze(-1))\n    homo_points_2d = torch.bmm(batch_cam_k, homo_points_2d)\n    points_2d = homo_points_2d.squeeze()\n    points_2d = points_2d[:, :2] / points_2d[:, -1].unsqueeze(-1).repeat(1, 2)\n    points_2d = points_2d.cpu().numpy()\n    if one_img is not None:  # for visualization:\n        visualize_2d_points(points_2d=points_2d, bg_image=one_img, save_root=save_root, pre_str=pre_str, post_str=post_str)\n    return points_2d\n"
  },
  {
    "path": "FourierGrid/pose_utils/visualization.py",
    "content": "import os\nimport cv2\nimport pdb\nimport torch\nimport imageio\nimport numpy as np\n\n\ndef visualize_2d_points(points_2d, bg_image, save_root=None, pre_str=\"\", post_str=\"\"):\n    \"\"\"\n    points_2d: [N, 2] denotes the points in the 2D space\n    bg_image: background image for visualization\n    post_str: adding some description\n    \"\"\"\n    vis_img = np.zeros(bg_image.shape).astype(np.uint8)\n    points_2d = points_2d.astype(np.int)\n    vis_img[points_2d[:, -1], points_2d[:, 0], :] = 255\n    if save_root is None:\n        imageio.imwrite(f'{pre_str}ori{post_str}.png', bg_image.astype(np.uint8))\n        imageio.imwrite(f'{pre_str}projected{post_str}.png', vis_img.astype(np.uint8))\n        imageio.imwrite(f'{pre_str}composed{post_str}.png', np.maximum(bg_image, vis_img).astype(np.uint8))\n    else:\n        imageio.imwrite(os.path.join(save_root, f'{pre_str}ori{post_str}.png'), bg_image.astype(np.uint8))\n        imageio.imwrite(os.path.join(save_root, f'{pre_str}projected{post_str}.png'), vis_img.astype(np.uint8))\n        imageio.imwrite(os.path.join(save_root, f'{pre_str}composed{post_str}.png'), np.maximum(bg_image, vis_img).astype(np.uint8))\n    return\n\n\ndef get_projected_points(cam_pose, cam_k, obj_m, one_img=None, save_root=None, pre_str=\"\", post_str=\"\"):\n    from FourierGrid.pose_utils.projection import get_projected_points\n    return get_projected_points(cam_pose, cam_k, obj_m, one_img, save_root, pre_str, post_str)\n\n\ndef draw_bbox_8_2D(draw_img, bbox_8_2D, color = (0, 255, 0), thickness = 2):\n    \"\"\" Draws the 2D projection of a 3D model's cuboid on an image with a given color.\n    # Arguments\n        draw_img     : The image to draw on.\n        bbox_8_2D    : A [8 or 9, 2] matrix containing the 8 corner points (x, y) and maybe also the centerpoint.\n        color     : The color of the boxes.\n        thickness : The thickness of the lines to draw boxes with.\n    \"\"\"\n    #convert bbox to int and tuple\n    bbox = np.copy(bbox_8_2D).astype(np.int32)\n    bbox = tuple(map(tuple, bbox))\n    #lower level\n    cv2.line(draw_img, bbox[0], bbox[1], color, thickness)\n    cv2.line(draw_img, bbox[1], bbox[2], color, thickness)\n    cv2.line(draw_img, bbox[2], bbox[3], color, thickness)\n    cv2.line(draw_img, bbox[0], bbox[3], color, thickness)\n    #upper level\n    cv2.line(draw_img, bbox[4], bbox[5], color, thickness)\n    cv2.line(draw_img, bbox[5], bbox[6], color, thickness)\n    cv2.line(draw_img, bbox[6], bbox[7], color, thickness)\n    cv2.line(draw_img, bbox[4], bbox[7], color, thickness)\n    #sides\n    cv2.line(draw_img, bbox[0], bbox[4], color, thickness)\n    cv2.line(draw_img, bbox[1], bbox[5], color, thickness)\n    cv2.line(draw_img, bbox[2], bbox[6], color, thickness)\n    cv2.line(draw_img, bbox[3], bbox[7], color, thickness)\n    \n    #check if centerpoint is also available to draw\n    if len(bbox) == 9:\n        #draw centerpoint\n        cv2.circle(draw_img, bbox[8], 3, color, -1)\n    \n\ndef visualize_pose_prediction(pose_a, pose_b, cam_k, obj_bb8, bg_img, save_root=None, a_color=(170, 214, 85), b_color=(66, 51, 122), pre_str='', post_str=''):\n    # get projected points only so save_root=None, post_str=\"\"\n    new_bg_img = bg_img.copy()\n    bb8_2d_a = get_projected_points(pose_a, cam_k, obj_bb8, one_img=None, save_root=None, pre_str=\"\", post_str=\"\")\n    bb8_2d_b = get_projected_points(pose_b, cam_k, obj_bb8, one_img=None, save_root=None, pre_str=\"\", post_str=\"\")\n    draw_bbox_8_2D(new_bg_img, bb8_2d_a, color=a_color, thickness=2)\n    draw_bbox_8_2D(new_bg_img, bb8_2d_b, color=b_color, thickness=2)\n    save_path = f'{pre_str}compare_pose{post_str}.png'\n    if save_root is not None:\n        save_path = os.path.join(save_root, save_path)\n    imageio.imwrite(save_path, new_bg_img.astype(np.uint8))\n"
  },
  {
    "path": "FourierGrid/pycolmap/LICENSE.txt",
    "content": "MIT License\n\nCopyright (c) 2018 True Price, UNC Chapel Hill\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "FourierGrid/pycolmap/README.md",
    "content": "# pycolmap\nPython interface for COLMAP reconstructions, plus some convenient scripts for loading/modifying/converting reconstructions.\n\nThis code does not, however, run reconstruction -- it only provides a convenient interface for handling COLMAP's output.\n"
  },
  {
    "path": "FourierGrid/pycolmap/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/pycolmap/pycolmap/__init__.py",
    "content": "from pycolmap.pycolmap.camera import Camera\nfrom pycolmap.pycolmap.database import COLMAPDatabase\nfrom pycolmap.pycolmap.image import Image\nfrom pycolmap.pycolmap.scene_manager import SceneManager\nfrom pycolmap.pycolmap.rotation import Quaternion, DualQuaternion\n"
  },
  {
    "path": "FourierGrid/pycolmap/pycolmap/camera.py",
    "content": "# Author: True Price <jtprice at cs.unc.edu>\n\nimport numpy as np\n\nfrom scipy.optimize import root\n\n\n#-------------------------------------------------------------------------------\n#\n# camera distortion functions for arrays of size (..., 2)\n#\n#-------------------------------------------------------------------------------\n\ndef simple_radial_distortion(camera, x):\n    return x * (1. + camera.k1 * np.square(x).sum(axis=-1, keepdims=True))\n\ndef radial_distortion(camera, x):\n    r_sq = np.square(x).sum(axis=-1, keepdims=True)\n    return x * (1. + r_sq * (camera.k1 + camera.k2 * r_sq))\n\ndef opencv_distortion(camera, x):\n    x_sq = np.square(x)\n    xy = np.prod(x, axis=-1, keepdims=True)\n    r_sq = x_sq.sum(axis=-1, keepdims=True)\n\n    return x * (1. + r_sq * (camera.k1 + camera.k2 * r_sq)) + np.concatenate((\n        2. * camera.p1 * xy + camera.p2 * (r_sq + 2. * x_sq),\n        camera.p1 * (r_sq + 2. * y_sq) + 2. * camera.p2 * xy),\n        axis=-1)\n\n\n#-------------------------------------------------------------------------------\n#\n# Camera\n#\n#-------------------------------------------------------------------------------\n\nclass Camera:\n    @staticmethod\n    def GetNumParams(type_):\n        if type_ == 0 or type_ == 'SIMPLE_PINHOLE':\n            return 3\n        if type_ == 1 or type_ == 'PINHOLE':\n            return 4\n        if type_ == 2 or type_ == 'SIMPLE_RADIAL':\n            return 4\n        if type_ == 3 or type_ == 'RADIAL':\n            return 5\n        if type_ == 4 or type_ == 'OPENCV':\n            return 8\n        #if type_ == 5 or type_ == 'OPENCV_FISHEYE':\n        #    return 8\n        #if type_ == 6 or type_ == 'FULL_OPENCV':\n        #    return 12\n        #if type_ == 7 or type_ == 'FOV':\n        #    return 5\n        #if type_ == 8 or type_ == 'SIMPLE_RADIAL_FISHEYE':\n        #    return 4\n        #if type_ == 9 or type_ == 'RADIAL_FISHEYE':\n        #    return 5\n        #if type_ == 10 or type_ == 'THIN_PRISM_FISHEYE':\n        #    return 12\n\n        # TODO: not supporting other camera types, currently\n        raise Exception('Camera type not supported')\n\n\n    #---------------------------------------------------------------------------\n\n    @staticmethod\n    def GetNameFromType(type_):\n        if type_ == 0: return 'SIMPLE_PINHOLE'\n        if type_ == 1: return 'PINHOLE'\n        if type_ == 2: return 'SIMPLE_RADIAL'\n        if type_ == 3: return 'RADIAL'\n        if type_ == 4: return 'OPENCV'\n        #if type_ == 5: return 'OPENCV_FISHEYE'\n        #if type_ == 6: return 'FULL_OPENCV'\n        #if type_ == 7: return 'FOV'\n        #if type_ == 8: return 'SIMPLE_RADIAL_FISHEYE'\n        #if type_ == 9: return 'RADIAL_FISHEYE'\n        #if type_ == 10: return 'THIN_PRISM_FISHEYE'\n\n        raise Exception('Camera type not supported')\n\n\n    #---------------------------------------------------------------------------\n\n    def __init__(self, type_, width_, height_, params):\n        self.width = width_\n        self.height = height_\n\n        if type_ == 0 or type_ == 'SIMPLE_PINHOLE':\n            self.fx, self.cx, self.cy = params\n            self.fy = self.fx\n            self.distortion_func = None\n            self.camera_type = 0\n\n        elif type_ == 1 or type_ == 'PINHOLE':\n            self.fx, self.fy, self.cx, self.cy = params\n            self.distortion_func = None\n            self.camera_type = 1\n\n        elif type_ == 2 or type_ == 'SIMPLE_RADIAL':\n            self.fx, self.cx, self.cy, self.k1 = params\n            self.fy = self.fx\n            self.distortion_func = simple_radial_distortion\n            self.camera_type = 2\n\n        elif type_ == 3 or type_ == 'RADIAL':\n            self.fx, self.cx, self.cy, self.k1, self.k2 = params\n            self.fy = self.fx\n            self.distortion_func = radial_distortion\n            self.camera_type = 3\n\n        elif type_ == 4 or type_ == 'OPENCV':\n            self.fx, self.fy, self.cx, self.cy = params[:4]\n            self.k1, self.k2, self.p1, self.p2 = params[4:]\n            self.distortion_func = opencv_distortion\n            self.camera_type = 4\n\n        else:\n            raise Exception('Camera type not supported')\n\n\n    #---------------------------------------------------------------------------\n\n    def __str__(self):\n        s = (self.GetNameFromType(self.camera_type) +\n             ' {} {} {}'.format(self.width, self.height, self.fx))\n\n        if self.camera_type in (1, 4): # PINHOLE, OPENCV\n            s += ' {}'.format(self.fy)\n\n        s += ' {} {}'.format(self.cx, self.cy)\n\n        if self.camera_type == 2: # SIMPLE_RADIAL\n            s += ' {}'.format(self.k1)\n\n        elif self.camera_type == 3: # RADIAL\n            s += ' {} {}'.format(self.k1, self.k2)\n\n        elif self.camera_type == 4: # OPENCV\n            s += ' {} {} {} {}'.format(self.k1, self.k2, self.p1, self.p2)\n\n        return s\n\n\n    #---------------------------------------------------------------------------\n\n    # return the camera parameters in the same order as the colmap output format\n    def get_params(self):\n        if self.camera_type == 0:\n            return np.array((self.fx, self.cx, self.cy))\n        if self.camera_type == 1:\n            return np.array((self.fx, self.fy, self.cx, self.cy))\n        if self.camera_type == 2:\n            return np.array((self.fx, self.cx, self.cy, self.k1))\n        if self.camera_type == 3:\n            return np.array((self.fx, self.cx, self.cy, self.k1, self.k2))\n        if self.camera_type == 4:\n            return np.array((self.fx, self.fy, self.cx, self.cy, self.k1,\n                             self.k2, self.p1, self.p2))\n\n\n    #---------------------------------------------------------------------------\n\n    def get_camera_matrix(self):\n        return np.array(\n            ((self.fx, 0, self.cx), (0, self.fy, self.cy), (0, 0, 1)))\n\n    def get_inverse_camera_matrix(self):\n        return np.array(\n            ((1. / self.fx, 0, -self.cx / self.fx),\n             (0, 1. / self.fy, -self.cy / self.fy),\n             (0, 0, 1)))\n\n    @property\n    def K(self):\n        return self.get_camera_matrix()\n\n    @property\n    def K_inv(self):\n        return self.get_inverse_camera_matrix()\n\n    #---------------------------------------------------------------------------\n\n    # return the inverse camera matrix\n    def get_inv_camera_matrix(self):\n        inv_fx, inv_fy = 1. / self.fx, 1. / self.fy\n        return np.array(((inv_fx, 0, -inv_fx * self.cx),\n                         (0, inv_fy, -inv_fy * self.cy),\n                         (0, 0, 1)))\n\n\n    #---------------------------------------------------------------------------\n\n    # return an (x, y) pixel coordinate grid for this camera\n    def get_image_grid(self):\n        xmin = (0.5 - self.cx) / self.fx\n        xmax = (self.width - 0.5 - self.cx) / self.fx\n        ymin = (0.5 - self.cy) / self.fy\n        ymax = (self.height - 0.5 - self.cy) / self.fy\n        return np.meshgrid(np.linspace(xmin, xmax, self.width),\n                           np.linspace(ymin, ymax, self.height))\n\n\n    #---------------------------------------------------------------------------\n\n    # x: array of shape (N,2) or (2,)\n    # normalized: False if the input points are in pixel coordinates\n    # denormalize: True if the points should be put back into pixel coordinates\n    def distort_points(self, x, normalized=True, denormalize=True):\n        x = np.atleast_2d(x)\n\n        # put the points into normalized camera coordinates\n        if not normalized:\n            x -= np.array([[self.cx, self.cy]])\n            x /= np.array([[self.fx, self.fy]])\n\n        # distort, if necessary\n        if self.distortion_func is not None:\n            x = self.distortion_func(self, x)\n\n        if denormalize:\n            x *= np.array([[self.fx, self.fy]])\n            x += np.array([[self.cx, self.cy]])\n\n        return x\n\n\n    #---------------------------------------------------------------------------\n\n    # x: array of shape (N1,N2,...,2), (N,2), or (2,)\n    # normalized: False if the input points are in pixel coordinates\n    # denormalize: True if the points should be put back into pixel coordinates\n    def undistort_points(self, x, normalized=False, denormalize=True):\n        x = np.atleast_2d(x)\n\n        # put the points into normalized camera coordinates\n        if not normalized:\n            x = x - np.array([self.cx, self.cy]) # creates a copy\n            x /= np.array([self.fx, self.fy])\n\n        # undistort, if necessary\n        if self.distortion_func is not None:\n            def objective(xu):\n                return (x - self.distortion_func(self, xu.reshape(*x.shape))\n                    ).ravel()\n\n            xu = root(objective, x).x.reshape(*x.shape)\n        else:\n            xu = x\n            \n        if denormalize:\n            xu *= np.array([[self.fx, self.fy]])\n            xu += np.array([[self.cx, self.cy]])\n\n        return xu\n"
  },
  {
    "path": "FourierGrid/pycolmap/pycolmap/database.py",
    "content": "import numpy as np\nimport os\nimport sqlite3\n\n\n#-------------------------------------------------------------------------------\n# convert SQLite BLOBs to/from numpy arrays\n\ndef array_to_blob(arr):\n    return np.getbuffer(arr)\n\ndef blob_to_array(blob, dtype, shape=(-1,)):\n    return np.frombuffer(blob, dtype).reshape(*shape)\n\n\n#-------------------------------------------------------------------------------\n# convert to/from image pair ids\n\nMAX_IMAGE_ID = 2**31 - 1\n\ndef get_pair_id(image_id1, image_id2):\n    if image_id1 > image_id2:\n        image_id1, image_id2 = image_id2, image_id1\n    return image_id1 * MAX_IMAGE_ID + image_id2\n\n\ndef get_image_ids_from_pair_id(pair_id):\n    image_id2 = pair_id % MAX_IMAGE_ID\n    return (pair_id - image_id2) / MAX_IMAGE_ID, image_id2\n\n\n#-------------------------------------------------------------------------------\n# create table commands\n\nCREATE_CAMERAS_TABLE = \"\"\"CREATE TABLE IF NOT EXISTS cameras (\n    camera_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,\n    model INTEGER NOT NULL,\n    width INTEGER NOT NULL,\n    height INTEGER NOT NULL,\n    params BLOB,\n    prior_focal_length INTEGER NOT NULL)\"\"\"\n\nCREATE_DESCRIPTORS_TABLE = \"\"\"CREATE TABLE IF NOT EXISTS descriptors (\n    image_id INTEGER PRIMARY KEY NOT NULL,\n    rows INTEGER NOT NULL,\n    cols INTEGER NOT NULL,\n    data BLOB,\n    FOREIGN KEY(image_id) REFERENCES images(image_id) ON DELETE CASCADE)\"\"\"\n\nCREATE_IMAGES_TABLE = \"\"\"CREATE TABLE IF NOT EXISTS images (\n    image_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,\n    name TEXT NOT NULL UNIQUE,\n    camera_id INTEGER NOT NULL,\n    prior_qw REAL,\n    prior_qx REAL,\n    prior_qy REAL,\n    prior_qz REAL,\n    prior_tx REAL,\n    prior_ty REAL,\n    prior_tz REAL,\n    CONSTRAINT image_id_check CHECK(image_id >= 0 and image_id < 2147483647),\n    FOREIGN KEY(camera_id) REFERENCES cameras(camera_id))\"\"\"\n\nCREATE_INLIER_MATCHES_TABLE = \"\"\"CREATE TABLE IF NOT EXISTS two_view_geometries (\n    pair_id INTEGER PRIMARY KEY NOT NULL,\n    rows INTEGER NOT NULL,\n    cols INTEGER NOT NULL,\n    data BLOB,\n    config INTEGER NOT NULL,\n    F BLOB,\n    E BLOB,\n    H BLOB)\"\"\"\n\nCREATE_KEYPOINTS_TABLE = \"\"\"CREATE TABLE IF NOT EXISTS keypoints (\n    image_id INTEGER PRIMARY KEY NOT NULL,\n    rows INTEGER NOT NULL,\n    cols INTEGER NOT NULL,\n    data BLOB,\n    FOREIGN KEY(image_id) REFERENCES images(image_id) ON DELETE CASCADE)\"\"\"\n\nCREATE_MATCHES_TABLE = \"\"\"CREATE TABLE IF NOT EXISTS matches (\n    pair_id INTEGER PRIMARY KEY NOT NULL,\n    rows INTEGER NOT NULL,\n    cols INTEGER NOT NULL,\n    data BLOB)\"\"\"\n\nCREATE_NAME_INDEX = \\\n    \"CREATE UNIQUE INDEX IF NOT EXISTS index_name ON images(name)\"\n\nCREATE_ALL = \"; \".join([CREATE_CAMERAS_TABLE, CREATE_DESCRIPTORS_TABLE,\n    CREATE_IMAGES_TABLE, CREATE_INLIER_MATCHES_TABLE, CREATE_KEYPOINTS_TABLE,\n    CREATE_MATCHES_TABLE, CREATE_NAME_INDEX])\n\n\n#-------------------------------------------------------------------------------\n# functional interface for adding objects\n\ndef add_camera(db, model, width, height, params, prior_focal_length=False,\n               camera_id=None):\n    # TODO: Parameter count checks\n    params = np.asarray(params, np.float64)\n    db.execute(\"INSERT INTO cameras VALUES (?, ?, ?, ?, ?, ?)\",\n        (camera_id, model, width, height, array_to_blob(params),\n         prior_focal_length))\n\n\ndef add_descriptors(db, image_id, descriptors):\n    descriptors = np.ascontiguousarray(descriptors, np.uint8)\n    db.execute(\"INSERT INTO descriptors VALUES (?, ?, ?, ?)\",\n        (image_id,) + descriptors.shape + (array_to_blob(descriptors),))\n\n\ndef add_image(db, name, camera_id, prior_q=np.zeros(4), prior_t=np.zeros(3),\n        image_id=None):\n    db.execute(\"INSERT INTO images VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n        (image_id, name, camera_id, prior_q[0], prior_q[1], prior_q[2],\n         prior_q[3], prior_t[0], prior_t[1], prior_t[2]))\n\n\n# config: defaults to fundamental matrix\ndef add_inlier_matches(db, image_id1, image_id2, matches, config=2, F=None,\n                       E=None, H=None):\n    assert(len(matches.shape) == 2)\n    assert(matches.shape[1] == 2)\n\n    if image_id1 > image_id2:\n        matches = matches[:,::-1]\n\n    if F is not None:\n        F = np.asarray(F, np.float64)\n    if E is not None:\n        E = np.asarray(E, np.float64)\n    if H is not None:\n        H = np.asarray(H, np.float64)\n\n    pair_id = get_pair_id(image_id1, image_id2)\n    matches = np.asarray(matches, np.uint32)\n    db.execute(\"INSERT INTO inlier_matches VALUES (?, ?, ?, ?, ?, ?, ?, ?)\",\n        (pair_id,) + matches.shape + (array_to_blob(matches), config, F, E, H))\n\n\ndef add_keypoints(db, image_id, keypoints):\n    assert(len(keypoints.shape) == 2)\n    assert(keypoints.shape[1] in [2, 4, 6])\n\n    keypoints = np.asarray(keypoints, np.float32)\n    db.execute(\"INSERT INTO keypoints VALUES (?, ?, ?, ?)\",\n        (image_id,) + keypoints.shape + (array_to_blob(keypoints),))\n\n\n# config: defaults to fundamental matrix\ndef add_matches(db, image_id1, image_id2, matches):\n    assert(len(matches.shape) == 2)\n    assert(matches.shape[1] == 2)\n\n    if image_id1 > image_id2:\n        matches = matches[:,::-1]\n\n    pair_id = get_pair_id(image_id1, image_id2)\n    matches = np.asarray(matches, np.uint32)\n    db.execute(\"INSERT INTO matches VALUES (?, ?, ?, ?)\",\n        (pair_id,) + matches.shape + (array_to_blob(matches),))\n\n\n#-------------------------------------------------------------------------------\n# simple functional interface\n\nclass COLMAPDatabase(sqlite3.Connection):\n    @staticmethod\n    def connect(database_path):\n        return sqlite3.connect(database_path, factory=COLMAPDatabase)\n\n\n    def __init__(self, *args, **kwargs):\n        super(COLMAPDatabase, self).__init__(*args, **kwargs)\n\n        self.initialize_tables = lambda: self.executescript(CREATE_ALL)\n\n        self.initialize_cameras = \\\n            lambda: self.executescript(CREATE_CAMERAS_TABLE)\n        self.initialize_descriptors = \\\n            lambda: self.executescript(CREATE_DESCRIPTORS_TABLE)\n        self.initialize_images = \\\n            lambda: self.executescript(CREATE_IMAGES_TABLE)\n        self.initialize_inlier_matches = \\\n            lambda: self.executescript(CREATE_INLIER_MATCHES_TABLE)\n        self.initialize_keypoints = \\\n            lambda: self.executescript(CREATE_KEYPOINTS_TABLE)\n        self.initialize_matches = \\\n            lambda: self.executescript(CREATE_MATCHES_TABLE)\n\n        self.create_name_index = lambda: self.executescript(CREATE_NAME_INDEX)\n\n\n    add_camera = add_camera\n    add_descriptors = add_descriptors\n    add_image = add_image\n    add_inlier_matches = add_inlier_matches\n    add_keypoints = add_keypoints\n    add_matches = add_matches\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    import os\n\n    if os.path.exists(args.database_path):\n        print(\"Error: database path already exists -- will not modify it.\")\n        exit()\n\n    db = COLMAPDatabase.connect(args.database_path)\n\n    #\n    # for convenience, try creating all the tables upfront\n    #\n\n    db.initialize_tables()\n\n\n    #\n    # create dummy cameras\n    #\n\n    model1, w1, h1, params1 = 0, 1024, 768, np.array((1024., 512., 384.))\n    model2, w2, h2, params2 = 2, 1024, 768, np.array((1024., 512., 384., 0.1))\n\n    db.add_camera(model1, w1, h1, params1)\n    db.add_camera(model2, w2, h2, params2)\n\n\n    #\n    # create dummy images\n    #\n\n    db.add_image(\"image1.png\", 0)\n    db.add_image(\"image2.png\", 0)\n    db.add_image(\"image3.png\", 2)\n    db.add_image(\"image4.png\", 2)\n\n\n    #\n    # create dummy keypoints; note that COLMAP supports 2D keypoints (x, y),\n    # 4D keypoints (x, y, theta, scale), and 6D affine keypoints\n    # (x, y, a_11, a_12, a_21, a_22)\n    #\n\n    N = 1000\n    kp1 = np.random.rand(N, 2) * (1024., 768.)\n    kp2 = np.random.rand(N, 2) * (1024., 768.)\n    kp3 = np.random.rand(N, 2) * (1024., 768.)\n    kp4 = np.random.rand(N, 2) * (1024., 768.)\n\n    db.add_keypoints(1, kp1)\n    db.add_keypoints(2, kp2)\n    db.add_keypoints(3, kp3)\n    db.add_keypoints(4, kp4)\n\n\n    #\n    # create dummy matches\n    #\n\n    M = 50\n    m12 = np.random.randint(N, size=(M, 2))\n    m23 = np.random.randint(N, size=(M, 2))\n    m34 = np.random.randint(N, size=(M, 2))\n\n    db.add_matches(1, 2, m12)\n    db.add_matches(2, 3, m23)\n    db.add_matches(3, 4, m34)\n\n\n    #\n    # check cameras\n    #\n\n    rows = db.execute(\"SELECT * FROM cameras\")\n\n    camera_id, model, width, height, params, prior = next(rows)\n    params = blob_to_array(params, np.float32)\n    assert model == model1 and width == w1 and height == h1\n    assert np.allclose(params, params1)\n\n    camera_id, model, width, height, params, prior = next(rows)\n    params = blob_to_array(params, np.float32)\n    assert model == model2 and width == w2 and height == h2\n    assert np.allclose(params, params2)\n\n\n    #\n    # check keypoints\n    #\n\n    kps = dict(\n        (image_id, blob_to_array(data, np.float32, (-1, 2)))\n        for image_id, data in db.execute(\n            \"SELECT image_id, data FROM keypoints\"))\n\n    assert np.allclose(kps[1], kp1)\n    assert np.allclose(kps[2], kp2)\n    assert np.allclose(kps[3], kp3)\n    assert np.allclose(kps[4], kp4)\n\n\n    #\n    # check matches\n    #\n\n    pair_ids = [get_pair_id(*pair) for pair in [(1, 2), (2, 3), (3, 4)]]\n\n    matches = dict(\n        (get_image_ids_from_pair_id(pair_id),\n            blob_to_array(data, np.uint32, (-1, 2)))\n        for pair_id, data in db.execute(\"SELECT pair_id, data FROM matches\"))\n\n    assert np.all(matches[(1, 2)] == m12)\n    assert np.all(matches[(2, 3)] == m23)\n    assert np.all(matches[(3, 4)] == m34)\n    \n    #\n    # clean up\n    #\n\n    db.close()\n    os.remove(args.database_path)\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"--database_path\", type=str, default=\"database.db\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/pycolmap/image.py",
    "content": "# Author: True Price <jtprice at cs.unc.edu>\n\nimport numpy as np\n\n#-------------------------------------------------------------------------------\n#\n# Image\n#\n#-------------------------------------------------------------------------------\n\nclass Image:\n    def __init__(self, name_, camera_id_, q_, tvec_):\n        self.name = name_\n        self.camera_id = camera_id_\n        self.q = q_\n        self.tvec = tvec_\n\n        self.points2D = np.empty((0, 2), dtype=np.float64)\n        self.point3D_ids = np.empty((0,), dtype=np.uint64)\n\n    #---------------------------------------------------------------------------\n\n    def R(self):\n        return self.q.ToR()\n\n    #---------------------------------------------------------------------------\n\n    def C(self):\n        return -self.R().T.dot(self.tvec)\n\n    #---------------------------------------------------------------------------\n\n    @property\n    def t(self):\n        return self.tvec\n"
  },
  {
    "path": "FourierGrid/pycolmap/pycolmap/rotation.py",
    "content": "# Author: True Price <jtprice at cs.unc.edu>\n\nimport numpy as np\n\n#-------------------------------------------------------------------------------\n#\n# Axis-Angle Functions\n#\n#-------------------------------------------------------------------------------\n\n# returns the cross product matrix representation of a 3-vector v\ndef cross_prod_matrix(v):\n    return np.array(((0., -v[2], v[1]), (v[2], 0., -v[0]), (-v[1], v[0], 0.)))\n\n#-------------------------------------------------------------------------------\n\n# www.euclideanspace.com/maths/geometry/rotations/conversions/angleToMatrix/\n# if angle is None, assume ||axis|| == angle, in radians\n# if angle is not None, assume that axis is a unit vector\ndef axis_angle_to_rotation_matrix(axis, angle=None):\n    if angle is None:\n        angle = np.linalg.norm(axis)\n        if np.abs(angle) > np.finfo('float').eps:\n            axis = axis / angle\n\n    cp_axis = cross_prod_matrix(axis)\n    return np.eye(3) + (\n        np.sin(angle) * cp_axis + (1. - np.cos(angle)) * cp_axis.dot(cp_axis))\n\n#-------------------------------------------------------------------------------\n\n# after some deliberation, I've decided the easiest way to do this is to use\n# quaternions as an intermediary\ndef rotation_matrix_to_axis_angle(R):\n    return Quaternion.FromR(R).ToAxisAngle()\n\n#-------------------------------------------------------------------------------\n#\n# Quaternion\n#\n#-------------------------------------------------------------------------------\n\nclass Quaternion:\n    # create a quaternion from an existing rotation matrix\n    # euclideanspace.com/maths/geometry/rotations/conversions/matrixToQuaternion/\n    @staticmethod\n    def FromR(R):\n        trace = np.trace(R)\n  \n        if trace > 0:\n            qw = 0.5 * np.sqrt(1. + trace)\n            qx = (R[2,1] - R[1,2]) * 0.25 / qw\n            qy = (R[0,2] - R[2,0]) * 0.25 / qw\n            qz = (R[1,0] - R[0,1]) * 0.25 / qw\n        elif R[0,0] > R[1,1] and R[0,0] > R[2,2]:\n            s = 2. * np.sqrt(1. + R[0,0] - R[1,1] - R[2,2])\n            qw = (R[2,1] - R[1,2]) / s\n            qx = 0.25 * s\n            qy = (R[0,1] + R[1,0]) / s\n            qz = (R[0,2] + R[2,0]) / s\n        elif R[1,1] > R[2,2]:\n            s = 2. * np.sqrt(1. + R[1,1] - R[0,0] - R[2,2])\n            qw = (R[0,2] - R[2,0]) / s\n            qx = (R[0,1] + R[1,0]) / s\n            qy = 0.25 * s\n            qz = (R[1,2] + R[2,1]) / s\n        else:\n            s = 2. * np.sqrt(1. + R[2,2] - R[0,0] - R[1,1])\n            qw = (R[1,0] - R[0,1]) / s\n            qx = (R[0,2] + R[2,0]) / s\n            qy = (R[1,2] + R[2,1]) / s\n            qz = 0.25 * s\n  \n        return Quaternion(np.array((qw, qx, qy, qz)))\n  \n    # if angle is None, assume ||axis|| == angle, in radians\n    # if angle is not None, assume that axis is a unit vector\n    @staticmethod\n    def FromAxisAngle(axis, angle=None):\n        if angle is None:\n            angle = np.linalg.norm(axis)\n            if np.abs(angle) > np.finfo('float').eps:\n                axis = axis / angle\n\n        qw = np.cos(0.5 * angle)\n        axis = axis * np.sin(0.5 * angle)\n\n        return Quaternion(np.array((qw, axis[0], axis[1], axis[2])))\n\n    #---------------------------------------------------------------------------\n  \n    def __init__(self, q=np.array((1., 0., 0., 0.))):\n        if isinstance(q, Quaternion):\n            self.q = q.q.copy()\n        else:\n            q = np.asarray(q)\n            if q.size == 4:\n                self.q = q.copy()\n            elif q.size == 3: # convert from a 3-vector to a quaternion\n                self.q = np.empty(4)\n                self.q[0], self.q[1:] = 0., q.ravel()\n            else:\n                raise Exception('Input quaternion should be a 3- or 4-vector')\n  \n    def __add__(self, other):\n        return Quaternion(self.q + other.q)\n  \n    def __iadd__(self, other):\n        self.q += other.q\n        return self\n  \n    # conjugation via the ~ operator\n    def __invert__(self):\n        return Quaternion(\n            np.array((self.q[0], -self.q[1], -self.q[2], -self.q[3])))\n  \n    # returns: self.q * other.q if other is a Quaternion; otherwise performs\n    #          scalar multiplication\n    def __mul__(self, other):\n      if isinstance(other, Quaternion): # quaternion multiplication\n          return Quaternion(np.array((\n              self.q[0] * other.q[0] - self.q[1] * other.q[1] -\n                 self.q[2] * other.q[2] - self.q[3] * other.q[3],\n              self.q[0] * other.q[1] + self.q[1] * other.q[0] +\n                 self.q[2] * other.q[3] - self.q[3] * other.q[2],\n              self.q[0] * other.q[2] - self.q[1] * other.q[3] +\n                 self.q[2] * other.q[0] + self.q[3] * other.q[1],\n              self.q[0] * other.q[3] + self.q[1] * other.q[2] -\n                 self.q[2] * other.q[1] + self.q[3] * other.q[0])))\n      else: # scalar multiplication (assumed)\n          return Quaternion(other * self.q)\n  \n    def __rmul__(self, other):\n        return self * other\n  \n    def __imul__(self, other):\n        self.q[:] = (self * other).q\n        return self\n  \n    def __irmul__(self, other):\n        self.q[:] = (self * other).q\n        return self\n  \n    def __neg__(self):\n        return Quaternion(-self.q)\n  \n    def __sub__(self, other):\n        return Quaternion(self.q - other.q)\n  \n    def __isub__(self, other):\n        self.q -= other.q\n        return self\n  \n    def __str__(self):\n        return str(self.q)\n\n    def copy(self):\n        return Quaternion(self)\n  \n    def dot(self, other):\n        return self.q.dot(other.q)\n  \n    # assume the quaternion is nonzero!\n    def inverse(self):\n        return Quaternion((~self).q / self.q.dot(self.q))\n  \n    def norm(self):\n        return np.linalg.norm(self.q)\n  \n    def normalize(self):\n        self.q /= np.linalg.norm(self.q)\n        return self\n  \n    # assume x is a Nx3 numpy array or a numpy 3-vector\n    def rotate_points(self, x):\n        x = np.atleast_2d(x)\n        return x.dot(self.ToR().T)\n  \n    # convert to a rotation matrix\n    def ToR(self):\n        return np.eye(3) + 2 * np.array((\n          (-self.q[2] * self.q[2] - self.q[3] * self.q[3],\n            self.q[1] * self.q[2] - self.q[3] * self.q[0],\n            self.q[1] * self.q[3] + self.q[2] * self.q[0]),\n          ( self.q[1] * self.q[2] + self.q[3] * self.q[0],\n           -self.q[1] * self.q[1] - self.q[3] * self.q[3],\n            self.q[2] * self.q[3] - self.q[1] * self.q[0]),\n          ( self.q[1] * self.q[3] - self.q[2] * self.q[0],\n            self.q[2] * self.q[3] + self.q[1] * self.q[0],\n           -self.q[1] * self.q[1] - self.q[2] * self.q[2])))\n  \n    # convert to axis-angle representation, with angle encoded by the length\n    def ToAxisAngle(self):\n        # recall that for axis-angle representation (a, angle), with \"a\" unit:\n        #   q = (cos(angle/2), a * sin(angle/2))\n        # below, for readability, \"theta\" actually means half of the angle\n\n        sin_sq_theta = self.q[1:].dot(self.q[1:])\n    \n        # if theta is non-zero, then we can compute a unique rotation\n        if np.abs(sin_sq_theta) > np.finfo('float').eps:\n            sin_theta = np.sqrt(sin_sq_theta)\n            cos_theta = self.q[0]\n    \n            # atan2 is more stable, so we use it to compute theta\n            # note that we multiply by 2 to get the actual angle\n            angle = 2. * (\n                np.arctan2(-sin_theta, -cos_theta) if cos_theta < 0. else\n                np.arctan2(sin_theta, cos_theta))\n    \n            return self.q[1:] * (angle / sin_theta)\n\n        # otherwise, the result is singular, and we avoid dividing by\n        # sin(angle/2) = 0\n        return np.zeros(3)\n\n    # euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler\n    # this assumes the quaternion is non-zero\n    # returns yaw, pitch, roll, with application in that order\n    def ToEulerAngles(self):\n        qsq = self.q**2\n        k = 2. * (self.q[0] * self.q[3] + self.q[1] * self.q[2]) / qsq.sum()\n\n        if (1. - k) < np.finfo('float').eps: # north pole singularity\n            return  2. * np.arctan2(self.q[1], self.q[0]), 0.5 * np.pi, 0.\n        if (1. + k) < np.finfo('float').eps: # south pole singularity\n            return -2. * np.arctan2(self.q[1], self.q[0]), -0.5 * np.pi, 0.\n\n        yaw = np.arctan2(2. * (self.q[0] * self.q[2] - self.q[1] * self.q[3]),\n                         qsq[0] + qsq[1] - qsq[2] - qsq[3])\n        pitch = np.arcsin(k)\n        roll = np.arctan2(2. * (self.q[0] * self.q[1] - self.q[2] * self.q[3]),\n                          qsq[0] - qsq[1] + qsq[2] - qsq[3])\n\n        return yaw, pitch, roll\n  \n#-------------------------------------------------------------------------------\n#\n# DualQuaternion\n#\n#-------------------------------------------------------------------------------\n\nclass DualQuaternion:\n    # DualQuaternion from an existing rotation + translation\n    @staticmethod\n    def FromQT(q, t):\n        return DualQuaternion(qe=(0.5 * np.asarray(t))) * DualQuaternion(q)\n  \n    def __init__(self, q0=np.array((1., 0., 0., 0.)), qe=np.zeros(4)):\n        self.q0, self.qe = Quaternion(q0), Quaternion(qe)\n  \n    def __add__(self, other):\n        return DualQuaternion(self.q0 + other.q0, self.qe + other.qe)\n  \n    def __iadd__(self, other):\n        self.q0 += other.q0\n        self.qe += other.qe\n        return self\n  \n    # conguation via the ~ operator\n    def __invert__(self):\n        return DualQuaternion(~self.q0, ~self.qe)\n  \n    def __mul__(self, other):\n        if isinstance(other, DualQuaternion):\n            return DualQuaternion(\n                self.q0 * other.q0,\n                self.q0 * other.qe + self.qe * other.q0)\n        elif isinstance(other, complex): # multiplication by a dual number\n            return DualQuaternion(\n                self.q0 * other.real,\n                self.q0 * other.imag + self.qe * other.real)\n        else: # scalar multiplication (assumed)\n            return DualQuaternion(other * self.q0, other * self.qe)\n  \n    def __rmul__(self, other):\n        return self.__mul__(other)\n  \n    def __imul__(self, other):\n        tmp = self * other\n        self.q0, self.qe = tmp.q0, tmp.qe\n        return self\n  \n    def __neg__(self):\n        return DualQuaternion(-self.q0, -self.qe)\n  \n    def __sub__(self, other):\n        return DualQuaternion(self.q0 - other.q0, self.qe - other.qe)\n  \n    def __isub__(self, other):\n        self.q0 -= other.q0\n        self.qe -= other.qe\n        return self\n  \n    # q^-1 = q* / ||q||^2\n    # assume that q0 is nonzero!\n    def inverse(self):\n        normsq = complex(q0.dot(q0), 2. * self.q0.q.dot(self.qe.q))\n        inv_len_real = 1. / normsq.real\n        return ~self * complex(\n            inv_len_real, -normsq.imag * inv_len_real * inv_len_real)\n  \n    # returns a complex representation of the real and imaginary parts of the norm\n    # assume that q0 is nonzero!\n    def norm(self):\n        q0_norm = self.q0.norm()\n        return complex(q0_norm, self.q0.dot(self.qe) / q0_norm)\n  \n    # assume that q0 is nonzero!\n    def normalize(self):\n        # current length is ||q0|| + eps * (<q0, qe> / ||q0||)\n        # writing this as a + eps * b, the inverse is\n        #   1/||q|| = 1/a - eps * b / a^2\n        norm = self.norm()\n        inv_len_real = 1. / norm.real\n        self *= complex(inv_len_real, -norm.imag * inv_len_real * inv_len_real)\n        return self\n  \n    # return the translation vector for this dual quaternion\n    def getT(self):\n        return 2 * (self.qe * ~self.q0).q[1:]\n  \n    def ToQT(self):\n        return self.q0, self.getT()\n"
  },
  {
    "path": "FourierGrid/pycolmap/pycolmap/scene_manager.py",
    "content": "# Author: True Price <jtprice at cs.unc.edu>\n\nimport array\nimport numpy as np\nimport os\nimport struct\n\nfrom collections import OrderedDict\nfrom itertools import combinations\n\nfrom pycolmap.pycolmap.camera import Camera\nfrom pycolmap.pycolmap.image import Image\nfrom pycolmap.pycolmap.rotation import Quaternion\n\n#-------------------------------------------------------------------------------\n#\n# SceneManager\n#\n#-------------------------------------------------------------------------------\n\nclass SceneManager:\n    INVALID_POINT3D = np.uint64(-1)\n\n    def __init__(self, colmap_results_folder, image_path=None):\n        self.folder = colmap_results_folder\n        if not self.folder.endswith('/'):\n            self.folder += '/'\n\n        self.image_path = None\n        self.load_colmap_project_file(image_path=image_path)\n\n        self.cameras = OrderedDict()\n        self.images = OrderedDict()\n        self.name_to_image_id = dict()\n\n        self.last_camera_id = 0\n        self.last_image_id = 0\n\n        # Nx3 array of point3D xyz's\n        self.points3D = np.zeros((0, 3))\n\n        # for each element in points3D, stores the id of the point\n        self.point3D_ids = np.empty(0)\n\n        # point3D_id => index in self.points3D\n        self.point3D_id_to_point3D_idx = dict()\n\n        # point3D_id => [(image_id, point2D idx in image)]\n        self.point3D_id_to_images = dict()\n\n        self.point3D_colors = np.zeros((0, 3), dtype=np.uint8)\n        self.point3D_errors = np.zeros(0)\n\n    #---------------------------------------------------------------------------\n\n    def load_colmap_project_file(self, project_file=None, image_path=None):\n        if project_file is None:\n            project_file = self.folder + 'project.ini'\n\n        self.image_path = image_path\n\n        if self.image_path is None:\n            try:\n                with open(project_file, 'r') as f:\n                    for line in iter(f.readline, ''):\n                        if line.startswith('image_path'):\n                            self.image_path = line[11:].strip()\n                            break\n            except:\n                pass\n\n        if self.image_path is None:\n            print('Warning: image_path not found for reconstruction')\n        elif not self.image_path.endswith('/'):\n            self.image_path += '/'\n\n    #---------------------------------------------------------------------------\n\n    def load(self):\n        self.load_cameras()\n        self.load_images()\n        self.load_points3D()\n\n    #---------------------------------------------------------------------------\n\n    def load_cameras(self, input_file=None):\n        if input_file is None:\n            input_file = self.folder + 'cameras.bin'\n            if os.path.exists(input_file):\n                self._load_cameras_bin(input_file)\n            else:\n                input_file = self.folder + 'cameras.txt'\n                if os.path.exists(input_file):\n                    self._load_cameras_txt(input_file)\n                else:\n                    raise IOError('no cameras file found')\n    \n    def _load_cameras_bin(self, input_file):\n        self.cameras = OrderedDict()\n\n        with open(input_file, 'rb') as f:\n            num_cameras = struct.unpack('L', f.read(8))[0]\n\n            for _ in range(num_cameras):\n                camera_id, camera_type, w, h = struct.unpack('IiLL', f.read(24))\n                num_params = Camera.GetNumParams(camera_type)\n                params = struct.unpack('d' * num_params, f.read(8 * num_params))\n                self.cameras[camera_id] = Camera(camera_type, w, h, params)\n                self.last_camera_id = max(self.last_camera_id, camera_id)\n\n    def _load_cameras_txt(self, input_file):\n        self.cameras = OrderedDict()\n\n        with open(input_file, 'r') as f:\n            for line in iter(lambda: f.readline().strip(), ''):\n                if not line or line.startswith('#'):\n                    continue\n\n                data = line.split()\n                camera_id = int(data[0])\n                self.cameras[camera_id] = Camera(\n                    data[1], int(data[2]), int(data[3]), map(float, data[4:]))\n                self.last_camera_id = max(self.last_camera_id, camera_id)\n\n    #---------------------------------------------------------------------------\n\n    def load_images(self, input_file=None):\n        if input_file is None:\n            input_file = self.folder + 'images.bin'\n            if os.path.exists(input_file):\n                self._load_images_bin(input_file)\n            else:\n                input_file = self.folder + 'images.txt'\n                if os.path.exists(input_file):\n                    self._load_images_txt(input_file)\n                else:\n                    raise IOError('no images file found')\n\n    def _load_images_bin(self, input_file):\n        self.images = OrderedDict()\n\n        with open(input_file, 'rb') as f:\n            num_images = struct.unpack('L', f.read(8))[0]\n            image_struct = struct.Struct('<I 4d 3d I')\n            for _ in range(num_images):\n                data = image_struct.unpack(f.read(image_struct.size))\n                image_id = data[0]\n                q = Quaternion(np.array(data[1:5]))\n                t = np.array(data[5:8])\n                camera_id = data[8]\n                name = b''.join(c for c in iter(lambda: f.read(1), b'\\x00')).decode()\n\n                image = Image(name, camera_id, q, t)\n                num_points2D = struct.unpack('Q', f.read(8))[0]\n\n                # Optimized code below.\n                # Read all elements as double first, then convert to array, slice it\n                # into points2d and ids, and convert ids back to unsigned long longs\n                # ('Q'). This is significantly faster than using O(num_points2D) f.read\n                # calls, experiments show >7x improvements in 60 image model, 23s -> 3s.\n                points_array = array.array('d')\n                points_array.fromfile(f, 3 * num_points2D)\n                points_elements = np.array(points_array).reshape((num_points2D, 3))\n                image.points2D = points_elements[:, :2]\n\n                ids_array = array.array('Q')\n                ids_array.frombytes(points_elements[:, 2].tobytes())\n                image.point3D_ids = np.array(ids_array, dtype=np.uint64).reshape(\n                    (num_points2D,))\n\n                # automatically remove points without an associated 3D point\n                #mask = (image.point3D_ids != SceneManager.INVALID_POINT3D)\n                #image.points2D = image.points2D[mask]\n                #image.point3D_ids = image.point3D_ids[mask]\n\n                self.images[image_id] = image\n                self.name_to_image_id[image.name] = image_id\n\n                self.last_image_id = max(self.last_image_id, image_id)\n\n    def _load_images_txt(self, input_file):\n        self.images = OrderedDict()\n\n        with open(input_file, 'r') as f:\n            is_camera_description_line = False\n\n            for line in iter(lambda: f.readline().strip(), ''):\n                if not line or line.startswith('#'):\n                    continue\n\n                is_camera_description_line = not is_camera_description_line\n\n                data = line.split()\n\n                if is_camera_description_line:\n                    image_id = int(data[0])\n                    image = Image(data[-1], int(data[-2]),\n                                  Quaternion(np.array(map(float, data[1:5]))),\n                                  np.array(map(float, data[5:8])))\n                else:\n                    image.points2D = np.array(\n                        [map(float, data[::3]), map(float, data[1::3])]).T\n                    image.point3D_ids = np.array(map(np.uint64, data[2::3]))\n\n                    # automatically remove points without an associated 3D point\n                    #mask = (image.point3D_ids != SceneManager.INVALID_POINT3D)\n                    #image.points2D = image.points2D[mask]\n                    #image.point3D_ids = image.point3D_ids[mask]\n\n                    self.images[image_id] = image\n                    self.name_to_image_id[image.name] = image_id\n\n                    self.last_image_id = max(self.last_image_id, image_id)\n\n    #---------------------------------------------------------------------------\n\n    def load_points3D(self, input_file=None):\n        if input_file is None:\n            input_file = self.folder + 'points3D.bin'\n            if os.path.exists(input_file):\n                self._load_points3D_bin(input_file)\n            else:\n                input_file = self.folder + 'points3D.txt'\n                if os.path.exists(input_file):\n                    self._load_points3D_txt(input_file)\n                else:\n                    raise IOError('no points3D file found')\n\n    def _load_points3D_bin(self, input_file):\n        with open(input_file, 'rb') as f:\n            num_points3D = struct.unpack('L', f.read(8))[0]\n\n            self.points3D = np.empty((num_points3D, 3))\n            self.point3D_ids = np.empty(num_points3D, dtype=np.uint64)\n            self.point3D_colors = np.empty((num_points3D, 3), dtype=np.uint8)\n            self.point3D_id_to_point3D_idx = dict()\n            self.point3D_id_to_images = dict()\n            self.point3D_errors = np.empty(num_points3D)\n\n            data_struct = struct.Struct('<Q 3d 3B d Q')\n\n            for i in range(num_points3D):\n                data = data_struct.unpack(f.read(data_struct.size))\n                self.point3D_ids[i] = data[0]\n                self.points3D[i] = data[1:4]\n                self.point3D_colors[i] = data[4:7]\n                self.point3D_errors[i] = data[7]\n                track_len = data[8]\n\n                self.point3D_id_to_point3D_idx[self.point3D_ids[i]] = i\n\n                data = struct.unpack(f'{2*track_len}I', f.read(2 * track_len * 4))\n\n                self.point3D_id_to_images[self.point3D_ids[i]] = \\\n                    np.array(data, dtype=np.uint32).reshape(track_len, 2)\n\n    def _load_points3D_txt(self, input_file):\n        self.points3D = []\n        self.point3D_ids = []\n        self.point3D_colors = []\n        self.point3D_id_to_point3D_idx = dict()\n        self.point3D_id_to_images = dict()\n        self.point3D_errors = []\n\n        with open(input_file, 'r') as f:\n            for line in iter(lambda: f.readline().strip(), ''):\n                if not line or line.startswith('#'):\n                    continue\n\n                data = line.split()\n                point3D_id = np.uint64(data[0])\n\n                self.point3D_ids.append(point3D_id)\n                self.point3D_id_to_point3D_idx[point3D_id] = len(self.points3D)\n                self.points3D.append(map(np.float64, data[1:4]))\n                self.point3D_colors.append(map(np.uint8, data[4:7]))\n                self.point3D_errors.append(np.float64(data[7]))\n\n                # load (image id, point2D idx) pairs\n                self.point3D_id_to_images[point3D_id] = \\\n                    np.array(map(np.uint32, data[8:])).reshape(-1, 2)\n\n        self.points3D = np.array(self.points3D)\n        self.point3D_ids = np.array(self.point3D_ids)\n        self.point3D_colors = np.array(self.point3D_colors)\n        self.point3D_errors = np.array(self.point3D_errors)\n\n    #---------------------------------------------------------------------------\n\n    def save(self, output_folder, binary=True):\n        self.save_cameras(output_folder, binary=binary)\n        self.save_images(output_folder, binary=binary)\n        self.save_points3D(output_folder, binary=binary)\n\n    #---------------------------------------------------------------------------\n\n    def save_cameras(self, output_folder, output_file=None, binary=True):\n        if not os.path.exists(output_folder):\n            os.makedirs(output_folder)\n\n        if output_file is None:\n            output_file = 'cameras.bin' if binary else 'cameras.txt'\n\n        output_file = os.path.join(output_folder, output_file)\n\n        if binary:\n            self._save_cameras_bin(output_file)\n        else:\n            self._save_cameras_txt(output_file)\n    \n    def _save_cameras_bin(self, output_file):\n        with open(output_file, 'wb') as fid:\n            fid.write(struct.pack('L', len(self.cameras)))\n\n            camera_struct = struct.Struct('IiLL')\n\n            for camera_id, camera in sorted(self.cameras.iteritems()):\n                fid.write(camera_struct.pack(\n                    camera_id, camera.camera_type, camera.width, camera.height))\n                # TODO (True): should move this into the Camera class\n                fid.write(camera.get_params().tobytes())\n\n    def _save_cameras_txt(self, output_file):\n        with open(output_file, 'w') as fid:\n            print>>fid, '# Camera list with one line of data per camera:'\n            print>>fid, '#   CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]'\n            print>>fid, '# Number of cameras:', len(self.cameras)\n\n            for camera_id, camera in sorted(self.cameras.iteritems()):\n                print>>fid, camera_id, camera\n\n    #---------------------------------------------------------------------------\n\n    def save_images(self, output_folder, output_file=None, binary=True):\n        if not os.path.exists(output_folder):\n            os.makedirs(output_folder)\n\n        if output_file is None:\n            output_file = 'images.bin' if binary else 'images.txt'\n\n        output_file = os.path.join(output_folder, output_file)\n\n        if binary:\n            self._save_images_bin(output_file)\n        else:\n            self._save_images_txt(output_file)\n\n    def _save_images_bin(self, output_file):\n        with open(output_file, 'wb') as fid:\n            fid.write(struct.pack('L', len(self.images)))\n\n            for image_id, image in self.images.iteritems():\n                fid.write(struct.pack('I', image_id))\n                fid.write(image.q.q.tobytes())\n                fid.write(image.tvec.tobytes())\n                fid.write(struct.pack('I', image.camera_id))\n                fid.write(image.name + '\\0')\n                fid.write(struct.pack('L', len(image.points2D)))\n                data = np.rec.fromarrays(\n                    (image.points2D[:,0], image.points2D[:,1], image.point3D_ids))\n                fid.write(data.tobytes())\n\n    def _save_images_txt(self, output_file):\n        with open(output_file, 'w') as fid:\n            print>>fid, '# Image list with two lines of data per image:'\n            print>>fid, '#   IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME'\n            print>>fid, '#   POINTS2D[] as (X, Y, POINT3D_ID)'\n            print>>fid, '# Number of images: {},'.format(len(self.images)),\n            print>>fid, 'mean observations per image: unknown'\n\n            for image_id, image in self.images.iteritems():\n                print>>fid, image_id,\n                print>>fid, ' '.join(str(qi) for qi in image.q.q),\n                print>>fid, ' '.join(str(ti) for ti in image.tvec),\n                print>>fid, image.camera_id, image.name\n\n                data = np.rec.fromarrays(\n                    (image.points2D[:,0], image.points2D[:,1],\n                     image.point3D_ids.astype(np.int64)))\n                if len(data) > 0:\n                    np.savetxt(fid, data, '%.2f %.2f %d', newline=' ')\n                    fid.seek(-1, os.SEEK_CUR)\n                fid.write('\\n')\n\n    #---------------------------------------------------------------------------\n\n    def save_points3D(self, output_folder, output_file=None, binary=True):\n        if not os.path.exists(output_folder):\n            os.makedirs(output_folder)\n\n        if output_file is None:\n            output_file = 'points3D.bin' if binary else 'points3D.txt'\n\n        output_file = os.path.join(output_folder, output_file)\n\n        if binary:\n            self._save_points3D_bin(output_file)\n        else:\n            self._save_points3D_txt(output_file)\n\n    def _save_points3D_bin(self, output_file):\n        num_valid_points3D = sum(\n            1 for point3D_idx in self.point3D_id_to_point3D_idx.itervalues()\n            if point3D_idx != SceneManager.INVALID_POINT3D)\n\n        iter_point3D_id_to_point3D_idx = \\\n            self.point3D_id_to_point3D_idx.iteritems()\n\n        with open(output_file, 'wb') as fid:\n            fid.write(struct.pack('L', num_valid_points3D))\n\n            for point3D_id, point3D_idx in iter_point3D_id_to_point3D_idx:\n                if point3D_idx == SceneManager.INVALID_POINT3D:\n                    continue\n\n                fid.write(struct.pack('L', point3D_id))\n                fid.write(self.points3D[point3D_idx].tobytes())\n                fid.write(self.point3D_colors[point3D_idx].tobytes())\n                fid.write(self.point3D_errors[point3D_idx].tobytes())\n                fid.write(\n                    struct.pack('L', len(self.point3D_id_to_images[point3D_id])))\n                fid.write(self.point3D_id_to_images[point3D_id].tobytes())\n\n    def _save_points3D_txt(self, output_file):\n        num_valid_points3D = sum(\n            1 for point3D_idx in self.point3D_id_to_point3D_idx.itervalues()\n            if point3D_idx != SceneManager.INVALID_POINT3D)\n\n        array_to_string = lambda arr: ' '.join(str(x) for x in arr)\n\n        iter_point3D_id_to_point3D_idx = \\\n            self.point3D_id_to_point3D_idx.iteritems()\n\n        with open(output_file, 'w') as fid:\n            print>>fid, '# 3D point list with one line of data per point:'\n            print>>fid, '#   POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as ',\n            print>>fid, '(IMAGE_ID, POINT2D_IDX)'\n            print>>fid, '# Number of points: {},'.format(num_valid_points3D),\n            print>>fid, 'mean track length: unknown'\n\n            for point3D_id, point3D_idx in iter_point3D_id_to_point3D_idx:\n                if point3D_idx == SceneManager.INVALID_POINT3D:\n                    continue\n\n                print>>fid, point3D_id,\n                print>>fid, array_to_string(self.points3D[point3D_idx]),\n                print>>fid, array_to_string(self.point3D_colors[point3D_idx]),\n                print>>fid, self.point3D_errors[point3D_idx],\n                print>>fid, array_to_string(\n                    self.point3D_id_to_images[point3D_id].flat)\n\n    #---------------------------------------------------------------------------\n\n    # return the image id associated with a given image file\n    def get_image_from_name(self, image_name):\n        image_id = self.name_to_image_id[image_name]\n        return image_id, self.images[image_id]\n\n    #---------------------------------------------------------------------------\n\n    def get_camera(self, camera_id):\n        return self.cameras[camera_id]\n\n    #---------------------------------------------------------------------------\n\n    def get_points3D(self, image_id, return_points2D=True, return_colors=False):\n        image = self.images[image_id]\n\n        mask = (image.point3D_ids != SceneManager.INVALID_POINT3D)\n\n        point3D_idxs = np.array([\n            self.point3D_id_to_point3D_idx[point3D_id]\n            for point3D_id in image.point3D_ids[mask]])\n        # detect filtered points\n        filter_mask = (point3D_idxs != SceneManager.INVALID_POINT3D)\n        point3D_idxs = point3D_idxs[filter_mask]\n        result = [self.points3D[point3D_idxs,:]]\n\n        if return_points2D:\n            mask[mask] &= filter_mask\n            result += [image.points2D[mask]]\n        if return_colors:\n            result += [self.point3D_colors[point3D_idxs,:]]\n\n        return result if len(result) > 1 else result[0]\n\n    #---------------------------------------------------------------------------\n\n    def point3D_valid(self, point3D_id):\n        return (self.point3D_id_to_point3D_idx[point3D_id] !=\n                SceneManager.INVALID_POINT3D)\n\n    #---------------------------------------------------------------------------\n\n    def get_filtered_points3D(self, return_colors=False):\n        point3D_idxs = [\n            idx for idx in self.point3D_id_to_point3D_idx.values()\n            if idx != SceneManager.INVALID_POINT3D]\n        result = [self.points3D[point3D_idxs,:]]\n        \n        if return_colors:\n            result += [self.point3D_colors[point3D_idxs,:]]\n\n        return result if len(result) > 1 else result[0]\n\n    #---------------------------------------------------------------------------\n\n    # return 3D points shared by two images\n    def get_shared_points3D(self, image_id1, image_id2):\n        point3D_ids = (\n                set(self.images[image_id1].point3D_ids) &\n                set(self.images[image_id2].point3D_ids))\n        point3D_ids.discard(SceneManager.INVALID_POINT3D)\n\n        point3D_idxs = np.array([self.point3D_id_to_point3D_idx[point3D_id]\n            for point3D_id in point3D_ids])\n\n        return self.points3D[point3D_idxs,:]\n\n    #---------------------------------------------------------------------------\n\n    # project *all* 3D points into image, return their projection coordinates,\n    # as well as their 3D positions\n    def get_viewed_points(self, image_id):\n        image = self.images[image_id]\n\n        # get unfiltered points\n        point3D_idxs = set(self.point3D_id_to_point3D_idx.itervalues())\n        point3D_idxs.discard(SceneManager.INVALID_POINT3D)\n        point3D_idxs = list(point3D_idxs)\n        points3D = self.points3D[point3D_idxs,:]\n\n        # orient points relative to camera\n        R = image.q.ToR()\n        points3D = points3D.dot(R.T) + image.tvec[np.newaxis,:]\n        points3D = points3D[points3D[:,2] > 0,:] # keep points with positive z\n\n        # put points into image coordinates\n        camera = self.cameras[image.camera_id]\n        points2D = points3D.dot(camera.get_camera_matrix().T)\n        points2D = points2D[:,:2] / points2D[:,2][:,np.newaxis]\n\n        # keep points that are within the image\n        mask = (\n            (points2D[:,0] >= 0) &\n            (points2D[:,1] >= 0) &\n            (points2D[:,0] < camera.width - 1) &\n            (points2D[:,1] < camera.height - 1))\n\n        return points2D[mask,:], points3D[mask,:]\n\n    #---------------------------------------------------------------------------\n\n    def add_camera(self, camera):\n        self.last_camera_id += 1\n        self.cameras[self.last_camera_id] = camera\n        return self.last_camera_id\n\n    #---------------------------------------------------------------------------\n\n    def add_image(self, image):\n        self.last_image_id += 1\n        self.images[self.last_image_id] = image\n        return self.last_image_id\n\n    #---------------------------------------------------------------------------\n\n    def delete_images(self, image_list):\n        # delete specified images\n        for image_id in image_list:\n            if image_id in self.images:\n                del self.images[image_id]\n\n        keep_set = set(self.images.iterkeys())\n\n        # delete references to specified images, and ignore any points that are\n        # invalidated\n        iter_point3D_id_to_point3D_idx = \\\n            self.point3D_id_to_point3D_idx.iteritems()\n\n        for point3D_id, point3D_idx in iter_point3D_id_to_point3D_idx:\n            if point3D_idx == SceneManager.INVALID_POINT3D:\n                continue\n\n            mask = np.array([\n                image_id in keep_set\n                for image_id in self.point3D_id_to_images[point3D_id][:,0]])\n            if np.any(mask):\n                self.point3D_id_to_images[point3D_id] = \\\n                    self.point3D_id_to_images[point3D_id][mask]\n            else:\n                self.point3D_id_to_point3D_idx[point3D_id] = \\\n                    SceneManager.INVALID_POINT3D\n\n    #---------------------------------------------------------------------------\n\n    # camera_list: set of cameras whose points we'd like to keep\n    # min/max triangulation angle: in degrees\n    def filter_points3D(self,\n            min_track_len=0, max_error=np.inf, min_tri_angle=0,\n            max_tri_angle=180, image_set=set()):\n\n        image_set = set(image_set)\n\n        check_triangulation_angles = (min_tri_angle > 0 or max_tri_angle < 180)\n        if check_triangulation_angles:\n            max_tri_prod = np.cos(np.radians(min_tri_angle))\n            min_tri_prod = np.cos(np.radians(max_tri_angle))\n\n        iter_point3D_id_to_point3D_idx = \\\n            self.point3D_id_to_point3D_idx.iteritems()\n\n        image_ids = []\n\n        for point3D_id, point3D_idx in iter_point3D_id_to_point3D_idx:\n            if point3D_idx == SceneManager.INVALID_POINT3D:\n                continue\n\n            if image_set or min_track_len > 0:\n                image_ids = set(self.point3D_id_to_images[point3D_id][:,0])\n            \n            # check if error and min track length are sufficient, or if none of\n            # the selected cameras see the point\n            if (len(image_ids) < min_track_len or\n                      self.point3D_errors[point3D_idx] > max_error or\n                      image_set and image_set.isdisjoint(image_ids)):\n                self.point3D_id_to_point3D_idx[point3D_id] = \\\n                    SceneManager.INVALID_POINT3D\n\n            # find dot product between all camera viewing rays\n            elif check_triangulation_angles:\n                xyz = self.points3D[point3D_idx,:]\n                tvecs = np.array(\n                    [(self.images[image_id].tvec - xyz)\n                     for image_id in image_ids])\n                tvecs /= np.linalg.norm(tvecs, axis=-1)[:,np.newaxis]\n\n                cos_theta = np.array(\n                    [u.dot(v) for u,v in combinations(tvecs, 2)])\n\n                # min_prod = cos(maximum viewing angle), and vice versa\n                # if maximum viewing angle is too small or too large,\n                # don't add this point\n                if (np.min(cos_theta) > max_tri_prod or\n                    np.max(cos_theta) < min_tri_prod):\n                    self.point3D_id_to_point3D_idx[point3D_id] = \\\n                        SceneManager.INVALID_POINT3D\n\n        # apply the filters to the image point3D_ids\n        for image in self.images.itervalues():\n            mask = np.array([\n                self.point3D_id_to_point3D_idx.get(point3D_id, 0) \\\n                    == SceneManager.INVALID_POINT3D\n                for point3D_id in image.point3D_ids])\n            image.point3D_ids[mask] = SceneManager.INVALID_POINT3D\n\n    #---------------------------------------------------------------------------\n\n    # scene graph: {image_id: [image_id: #shared points]}\n    def build_scene_graph(self):\n        self.scene_graph = defaultdict(lambda: defaultdict(int))\n        point3D_iter = self.point3D_id_to_images.iteritems()\n\n        for i, (point3D_id, images) in enumerate(point3D_iter):\n            if not self.point3D_valid(point3D_id):\n                continue\n\n            for image_id1, image_id2 in combinations(images[:,0], 2):\n                self.scene_graph[image_id1][image_id2] += 1\n                self.scene_graph[image_id2][image_id1] += 1\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/colmap_to_nvm.py",
    "content": "import itertools\nimport sys\nsys.path.append(\"..\")\nfloat32\nimport numpy as np\n\nfrom pycolmap import Quaternion, SceneManager\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    scene_manager = SceneManager(args.input_folder)\n    scene_manager.load()\n\n    with open(args.output_file, \"w\") as fid:\n        fid.write(\"NVM_V3\\n \\n{:d}\\n\".format(len(scene_manager.images)))\n\n        image_fmt_str = \" {:.3f} \" + 7 * \"{:.7f} \"\n        for image_id, image in scene_manager.images.iteritems():\n            camera = scene_manager.cameras[image.camera_id]\n            f = 0.5 * (camera.fx + camera.fy)\n            fid.write(args.image_name_prefix + image.name)\n            fid.write(image_fmt_str.format(\n               *((f,) + tuple(image.q.q) + tuple(image.C()))))\n            if camera.distortion_func is None:\n                fid.write(\"0 0\\n\")\n            else:\n                fid.write(\"{:.7f} 0\\n\".format(-camera.k1))\n\n        image_id_to_idx = dict(\n            (image_id, i) for i, image_id in enumerate(scene_manager.images))\n\n        fid.write(\"{:d}\\n\".format(len(scene_manager.points3D)))\n        for i, point3D_id in enumerate(scene_manager.point3D_ids):\n            fid.write(\n                \"{:.7f} {:.7f} {:.7f} \".format(*scene_manager.points3D[i]))\n            fid.write(\n                \"{:d} {:d} {:d} \".format(*scene_manager.point3D_colors[i]))\n            keypoints = [\n                (image_id_to_idx[image_id], kp_idx) +\n                    tuple(scene_manager.images[image_id].points2D[kp_idx])\n                for image_id, kp_idx in\n                    scene_manager.point3D_id_to_images[point3D_id]]\n            fid.write(\"{:d}\".format(len(keypoints)))\n            fid.write(\n                (len(keypoints) * \" {:d} {:d} {:.3f} {:.3f}\" + \"\\n\").format(\n                    *itertools.chain(*keypoints)))\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        description=\"Save a COLMAP reconstruction in the NVM format \"\n            \"(http://ccwu.me/vsfm/doc.html#nvm).\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"input_folder\")\n    parser.add_argument(\"output_file\")\n\n    parser.add_argument(\"--image_name_prefix\", type=str, default=\"\",\n        help=\"prefix image names with this string (e.g., 'images/')\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/delete_images.py",
    "content": "import sys\nsys.path.append(\"..\")\n\nimport numpy as np\n\nfrom pycolmap import DualQuaternion, Image, SceneManager\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    scene_manager = SceneManager(args.input_folder)\n    scene_manager.load()\n\n    image_ids = map(scene_manager.get_image_from_name,\n                    iter(lambda: sys.stdin.readline().strip(), \"\"))\n    scene_manager.delete_images(image_ids)\n\n    scene_manager.save(args.output_folder)\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        description=\"Deletes images (filenames read from stdin) from a model.\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"input_folder\")\n    parser.add_argument(\"output_folder\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/impute_missing_cameras.py",
    "content": "import sys\nsys.path.append(\"..\")\n\nimport numpy as np\n\nfrom pycolmap import DualQuaternion, Image, SceneManager\n\n\n#-------------------------------------------------------------------------------\n\nimage_to_idx = lambda im: int(im.name[:im.name.rfind(\".\")])\n\n\n#-------------------------------------------------------------------------------\n\ndef interpolate_linear(images, camera_id, file_format):\n    if len(images) < 2:\n        raise ValueError(\"Need at least two images for linear interpolation!\")\n\n    prev_image = images[0]\n    prev_idx = image_to_idx(prev_image)\n    prev_dq = DualQuaternion.FromQT(prev_image.q, prev_image.t)\n    start = prev_idx\n\n    new_images = []\n\n    for image in images[1:]:\n        curr_idx = image_to_idx(image)\n        curr_dq = DualQuaternion.FromQT(image.q, image.t)\n        T = curr_idx - prev_idx\n        Tinv = 1. / T\n\n        # like quaternions, dq(x) = -dq(x), so we'll need to pick the one more\n        # appropriate for interpolation by taking -dq if the dot product of the\n        # two q-vectors is negative\n        if prev_dq.q0.dot(curr_dq.q0) < 0:\n            curr_dq = -curr_dq\n\n        for i in xrange(1, T):\n            t = i * Tinv\n            dq = t * prev_dq + (1. - t) * curr_dq\n            q, t = dq.ToQT()\n            new_images.append(\n                Image(file_format.format(prev_idx + i), args.camera_id, q, t))\n\n        prev_idx = curr_idx\n        prev_dq = curr_dq\n\n    return new_images\n\n\n#-------------------------------------------------------------------------------\n\ndef interpolate_hermite(images, camera_id, file_format):\n    if len(images) < 4:\n        raise ValueError(\n            \"Need at least four images for Hermite spline interpolation!\")\n\n    new_images = []\n\n    # linear blending for the first frames\n    T0 = image_to_idx(images[0])\n    dq0 = DualQuaternion.FromQT(images[0].q, images[0].t)\n    T1 = image_to_idx(images[1])\n    dq1 = DualQuaternion.FromQT(images[1].q, images[1].t)\n\n    if dq0.q0.dot(dq1.q0) < 0:\n        dq1 = -dq1\n    dT = 1. / float(T1 - T0)\n    for j in xrange(1, T1 - T0):\n        t = j * dT\n        dq = ((1. - t) * dq0 + t * dq1).normalize()\n        new_images.append(\n            Image(file_format.format(T0 + j), camera_id, *dq.ToQT()))\n\n    T2 = image_to_idx(images[2])\n    dq2 = DualQuaternion.FromQT(images[2].q, images[2].t)\n    if dq1.q0.dot(dq2.q0) < 0:\n        dq2 = -dq2\n\n    # Hermite spline interpolation of dual quaternions\n    # pdfs.semanticscholar.org/05b1/8ede7f46c29c2722fed3376d277a1d286c55.pdf\n    for i in xrange(1, len(images) - 2):\n        T3 = image_to_idx(images[i + 2])\n        dq3 = DualQuaternion.FromQT(images[i + 2].q, images[i + 2].t)\n        if dq2.q0.dot(dq3.q0) < 0:\n            dq3 = -dq3\n\n        prev_duration = T1 - T0\n        current_duration = T2 - T1\n        next_duration = T3 - T2\n\n        # approximate the derivatives at dq1 and dq2 using weighted central\n        # differences\n        dt1 = 1. / float(T2 - T0)\n        dt2 = 1. / float(T3 - T1)\n\n        m1 = (current_duration * dt1) * (dq2 - dq1) + \\\n             (prev_duration * dt1) * (dq1 - dq0) \n        m2 = (next_duration * dt2) * (dq3 - dq2) + \\\n             (current_duration * dt2) * (dq2 - dq1) \n\n        dT = 1. / float(current_duration)\n\n        for j in xrange(1, current_duration):\n            t = j * dT # 0 to 1\n            t2 = t * t # t squared\n            t3 = t2 * t # t cubed\n\n            # coefficients of the Hermite spline (a=>dq and b=>m)\n            a1 = 2. * t3 - 3. * t2 + 1.\n            b1 = t3 - 2. * t2 + t\n            a2 = -2. * t3 + 3. * t2\n            b2 = t3 - t2\n\n            dq = (a1 * dq1 + b1 * m1 + a2 * dq2 + b2 * m2).normalize()\n\n            new_images.append(\n                Image(file_format.format(T1 + j), camera_id, *dq.ToQT()))\n\n        T0, T1, T2 = T1, T2, T3\n        dq0, dq1, dq2 = dq1, dq2, dq3\n\n    # linear blending for the last frames\n    dT = 1. / float(T2 - T1)\n    for j in xrange(1, T2 - T1):\n        t = j * dT # 0 to 1\n        dq = ((1. - t) * dq1 + t * dq2).normalize()\n        new_images.append(\n            Image(file_format.format(T1 + j), camera_id, *dq.ToQT()))\n    \n    return new_images\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    scene_manager = SceneManager(args.input_folder)\n    scene_manager.load()\n\n    images = sorted(scene_manager.images.itervalues(), key=image_to_idx)\n\n    if args.method.lower() == \"linear\":\n        new_images = interpolate_linear(images, args.camera_id, args.format)\n    else:\n        new_images = interpolate_hermite(images, args.camera_id, args.format)\n\n    map(scene_manager.add_image, new_images)\n\n    scene_manager.save(args.output_folder)\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        description=\"Given a reconstruction with ordered images *with integer \"\n        \"filenames* like '000100.png', fill in missing camera positions for \"\n        \"intermediate frames.\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"input_folder\")\n    parser.add_argument(\"output_folder\")\n\n    parser.add_argument(\"--camera_id\", type=int, default=1,\n        help=\"camera id to use for the missing images\")\n\n    parser.add_argument(\"--format\", type=str, default=\"{:06d}.png\",\n        help=\"filename format to use for added images\")\n\n    parser.add_argument(\n        \"--method\", type=str.lower, choices=(\"linear\", \"hermite\"),\n        default=\"hermite\",\n        help=\"Pose imputation method\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/save_cameras_as_ply.py",
    "content": "import sys\nsys.path.append(\"..\")\n\nimport numpy as np\nimport os\n\nfrom pycolmap import SceneManager\n\n\n#-------------------------------------------------------------------------------\n\n# Saves the cameras as a mesh\n#\n# inputs:\n# - ply_file: output file\n# - images: ordered array of pycolmap Image objects\n# - color: color string for the camera\n# - scale: amount to shrink/grow the camera model\ndef save_camera_ply(ply_file, images, scale):\n    points3D = scale * np.array((\n        (0., 0., 0.),\n        (-1., -1., 1.),\n        (-1., 1., 1.),\n        (1., -1., 1.),\n        (1., 1., 1.)))\n\n    faces = np.array(((0, 2, 1),\n                      (0, 4, 2),\n                      (0, 3, 4),\n                      (0, 1, 3),\n                      (1, 2, 4),\n                      (1, 4, 3)))\n\n    r = np.linspace(0, 255, len(images), dtype=np.uint8)\n    g = 255 - r\n    b = r - np.linspace(0, 128, len(images), dtype=np.uint8)\n    color = np.column_stack((r, g, b))\n\n    with open(ply_file, \"w\") as fid:\n        print>>fid, \"ply\"\n        print>>fid, \"format ascii 1.0\"\n        print>>fid, \"element vertex\", len(points3D) * len(images)\n        print>>fid, \"property float x\"\n        print>>fid, \"property float y\"\n        print>>fid, \"property float z\"\n        print>>fid, \"property uchar red\"\n        print>>fid, \"property uchar green\"\n        print>>fid, \"property uchar blue\"\n        print>>fid, \"element face\", len(faces) * len(images)\n        print>>fid, \"property list uchar int vertex_index\"\n        print>>fid, \"end_header\"\n\n        for image, c in zip(images, color):\n            for p3D in (points3D.dot(image.R()) + image.C()):\n                print>>fid, p3D[0], p3D[1], p3D[2], c[0], c[1], c[2]\n\n        for i in xrange(len(images)):\n            for f in (faces + len(points3D) * i):\n                print>>fid, \"3 {} {} {}\".format(*f)\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    scene_manager = SceneManager(args.input_folder)\n    scene_manager.load_images()\n\n    images = sorted(scene_manager.images.itervalues(),\n                    key=lambda image: image.name)\n\n    save_camera_ply(args.output_file, images, args.scale)\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        description=\"Saves camera positions to a PLY for easy viewing outside \"\n        \"of COLMAP. Currently, camera FoV is not reflected in the output.\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"input_folder\")\n    parser.add_argument(\"output_file\")\n\n    parser.add_argument(\"--scale\", type=float, default=1.,\n        help=\"Scaling factor for the camera mesh.\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/transform_model.py",
    "content": "import sys\nsys.path.append(\"..\")\n\nimport numpy as np\n\nfrom pycolmap import Quaternion, SceneManager\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    scene_manager = SceneManager(args.input_folder)\n    scene_manager.load()\n\n    # expect each line of input corresponds to one row\n    P = np.array([\n        map(float, sys.stdin.readline().strip().split()) for _ in xrange(3)])\n\n    scene_manager.points3D[:] = scene_manager.points3D.dot(P[:,:3].T) + P[:,3]\n\n    # get rotation without any global scaling (assuming isotropic scaling)\n    scale = np.cbrt(np.linalg.det(P[:,:3]))\n    q_old_from_new = ~Quaternion.FromR(P[:,:3] / scale)\n\n    for image in scene_manager.images.itervalues():\n        image.q *= q_old_from_new\n        image.tvec = scale * image.tvec - image.R().dot(P[:,3])\n\n    scene_manager.save(args.output_folder)\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        description=\"Apply a 3x4 transformation matrix to a COLMAP model and \"\n        \"save the result as a new model. Row-major input can be piped in from \"\n        \"a file or entered via the command line.\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"input_folder\")\n    parser.add_argument(\"output_folder\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/write_camera_track_to_bundler.py",
    "content": "import sys\nsys.path.append(\"..\")\n\nimport numpy as np\n\nfrom pycolmap import SceneManager\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    scene_manager = SceneManager(args.input_folder)\n    scene_manager.load_cameras()\n    scene_manager.load_images()\n\n    if args.sort:\n        images = sorted(\n            scene_manager.images.itervalues(), key=lambda im: im.name)\n    else:\n        images = scene_manager.images.values()\n\n    fid = open(args.output_file, \"w\")\n    fid_filenames = open(args.output_file + \".list.txt\", \"w\")\n\n    print>>fid, \"# Bundle file v0.3\"\n    print>>fid, len(images), 0\n\n    for image in images:\n        print>>fid_filenames, image.name\n        camera = scene_manager.cameras[image.camera_id]\n        print>>fid, 0.5 * (camera.fx + camera.fy), 0, 0\n        R, t = image.R(), image.t\n        print>>fid, R[0, 0], R[0, 1], R[0, 2]\n        print>>fid, -R[1, 0], -R[1, 1], -R[1, 2]\n        print>>fid, -R[2, 0], -R[2, 1], -R[2, 2]\n        print>>fid, t[0], -t[1], -t[2]\n\n    fid.close()\n    fid_filenames.close()\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        description=\"Saves the camera positions in the Bundler format. Note \"\n        \"that 3D points are not saved.\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"input_folder\")\n    parser.add_argument(\"output_file\")\n\n    parser.add_argument(\"--sort\", default=False, action=\"store_true\",\n        help=\"sort the images by their filename\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/pycolmap/tools/write_depthmap_to_ply.py",
    "content": "import sys\nsys.path.append(\"..\")\n\nimport imageio\nimport numpy as np\nimport os\n\nfrom plyfile import PlyData, PlyElement\nfrom pycolmap import SceneManager\nfrom scipy.ndimage.interpolation import zoom\n\n\n#-------------------------------------------------------------------------------\n\ndef main(args):\n    suffix = \".photometric.bin\" if args.photometric else \".geometric.bin\"\n\n    image_file = os.path.join(args.dense_folder, \"images\", args.image_filename)\n    depth_file = os.path.join(\n        args.dense_folder, args.stereo_folder, \"depth_maps\",\n        args.image_filename + suffix)\n    if args.save_normals:\n        normals_file = os.path.join(\n            args.dense_folder, args.stereo_folder, \"normal_maps\",\n            args.image_filename + suffix)\n\n    # load camera intrinsics from the COLMAP reconstruction\n    scene_manager = SceneManager(os.path.join(args.dense_folder, \"sparse\"))\n    scene_manager.load_cameras()\n    scene_manager.load_images()\n\n    image_id, image = scene_manager.get_image_from_name(args.image_filename)\n    camera = scene_manager.cameras[image.camera_id]\n    rotation_camera_from_world = image.R()\n    camera_center = image.C()\n\n    # load image, depth map, and normal map\n    image = imageio.imread(image_file)\n\n    with open(depth_file, \"rb\") as fid:\n        w = int(\"\".join(iter(lambda: fid.read(1), \"&\")))\n        h = int(\"\".join(iter(lambda: fid.read(1), \"&\")))\n        c = int(\"\".join(iter(lambda: fid.read(1), \"&\")))\n        depth_map = np.fromfile(fid, np.float32).reshape(h, w)\n        if (h, w) != image.shape[:2]:\n            depth_map = zoom(\n                depth_map,\n                (float(image.shape[0]) / h, float(image.shape[1]) / w),\n                order=0)\n\n    if args.save_normals:\n        with open(normals_file, \"rb\") as fid:\n            w = int(\"\".join(iter(lambda: fid.read(1), \"&\")))\n            h = int(\"\".join(iter(lambda: fid.read(1), \"&\")))\n            c = int(\"\".join(iter(lambda: fid.read(1), \"&\")))\n            normals = np.fromfile(\n                fid, np.float32).reshape(c, h, w).transpose([1, 2, 0])\n            if (h, w) != image.shape[:2]:\n                normals = zoom(\n                    normals,\n                    (float(image.shape[0]) / h, float(image.shape[1]) / w, 1.),\n                    order=0)\n\n    if args.min_depth is not None:\n        depth_map[depth_map < args.min_depth] = 0.\n    if args.max_depth is not None:\n        depth_map[depth_map > args.max_depth] = 0.\n\n    # create 3D points\n    #depth_map = np.minimum(depth_map, 100.)\n    points3D = np.dstack(camera.get_image_grid() + [depth_map])\n    points3D[:,:,:2] *= depth_map[:,:,np.newaxis]\n\n    # save\n    points3D = points3D.astype(np.float32).reshape(-1, 3)\n    if args.save_normals:\n        normals = normals.astype(np.float32).reshape(-1, 3)\n    image = image.reshape(-1, 3)\n    if image.dtype != np.uint8:\n        if image.max() <= 1:\n            image = (image * 255.).astype(np.uint8)\n        else:\n            image = image.astype(np.uint8)\n\n    if args.world_space:\n        points3D = points3D.dot(rotation_camera_from_world) + camera_center\n        if args.save_normals:\n            normals = normals.dot(rotation_camera_from_world)\n\n    if args.save_normals:\n        vertices = np.rec.fromarrays(\n            tuple(points3D.T) + tuple(normals.T) + tuple(image.T),\n            names=\"x,y,z,nx,ny,nz,red,green,blue\")\n    else:\n        vertices = np.rec.fromarrays(\n            tuple(points3D.T) + tuple(image.T), names=\"x,y,z,red,green,blue\")\n    vertices = PlyElement.describe(vertices, \"vertex\")\n    PlyData([vertices]).write(args.output_filename)\n\n\n#-------------------------------------------------------------------------------\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n\n    parser.add_argument(\"dense_folder\", type=str)\n    parser.add_argument(\"image_filename\", type=str)\n    parser.add_argument(\"output_filename\", type=str)\n\n    parser.add_argument(\n        \"--photometric\", default=False, action=\"store_true\",\n        help=\"use photometric depthmap instead of geometric\")\n\n    parser.add_argument(\n        \"--world_space\", default=False, action=\"store_true\",\n        help=\"apply the camera->world extrinsic transformation to the result\")\n\n    parser.add_argument(\n        \"--save_normals\", default=False, action=\"store_true\",\n        help=\"load the estimated normal map and save as part of the PLY\")\n\n    parser.add_argument(\n        \"--stereo_folder\", type=str, default=\"stereo\",\n        help=\"folder in the dense workspace containing depth and normal maps\")\n\n    parser.add_argument(\n        \"--min_depth\", type=float, default=None,\n        help=\"set pixels with depth less than this value to zero depth\")\n\n    parser.add_argument(\n        \"--max_depth\", type=float, default=None,\n        help=\"set pixels with depth greater than this value to zero depth\")\n\n    args = parser.parse_args()\n\n    main(args)\n"
  },
  {
    "path": "FourierGrid/run_colmap2standard.py",
    "content": "from os.path import join as pjoin\nfrom copy import deepcopy\nfrom glob import glob\nimport click\nimport pdb\nimport numpy as np\nimport camera_utils\nimport cv2 as cv\nfrom pycolmap.pycolmap.scene_manager import SceneManager\nfrom typing import Mapping, Optional, Sequence, Text, Tuple, Union\n\n\n# This implementation is from MipNeRF360\nclass NeRFSceneManager(SceneManager):\n    \"\"\"COLMAP pose loader.\n\n    Minor NeRF-specific extension to the third_party Python COLMAP loader:\n    google3/third_party/py/pycolmap/scene_manager.py\n    \"\"\"\n\n    def __init__(self, data_dir):\n        super(NeRFSceneManager, self).__init__(pjoin(data_dir, 'sparse', '0'))\n\n    def process(\n            self\n    ) -> Tuple[Sequence[Text], np.ndarray, np.ndarray, Optional[Mapping[\n        Text, float]], camera_utils.ProjectionType]:\n        \"\"\"Applies NeRF-specific postprocessing to the loaded pose data.\n\n        Returns:\n          a tuple [image_names, poses, pixtocam, distortion_params].\n          image_names:  contains the only the basename of the images.\n          poses: [N, 4, 4] array containing the camera to world matrices.\n          pixtocam: [N, 3, 3] array containing the camera to pixel space matrices.\n          distortion_params: mapping of distortion param name to distortion\n            parameters. Cameras share intrinsics. Valid keys are k1, k2, p1 and p2.\n        \"\"\"\n\n        self.load_cameras()\n        self.load_images()\n        self.load_points3D()\n\n        # Assume shared intrinsics between all cameras.\n        cam = self.cameras[1]\n\n        # Extract focal lengths and principal point parameters.\n        fx, fy, cx, cy = cam.fx, cam.fy, cam.cx, cam.cy\n        pixtocam = np.linalg.inv(camera_utils.intrinsic_matrix(fx, fy, cx, cy))\n\n        # Extract extrinsic matrices in world-to-camera format.\n        imdata = self.images\n        w2c_mats = []\n        bottom = np.array([0, 0, 0, 1]).reshape(1, 4)\n        for k in imdata:\n            im = imdata[k]\n            rot = im.R()\n            trans = im.tvec.reshape(3, 1)\n            w2c = np.concatenate([np.concatenate([rot, trans], 1), bottom], axis=0)\n            w2c_mats.append(w2c)\n        w2c_mats = np.stack(w2c_mats, axis=0)\n\n        # Convert extrinsics to camera-to-world.\n        c2w_mats = np.linalg.inv(w2c_mats)\n        poses = c2w_mats[:, :3, :4]\n\n        # Image names from COLMAP. No need for permuting the poses according to\n        # image names anymore.\n        names = [imdata[k].name for k in imdata]\n        \n        # Switch from COLMAP (right, down, fwd) to NeRF (right, up, back) frame.\n        poses = poses @ np.diag([1, -1, -1, 1])\n        # pixtocam = np.diag([1, -1, -1]) @ pixtocam\n\n        # Get distortion parameters.\n        type_ = cam.camera_type\n\n        if type_ == 0 or type_ == 'SIMPLE_PINHOLE':\n            params = None\n            camtype = camera_utils.ProjectionType.PERSPECTIVE\n\n        elif type_ == 1 or type_ == 'PINHOLE':\n            params = None\n            camtype = camera_utils.ProjectionType.PERSPECTIVE\n\n        if type_ == 2 or type_ == 'SIMPLE_RADIAL':\n            params = {k: 0. for k in ['k1', 'k2', 'k3', 'p1', 'p2']}\n            params['k1'] = cam.k1\n            camtype = camera_utils.ProjectionType.PERSPECTIVE\n\n        elif type_ == 3 or type_ == 'RADIAL':\n            params = {k: 0. for k in ['k1', 'k2', 'k3', 'p1', 'p2']}\n            params['k1'] = cam.k1\n            params['k2'] = cam.k2\n            camtype = camera_utils.ProjectionType.PERSPECTIVE\n\n        elif type_ == 4 or type_ == 'OPENCV':\n            params = {k: 0. for k in ['k1', 'k2', 'k3', 'p1', 'p2']}\n            params['k1'] = cam.k1\n            params['k2'] = cam.k2\n            params['p1'] = cam.p1\n            params['p2'] = cam.p2\n            camtype = camera_utils.ProjectionType.PERSPECTIVE\n\n        elif type_ == 5 or type_ == 'OPENCV_FISHEYE':\n            params = {k: 0. for k in ['k1', 'k2', 'k3', 'k4']}\n            params['k1'] = cam.k1\n            params['k2'] = cam.k2\n            params['k3'] = cam.k3\n            params['k4'] = cam.k4\n            camtype = camera_utils.ProjectionType.FISHEYE\n\n        return names, poses, pixtocam, params, camtype\n\n\nclass Dataset:\n    def __init__(self, data_dir):\n        scene_manager = NeRFSceneManager(data_dir)\n        self.names, self.poses, self.pix2cam, self.params, self.camtype = scene_manager.process()\n        self.cam2pix = np.linalg.inv(self.pix2cam)\n        self.n_images = len(self.poses)\n\n        # re-permute images by name\n        sorted_image_names = sorted(deepcopy(self.names))\n        sort_img_idx = []\n        for i in range(self.n_images):\n            sort_img_idx.append(self.names.index(sorted_image_names[i]))\n        img_idx = np.array(sort_img_idx, dtype=np.int32)\n        self.poses = self.poses[sort_img_idx]\n\n        # calc near-far bounds\n        self.bounds = np.zeros([self.n_images, 2], dtype=np.float32)\n        name_to_ids = scene_manager.name_to_image_id\n        points3D = scene_manager.points3D\n        points3D_ids = scene_manager.point3D_ids\n        point3D_id_to_images = scene_manager.point3D_id_to_images\n        image_id_to_image_idx = np.zeros(self.n_images + 10, dtype=np.int32)\n        for image_name in self.names:\n            image_id_to_image_idx[name_to_ids[image_name]] = sorted_image_names.index(image_name)\n\n        vis_arr = []\n        for pts_i in range(len(points3D)):\n            cams = np.zeros([self.n_images], dtype=np.uint8)\n            images_ids = point3D_id_to_images[points3D_ids[pts_i]]\n            for image_info in images_ids:\n                image_id = image_info[0]\n                image_idx = image_id_to_image_idx[image_id]\n                cams[image_idx] = 1\n            vis_arr.append(cams)\n\n        vis_arr = np.stack(vis_arr, 1)     # [n_images, n_pts ]\n\n        for img_i in range(self.n_images):\n            vis = vis_arr[img_i]\n            pts = points3D[vis == 1]\n            c2w = np.diag([1., 1., 1., 1.])\n            c2w[:3, :4] = self.poses[img_i]\n            w2c = np.linalg.inv(c2w)\n            z_vals = (w2c[None, 2, :3] * pts).sum(-1) + w2c[None, 2, 3]\n            depth = -z_vals\n            near_depth, far_depth = np.percentile(depth, 1.), np.percentile(depth, 99.)\n            near_depth = near_depth * .5\n            far_depth = far_depth * 5.\n            self.bounds[img_i, 0], self.bounds[img_i, 1] = near_depth, far_depth\n\n        # Move all to numpy\n        def proc(x):\n            return np.ascontiguousarray(np.array(x).astype(np.float64))\n\n        self.poses = proc(self.poses)\n        self.cam2pix = proc(np.tile(self.cam2pix[None], (len(self.poses), 1, 1)))\n        self.bounds = proc(self.bounds)\n        if self.params is not None:\n            dist_params = [ self.params['k1'], self.params['k2'], self.params['p1'], self.params['p2'] ]\n        else:\n            dist_params = [0., 0., 0., 0.]\n        dist_params = np.tile(np.array(dist_params), len(self.poses)).reshape([len(self.poses), -1])\n        self.dist_params = proc([dist_params])\n\n    def export(self, data_dir, out_mode):\n        n = len(self.poses)\n        if out_mode == 'cams_meta':\n            data = np.concatenate([self.poses.reshape([n, -1]),\n                                   self.cam2pix.reshape([n, -1]),\n                                   self.dist_params.reshape([n, -1]),\n                                   self.bounds.reshape([n, -1])], axis=-1)\n            data = np.ascontiguousarray(np.array(data).astype(np.float64))\n            np.save(pjoin(data_dir, 'cams_meta.npy'), data)\n        elif 'poses_bounds' in out_mode :\n            poses = deepcopy(self.poses)\n            image_list = []\n            suffs = ['*.png', '*.PNG', '*.jpg', '*.JPG']\n            for suff in suffs:\n                image_list += glob(pjoin(data_dir, 'images', suff))\n            h, w, _ = cv.imread(image_list[0]).shape\n            focal = (self.cam2pix[0, 0, 0] + self.cam2pix[0, 1, 1]) * .5\n\n            # poses_ = torch::cat({ poses_.index({Slc(), Slc(), Slc(1, 2)}),\n            # -poses_.index({Slc(), Slc(), Slc(0, 1)}),\n            # poses_.index({Slc(), Slc(), Slc(2, None)})}, 2);\n\n            if out_mode == 'poses_bounds_raw':\n                poses = np.concatenate([-poses[:, :, 1:2], poses[:, :, 0:1], poses[:, :, 2:]], 2)\n\n            hwf = np.zeros([n, 3])\n            hwf[:, 0] = h\n            hwf[:, 1] = w\n            hwf[:, 2] = focal\n            bounds = self.bounds\n            poses_hwf = np.concatenate([poses, hwf[:, :, None]], -1)\n            data = np.concatenate([poses_hwf.reshape([n, -1]), bounds.reshape([n, -1])], -1)\n            data = np.ascontiguousarray(np.array(data).astype(np.float64))\n            np.save(pjoin(data_dir, '{}.npy'.format(out_mode)), data)\n\n\n@click.command()\n@click.option('--data_dir', type=str)\n@click.option('--out_mode', type=str, default='cams_meta')\ndef main(data_dir, out_mode):\n    dataset = Dataset(data_dir)\n    dataset.export(data_dir, out_mode)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "FourierGrid/run_export_bbox.py",
    "content": "import numpy as np\nimport os\nimport pdb\nfrom pathlib import Path\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nfrom FourierGrid.bbox_compute import compute_bbox_by_cam_frustrm\n\n\ndef run_export_bbox_cams(args, cfg, data_dict, save_path=None): \n    verbose = args.block_num <= 1\n    if verbose:\n        print('Export bbox and cameras...')\n    if save_path is None:\n        save_path = args.export_bbox_and_cams_only\n    xyz_min, xyz_max = compute_bbox_by_cam_frustrm(args=args, cfg=cfg, **data_dict)\n    poses, HW, Ks, i_train = data_dict['poses'], data_dict['HW'], data_dict['Ks'], data_dict['i_train']\n    near, far = data_dict['near'], data_dict['far']\n    if data_dict['near_clip'] is not None:\n        near = data_dict['near_clip']\n    cam_lst = []\n    for c2w, (H, W), K in zip(poses[i_train], HW[i_train], Ks[i_train]):\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H, W, K, c2w, cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y,)\n        cam_o = rays_o[0,0].cpu().numpy()\n        cam_d = rays_d[[0,0,-1,-1],[0,-1,0,-1]].cpu().numpy()\n        frustrum_height = max(near, far*0.05) * cfg.vis.height_rate\n        cam_lst.append(np.array([cam_o, *(cam_o+cam_d*frustrum_height)]))\n    dir_name = os.path.dirname(save_path)\n    Path(dir_name).mkdir(parents=True, exist_ok=True)\n    np.savez_compressed(save_path,\n        xyz_min=xyz_min.cpu().numpy(), xyz_max=xyz_max.cpu().numpy(),\n        cam_lst=np.array(cam_lst))\n"
  },
  {
    "path": "FourierGrid/run_export_coarse.py",
    "content": "import torch\nimport os\nimport numpy as np\nfrom FourierGrid.load_everything import load_existing_model\n\n\ndef run_export_coarse(args, cfg, device, save_path=None):\n    verbose = args.block_num <= 1\n    if verbose:\n        print('Export coarse visualization...')\n    with torch.no_grad():\n        ckpt_path = os.path.join(cfg.basedir, cfg.expname, 'coarse_last.tar')\n        # model = utils.load_model(dvgo.DirectVoxGO, ckpt_path).to(device)\n        model, _, _ = load_existing_model(args, cfg, cfg.fine_train, ckpt_path, device=device)\n        model.to(device)\n        alpha = model.activate_density(model.density.get_dense_grid()).squeeze().cpu().numpy()\n        rgb = torch.sigmoid(model.k0.get_dense_grid()).squeeze().permute(1,2,3,0).cpu().numpy()\n    if save_path is None:\n        save_path = args.export_coarse_only\n    np.savez_compressed(save_path, alpha=alpha, rgb=rgb)\n"
  },
  {
    "path": "FourierGrid/run_gen_cam_paths.py",
    "content": "import pdb, os, cv2\nfrom turtle import onkey\nfrom random import sample\nimport imageio\nfrom FourierGrid import utils\nimport numpy as np\nfrom scipy.spatial.distance import cdist\nfrom scipy.spatial.transform import Rotation as R\nfrom pathlib import Path\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nfrom FourierGrid.bbox_compute import compute_bbox_by_cam_frustrm\nimport shutil\n\n\ndef select_k_nearest_points(idx, positions, k):\n    positions = np.array(positions)\n    distances = [np.linalg.norm(a-positions[idx]) for a in positions]\n    sorted_idxs = sorted(zip(range(len(distances)), distances), key=lambda row: row[1])\n    sorted_idxs = [i for i, j in sorted_idxs[:1+k]]  # the first one is itself\n    return sorted_idxs\n\n\ndef move_idxs_to_folder(data_dict, sampled_idxs, save_path, data_root=\"data/sep19_ordered_dataset\"):\n    images = data_dict['images']\n    rgb_paths = [images[idx] for idx in sampled_idxs]\n    full_paths = [os.path.join(data_root, path) for path in rgb_paths]\n    for one_p in full_paths:\n        image_name = one_p.split(\"/\")[-1]\n        shutil.copyfile(one_p, os.path.join(save_path, image_name))\n    return\n\n\ndef render_idxs(data_dict, sampled_idxs, save_path, data_root=\"data/sep19_ordered_dataset\", fps=15, output_res=(800, 608)):\n    images = data_dict['images']\n    rgb_paths = [images[idx] for idx in sampled_idxs]\n    rgb_images = [imageio.imread(os.path.join(data_root, path)) / 255. for path in rgb_paths]\n    rgb_images = [cv2.resize(img, output_res) for img in rgb_images]\n    shapes = [img.shape for img in rgb_images]\n    imageio.mimwrite(save_path, utils.to8b(rgb_images), fps=fps, quality=8)\n    print(f\"Demo video is saved at {save_path}.\")\n    return\n\n\ndef get_rotation_kp_2d(args, cfg, data_dict, sample_idxs):\n    # get rotation vector in 2D planes\n    poses, HW, Ks, near, far = data_dict['poses'][sample_idxs], data_dict['HW'][sample_idxs], data_dict['Ks'][sample_idxs], data_dict['near'], data_dict['far']\n    xyz_min, xyz_max = compute_bbox_by_cam_frustrm(args=args, cfg=cfg, HW=HW, Ks=Ks, poses=poses, i_train=list(range(len(poses))), near=near, far=far, near_clip=data_dict['near_clip'])\n    near, far = data_dict['near'], data_dict['far']\n    if data_dict['near_clip'] is not None:\n        near = data_dict['near_clip']\n    rotations = []\n    for c2w, (H, W), K in zip(poses, HW, Ks):\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H, W, K, c2w, cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y,)\n        # cam_o = rays_o[0, 0].cpu().numpy()\n        # cam_d = rays_d[[0,0,-1,-1],[0,-1,0,-1]].cpu().numpy()\n        cam_d = rays_d[rays_d.shape[0]//2, rays_d.shape[1]//2].cpu().numpy()\n        rotations.append(cam_d)\n    return rotations\n    \n    \ndef run_export_bbox_cams(args, cfg, data_dict, sample_idxs, save_path): \n    # save the sampled camera in order to visualize it\n    print('Export bbox and cameras...')\n    poses, HW, Ks, near, far = data_dict['poses'][sample_idxs], data_dict['HW'][sample_idxs], data_dict['Ks'][sample_idxs], data_dict['near'], data_dict['far']\n    xyz_min, xyz_max = compute_bbox_by_cam_frustrm(args=args, cfg=cfg, HW=HW, Ks=Ks, poses=poses, i_train=list(range(len(poses))), near=near, far=far, near_clip=data_dict['near_clip'])\n    near, far = data_dict['near'], data_dict['far']\n    if data_dict['near_clip'] is not None:\n        near = data_dict['near_clip']\n    cam_lst = []\n    for c2w, (H, W), K in zip(poses, HW, Ks):\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H, W, K, c2w, cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y,)\n        cam_o = rays_o[0, 0].cpu().numpy()\n        cam_d = rays_d[[0,0,-1,-1],[0,-1,0,-1]].cpu().numpy()\n        cam_lst.append(np.array([cam_o, *(cam_o+cam_d*max(near, far*0.05))]))\n    dir_name = os.path.dirname(save_path)\n    Path(dir_name).mkdir(parents=True, exist_ok=True)\n    np.savez_compressed(save_path,\n        xyz_min=xyz_min.cpu().numpy(), xyz_max=xyz_max.cpu().numpy(),\n        cam_lst=np.array(cam_lst))\n    print(f\"The cam path has been saved at {save_path}.\")\n\n\ndef run_gen_cam_paths(args, cfg, data_dict, core_cam=None, straight_length=100):\n    print(\"Generating camera paths ...\")\n    # retrieve set of image lists for rendering videos\n    images = data_dict['images']\n    val_imgs = images[data_dict['i_val'][0]:data_dict['i_val'][-1]]\n    whole_cam_idxs = data_dict['cam_idxs']\n    whole_positions = data_dict['positions']\n    whole_poses = data_dict['poses']\n    # generate straight videos\n    idxs_all = list(range(len(data_dict['positions'])))\n    if core_cam is None:  # used as the core camera for video rendering\n        core_cam = max(set(whole_cam_idxs), key=whole_cam_idxs.count)\n    core_idxs = [idx for idx in idxs_all if whole_cam_idxs[idx] == core_cam]\n    sampled_positions = [whole_positions[idx] for idx in core_idxs]\n    sorted_idxs = sorted(zip(core_idxs, sampled_positions), key=lambda row: (row[1][1], row[1][0]))\n    sorted_idxs = [i for i, j in sorted_idxs]\n    sample_start = len(sorted_idxs) // 2 - straight_length // 2  # sample from mid\n    straight_idxs = sorted_idxs[sample_start: sample_start + straight_length]\n    save_p = 'data/samples/demo_video'\n    os.makedirs(save_p, exist_ok=True)\n    # render_idxs(data_dict, straight_idxs, save_path=os.path.join(save_p, 'straight.mp4'))\n    # run_export_bbox_cams(args, cfg, data_dict=data_dict, sample_idxs=straight_idxs, save_path=os.path.join(save_p, 'straight_cam.npz'))\n\n    # visualizing cameras in different rotations\n    close_idxs = select_k_nearest_points(sample_start, whole_positions, k=15)\n    rotations = get_rotation_kp_2d(args, cfg, data_dict, close_idxs)\n    sorted_idxs = sorted(zip(close_idxs, rotations), key=lambda row: (row[1][1], row[1][0]))\n    sorted_idxs = [i for i, j in sorted_idxs]\n    cam2idxs = {}\n    # save sorted indexes in to the disk.\n    save_idxs = []\n    for one_idx in sorted_idxs:\n        cam_idx = whole_cam_idxs[one_idx]\n        if cam_idx not in cam2idxs:\n            cam2idxs[cam_idx] = [one_idx] + straight_idxs\n            print(f'cam_id:{cam_idx}, image path: {images[one_idx]}, original idx: {one_idx}.')\n            save_idxs.append(one_idx)\n            run_export_bbox_cams(args, cfg, data_dict=data_dict, sample_idxs=cam2idxs[cam_idx], save_path=os.path.join(save_p, f'cam_{cam_idx}.npz'))\n    # move_idxs_to_folder(data_dict, save_idxs, save_path=save_p)\n\n    # close_poses = [whole_poses[idx] for idx in close_idxs]\n    # rotations = [R.from_matrix(pose[:3, :3]) for pose in close_poses]\n    # rot_degrees = [r.as_euler('zxy', degrees=True) for r in rotations]\n    # sorted_idxs = sorted(zip(close_idxs, rot_degrees), key=lambda row: row[1][0])\n    # sorted_idxs = [i for i, j in sorted_idxs]\n    final_rot_idxs = sorted_idxs\n    \n    # start_pos_in_sort = sorted_idxs.index(sample_start)\n    # if start_pos_in_sort > len(sorted_idxs) // 2:\n    #     final_rot_idxs = sorted_idxs[:start_pos_in_sort]  # later\n    # else:\n    #     final_rot_idxs = sorted_idxs[start_pos_in_sort:]\n    combined_idxs = final_rot_idxs + straight_idxs\n    render_idxs(data_dict, combined_idxs, save_path=os.path.join(save_p, 'rot.mp4'))\n    run_export_bbox_cams(args, cfg, data_dict=data_dict, sample_idxs=combined_idxs, save_path=os.path.join(save_p, 'rot_cam.npz'))\n"
  },
  {
    "path": "FourierGrid/run_gtk_analysis.py",
    "content": "import matplotlib\nimport matplotlib.pylab as pylab\nimport matplotlib.pyplot as plt\nfrom matplotlib.lines import Line2D\nfrom tqdm import tqdm\nimport pdb\nimport time\nimport torch\n# import random\n# from jax import random\nimport torch.nn as nn\nimport numpy as np\nfrom scipy.special import jv\nfrom scipy.ndimage import gaussian_filter1d\nfrom mpl_toolkits.mplot3d import axes3d\n\n\nclass VoxelGrid(nn.Module):\n    # the V o x e l G ri d operator\n    def __init__(self, grid_len=1000, data_point_num=100):\n        super(VoxelGrid, self).__init__()\n        self.grid_len = grid_len\n        self.data_point_num = data_point_num\n        self.interval_num = grid_len - 1\n        axis_coord = np.array([0 + i * 1 / grid_len for i in range(grid_len)])\n        self.ms_x, self.ms_t = np.meshgrid(axis_coord, axis_coord)  # x and t of the grid\n        x_coord = np.ravel(self.ms_x).reshape(-1,1)\n        t_coord = np.ravel(self.ms_t).reshape(-1,1)\n        self.x_coord = torch.tensor(x_coord).float()\n        self.t_coord = torch.tensor(t_coord).float()\n        axis_index = np.array([i for i in range(grid_len)])\n        ms_x, ms_t = np.meshgrid(axis_index, axis_index)\n        x_ind = np.ravel(ms_x).reshape(-1,1)\n        t_ind = np.ravel(ms_t).reshape(-1,1)\n        self.x_ind = torch.tensor(x_ind).long()\n        self.t_ind = torch.tensor(t_ind).long()\n        self.voxel = nn.Parameter(torch.rand((grid_len)), requires_grad=True)\n    \n    def forward(self,):  # calculate GTK\n        # the data is [0, 1/data_point_num, 2/data_point_num, ..., 1]\n        jacobian_y_w = np.zeros((self.data_point_num, self.grid_len))\n        for idx in range(self.data_point_num):\n            real_x = idx / self.data_point_num\n            left_grid = int(real_x // (1 / self.grid_len))\n            right_grid = left_grid + 1\n            if left_grid >= 0:\n                jacobian_y_w[idx][left_grid] = abs(real_x - right_grid * 1 / self.grid_len) * self.grid_len\n            if right_grid < self.grid_len:\n                jacobian_y_w[idx][right_grid] = abs(real_x - left_grid * 1 / self.grid_len) * self.grid_len\n        jacobian_y_w_transpose = np.transpose(jacobian_y_w)\n        result_matrix = np.matmul(jacobian_y_w, jacobian_y_w_transpose)\n        return result_matrix\n\n    def one_d_regress(self, x_train, x_test, y_train, y_test_gt):\n        train_loss = 0\n        for idx, one_x in enumerate(x_train):\n            left_grid = int(one_x // (1 / self.interval_num))\n            right_grid = left_grid + 1\n            left_value = self.voxel[left_grid]\n            right_value = self.voxel[right_grid]\n            left_weight = abs(one_x - right_grid * 1 / self.interval_num) * self.interval_num\n            right_weight = abs(one_x - left_grid * 1 / self.interval_num) * self.interval_num\n            y_pred = left_value * left_weight + right_value * right_weight\n            y_pred = torch.sigmoid(y_pred)\n            train_loss += torch.nn.functional.mse_loss(y_pred, torch.tensor(y_train[idx]).float())\n        \n        y_test = []\n        test_loss = []\n        for idx, one_x in enumerate(x_test):\n            left_grid = int(one_x // (1 / self.interval_num))\n            right_grid = left_grid + 1\n            left_value = self.voxel[left_grid]\n            right_value = self.voxel[right_grid]\n            left_weight = abs(one_x - right_grid * 1 / self.interval_num) * self.interval_num\n            right_weight = abs(one_x - left_grid * 1 / self.interval_num) * self.interval_num\n            y_pred = left_value * left_weight + right_value * right_weight\n            y_pred = torch.sigmoid(y_pred)\n            y_test.append(y_pred.item())\n            test_loss.append((y_pred.item() - y_test_gt[idx]) ** 2)\n        test_loss = np.mean(test_loss)\n        return train_loss, test_loss, y_test\n\n\nclass FourierGrid(nn.Module):\n    # the FourierGrid operator\n    def __init__(self, grid_len=1000, band_num=10, data_point_num=100):\n        super(FourierGrid, self).__init__()\n        self.grid_len = grid_len\n        self.data_point_num = data_point_num\n        self.interval_num = self.grid_len - 1\n        self.band_num = band_num\n        axis_coord = np.array([0 + i * 1 / grid_len for i in range(grid_len)])\n        self.ms_x, self.ms_t = np.meshgrid(axis_coord, axis_coord)  # x and t of the grid\n        x_coord = np.ravel(self.ms_x).reshape(-1,1)\n        t_coord = np.ravel(self.ms_t).reshape(-1,1)\n        self.x_coord = torch.tensor(x_coord).float()\n        self.t_coord = torch.tensor(t_coord).float()\n        axis_index = np.array([i for i in range(grid_len)])\n        ms_x, ms_t = np.meshgrid(axis_index, axis_index)\n        x_ind = np.ravel(ms_x).reshape(-1,1)\n        t_ind = np.ravel(ms_t).reshape(-1,1)\n        self.x_ind = torch.tensor(x_ind).long()\n        self.t_ind = torch.tensor(t_ind).long()\n        self.voxel = nn.Parameter(torch.rand((grid_len * (self.band_num + 1))), requires_grad=True)\n    \n    def gamma_x_i(self, x, i):\n        if i%2 == 0:\n            raw_fourier = np.sin((2^(i // 2)) * np.pi * x)\n        else:\n            raw_fourier = np.cos((2^(i // 2)) * np.pi * x)\n        fourier = (raw_fourier + 1) / 2   # to [0, 1]\n        return fourier\n        \n    def forward(self,):  # calculate GTK\n        jacobian_y_w = np.zeros((self.data_point_num, self.grid_len * self.band_num))\n        for idx in range(self.data_point_num):  # for all data points\n            real_x = idx / self.data_point_num   # the real x value\n            for jdx in range(self.band_num):\n                fourier = self.gamma_x_i(real_x, jdx)\n                left_grid = int(fourier // (1 / self.grid_len))\n                right_grid = right_grid = left_grid + 1\n                if left_grid > 0:\n                    jacobian_y_w[idx][self.grid_len * jdx + left_grid] = abs(fourier - right_grid * 1 / self.grid_len) * self.grid_len\n                if right_grid < self.grid_len:\n                    jacobian_y_w[idx][self.grid_len * jdx + right_grid] = abs(fourier - left_grid * 1 / self.grid_len) * self.grid_len\n        jacobian_y_w_transpose = np.transpose(jacobian_y_w)\n        result_matrix = np.matmul(jacobian_y_w, jacobian_y_w_transpose)\n        return result_matrix\n    \n    def one_d_regress(self, x_train, x_test, y_train, y_test_gt):\n        train_loss = 0\n        for idx, one_x in enumerate(x_train):\n            y_pred = 0\n            for jdx in range(self.band_num):\n                fourier = self.gamma_x_i(one_x, jdx)\n                left_grid = int(fourier * self.interval_num)\n                right_grid = left_grid + 1\n                left_value = self.voxel[self.grid_len * jdx + left_grid]\n                right_value = self.voxel[self.grid_len * jdx + right_grid]\n                left_weight = abs(fourier - right_grid * 1 / self.interval_num) * self.interval_num\n                right_weight = abs(fourier - left_grid * 1 / self.interval_num) * self.interval_num\n                assert abs(left_weight + right_weight - 1) < 0.0001\n                y_pred += left_value * left_weight + right_value * right_weight\n            y_pred /= self.band_num\n            y_pred = torch.sigmoid(y_pred)\n            train_loss += torch.nn.functional.mse_loss(y_pred, torch.tensor(y_train[idx]).float())\n        \n        y_test = []\n        test_loss = []\n        for idx, one_x in enumerate(x_test):\n            y_pred = 0\n            for jdx in range(self.band_num):\n                fourier = self.gamma_x_i(one_x, jdx)\n                left_grid = int(fourier * self.interval_num)\n                right_grid = left_grid + 1\n                left_value = self.voxel[self.grid_len * jdx + left_grid]\n                right_value = self.voxel[self.grid_len * jdx + right_grid]\n                left_weight = abs(fourier - right_grid * 1 / self.interval_num) * self.interval_num\n                right_weight = abs(fourier - left_grid * 1 / self.interval_num) * self.interval_num\n                y_pred += left_value * left_weight + right_value * right_weight\n            y_pred /= self.band_num\n            y_pred = torch.sigmoid(y_pred)\n            y_test.append(y_pred.item())\n            test_loss.append((y_pred.item() - y_test_gt[idx]) ** 2)\n        test_loss = np.mean(test_loss)\n        return train_loss, test_loss, y_test\n\n\n# build models and train them\ndef train_model(one_model):\n    # training and testing VoxelGrid\n    optimizer = torch.optim.Adam(one_model.parameters(), lr=lr)\n    iterations = 150\n    epoch_iter = tqdm(range(iterations))\n    for epoch in epoch_iter:\n        optimizer.zero_grad() # to make the gradients zero\n        train_loss, test_loss, test_y = one_model.one_d_regress(x_train, x_test, y_train, y_test_gt)\n        train_loss.backward() # This is for computing gradients using backward propagation\n        optimizer.step()      # This is equivalent to: theta_new = theta_old - alpha * derivative of J w.r.t theta\n        epoch_iter.set_description(f\"Training loss: {train_loss.item()}; Testing Loss: {test_loss}\")\n    return train_loss, test_loss, test_y\n\n\ndef get_fg_gtk_spectrum_by_band_num(band_num):\n    test_fg = FourierGrid(grid_len=grid_len, band_num=band_num * 2)\n    fg_gtk = test_fg()\n    # fg_gtk = (fg_gtk - fg_gtk.min()) / (fg_gtk.max() - fg_gtk.min())\n    fg_gtk_spectrum = 10**fplot(fg_gtk)\n    fg_plot = gaussian_filter1d(fg_gtk_spectrum[0], sigma=2)\n    return fg_plot\n\n\n# hyperparameters\ntitle_offset = -0.29\nbbox_offset = 1.44\ndata_point_num = 100\ngrid_len = 10\nfreq_num = 10\ncolors_k = np.array([[0.8872, 0.4281, 0.1875],\n    [0.8136, 0.6844, 0.0696],\n    [0.2634, 0.6634, 0.4134],\n    [0.0943, 0.5937, 0.8793],\n    [0.3936, 0.2946, 0.6330],\n    [0.7123, 0.2705, 0.3795]])\nlinewidth = 1.0\nline_alpha = .8\ntitle_font_size = 7.4\nlegend_font_size = 6\nlabel_size = 7\n# matplotlib.rcParams[\"font.family\"] = 'Arial'\nmatplotlib.rcParams['xtick.labelsize'] = label_size \nmatplotlib.rcParams['ytick.labelsize'] = label_size \n\n# begin plot \nfig3 = plt.figure(constrained_layout=True, figsize=(4, 4))\ngs = fig3.add_gridspec(2, 2, width_ratios=[1, 1], height_ratios=[1, 1])\n# 100 * 100 datapoints, 10*10 params (grid_len=10)\ntest_vg = VoxelGrid(grid_len=grid_len * freq_num)\nvg_gtk = test_vg()\nvg_gtk_normalized = (vg_gtk - vg_gtk.min()) / (vg_gtk.max() - vg_gtk.min())\nax = fig3.add_subplot(gs[0, 0])\nax.imshow(vg_gtk_normalized)\nax.set_xticks([*range(0, 100, 20)] + [100])\nax.set_yticks([*range(0, 100, 20)] + [100])\nax.grid(linestyle = '--', linewidth = 0.3)\nax.set_title('(a) VoxelGrid GTK', y=title_offset, fontsize=title_font_size)\n\nax = fig3.add_subplot(gs[0, 1])\ntest_fg = FourierGrid(grid_len=grid_len, band_num=freq_num)\nfg_gtk = test_fg()\nfg_gtk = (fg_gtk - fg_gtk.min()) / (fg_gtk.max() - fg_gtk.min())\nax.imshow(fg_gtk)\nax.set_xticks([*range(0, 100, 20)] + [100])\nax.set_yticks([*range(0, 100, 20)] + [100])\nax.grid(linestyle = '--', linewidth = 0.3)\nax.set_title('(b) FourierGrid GTK', y=title_offset, fontsize=title_font_size)\n\nax = fig3.add_subplot(gs[1, 0])\nw_vg, v_vg = np.linalg.eig(vg_gtk)\nw_fg, v_fg = np.linalg.eig(fg_gtk)\nfplot = lambda x : np.fft.fftshift(np.log10(np.abs(np.fft.fft(x))))\nvg_gtk_spectrum = 10**fplot(vg_gtk)\nvg_plot = gaussian_filter1d(vg_gtk_spectrum[0], sigma=2)\n\nfg_gtk_plot_1 = get_fg_gtk_spectrum_by_band_num(band_num=1)\nfg_gtk_plot_5 = get_fg_gtk_spectrum_by_band_num(band_num=5)\nfg_gtk_plot_10 = get_fg_gtk_spectrum_by_band_num(band_num=10)\nplt.autoscale(enable=True, axis='x', tight=True)\n# plt.plot(vg_plot, label='VoxelGrid', color=colors_k[0], alpha=line_alpha, linewidth=linewidth)\nplt.semilogy(np.append(vg_plot, vg_plot[0]), label='VoxelGrid', color=colors_k[0], alpha=line_alpha, linewidth=linewidth)\n# plt.semilogy(fg_gtk_plot_1, label='FourierGrid (l=1)', color=colors_k[2], alpha=line_alpha, linewidth=linewidth)\nplt.semilogy(np.append(fg_gtk_plot_1, fg_gtk_plot_1[0]), label='FourierGrid (l=1)', color=colors_k[2], alpha=line_alpha, linewidth=linewidth)\n# plt.semilogy(fg_gtk_plot_5, label='FourierGrid (l=5)', color=colors_k[3], alpha=line_alpha, linewidth=linewidth)\nplt.semilogy(np.append(fg_gtk_plot_5, fg_gtk_plot_5[0]), label='FourierGrid (l=5)', color=colors_k[3], alpha=line_alpha, linewidth=linewidth)\n# plt.semilogy(fg_gtk_plot_10, label='FourierGrid (l=10)', color=colors_k[4], alpha=line_alpha, linewidth=linewidth)\nplt.semilogy(np.append(fg_gtk_plot_10, fg_gtk_plot_10[0]), label='FourierGrid (l=10)', color=colors_k[4], alpha=line_alpha, linewidth=linewidth)\nplt.xticks([0,25,50,75,100], ['$-\\pi$','$-\\pi/2$','$0$','$\\pi/2$','$\\pi$'])\nax.set_yticks([0.1, 1, 10, 100])\nax.legend(loc='upper left', bbox_to_anchor=(-0.01, bbox_offset), handlelength=1, fontsize=legend_font_size, fancybox=False, ncol=1)\nax.set_title('(c) GTK Fourier Spectrum', y=title_offset, fontsize=title_font_size)\n\n\ndef sample_random_signal(key, decay_vec):\n  N = decay_vec.shape[0]\n#   raw = random.normal(key, [N, 2]) @ np.array([1, 1j])\n  raw = np.random.normal(size=[N, 2]) @ np.array([1, 1j])\n  signal_f = raw * decay_vec\n  signal = np.real(np.fft.ifft(signal_f))\n  return signal\n\n\ndef sample_random_powerlaw(key, N, power):\n  coords = np.float32(np.fft.ifftshift(1 + N//2 - np.abs(np.fft.fftshift(np.arange(N)) - N//2)))\n  decay_vec = coords ** (-power)\n  decay_vec = np.array(decay_vec)\n  decay_vec[N//4:] = 0\n  return sample_random_signal(key, decay_vec)\n\n\ndef get_sine_signal():\n    return np.array([np.sin(x / (train_num*sample_interval) * 2 * np.pi) for x in range(train_num*sample_interval)])\n\n\ndef get_bessel_signal():\n    # return np.array([np.exp(x / train_num*sample_interval) for x in range(train_num*sample_interval)])\n    return np.array([jv(1, x / 4) for x in range(train_num*sample_interval)])\n\n##  Fitting experiments\n# hyperparameters\nrand_key = np.array([0, 0], dtype=np.uint32)\ntrain_num = 7\nsample_interval = 4\ndata_power = 0.5\nlr = 1\n\n\n# setup data\nx_test = np.float32(np.linspace(0, 1., train_num * sample_interval, endpoint=False))\nx_train = x_test[0:len(x_test):sample_interval]\n\n# signal = get_sine_signal()\nsignal = get_bessel_signal()\nsignal = (signal-signal.min()) / (signal.max()-signal.min())\n\ny_train = signal[0:len(x_test):sample_interval]\ny_test_gt = signal\n\n\nfreq_num = 3\ntest_vg_small = VoxelGrid(grid_len=10 * freq_num)\ntest_vg_large = VoxelGrid(grid_len=100 * freq_num)\ntest_fg_small = FourierGrid(grid_len=10, band_num=freq_num)\ntest_fg_large = FourierGrid(grid_len=100, band_num=freq_num)\ntrain_loss, test_loss, test_y_vg_small = train_model(test_vg_small)\ntrain_loss, test_loss, test_y_fg_small = train_model(test_fg_small)\n\nax = fig3.add_subplot(gs[1, 1])\nax.plot(x_test, signal, label='Target signal', color='k', linewidth=1, alpha=line_alpha, zorder=1)\nax.scatter(x_train, y_train, color='w', edgecolors='k', linewidths=1, s=20, linewidth=1, label='Training points', zorder=2)\nax.plot(x_test, test_y_vg_small, label='Learned by VoxelGrid', color=colors_k[0], linewidth=1, alpha=line_alpha, zorder=1)\nax.plot(x_test, test_y_fg_small, label='Learned by FourierGrid', color=colors_k[3], linewidth=1, alpha=line_alpha, zorder=1)\nax.set_title('(d) 1D Regression', y=title_offset, fontsize=title_font_size)\nax.set_xticks(np.linspace(0.0, 1.0, num=5, endpoint=True))\nax.legend(loc='upper left', bbox_to_anchor=(-0.01, bbox_offset), handlelength=1, fontsize=legend_font_size, fancybox=False, ncol=1)\n\nprint(\"Plotting figures 1\")\nplt.savefig(\"figures/vg_fg_gtk.jpg\", dpi=300) # for example\nplt.savefig(\"figures/vg_fg_gtk.pdf\", format=\"pdf\")\n\n####################################################################\n# plotting a new diagram, figure 2\n####################################################################\ndef calculate_Delta(gtk, y1_data, y2_data):\n    # batch_y: [1 * 2 * B]\n    batch_y_transpose = np.expand_dims(np.stack([y1_data, y2_data]).reshape(2, -1).transpose(), axis=2)\n    batch_y = np.expand_dims(np.stack([y1_data, y2_data]).reshape(2, -1).transpose(), axis=1)\n    batch_size = batch_y.shape[0]\n    batch_gtk = np.array([np.linalg.inv(gtk) for i in range(batch_size)])\n    result = np.matmul(batch_y, np.matmul(batch_gtk, batch_y_transpose)).squeeze()\n    return result\n\n\none_vg = VoxelGrid(grid_len=grid_len, data_point_num=2)\nvg_gtk = one_vg()\none_fg = FourierGrid(grid_len=grid_len, data_point_num=2)\nfg_gtk = one_fg()\ny1_data, y2_data, Z = axes3d.get_test_data(0.05)\ny1_data = y1_data / y1_data.max()\ny2_data = y2_data / y2_data.max()\nvg_values = calculate_Delta(vg_gtk, y1_data, y2_data).reshape(Z.shape)\nvg_values /= vg_values.max()\nfg_values = calculate_Delta(fg_gtk, y1_data, y2_data).reshape(Z.shape) \nfg_values /= fg_values.max()\n\n# begin plot \nfig4 = plt.figure(constrained_layout=True, figsize=(4, 2))\ngs = fig4.add_gridspec(1, 2, width_ratios=[1.04, 0.96])\n# 100 * 100 datapoints, 10*10 params (grid_len=10)\nax = fig4.add_subplot(gs[0, 0])\n\nX=np.array([0.15, 0.44, 0.3, 0.56, 0.78, 0.72])\nY=np.array([0.18, 0.34, 0.51, 0.72, 0.81, 0.93])\nannotations=[\"Synthetic-NeRF\", \"NSVF\", \"BlendedMVS\", \"UT&T\", \"M360\", \"SFMB\"]\n\nax.scatter(X[:3], Y[:3], s=30, color=colors_k[0], marker=\"s\")\nax.scatter(X[3:], Y[3:], s=30, color=colors_k[3], marker=\"o\")\nax.annotate(annotations[0], (X[0], Y[0]), fontsize=title_font_size)\nax.annotate(annotations[1], (X[1], Y[1]), fontsize=title_font_size)\nax.annotate(annotations[2], (X[2], Y[2]), fontsize=title_font_size)\nax.annotate(annotations[3], (X[3], Y[3]), fontsize=title_font_size)\nax.annotate(annotations[4], (X[4], Y[4]), fontsize=title_font_size)\nax.annotate(annotations[5], (X[5], Y[5]), fontsize=title_font_size)\nax.set_xlabel(\"Density Norms\", size=title_font_size)\nax.set_ylabel(\"Generalization Gap\", size=title_font_size)\n\nax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])\nax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])\nax.grid(linestyle='--', linewidth = 0.3)\nax.set_title('(a) Bounded vs. Unbounded Scenes',  y=-0.4, fontsize=title_font_size)\n\n\nax = fig4.add_subplot(gs[0, 1])\ndiffrences = vg_values - fg_values\nax.imshow(diffrences, cmap='coolwarm', )\nax.set_xticks([0,24,48,72,96, 120], [-1,-0.6,-0.2,0.2,0.6,1])\nax.set_yticks([0,24,48,72,96, 120], [-1,-0.6,-0.2,0.2,0.6,1])\nax.grid(linestyle = '--', linewidth = 0.3)\nax.set_xlabel(\"y1\", size=title_font_size, labelpad=1)\nax.set_ylabel(\"y2\", size=title_font_size, labelpad=-1)\nax.set_title('(b) Generalization Bound Diff.',  y=-0.491, fontsize=title_font_size)\nplot = ax.pcolor(diffrences)\nplt.colorbar(plot)\n\nprint(\"Plotting figures 2\")\nplt.savefig(\"figures/unbounded.jpg\", dpi=300) # for example\nplt.savefig(\"figures/unbounded.pdf\", format=\"pdf\")\npdb.set_trace()\n\n\n# ax = fig3.add_subplot(gs[0, 1])\n# test_fg = FourierGrid(grid_len=grid_len, band_num=freq_num)\n# fg_gtk = test_fg()\n# fg_gtk = (fg_gtk - fg_gtk.min()) / (fg_gtk.max() - fg_gtk.min())\n# ax.imshow(fg_gtk)\n# ax.set_xticks([*range(0, 100, 20)] + [100])\n# ax.set_yticks([*range(0, 100, 20)] + [100])\n# ax.grid(linestyle = '--', linewidth = 0.3)\n# ax.set_title('(b) FourierGrid GTK', y=title_offset, fontsize=title_font_size)\n\npdb.set_trace()\n\n######################################################################\n# 3D plotting\n######################################################################\n\nax = plt.figure(constrained_layout=True).add_subplot(projection='3d')\n\n# Plot the 3D surface\nax.plot_surface(y1_data, y2_data, vg_values, color=colors_k[0], edgecolor=colors_k[0], lw=0.5, rstride=8, cstride=8,\n                alpha=0.3)\nax.plot_surface(y1_data, y2_data, fg_values, color=colors_k[3], edgecolor=colors_k[3], lw=0.5, rstride=8, cstride=8,\n                alpha=0.3)\n# Plot projections of the contours for each dimension.  By choosing offsets\n# that match the appropriate axes limits, the projected contours will sit on\n# the 'walls' of the graph\ncolor_shift = 0.9\nax.contourf(y1_data, y2_data, fg_values - vg_values + color_shift, zdir='z', offset=0.0, cmap='coolwarm', vmin=0.2, vmax=1.6)\n# ax.contourf(y1_data, y2_data, Z, zdir='x', offset=-40, cmap='coolwarm')\n# ax.contourf(y1_data, y2_data, Z, zdir='y', offset=40, cmap='coolwarm')\nax.margins(x=0, y=0, z=0)\n\nax.set(xlim=(-1, 1), ylim=(-1, 1), zlim=(0, 1),\n       xlabel='y1', ylabel='y2', zlabel='Gen. Bound')\n\nprint(\"Plotting figures 3, supplementary\")\nplt.savefig(\"figures/compare_generalization.jpg\", dpi=300) # for example\nplt.savefig(\"figures/compare_generalization.pdf\", format=\"pdf\")\n\n"
  },
  {
    "path": "FourierGrid/run_render.py",
    "content": "import os\nimport pdb\nimport imageio\nimport torch\nfrom tqdm import tqdm, trange\nimport numpy as np\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nfrom FourierGrid.FourierGrid_model import FourierGridModel\nfrom FourierGrid.utils import resize_and_to_8b\nfrom FourierGrid.arf import ARF\nimport matplotlib.pyplot as plt\n\n\n@torch.no_grad()\ndef render_viewpoints(cfg, model, render_poses, HW, Ks, ndc, render_kwargs,\n                      gt_imgs=None, savedir=None, dump_images=False,\n                      render_factor=0, render_video_flipy=False, render_video_rot90=0,\n                      eval_ssim=False, eval_lpips_alex=False, eval_lpips_vgg=False, verbose=True):\n    '''Render images for the given viewpoints; run evaluation if gt given.\n    '''\n    assert len(render_poses) == len(HW) and len(HW) == len(Ks)\n    if render_factor!=0:\n        HW = np.copy(HW)\n        Ks = np.copy(Ks)\n        HW = (HW/render_factor).astype(int)\n        Ks[:, :2, :3] /= render_factor\n\n    rgbs = []\n    depths = []\n    bgmaps = []\n    psnrs = []\n    ssims = []\n    lpips_alex = []\n    lpips_vgg = []\n    if verbose:\n        tqdm_bar = tqdm(render_poses)\n    else:\n        tqdm_bar = render_poses\n    for i, c2w in enumerate(tqdm_bar):\n        H, W = HW[i]\n        K = Ks[i]\n        c2w = torch.Tensor(c2w)\n        rays_o, rays_d, viewdirs = dvgo.get_rays_of_a_view(\n                H, W, K, c2w, ndc, inverse_y=render_kwargs['inverse_y'],\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n        keys = ['rgb_marched', 'depth', 'alphainv_last']\n        rays_o = rays_o.flatten(0,-2)\n        rays_d = rays_d.flatten(0,-2)\n        viewdirs = viewdirs.flatten(0,-2)\n        FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n        if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n            indexs = torch.zeros_like(rays_o)\n            indexs.copy_(torch.tensor(i).long().to(rays_o.device))  # add image index\n            render_result_chunks = [\n                {k: v for k, v in model(ro, rd, vd, **{**render_kwargs, \"indexs\": ind}).items() if k in keys}\n                for ro, rd, vd, ind in zip(rays_o.split(8192, 0), rays_d.split(8192, 0), \n                                           viewdirs.split(8192, 0), indexs.split(8192, 0))\n            ]\n        else:\n            render_result_chunks = [\n                {k: v for k, v in model(ro, rd, vd, **render_kwargs).items() if k in keys}\n                for ro, rd, vd in zip(rays_o.split(8192, 0), rays_d.split(8192, 0), viewdirs.split(8192, 0))\n            ]\n        render_result = {\n            k: torch.cat([ret[k] for ret in render_result_chunks]).reshape(H,W,-1)\n            for k in render_result_chunks[0].keys()\n        }\n        rgb = render_result['rgb_marched'].cpu().numpy()\n        depth = render_result['depth'].cpu().numpy()\n        bgmap = render_result['alphainv_last'].cpu().numpy()\n\n        rgbs.append(rgb)\n        depths.append(depth)\n        bgmaps.append(bgmap)\n        if verbose:\n            tqdm_bar.set_description(f\"Rendering video with frame shape: {rgb.shape}.\")\n        if gt_imgs is not None and render_factor==0:\n            p = -10. * np.log10(np.mean(np.square(rgb - gt_imgs[i])))\n            psnrs.append(p)\n            if eval_ssim:\n                ssims.append(utils.rgb_ssim(rgb, gt_imgs[i], max_val=1))\n            if eval_lpips_alex:\n                lpips_alex.append(utils.rgb_lpips(rgb, gt_imgs[i], net_name='alex', device=c2w.device))\n            if eval_lpips_vgg:\n                lpips_vgg.append(utils.rgb_lpips(rgb, gt_imgs[i], net_name='vgg', device=c2w.device))\n\n    if len(psnrs):\n        print('Psnr', np.mean(psnrs), '(avg)')\n        if eval_ssim: print('Ssim', np.mean(ssims), '(avg)')\n        if eval_lpips_vgg: print('Lpips (vgg)', np.mean(lpips_vgg), '(avg)')\n        if eval_lpips_alex: print('Lpips (alex)', np.mean(lpips_alex), '(avg)')\n\n    if render_video_flipy:\n        for i in range(len(rgbs)):\n            rgbs[i] = np.flip(rgbs[i], axis=0)\n            depths[i] = np.flip(depths[i], axis=0)\n            bgmaps[i] = np.flip(bgmaps[i], axis=0)\n\n    if render_video_rot90 != 0:\n        for i in range(len(rgbs)):\n            rgbs[i] = np.rot90(rgbs[i], k=render_video_rot90, axes=(0,1))\n            depths[i] = np.rot90(depths[i], k=render_video_rot90, axes=(0,1))\n            bgmaps[i] = np.rot90(bgmaps[i], k=render_video_rot90, axes=(0,1))\n\n    if savedir is not None and dump_images:\n        for i in trange(len(rgbs)):\n            rgb8 = utils.to8b(rgbs[i])\n            filename = os.path.join(savedir, '{:03d}.png'.format(i))\n            imageio.imwrite(filename, rgb8)\n\n    rgbs = np.array(rgbs)\n    depths = np.array(depths)\n    bgmaps = np.array(bgmaps)\n    return rgbs, depths, bgmaps\n\n\ndef run_render(args, cfg, data_dict, device, debug=True, add_info=\"\", visualize_geometry=False):\n    # initilize stylizer when needed\n    if 'arf' in cfg:\n        stylizer = ARF(cfg, data_dict, device)\n    else:\n        stylizer = None\n    model_class = FourierGridModel                 # only support FourierGridModel currently\n    merged_ckpt_path = os.path.join(cfg.basedir, cfg.expname, f'fine_last_merged.tar')\n    use_merged = os.path.exists(merged_ckpt_path)\n    if use_merged:\n        merged_model = utils.load_model(model_class, merged_ckpt_path).to(device)\n    else:\n        merged_model = None\n    render_viewpoints_kwargs = {\n        'model': None,\n        'ndc': cfg.data.ndc,\n        'render_kwargs': {\n            'near': data_dict['near'],\n            'far': data_dict['far'],\n            'bg': 1 if cfg.data.white_bkgd else 0,\n            'stepsize': cfg.fine_model_and_render.stepsize,\n            'inverse_y': cfg.data.inverse_y,\n            'flip_x': cfg.data.flip_x,\n            'flip_y': cfg.data.flip_y,\n            'render_depth': True,\n        }\n    }\n    \n    # block-by-block rendering\n    if args.block_num > 1 and not use_merged:\n        print(\"Merging trained blocks ...\")\n        ckpt_paths = [os.path.join(cfg.basedir, cfg.expname, f'fine_last_{i}.tar') for i in range(args.block_num)]\n        if args.render_train:\n            model_class = FourierGridModel                 # only support FourierGridModel currently\n            ckpt_paths = [os.path.join(cfg.basedir, cfg.expname, f'fine_last_{i}.tar') for i in range(args.block_num)]\n            train_save_dir = os.path.join(cfg.basedir, cfg.expname, f'render_train_fine_last')\n            os.makedirs(train_save_dir, exist_ok=True)\n            print('All results are dumped into', train_save_dir)\n            all_rgbs = []\n            all_training_indexs = data_dict['i_train'].copy()\n            for idx, cp in enumerate(ckpt_paths):\n                args.running_block_id = idx\n                s, e = idx * args.num_per_block, (idx + 1) * args.num_per_block\n                # Here we assume the i_train's order follows the block order.\n                data_dict['i_train'] = all_training_indexs[s:e]\n                model = utils.load_model(model_class, cp).to(device)\n                render_viewpoints_kwargs['model'] = model\n                rgbs, depths, bgmaps = render_viewpoints(cfg=cfg, render_poses=data_dict['poses'][data_dict['i_train']],\n                HW=data_dict['HW'][data_dict['i_train']], Ks=data_dict['Ks'][data_dict['i_train']],\n                gt_imgs=[data_dict['images'][i].cpu().numpy() for i in data_dict['i_train']],\n                savedir=train_save_dir, dump_images=args.dump_images, eval_ssim=args.eval_ssim, \n                eval_lpips_alex=args.eval_lpips_alex, eval_lpips_vgg=args.eval_lpips_vgg,\n                **render_viewpoints_kwargs)\n                if stylizer is not None:\n                    rgbs, _ = stylizer.match_colors_for_image_set(rgbs, train_save_dir)\n                all_rgbs += rgbs.tolist()\n            save_all_rgbs = np.array(all_rgbs)\n            save_name = 'video.rgb.mp4'\n            if stylizer is not None:\n                save_name = f'video.rgb.style.{cfg.arf.style_id}.mp4'\n            imageio.mimwrite(os.path.join(train_save_dir, save_name), utils.to8b(save_all_rgbs), fps=15, quality=8)\n\n        if args.render_test:\n            model_class = FourierGridModel                 # only support FourierGridModel currently\n            testsavedir = os.path.join(cfg.basedir, cfg.expname, f'render_test_fine_last')\n            os.makedirs(testsavedir, exist_ok=True)\n            print('All results are dumped into', testsavedir)\n            if data_dict['i_test'][0] >= len(data_dict['images']):  # gt images are not provided\n                gt_imgs = None\n            else:\n                gt_imgs = [data_dict['images'][i].cpu().numpy() for i in data_dict['i_test']]\n            ckpt_paths = [os.path.join(cfg.basedir, cfg.expname, f'fine_last_{i}.tar') for i in range(args.block_num)]\n            all_rgbs = []\n            all_test_indexs = data_dict['i_test'].copy()\n            for idx, cp in enumerate(ckpt_paths):\n                args.running_block_id = idx\n                s, e = idx * args.num_per_block, (idx + 1) * args.num_per_block\n                # Here we assume the i_test's order follows the block order.\n                data_dict['i_test'] = all_test_indexs[s:e]\n                model = utils.load_model(model_class, cp).to(device)\n                render_viewpoints_kwargs['model'] = model\n                rgbs, depths, bgmaps = render_viewpoints(\n                cfg=cfg, render_poses=data_dict['poses'][data_dict['i_test']], HW=data_dict['HW'][data_dict['i_test']], \n                Ks=data_dict['Ks'][data_dict['i_test']], gt_imgs=gt_imgs,\n                savedir=testsavedir, dump_images=args.dump_images,\n                eval_ssim=args.eval_ssim, eval_lpips_alex=args.eval_lpips_alex, eval_lpips_vgg=args.eval_lpips_vgg,\n                **render_viewpoints_kwargs)\n                all_rgbs += rgbs.tolist()\n            save_all_rgbs = np.array(all_rgbs)\n            imageio.mimwrite(os.path.join(testsavedir, 'video.rgb.mp4'), utils.to8b(save_all_rgbs), fps=15, quality=8)\n        return\n    # rendering merged model or normal cases\n    if args.render_test or args.render_train or args.render_video:\n        if args.ft_path:\n            ckpt_path = args.ft_path\n        else:\n            ckpt_path = os.path.join(cfg.basedir, cfg.expname, 'fine_last.tar')\n        ckpt_name = ckpt_path.split('/')[-1][:-4]\n        FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n        if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n            model_class = FourierGridModel\n        elif cfg.data.ndc:\n            model_class = dmpigo.DirectMPIGO\n        elif cfg.data.unbounded_inward:\n            model_class = dcvgo.DirectContractedVoxGO\n        else:\n            model_class = dvgo.DirectVoxGO\n        if use_merged:\n            model = merged_model\n        else:\n            model = utils.load_model(model_class, ckpt_path).to(device)\n        stepsize = cfg.fine_model_and_render.stepsize\n        render_viewpoints_kwargs = {\n            'model': model,\n            'ndc': cfg.data.ndc,\n            'render_kwargs': {\n                'near': data_dict['near'],\n                'far': data_dict['far'],\n                'bg': 1 if cfg.data.white_bkgd else 0,\n                'stepsize': stepsize,\n                'inverse_y': cfg.data.inverse_y,\n                'flip_x': cfg.data.flip_x,\n                'flip_y': cfg.data.flip_y,\n                'render_depth': True,\n            },\n        }\n        if visualize_geometry and model_class == FourierGridModel:\n            geometry_path = os.path.join(cfg.basedir, cfg.expname, f'geometry.npz')\n            model.export_geometry_for_visualize(geometry_path)\n\n    # render trainset and eval\n    if args.render_train:\n        testsavedir = os.path.join(cfg.basedir, cfg.expname, f'render_train_{ckpt_name}')\n        os.makedirs(testsavedir, exist_ok=True)\n        print('All results are dumped into', testsavedir)\n        \n        rgbs, depths, bgmaps = render_viewpoints(cfg=cfg,\n                render_poses=data_dict['poses'][data_dict['i_train']],\n                HW=data_dict['HW'][data_dict['i_train']],\n                Ks=data_dict['Ks'][data_dict['i_train']],\n                gt_imgs=[data_dict['images'][i].cpu().numpy() for i in data_dict['i_train']],\n                savedir=testsavedir, dump_images=args.dump_images,\n                eval_ssim=args.eval_ssim, eval_lpips_alex=args.eval_lpips_alex, eval_lpips_vgg=args.eval_lpips_vgg,\n                **render_viewpoints_kwargs)\n        imageio.mimwrite(os.path.join(testsavedir, 'video.rgb.mp4'), utils.to8b(rgbs), fps=30, quality=8)\n        # imageio.mimwrite(os.path.join(testsavedir, 'video.rgb.mp4'), resize_and_to_8b(rgbs, res=(800, 608)), fps=30, quality=8)\n        # TODO: make the depth visualization work with resize\n        # imageio.mimwrite(os.path.join(testsavedir, 'video.depth.mp4'), resize_and_to_8b(1 - depths / np.max(depths), res=(800, 608)), fps=30, quality=8)\n\n    # render testset and eval\n    if args.render_test:\n        testsavedir = os.path.join(cfg.basedir, cfg.expname, f'render_test_{ckpt_name}')\n        os.makedirs(testsavedir, exist_ok=True)\n        print('All results are dumped into', testsavedir)\n        if data_dict['i_test'][0] >= len(data_dict['images']):  # gt images are not provided\n            gt_imgs = None\n        else:\n            gt_imgs = [data_dict['images'][i].cpu().numpy() for i in data_dict['i_test']]\n        rgbs, depths, bgmaps = render_viewpoints(\n                cfg=cfg, render_poses=data_dict['poses'][data_dict['i_test']],\n                HW=data_dict['HW'][data_dict['i_test']],\n                Ks=data_dict['Ks'][data_dict['i_test']],\n                gt_imgs=gt_imgs,\n                savedir=testsavedir, dump_images=args.dump_images,\n                eval_ssim=args.eval_ssim, eval_lpips_alex=args.eval_lpips_alex, eval_lpips_vgg=args.eval_lpips_vgg,\n                **render_viewpoints_kwargs)\n        imageio.mimwrite(os.path.join(testsavedir, 'video.rgb.mp4'), utils.to8b(rgbs), fps=30, quality=8)\n        imageio.mimwrite(os.path.join(testsavedir, 'video.depth.mp4'), utils.to8b(1 - depths / np.max(depths)), fps=30, quality=8)\n\n    # render video\n    if args.render_video:\n        testsavedir = os.path.join(cfg.basedir, cfg.expname, f'render_video_{ckpt_name}')\n        os.makedirs(testsavedir, exist_ok=True)\n        print('All results are dumped into', testsavedir)\n        rgbs, depths, bgmaps = render_viewpoints(\n                cfg=cfg,\n                render_poses=data_dict['render_poses'],\n                # use the hw and ks from the test splits\n                HW=data_dict['HW'][data_dict['i_test']][[0]].repeat(len(data_dict['render_poses']), 0),\n                Ks=data_dict['Ks'][data_dict['i_test']][[0]].repeat(len(data_dict['render_poses']), 0),\n                render_factor=args.render_video_factor,\n                render_video_flipy=args.render_video_flipy,\n                render_video_rot90=args.render_video_rot90,\n                savedir=testsavedir, dump_images=args.dump_images,\n                **render_viewpoints_kwargs)\n        if 'running_rot' in args:\n            vid_name = add_info[:5] + \"_\" + str(args.running_rot) + '.rgb.mp4'\n        else:\n            vid_name = 'video.rgb.mp4'\n        print(f\"Rendering video saved at: {os.path.join(testsavedir, vid_name)}.\")\n        imageio.mimwrite(os.path.join(testsavedir, vid_name), utils.to8b(rgbs), fps=30, quality=8)\n        depths_vis = depths * (1-bgmaps) + bgmaps\n        mask = bgmaps < 0.1\n        if not mask.max():\n            print(\"depth img cannot be rendered because of the threshold!\")\n        else:\n            dmin, dmax = np.percentile(depths_vis[bgmaps < 0.1], q=[5, 95])\n            depth_vis = plt.get_cmap('rainbow')(1 - np.clip((depths_vis - dmin) / (dmax - dmin), 0, 1)).squeeze()[..., :3]\n            imageio.mimwrite(os.path.join(testsavedir, 'video.depth.mp4'), utils.to8b(depth_vis), fps=30, quality=8)\n    return\n"
  },
  {
    "path": "FourierGrid/run_train.py",
    "content": "import time\nimport torch\nimport torch.nn.functional as F\nimport os,pdb\nimport copy\nimport numpy as np\nfrom tqdm import tqdm, trange\nfrom FourierGrid.bbox_compute import compute_bbox_by_cam_frustrm, compute_bbox_by_coarse_geo\nfrom FourierGrid import utils, dvgo, dcvgo, dmpigo\nfrom FourierGrid.FourierGrid_model import FourierGridModel\nfrom FourierGrid.load_everything import load_existing_model\nfrom FourierGrid.run_export_bbox import run_export_bbox_cams\nfrom FourierGrid.run_export_coarse import run_export_coarse\nfrom FourierGrid.FourierGrid_model import FourierMSELoss\nfrom torch_efficient_distloss import flatten_eff_distloss\n\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\ndef create_new_model(args, cfg, cfg_model, cfg_train, xyz_min, xyz_max, stage, coarse_ckpt_path, device):\n    model_kwargs = copy.deepcopy(cfg_model)\n    num_voxels_density = model_kwargs.pop('num_voxels_density')\n    num_voxels_rgb = model_kwargs.pop('num_voxels_rgb')\n    if len(cfg_train.pg_scale):\n        num_voxels_density = int(num_voxels_density / (2**len(cfg_train.pg_scale)))\n        num_voxels_rgb = int(num_voxels_rgb / (2**len(cfg_train.pg_scale)))\n    verbose = False\n    model_kwargs['sample_num'] = args.sample_num\n    # todo: change this line, only conditioned on model\n    if cfg.data.dataset_type == \"waymo\" or cfg.data.dataset_type == \"nerfpp\" or cfg.model == 'FourierGrid':\n        if verbose:\n            print(f'Waymo scene_rep_reconstruction ({stage}): \\033[96m Use FourierGrid model. \\033[0m')\n        model = FourierGridModel(\n            xyz_min=xyz_min, xyz_max=xyz_max,\n            num_voxels_density=num_voxels_density, num_voxels_rgb=num_voxels_rgb, verbose=verbose,\n            **model_kwargs)\n    elif cfg.data.ndc:\n        model = dmpigo.DirectMPIGO(\n            xyz_min=xyz_min, xyz_max=xyz_max,\n            # num_voxels=num_voxels_rgb,\n            **model_kwargs)\n    elif cfg.data.unbounded_inward:\n        model = dcvgo.DirectContractedVoxGO(\n            xyz_min=xyz_min, xyz_max=xyz_max,\n            num_voxels=num_voxels_rgb, num_voxels_base=model_kwargs['num_voxels_base_rgb'],\n            **model_kwargs)\n    else:\n        model = dvgo.DirectVoxGO(\n            xyz_min=xyz_min, xyz_max=xyz_max,\n            num_voxels=num_voxels_rgb, num_voxels_base=model_kwargs['num_voxels_base_rgb'],\n            mask_cache_path=coarse_ckpt_path,\n            **model_kwargs)\n    model = model.to(device)\n    verbose = args.block_num <= 1\n    optimizer = utils.create_optimizer_or_freeze_model(model, cfg_train, global_step=0, verbose=verbose)\n    return model, optimizer\n\n\n# init batch rays sampler\ndef gather_training_rays(data_dict, images, cfg, i_train, cfg_train, poses, HW, Ks, model, render_kwargs):\n    if data_dict['irregular_shape']:\n        rgb_tr_ori = [images[i].to('cpu' if cfg.data.load2gpu_on_the_fly else device) for i in i_train]\n    else:\n        rgb_tr_ori = images[i_train].to('cpu' if cfg.data.load2gpu_on_the_fly else device)\n    indexs_train = None\n    FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n    # if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n    #     rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_train, imsz = model.FourierGrid_get_training_rays(\n    #     rgb_tr_ori=rgb_tr_ori, train_poses=poses[i_train], HW=HW[i_train], Ks=Ks[i_train], \n    #     ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n    #     flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y, )\n    # el\n    # TODO: validate the above lines.\n    if cfg_train.ray_sampler == 'in_maskcache':\n        rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz = dvgo.get_training_rays_in_maskcache_sampling(\n                rgb_tr_ori=rgb_tr_ori,\n                train_poses=poses[i_train],\n                HW=HW[i_train], Ks=Ks[i_train],\n                ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n                flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y,\n                model=model, render_kwargs=render_kwargs)\n    elif cfg_train.ray_sampler == 'flatten':\n        rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz = dvgo.get_training_rays_flatten(\n            rgb_tr_ori=rgb_tr_ori,\n            train_poses=poses[i_train],\n            HW=HW[i_train], Ks=Ks[i_train], ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n            flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n    else:\n        rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, imsz = dvgo.get_training_rays(\n            rgb_tr=rgb_tr_ori,\n            train_poses=poses[i_train],\n            HW=HW[i_train], Ks=Ks[i_train], ndc=cfg.data.ndc, inverse_y=cfg.data.inverse_y,\n            flip_x=cfg.data.flip_x, flip_y=cfg.data.flip_y)\n    index_generator = dvgo.batch_indices_generator(len(rgb_tr), cfg_train.N_rand)\n    batch_index_sampler = lambda: next(index_generator)\n    return rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_train, imsz, batch_index_sampler\n\n\ndef scene_rep_reconstruction(args, cfg, cfg_model, cfg_train, xyz_min, xyz_max, data_dict, stage, coarse_ckpt_path=None):\n    # init\n    if abs(cfg_model.world_bound_scale - 1) > 1e-9:\n        xyz_shift = (xyz_max - xyz_min) * (cfg_model.world_bound_scale - 1) / 2\n        xyz_min -= xyz_shift\n        xyz_max += xyz_shift\n    \n    # render_poses are removed because they are unused\n    HW, Ks, near, far, i_train, i_val, i_test, poses, images = [\n        data_dict[k] for k in [\n            'HW', 'Ks', 'near', 'far', 'i_train', 'i_val', 'i_test', 'poses', 'images'\n        ]\n    ]\n\n    # find whether there is existing checkpoint path\n    last_ckpt_path = os.path.join(cfg.basedir, cfg.expname, f'{stage}_last.tar')\n    if args.no_reload:\n        reload_ckpt_path = None\n    elif args.ft_path:\n        reload_ckpt_path = args.ft_path\n    elif os.path.isfile(last_ckpt_path):\n        reload_ckpt_path = last_ckpt_path\n    else:\n        reload_ckpt_path = None\n\n    # init model and optimizer\n    FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n    if reload_ckpt_path is None:\n        print(f'scene_rep_reconstruction ({stage}): train from scratch')\n        model, optimizer = create_new_model(args, cfg, cfg_model, cfg_train, xyz_min, xyz_max, stage, coarse_ckpt_path, device)\n        start = 0\n        if cfg_model.maskout_near_cam_vox:\n            model.maskout_near_cam_vox(poses[i_train,:3,3], near)\n    elif cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n        print(f'scene_rep_reconstruction ({stage}): reload FourierGrid model from {reload_ckpt_path}')\n        model, optimizer, start = args.ckpt_manager.load_existing_model(args, cfg, cfg_train, reload_ckpt_path, device=device)\n    else:\n        print(f'scene_rep_reconstruction ({stage}): reload from {reload_ckpt_path}')\n        model, optimizer, start = load_existing_model(args, cfg, cfg_train, reload_ckpt_path, device=device)\n    \n    # init loss\n    fourier_mse_loss = FourierMSELoss(num_freqs=7, logscale=True)\n    \n    # init rendering setup\n    render_kwargs = {\n        'near': data_dict['near'],\n        'far': data_dict['far'],\n        'bg': 1 if cfg.data.white_bkgd else 0,\n        'rand_bkgd': cfg.data.rand_bkgd,\n        'stepsize': cfg_model.stepsize,\n        'inverse_y': cfg.data.inverse_y,\n        'flip_x': cfg.data.flip_x,\n        'flip_y': cfg.data.flip_y,\n    }\n   \n    psnr_lst = []\n    time0 = time.time()\n    global_step = -1\n    psnr = torch.tensor(0)\n    training_steps = cfg_train.N_iters\n    FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n    if cfg.data.dataset_type != 'tankstemple' and (cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid'):\n        rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_tr, imsz, batch_index_sampler = \\\n            model.gather_training_rays(data_dict, images, cfg, i_train, cfg_train, poses, HW, Ks, render_kwargs)\n    else:\n        rgb_tr, rays_o_tr, rays_d_tr, viewdirs_tr, indexs_tr, imsz, batch_index_sampler = gather_training_rays(\n            data_dict, images, cfg, i_train, cfg_train, poses, HW, Ks, model, render_kwargs\n        )\n        \n    # view-count-based learning rate, FourierGrid does not support this operation\n    if cfg_train.pervoxel_lr:\n        def per_voxel_init():\n            cnt = model.voxel_count_views(\n                    rays_o_tr=rays_o_tr, rays_d_tr=rays_d_tr, imsz=imsz, near=near, far=far,\n                    stepsize=cfg_model.stepsize, downrate=cfg_train.pervoxel_lr_downrate,\n                    irregular_shape=data_dict['irregular_shape'])\n            optimizer.set_pervoxel_lr(cnt)\n            if cfg.model == 'FourierGrid':  #TODO: merge the following two lines\n                model.mask_cache.mask[cnt[0][0].squeeze() <= 2] = False\n            else:\n                model.mask_cache.mask[cnt.squeeze() <= 2] = False\n        per_voxel_init()\n\n    if cfg_train.maskout_lt_nviews > 0:\n        model.update_occupancy_cache_lt_nviews(\n                rays_o_tr, rays_d_tr, imsz, render_kwargs, cfg_train.maskout_lt_nviews)\n        \n    for global_step in trange(1 + start, 1 + training_steps):\n        # progress scaling checkpoint\n        if global_step in cfg_train.pg_scale:\n            n_rest_scales = len(cfg_train.pg_scale)-cfg_train.pg_scale.index(global_step)-1\n            cur_voxels_density = int(cfg_model.num_voxels_density / (2**n_rest_scales))\n            cur_voxels_rgb = int(cfg_model.num_voxels_rgb / (2**n_rest_scales))\n            if isinstance(model, FourierGridModel):\n                model.scale_volume_grid(cur_voxels_density, cur_voxels_rgb)\n            elif isinstance(model, (dvgo.DirectVoxGO, dcvgo.DirectContractedVoxGO)):\n                model.scale_volume_grid(cur_voxels_rgb)\n            elif isinstance(model, dmpigo.DirectMPIGO):\n                model.scale_volume_grid(cur_voxels_rgb, model.mpi_depth)\n            else:\n                raise NotImplementedError\n            optimizer = utils.create_optimizer_or_freeze_model(model, cfg_train, global_step=0)\n            model.act_shift -= cfg_train.decay_after_scale\n            torch.cuda.empty_cache()  # todo: validate the effects of this\n\n        # random sample rays\n        if cfg_train.ray_sampler in ['flatten', 'in_maskcache']:\n            sel_i = batch_index_sampler()\n            target = rgb_tr[sel_i]\n            rays_o = rays_o_tr[sel_i]\n            rays_d = rays_d_tr[sel_i]\n            viewdirs = viewdirs_tr[sel_i]\n            if indexs_tr is not None:\n                indexs = indexs_tr[sel_i]\n            else:\n                indexs = None\n        elif cfg_train.ray_sampler == 'random':  # fixed function\n            if len(rgb_tr.shape) != 2:\n                sel_b = torch.randint(rgb_tr.shape[0], [cfg_train.N_rand], device=rgb_tr.device)\n                sel_r = torch.randint(rgb_tr.shape[1], [cfg_train.N_rand], device=rgb_tr.device)\n                sel_c = torch.randint(rgb_tr.shape[2], [cfg_train.N_rand], device=rgb_tr.device)\n                target = rgb_tr[sel_b, sel_r, sel_c]\n                rays_o = rays_o_tr[sel_b, sel_r, sel_c]\n                rays_d = rays_d_tr[sel_b, sel_r, sel_c]\n                viewdirs = viewdirs_tr[sel_b, sel_r, sel_c]\n                if indexs_tr is not None:\n                    indexs = indexs_tr[sel_b, sel_r, sel_c]\n                else:\n                    indexs = None\n            else:\n                assert len(rgb_tr.shape) == 2, \"tgb_tr's shape is not correct.\"\n                sel_b = torch.randint(rgb_tr.shape[0], [cfg_train.N_rand], device=rgb_tr.device)\n                sel_r = torch.randint(rgb_tr.shape[1], [cfg_train.N_rand], device=rgb_tr.device)\n                target = rgb_tr[sel_b]\n                rays_o = rays_o_tr[sel_b]\n                rays_d = rays_d_tr[sel_b]\n                viewdirs = viewdirs_tr[sel_b]\n                if indexs_tr is not None:\n                    indexs = indexs_tr[sel_b]\n                else:\n                    indexs = None\n        else:\n            raise NotImplementedError\n\n        if cfg.data.load2gpu_on_the_fly:\n            target = target.to(device)\n            rays_o = rays_o.to(device)\n            rays_d = rays_d.to(device)\n            viewdirs = viewdirs.to(device)\n            if indexs is not None:\n                indexs = indexs.to(device)\n        render_kwargs['indexs'] = indexs  # to avoid change the model interface\n        # forward model here and get rendered results\n        render_result = model(rays_o, rays_d, viewdirs, global_step=global_step, is_train=True,\n            **render_kwargs)\n        optimizer.zero_grad(set_to_none=True)\n        mse_loss = F.mse_loss(render_result['rgb_marched'], target)\n        freq_loss = fourier_mse_loss(render_result['rgb_marched'], target)\n        psnr = utils.mse2psnr(mse_loss.detach())\n        loss = cfg_train.weight_main * mse_loss + cfg_train.weight_freq * freq_loss\n        if cfg_train.weight_entropy_last > 0:\n            pout = render_result['alphainv_last'].clamp(1e-6, 1-1e-6)\n            entropy_last_loss = -(pout*torch.log(pout) + (1-pout)*torch.log(1-pout)).mean()\n            loss += cfg_train.weight_entropy_last * entropy_last_loss\n        if cfg_train.weight_nearclip > 0:\n            near_thres = data_dict['near_clip'] / model.scene_radius[0].item()\n            near_mask = (render_result['t'] < near_thres)\n            density = render_result['raw_density'][near_mask]\n            if len(density):\n                nearclip_loss = (density - density.detach()).sum()\n                loss += cfg_train.weight_nearclip * nearclip_loss\n        if cfg_train.weight_distortion > 0:\n            n_max = render_result['n_max']\n            s = render_result['s']\n            w = render_result['weights']\n            ray_id = render_result['ray_id']\n            loss_distortion = flatten_eff_distloss(w, s, 1/n_max, ray_id)\n            loss += cfg_train.weight_distortion * loss_distortion\n        if cfg_train.weight_rgbper > 0:\n            rgbper = (render_result['raw_rgb'] - target[render_result['ray_id']]).pow(2).sum(-1)\n            rgbper_loss = (rgbper * render_result['weights'].detach()).sum() / len(rays_o)\n            loss += cfg_train.weight_rgbper * rgbper_loss\n        loss.backward()\n        if global_step<cfg_train.tv_before and global_step>cfg_train.tv_after and global_step%cfg_train.tv_every==0:\n            if cfg_train.weight_tv_density>0:\n                model.density_total_variation_add_grad(\n                    cfg_train.weight_tv_density/len(rays_o), global_step<cfg_train.tv_dense_before)\n            if cfg_train.weight_tv_k0>0:\n                model.k0_total_variation_add_grad(\n                    cfg_train.weight_tv_k0/len(rays_o), global_step<cfg_train.tv_dense_before)\n        optimizer.step()\n        psnr_lst.append(psnr.item())\n\n        # update lr, continuously decaying, optional\n        if cfg.model != 'FourierGrid' or cfg.data.dataset_type != 'tankstemple':\n            decay_steps = cfg_train.lrate_decay * 1000\n            decay_factor = 0.1 ** (1/decay_steps)\n            for i_opt_g, param_group in enumerate(optimizer.param_groups):\n                param_group['lr'] = param_group['lr'] * decay_factor\n\n        # check log & save\n        if global_step%args.i_print==0:\n            eps_time = time.time() - time0\n            eps_time_str = f'{eps_time//3600:02.0f}:{eps_time//60%60:02.0f}:{eps_time%60:02.0f}'\n            tqdm.write(f'training iter {global_step:6d} / '\n                       f'Loss: {loss.item():.9f} / PSNR: {np.mean(psnr_lst):5.2f} / '\n                       f'Eps: {eps_time_str}')\n            psnr_lst = []\n        \n        if global_step==1+start:  # test saving function at start\n            path = os.path.join(cfg.basedir, cfg.expname, f'{stage}_{global_step:06d}.tar')\n            FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\"]\n            if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n                args.ckpt_manager.save_model(global_step, model, optimizer, path)\n            else:\n                torch.save({\n                    'global_step': global_step,\n                    'model_kwargs': model.get_kwargs(),\n                    'model_state_dict': model.state_dict(),\n                    'optimizer_state_dict': optimizer.state_dict(),\n                }, path)\n                print(f'scene_rep_reconstruction ({stage}): saved checkpoints at', path)\n\n    # final save\n    if global_step != -1:\n        if cfg.data.dataset_type == \"waymo\" or cfg.model == 'FourierGrid':\n            args.ckpt_manager.save_model(global_step, model, optimizer, last_ckpt_path)\n        else:               \n            torch.save({\n                'global_step': global_step,\n                'model_kwargs': model.get_kwargs(),\n                'model_state_dict': model.state_dict(),\n                'optimizer_state_dict': optimizer.state_dict(),\n            }, last_ckpt_path)\n            print(f'scene_rep_reconstruction ({stage}): saved checkpoints at', last_ckpt_path)\n    return psnr.item()\n\n\ndef run_train(args, cfg, data_dict, export_cam=False, export_geometry=False):\n    # init\n    running_block_id = args.running_block_id\n    if running_block_id >= 0:\n        print(f\"Training block id: {running_block_id}.\")\n    else:\n        print('Training: start.')\n    eps_time = time.time()\n    os.makedirs(os.path.join(cfg.basedir, cfg.expname), exist_ok=True)\n    with open(os.path.join(cfg.basedir, cfg.expname, 'args.txt'), 'w') as file:\n        for arg in sorted(vars(args)):\n            attr = getattr(args, arg)\n            file.write('{} = {}\\n'.format(arg, attr))\n    cfg.dump(os.path.join(cfg.basedir, cfg.expname, 'config.py'))\n    \n    # coarse geometry searching (originally only for inward bounded scenes, extended to support waymo)\n    eps_coarse = time.time()\n    xyz_min_coarse, xyz_max_coarse = compute_bbox_by_cam_frustrm(args=args, cfg=cfg, **data_dict)\n    if cfg.coarse_train.N_iters > 0:\n        scene_rep_reconstruction(\n                args=args, cfg=cfg,\n                cfg_model=cfg.coarse_model_and_render, cfg_train=cfg.coarse_train,\n                xyz_min=xyz_min_coarse, xyz_max=xyz_max_coarse,\n                data_dict=data_dict, stage='coarse')\n        eps_coarse = time.time() - eps_coarse\n        eps_time_str = f'{eps_coarse//3600:02.0f}:{eps_coarse//60%60:02.0f}:{eps_coarse%60:02.0f}'\n        print('train: coarse geometry searching in', eps_time_str)\n        coarse_ckpt_path = os.path.join(cfg.basedir, cfg.expname, f'coarse_last.tar')\n    else:\n        print('train: skip coarse geometry searching')\n        coarse_ckpt_path = None\n\n    # export cameras and geometries for debugging\n    if export_cam:\n        run_export_bbox_cams(args=args, cfg=cfg, data_dict=data_dict, save_path=os.path.join(cfg.basedir, cfg.expname, 'cam.npz'))\n    if export_geometry and cfg.coarse_train.N_iters > 0:\n        run_export_coarse(args=args, cfg=cfg, device=device, save_path=os.path.join(cfg.basedir, cfg.expname, 'cam_coarse.npz'))\n\n    # fine detail reconstruction\n    eps_fine = time.time()\n    if cfg.coarse_train.N_iters == 0 or cfg.data.dataset_type == \"waymo\":\n        xyz_min_fine, xyz_max_fine = xyz_min_coarse.clone(), xyz_max_coarse.clone()\n    else:\n        xyz_min_fine, xyz_max_fine = compute_bbox_by_coarse_geo(\n                model_class=dvgo.DirectVoxGO, model_path=coarse_ckpt_path,\n                thres=cfg.fine_model_and_render.bbox_thres, device=device, args=args, cfg=cfg)\n    psnr = scene_rep_reconstruction(\n            args=args, cfg=cfg,\n            cfg_model=cfg.fine_model_and_render, cfg_train=cfg.fine_train,\n            xyz_min=xyz_min_fine, xyz_max=xyz_max_fine,\n            data_dict=data_dict, stage='fine',\n            coarse_ckpt_path=coarse_ckpt_path)\n    eps_fine = time.time() - eps_fine\n    eps_time_str = f'{eps_fine//3600:02.0f}:{eps_fine//60%60:02.0f}:{eps_fine%60:02.0f}'\n    if running_block_id >= 0:\n        print('train: fine detail reconstruction in', eps_time_str)\n\n    eps_time = time.time() - eps_time\n    eps_time_str = f'{eps_time//3600:02.0f}:{eps_time//60%60:02.0f}:{eps_time%60:02.0f}'\n    if running_block_id >= 0:\n        print('train: finish (eps time', eps_time_str, ')')\n\n    return psnr\n"
  },
  {
    "path": "FourierGrid/tools/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/tools/colmap_utils/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/tools/colmap_utils/colmap_read_model.py",
    "content": "# Source: https://github.com/Fyusion/LLFF\n# Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.\n# All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions are met:\n#\n#     * Redistributions of source code must retain the above copyright\n#       notice, this list of conditions and the following disclaimer.\n#\n#     * Redistributions in binary form must reproduce the above copyright\n#       notice, this list of conditions and the following disclaimer in the\n#       documentation and/or other materials provided with the distribution.\n#\n#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of\n#       its contributors may be used to endorse or promote products derived\n#       from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE\n# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n# POSSIBILITY OF SUCH DAMAGE.\n#\n# Author: Johannes L. Schoenberger (jsch at inf.ethz.ch)\n\nimport os\nimport sys\nimport collections\nimport numpy as np\nimport struct\n\n\nCameraModel = collections.namedtuple(\n    \"CameraModel\", [\"model_id\", \"model_name\", \"num_params\"])\nCamera = collections.namedtuple(\n    \"Camera\", [\"id\", \"model\", \"width\", \"height\", \"params\"])\nBaseImage = collections.namedtuple(\n    \"Image\", [\"id\", \"qvec\", \"tvec\", \"camera_id\", \"name\", \"xys\", \"point3D_ids\"])\nPoint3D = collections.namedtuple(\n    \"Point3D\", [\"id\", \"xyz\", \"rgb\", \"error\", \"image_ids\", \"point2D_idxs\"])\n\nclass Image(BaseImage):\n    def qvec2rotmat(self):\n        return qvec2rotmat(self.qvec)\n\n\nCAMERA_MODELS = {\n    CameraModel(model_id=0, model_name=\"SIMPLE_PINHOLE\", num_params=3),\n    CameraModel(model_id=1, model_name=\"PINHOLE\", num_params=4),\n    CameraModel(model_id=2, model_name=\"SIMPLE_RADIAL\", num_params=4),\n    CameraModel(model_id=3, model_name=\"RADIAL\", num_params=5),\n    CameraModel(model_id=4, model_name=\"OPENCV\", num_params=8),\n    CameraModel(model_id=5, model_name=\"OPENCV_FISHEYE\", num_params=8),\n    CameraModel(model_id=6, model_name=\"FULL_OPENCV\", num_params=12),\n    CameraModel(model_id=7, model_name=\"FOV\", num_params=5),\n    CameraModel(model_id=8, model_name=\"SIMPLE_RADIAL_FISHEYE\", num_params=4),\n    CameraModel(model_id=9, model_name=\"RADIAL_FISHEYE\", num_params=5),\n    CameraModel(model_id=10, model_name=\"THIN_PRISM_FISHEYE\", num_params=12)\n}\nCAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) \\\n                         for camera_model in CAMERA_MODELS])\n\n\ndef read_next_bytes(fid, num_bytes, format_char_sequence, endian_character=\"<\"):\n    \"\"\"Read and unpack the next bytes from a binary file.\n    :param fid:\n    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.\n    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.\n    :param endian_character: Any of {@, =, <, >, !}\n    :return: Tuple of read and unpacked values.\n    \"\"\"\n    data = fid.read(num_bytes)\n    return struct.unpack(endian_character + format_char_sequence, data)\n\n\ndef read_cameras_text(path):\n    \"\"\"\n    see: src/base/reconstruction.cc\n        void Reconstruction::WriteCamerasText(const std::string& path)\n        void Reconstruction::ReadCamerasText(const std::string& path)\n    \"\"\"\n    cameras = {}\n    with open(path, \"r\") as fid:\n        while True:\n            line = fid.readline()\n            if not line:\n                break\n            line = line.strip()\n            if len(line) > 0 and line[0] != \"#\":\n                elems = line.split()\n                camera_id = int(elems[0])\n                model = elems[1]\n                width = int(elems[2])\n                height = int(elems[3])\n                params = np.array(tuple(map(float, elems[4:])))\n                cameras[camera_id] = Camera(id=camera_id, model=model,\n                                            width=width, height=height,\n                                            params=params)\n    return cameras\n\n\ndef read_cameras_binary(path_to_model_file):\n    \"\"\"\n    see: src/base/reconstruction.cc\n        void Reconstruction::WriteCamerasBinary(const std::string& path)\n        void Reconstruction::ReadCamerasBinary(const std::string& path)\n    \"\"\"\n    cameras = {}\n    with open(path_to_model_file, \"rb\") as fid:\n        num_cameras = read_next_bytes(fid, 8, \"Q\")[0]\n        for camera_line_index in range(num_cameras):\n            camera_properties = read_next_bytes(\n                fid, num_bytes=24, format_char_sequence=\"iiQQ\")\n            camera_id = camera_properties[0]\n            model_id = camera_properties[1]\n            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name\n            width = camera_properties[2]\n            height = camera_properties[3]\n            num_params = CAMERA_MODEL_IDS[model_id].num_params\n            params = read_next_bytes(fid, num_bytes=8*num_params,\n                                     format_char_sequence=\"d\"*num_params)\n            cameras[camera_id] = Camera(id=camera_id,\n                                        model=model_name,\n                                        width=width,\n                                        height=height,\n                                        params=np.array(params))\n        assert len(cameras) == num_cameras\n    return cameras\n\n\ndef read_images_text(path):\n    \"\"\"\n    see: src/base/reconstruction.cc\n        void Reconstruction::ReadImagesText(const std::string& path)\n        void Reconstruction::WriteImagesText(const std::string& path)\n    \"\"\"\n    images = {}\n    with open(path, \"r\") as fid:\n        while True:\n            line = fid.readline()\n            if not line:\n                break\n            line = line.strip()\n            if len(line) > 0 and line[0] != \"#\":\n                elems = line.split()\n                image_id = int(elems[0])\n                qvec = np.array(tuple(map(float, elems[1:5])))\n                tvec = np.array(tuple(map(float, elems[5:8])))\n                camera_id = int(elems[8])\n                image_name = elems[9]\n                elems = fid.readline().split()\n                xys = np.column_stack([tuple(map(float, elems[0::3])),\n                                       tuple(map(float, elems[1::3]))])\n                point3D_ids = np.array(tuple(map(int, elems[2::3])))\n                images[image_id] = Image(\n                    id=image_id, qvec=qvec, tvec=tvec,\n                    camera_id=camera_id, name=image_name,\n                    xys=xys, point3D_ids=point3D_ids)\n    return images\n\n\ndef read_images_binary(path_to_model_file):\n    \"\"\"\n    see: src/base/reconstruction.cc\n        void Reconstruction::ReadImagesBinary(const std::string& path)\n        void Reconstruction::WriteImagesBinary(const std::string& path)\n    \"\"\"\n    images = {}\n    with open(path_to_model_file, \"rb\") as fid:\n        num_reg_images = read_next_bytes(fid, 8, \"Q\")[0]\n        for image_index in range(num_reg_images):\n            binary_image_properties = read_next_bytes(\n                fid, num_bytes=64, format_char_sequence=\"idddddddi\")\n            image_id = binary_image_properties[0]\n            qvec = np.array(binary_image_properties[1:5])\n            tvec = np.array(binary_image_properties[5:8])\n            camera_id = binary_image_properties[8]\n            image_name = \"\"\n            current_char = read_next_bytes(fid, 1, \"c\")[0]\n            while current_char != b\"\\x00\":   # look for the ASCII 0 entry\n                image_name += current_char.decode(\"utf-8\")\n                current_char = read_next_bytes(fid, 1, \"c\")[0]\n            num_points2D = read_next_bytes(fid, num_bytes=8,\n                                           format_char_sequence=\"Q\")[0]\n            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,\n                                       format_char_sequence=\"ddq\"*num_points2D)\n            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),\n                                   tuple(map(float, x_y_id_s[1::3]))])\n            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))\n            images[image_id] = Image(\n                id=image_id, qvec=qvec, tvec=tvec,\n                camera_id=camera_id, name=image_name,\n                xys=xys, point3D_ids=point3D_ids)\n    return images\n\n\ndef read_points3D_text(path):\n    \"\"\"\n    see: src/base/reconstruction.cc\n        void Reconstruction::ReadPoints3DText(const std::string& path)\n        void Reconstruction::WritePoints3DText(const std::string& path)\n    \"\"\"\n    points3D = {}\n    with open(path, \"r\") as fid:\n        while True:\n            line = fid.readline()\n            if not line:\n                break\n            line = line.strip()\n            if len(line) > 0 and line[0] != \"#\":\n                elems = line.split()\n                point3D_id = int(elems[0])\n                xyz = np.array(tuple(map(float, elems[1:4])))\n                rgb = np.array(tuple(map(int, elems[4:7])))\n                error = float(elems[7])\n                image_ids = np.array(tuple(map(int, elems[8::2])))\n                point2D_idxs = np.array(tuple(map(int, elems[9::2])))\n                points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb,\n                                               error=error, image_ids=image_ids,\n                                               point2D_idxs=point2D_idxs)\n    return points3D\n\n\ndef read_points3d_binary(path_to_model_file):\n    \"\"\"\n    see: src/base/reconstruction.cc\n        void Reconstruction::ReadPoints3DBinary(const std::string& path)\n        void Reconstruction::WritePoints3DBinary(const std::string& path)\n    \"\"\"\n    points3D = {}\n    with open(path_to_model_file, \"rb\") as fid:\n        num_points = read_next_bytes(fid, 8, \"Q\")[0]\n        for point_line_index in range(num_points):\n            binary_point_line_properties = read_next_bytes(\n                fid, num_bytes=43, format_char_sequence=\"QdddBBBd\")\n            point3D_id = binary_point_line_properties[0]\n            xyz = np.array(binary_point_line_properties[1:4])\n            rgb = np.array(binary_point_line_properties[4:7])\n            error = np.array(binary_point_line_properties[7])\n            track_length = read_next_bytes(\n                fid, num_bytes=8, format_char_sequence=\"Q\")[0]\n            track_elems = read_next_bytes(\n                fid, num_bytes=8*track_length,\n                format_char_sequence=\"ii\"*track_length)\n            image_ids = np.array(tuple(map(int, track_elems[0::2])))\n            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))\n            points3D[point3D_id] = Point3D(\n                id=point3D_id, xyz=xyz, rgb=rgb,\n                error=error, image_ids=image_ids,\n                point2D_idxs=point2D_idxs)\n    return points3D\n\n\ndef read_model(path, ext):\n    if ext == \".txt\":\n        cameras = read_cameras_text(os.path.join(path, \"cameras\" + ext))\n        images = read_images_text(os.path.join(path, \"images\" + ext))\n        points3D = read_points3D_text(os.path.join(path, \"points3D\") + ext)\n    else:\n        cameras = read_cameras_binary(os.path.join(path, \"cameras\" + ext))\n        images = read_images_binary(os.path.join(path, \"images\" + ext))\n        points3D = read_points3d_binary(os.path.join(path, \"points3D\") + ext)\n    return cameras, images, points3D\n\n\ndef qvec2rotmat(qvec):\n    return np.array([\n        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,\n         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],\n         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],\n        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],\n         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,\n         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],\n        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],\n         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],\n         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])\n\n\ndef rotmat2qvec(R):\n    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat\n    K = np.array([\n        [Rxx - Ryy - Rzz, 0, 0, 0],\n        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],\n        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],\n        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0\n    eigvals, eigvecs = np.linalg.eigh(K)\n    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]\n    if qvec[0] < 0:\n        qvec *= -1\n    return qvec\n\n\ndef main():\n    if len(sys.argv) != 3:\n        print(\"Usage: python read_model.py path/to/model/folder [.txt,.bin]\")\n        return\n\n    cameras, images, points3D = read_model(path=sys.argv[1], ext=sys.argv[2])\n\n    print(\"num_cameras:\", len(cameras))\n    print(\"num_images:\", len(images))\n    print(\"num_points3D:\", len(points3D))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "FourierGrid/tools/colmap_utils/colmap_wrapper.py",
    "content": "# Source: https://github.com/Fyusion/LLFF\nimport os\nimport subprocess\nimport pdb\n\n\n# $ DATASET_PATH=/path/to/dataset\n\n# $ colmap feature_extractor \\\n#    --database_path $DATASET_PATH/database.db \\\n#    --image_path $DATASET_PATH/images\n\n# $ colmap exhaustive_matcher \\\n#    --database_path $DATASET_PATH/database.db\n\n# $ mkdir $DATASET_PATH/sparse\n\n# $ colmap mapper \\\n#     --database_path $DATASET_PATH/database.db \\\n#     --image_path $DATASET_PATH/images \\\n#     --output_path $DATASET_PATH/sparse\n\n# $ mkdir $DATASET_PATH/dense\ndef run_colmap(basedir, match_type):\n    logfile_name = os.path.join(basedir, 'colmap_output.txt')\n    logfile = open(logfile_name, 'w')\n    \n    feature_extractor_args = [\n        'colmap', 'feature_extractor', \n            '--database_path', os.path.join(basedir, 'database.db'), \n            '--image_path', os.path.join(basedir, 'source'),\n            '--ImageReader.single_camera', '1',\n            '--SiftExtraction.use_gpu', 'false'\n    ]\n    joined_args = ' '.join(feature_extractor_args)\n    os.system(joined_args)\n    # feat_output = ( subprocess.check_output(feature_extractor_args, universal_newlines=True) )\n    # logfile.write(feat_output)\n    print('Features extracted, the next step would cost several hours, don\\'t worry and get a coffee.')\n\n    exhaustive_matcher_args = [\n        'colmap', match_type, \n            '--database_path', os.path.join(basedir, 'database.db'), \n            '--SiftMatching.use_gpu', 'false' # comment this line to use gpus, but a desktop is required\n    ]\n    joined_args = ' '.join(exhaustive_matcher_args)\n    print(\"Executing: \", joined_args)\n    os.system(joined_args)\n    # match_output = ( subprocess.check_output(exhaustive_matcher_args, universal_newlines=True) )\n    # logfile.write(match_output)\n    print('Features matched')\n    p = os.path.join(basedir, 'sparse')\n    if not os.path.exists(p):\n        os.makedirs(p)\n\n    mapper_args = [\n        'colmap', 'mapper',\n            '--database_path', os.path.join(basedir, 'database.db'),\n            '--image_path', os.path.join(basedir, 'source'),\n            '--export_path', os.path.join(basedir, 'sparse'), # --export_path changed to --output_path in colmap 3.6\n            '--Mapper.num_threads', '16',\n            '--Mapper.init_min_tri_angle', '4',\n            '--Mapper.multiple_models', '0',\n            '--Mapper.extract_colors', '0',\n    ]\n    joined_args = ' '.join(mapper_args)\n    os.system(joined_args)\n    # map_output = ( subprocess.check_output(mapper_args, universal_newlines=True) )\n    # logfile.write(map_output)\n    print('Sparse map created')\n    undistorter = [\n        'colmap', 'image_undistorter',\n        '--image_path', os.path.join(basedir, 'source'),\n        '--input_path', os.path.join(basedir, 'sparse', '0'),\n        '--output_path', os.path.join(basedir, 'dense'),\n        '--output_type', 'COLMAP',\n    ]\n    joined_args = ' '.join(undistorter)\n    os.system(joined_args)\n    # undistort_output = subprocess.check_output(undistorter, universal_newlines=True)\n    # logfile.write(undistort_output)\n    print('Undistort images')\n    print( 'Finished running COLMAP! Congrats!')\n\n    # logfile.close()\n    # print( 'Finished running COLMAP, see {} for logs'.format(logfile_name) )\n\n\n"
  },
  {
    "path": "FourierGrid/tools/colmap_utils/pose_utils.py",
    "content": "# Source: https://github.com/Fyusion/LLFF\nimport numpy as np\nimport os\nimport sys\nimport pdb\nimport json\nimport imageio\nimport skimage.transform\n\nfrom .colmap_wrapper import run_colmap\nfrom . import colmap_read_model as read_model\n\n\ndef load_colmap_data_nerfstudio(basedir):\n    colmap_base_dir = os.path.join(basedir, 'colmap')\n    camerasfile = os.path.join(colmap_base_dir, 'sparse/0/cameras.bin')\n    camdata = read_model.read_cameras_binary(camerasfile)\n    list_of_keys = list(camdata.keys())\n    cam = camdata[list_of_keys[0]]\n    print( 'Cameras', len(cam))\n\n    h, w, f = cam.height, cam.width, cam.params[0]\n    hwf = np.array([h, w, f]).reshape([3, 1])\n    imagesfile = os.path.join(colmap_base_dir, 'sparse/0/images.bin')\n    imdata = read_model.read_images_binary(imagesfile)\n    \n    w2c_mats = []\n    bottom = np.array([0,0,0,1.]).reshape([1,4])\n    \n    names = [imdata[k].name for k in imdata]\n    print( 'Images #', len(names))\n    sorted_names = np.argsort(names)\n    for k in imdata:\n        im = imdata[k]\n        R = im.qvec2rotmat()\n        t = im.tvec.reshape([3,1])\n        m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)\n        w2c_mats.append(m)\n    \n    w2c_mats = np.stack(w2c_mats, 0)\n    c2w_mats = np.linalg.inv(w2c_mats)\n    poses = c2w_mats[:, :3, :4].transpose([1, 2, 0])\n    poses = np.concatenate([poses, np.tile(hwf[..., np.newaxis], [1, 1, poses.shape[-1]])], 1)\n    \n    # load render poses\n    render_cam_paths = json.load(open(os.path.join(basedir, 'camera_path.json')))\n    h, w = render_cam_paths['render_height'], render_cam_paths['render_width']\n    render_poses = np.array([p['camera_to_world'] for p in render_cam_paths['camera_path']]).reshape(-1, 4, 4)\n    hwf = np.array([h, w, f]).reshape([3, 1])\n    render_poses = render_poses[:, :3, :4].transpose([1, 2, 0])\n    render_poses = np.concatenate([render_poses, np.tile(hwf[..., np.newaxis], [1, 1, render_poses.shape[-1]])], 1)\n    \n    train_num = len(c2w_mats)\n    # poses = np.concatenate([poses, render_poses], 2)\n    points3dfile = os.path.join(colmap_base_dir, 'sparse/0/points3D.bin')\n    pts3d = read_model.read_points3d_binary(points3dfile)\n    \n    # todo: validate this effect, enabled by default, commented out by zelin\n    # must switch to [-u, r, -t] from [r, -u, t], NOT [r, u, -t]\n    poses = np.concatenate([poses[:, 1:2, :], poses[:, 0:1, :], -poses[:, 2:3, :], poses[:, 3:4, :], poses[:, 4:5, :]], 1)\n    return poses, train_num, pts3d, sorted_names, names\n\n\ndef load_colmap_data(realdir):\n    camerasfile = os.path.join(realdir, 'dense/sparse/cameras.bin')\n    # camerasfile = os.path.join(realdir, 'sparse/0/cameras.bin')\n    camdata = read_model.read_cameras_binary(camerasfile)\n    \n    # cam = camdata[camdata.keys()[0]]\n    list_of_keys = list(camdata.keys())\n    cam = camdata[list_of_keys[0]]\n    print( 'Cameras', len(cam))\n\n    h, w, f = cam.height, cam.width, cam.params[0]\n    # w, h, f = factor * w, factor * h, factor * f\n    hwf = np.array([h,w,f]).reshape([3,1])\n    \n    imagesfile = os.path.join(realdir, 'dense/sparse/images.bin')\n    # imagesfile = os.path.join(realdir, 'sparse/0/images.bin')\n    imdata = read_model.read_images_binary(imagesfile)\n    \n    w2c_mats = []\n    bottom = np.array([0,0,0,1.]).reshape([1,4])\n    \n    names = [imdata[k].name for k in imdata]\n    print( 'Images #', len(names))\n    perm = np.argsort(names)\n    for k in imdata:\n        im = imdata[k]\n        R = im.qvec2rotmat()\n        t = im.tvec.reshape([3,1])\n        m = np.concatenate([np.concatenate([R, t], 1), bottom], 0)\n        w2c_mats.append(m)\n    \n    w2c_mats = np.stack(w2c_mats, 0)\n    c2w_mats = np.linalg.inv(w2c_mats)\n    \n    poses = c2w_mats[:, :3, :4].transpose([1,2,0])\n    poses = np.concatenate([poses, np.tile(hwf[..., np.newaxis], [1,1,poses.shape[-1]])], 1)\n    \n    # points3dfile = os.path.join(realdir, 'dense/sparse/points3D.bin')\n    points3dfile = os.path.join(realdir, 'sparse/0/points3D.bin')\n    pts3d = read_model.read_points3d_binary(points3dfile)\n    \n    # must switch to [-u, r, -t] from [r, -u, t], NOT [r, u, -t]\n    poses = np.concatenate([poses[:, 1:2, :], poses[:, 0:1, :], -poses[:, 2:3, :], poses[:, 3:4, :], poses[:, 4:5, :]], 1)\n    return poses, pts3d, perm, names\n\n\ndef save_poses(basedir, poses, pts3d, perm, names):\n    pts_arr = []\n    vis_arr = []\n    for k in pts3d:\n        pts_arr.append(pts3d[k].xyz)\n        cams = [0] * poses.shape[-1]\n        for ind in pts3d[k].image_ids:\n            if len(cams) <= ind - 1:\n                print('ERROR: the correct camera poses for current points cannot be accessed')\n                return\n            cams[ind-1] = 1\n        vis_arr.append(cams)\n\n    pts_arr = np.array(pts_arr)\n    vis_arr = np.array(vis_arr)\n    print( 'Points', pts_arr.shape, 'Visibility', vis_arr.shape )\n    if len(pts_arr) < 1:\n        raise RuntimeError(\"Points has zero shape!\")\n    zvals = np.sum(-(pts_arr[:, np.newaxis, :].transpose([2,0,1]) - poses[:3, 3:4, :]) * poses[:3, 2:3, :], 0)\n    valid_z = zvals[vis_arr==1]\n    print( 'Depth stats', valid_z.min(), valid_z.max(), valid_z.mean() )\n    \n    save_arr = []\n    for i in perm:\n        vis = vis_arr[:, i]\n        zs = zvals[:, i]\n        zs = zs[vis==1]\n        close_depth, inf_depth = np.percentile(zs, .1), np.percentile(zs, 99.9)\n        # print( i, close_depth, inf_depth )\n        save_arr.append(np.concatenate([poses[..., i].ravel(), np.array([close_depth, inf_depth])], 0))\n    save_arr = np.array(save_arr)\n    \n    np.save(os.path.join(basedir, 'poses_bounds.npy'), save_arr)\n    np.save(os.path.join(basedir, 'poses_names.npy'), sorted(names))\n\n\ndef minify(basedir, factors=[], resolutions=[]):\n    needtoload = False\n    for r in factors:\n        imgdir = os.path.join(basedir, 'images_{}'.format(r))\n        if not os.path.exists(imgdir):\n            needtoload = True\n    for r in resolutions:\n        imgdir = os.path.join(basedir, 'images_{}x{}'.format(r[1], r[0]))\n        if not os.path.exists(imgdir):\n            needtoload = True\n    if not needtoload:\n        return\n\n    from shutil import copy\n    from subprocess import check_output\n\n    imgdir = os.path.join(basedir, 'images')\n    imgs = [os.path.join(imgdir, f) for f in sorted(os.listdir(imgdir))]\n    imgs = [f for f in imgs if any([f.endswith(ex) for ex in ['JPG', 'jpg', 'png', 'jpeg', 'PNG']])]\n    imgdir_orig = imgdir\n\n    wd = os.getcwd()\n\n    for r in factors + resolutions:\n        if isinstance(r, int):\n            name = 'images_{}'.format(r)\n            resizearg = '{}%'.format(int(100./r))\n        else:\n            name = 'images_{}x{}'.format(r[1], r[0])\n            resizearg = '{}x{}'.format(r[1], r[0])\n        imgdir = os.path.join(basedir, name)\n        if os.path.exists(imgdir):\n            continue\n\n        print('Minifying', r, basedir)\n\n        os.makedirs(imgdir)\n        check_output('cp {}/* {}'.format(imgdir_orig, imgdir), shell=True)\n\n        ext = imgs[0].split('.')[-1]\n        args = ' '.join(['magick mogrify', '-resize', resizearg, '-format', 'png', '*.{}'.format(ext)])\n        print(args)\n        os.chdir(imgdir)\n        os.system(args)\n        # check_output(args, shell=True)\n        os.chdir(wd)\n\n        if ext != 'png':\n            check_output('rm {}/*.{}'.format(imgdir, ext), shell=True)\n            print('Removed duplicates')\n        print('Done')\n\n\ndef gen_poses(basedir, match_type, factors=None):\n    print(\"Force run COLMAP!\")\n    run_colmap(basedir, match_type)  # comment this line if you have run the colmap software offline\n\n    # files_needed = ['{}.bin'.format(f) for f in ['cameras', 'images', 'points3D']]\n    # if os.path.exists(os.path.join(basedir, 'sparse/0')):\n    #     files_had = os.listdir(os.path.join(basedir, 'sparse/0'))\n    # else:\n    #     files_had = []\n    # if not all([f in files_had for f in files_needed]):\n    #     print( 'Need to run COLMAP' )\n    #     run_colmap(basedir, match_type)\n    # else:\n    #     print('Don\\'t need to run COLMAP')\n\n    print( 'Post-colmap')\n\n    poses, pts3d, perm, names = load_colmap_data(basedir)\n\n    densedir = os.path.join(basedir, 'dense')\n\n    save_poses(densedir, poses, pts3d, perm, names)\n\n    if factors is not None:\n        print( 'Factors:', factors)\n        minify(densedir, factors)\n\n    print( 'Done with imgs2poses' )\n\n    return True\n\n"
  },
  {
    "path": "FourierGrid/tools/imgs2poses.py",
    "content": "# Modified from https://github.com/Fyusion/LLFF\nimport os\nimport sys\nimport glob\n\nfrom colmap_utils.pose_utils import gen_poses\n\n\ndef check_structure(scenedir):\n    source = os.path.join(scenedir, 'source')\n    if not os.path.isdir(source):\n        print('Invalid directory structure.')\n        print('Please put all your images under', source, '!')\n        sys.exit()\n    if len(glob.glob(f'{source}/*[JPG\\|jpg\\|png\\|jpeg\\|PNG]')) == 0:\n        print('Invalid directory structure.')\n        print('No image in', source, '!')\n        sys.exit()\n    print('Directory structure check: PASS.')\n\n\nif __name__=='__main__':\n\n    import argparse\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--match_type', type=str,\n                        default='exhaustive_matcher', help='type of matcher used.  Valid options: \\\n                        exhaustive_matcher sequential_matcher.  Other matchers not supported at this time')\n    parser.add_argument('scenedir', type=str,\n                        help='input scene directory')\n    args = parser.parse_args()\n\n    if args.match_type != 'exhaustive_matcher' and args.match_type != 'sequential_matcher':\n        print('ERROR: matcher type ' + args.match_type + ' is not valid.  Aborting')\n        sys.exit()\n\n    check_structure(args.scenedir)\n\n    gen_poses(args.scenedir, args.match_type, factors=[2,4,8])\n\n"
  },
  {
    "path": "FourierGrid/tools/vis_train.py",
    "content": "import argparse\nimport numpy as np\nimport open3d as o3d\n\nparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)\nparser.add_argument('path')\nargs = parser.parse_args()\n\ndata = np.load(args.path)\nxyz_min = data['xyz_min']\nxyz_max = data['xyz_max']\ncam_lst = data['cam_lst']\n\n# Outer aabb\naabb_01 = np.array([[0, 0, 0],\n                    [0, 0, 1],\n                    [0, 1, 1],\n                    [0, 1, 0],\n                    [1, 0, 0],\n                    [1, 0, 1],\n                    [1, 1, 1],\n                    [1, 1, 0]])\nout_bbox = o3d.geometry.LineSet()\nout_bbox.points = o3d.utility.Vector3dVector(xyz_min + aabb_01 * (xyz_max - xyz_min))\nout_bbox.colors = o3d.utility.Vector3dVector([[1,0,0] for i in range(12)])\nout_bbox.lines = o3d.utility.Vector2iVector([[0,1],[1,2],[2,3],[3,0],[4,5],[5,6],[6,7],[7,4],[0,4],[1,5],[2,6],[3,7]])\n\n# Cameras\ncam_frustrm_lst = []\nfor cam in cam_lst:\n    cam_frustrm = o3d.geometry.LineSet()\n    cam_frustrm.points = o3d.utility.Vector3dVector(cam)\n    if len(cam) == 5:\n        cam_frustrm.colors = o3d.utility.Vector3dVector([[0,0,0] for i in range(8)])\n        cam_frustrm.lines = o3d.utility.Vector2iVector([[0,1],[0,2],[0,3],[0,4],[1,2],[2,4],[4,3],[3,1]])\n    elif len(cam) == 8:\n        cam_frustrm.colors = o3d.utility.Vector3dVector([[0,0,0] for i in range(12)])\n        cam_frustrm.lines = o3d.utility.Vector2iVector([\n            [0,1],[1,3],[3,2],[2,0],\n            [4,5],[5,7],[7,6],[6,4],\n            [0,4],[1,5],[3,7],[2,6],\n        ])\n    else:\n        raise NotImplementedError\n    cam_frustrm_lst.append(cam_frustrm)\n\n# Show\no3d.visualization.draw_geometries([\n    o3d.geometry.TriangleMesh.create_coordinate_frame(size=1.0, origin=xyz_min),\n    out_bbox, *cam_frustrm_lst])\n\n"
  },
  {
    "path": "FourierGrid/tools/vis_volume.py",
    "content": "import argparse\nimport numpy as np\nimport open3d as o3d\n\nparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)\nparser.add_argument('path')\nparser.add_argument('thres', type=float)\nparser.add_argument('--cam')\nargs = parser.parse_args()\n\ndata = np.load(args.path)\nalpha = data['alpha']\nrgb = data['rgb']\nif rgb.shape[0] < rgb.shape[-1]:\n    alpha = np.transpose(alpha, (1,2,0))\n    rgb = np.transpose(rgb, (1,2,3,0))\nprint('Shape', alpha.shape, rgb.shape)\nprint('Active rate', (alpha > args.thres).mean())\nprint('Active nums', (alpha > args.thres).sum())\nxyz_min = np.array([0,0,0])\nxyz_max = np.array(alpha.shape)\n\nif args.cam:\n    data = np.load(args.cam)\n    xyz_min = data['xyz_min']\n    xyz_max = data['xyz_max']\n    cam_lst = data['cam_lst']\n    cam_frustrm_lst = []\n    for cam in cam_lst:\n        cam_frustrm = o3d.geometry.LineSet()\n        cam_frustrm.points = o3d.utility.Vector3dVector(cam)\n        if len(cam) == 5:\n            cam_frustrm.colors = o3d.utility.Vector3dVector([[0.5,0.5,0.5] for i in range(8)])\n            cam_frustrm.lines = o3d.utility.Vector2iVector([[0,1],[0,2],[0,3],[0,4],[1,2],[2,4],[4,3],[3,1]])\n        elif len(cam) == 8:\n            cam_frustrm.colors = o3d.utility.Vector3dVector([[0.5,0.5,0.5] for i in range(12)])\n            cam_frustrm.lines = o3d.utility.Vector2iVector([\n                [0,1],[1,3],[3,2],[2,0],\n                [4,5],[5,7],[7,6],[6,4],\n                [0,4],[1,5],[3,7],[2,6],\n            ])\n        cam_frustrm_lst.append(cam_frustrm)\nelse:\n    cam_frustrm_lst = []\n\n\naabb_01 = np.array([[0, 0, 0],\n                    [0, 0, 1],\n                    [0, 1, 1],\n                    [0, 1, 0],\n                    [1, 0, 0],\n                    [1, 0, 1],\n                    [1, 1, 1],\n                    [1, 1, 0]])\nout_bbox = o3d.geometry.LineSet()\nout_bbox.points = o3d.utility.Vector3dVector(xyz_min + aabb_01 * (xyz_max - xyz_min))\nout_bbox.colors = o3d.utility.Vector3dVector([[1,0,0] for i in range(12)])\nout_bbox.lines = o3d.utility.Vector2iVector([[0,1],[1,2],[2,3],[3,0],[4,5],[5,6],[6,7],[7,4],[0,4],[1,5],[2,6],[3,7]])\n\nxyz = np.stack((alpha > args.thres).nonzero(), -1)\ncolor = rgb[xyz[:,0], xyz[:,1], xyz[:,2]]\npcd = o3d.geometry.PointCloud()\npcd.points = o3d.utility.Vector3dVector(xyz / alpha.shape * (xyz_max - xyz_min) + xyz_min)\npcd.colors = o3d.utility.Vector3dVector(color[:, :3])\nvoxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(pcd, voxel_size=max((xyz_max - xyz_min) / alpha.shape))\n\ndef change_background_to_black(vis):\n    opt = vis.get_render_option()\n    opt.background_color = np.asarray([0, 0, 0])\n    return False\n\no3d.visualization.draw_geometries_with_key_callbacks([\n    o3d.geometry.TriangleMesh.create_coordinate_frame(size=(xyz_max-xyz_min).min()*0.1, origin=xyz_min),\n    out_bbox, voxel_grid, *cam_frustrm_lst,\n], {ord(\"K\"): change_background_to_black})\n\n"
  },
  {
    "path": "FourierGrid/trajectory_generators/__init__.py",
    "content": ""
  },
  {
    "path": "FourierGrid/trajectory_generators/interp_traj.py",
    "content": "import click\nimport os\nimport numpy as np\nimport cv2 as cv\nfrom os.path import join as pjoin\nfrom glob import glob\nfrom scipy.spatial.transform import Rotation as R\nfrom scipy.spatial.transform import Slerp\n\n\ndef inter_two_poses(pose_a, pose_b, alpha):\n    ret = np.zeros([3, 4], dtype=np.float64)\n    rot_a = R.from_matrix(pose_a[:3, :3])\n    rot_b = R.from_matrix(pose_b[:3, :3])\n    key_rots = R.from_matrix(np.stack([pose_a[:3, :3], pose_b[:3, :3]], 0))\n    key_times = [0, 1]\n    slerp = Slerp(key_times, key_rots)\n    rot = slerp(1. - alpha)\n    ret[:3, :3] = rot.as_matrix()\n    ret[:3, 3] = (pose_a * alpha + pose_b * (1. - alpha))[:3, 3]\n    return ret\n\n\ndef inter_poses(key_poses, n_out_poses, sigma=1.):\n    n_key_poses = len(key_poses)\n    out_poses = []\n    for i in range(n_out_poses):\n        w = np.linspace(0, n_key_poses - 1, n_key_poses)\n        w = np.exp(-(np.abs(i / n_out_poses * n_key_poses - w) / sigma)**2)\n        w = w + 1e-6\n        w /= np.sum(w)\n        cur_pose = key_poses[0]\n        cur_w = w[0]\n        for j in range(0, n_key_poses - 1):\n            cur_pose = inter_two_poses(cur_pose, key_poses[j + 1], cur_w / (cur_w + w[j + 1]))\n            cur_w += w[j + 1]\n\n        out_poses.append(cur_pose)\n\n    return np.stack(out_poses)\n\n@click.command()\n@click.option('--data_dir', type=str)\n@click.option('--key_poses', type=str)\n@click.option('--n_out_poses', type=int, default=240)\ndef hello(data_dir, n_out_poses, key_poses):\n    poses = np.load(pjoin(data_dir, 'cams_meta.npy')).reshape(-1, 27)[:, :12].reshape(-1, 3, 4)\n    n_poses = len(poses)\n    key_poses = np.array([int(_) for _ in key_poses.split(',')])\n    key_poses = poses[key_poses]\n\n    out_poses = inter_poses(key_poses, n_out_poses)\n    out_poses = np.ascontiguousarray(out_poses.astype(np.float64))\n    np.save(pjoin(data_dir, 'poses_render.npy'), out_poses)\n\n\nif __name__ == '__main__':\n    hello()\n"
  },
  {
    "path": "FourierGrid/trajectory_generators/mega_traj.py",
    "content": "import numpy as np\nfrom scipy.spatial.transform import Rotation as R\nimport pdb\nfrom itertools import groupby\n\n\ndef all_equal(iterable):\n    g = groupby(iterable)\n    return next(g, True) and not next(g, False)\n\n\ndef rotate_rot_matrix_by_degree(rot, rot_degree, axis='y'):\n    rotate_r = R.from_euler(axis, -rot_degree, degrees=True)\n    rot_matrix_new = np.matmul(rot, rotate_r.as_matrix())\n    return rot_matrix_new\n\n\ndef gen_dummy_trajs(metadata, tr_c2w, train_HW, tr_K, test_num=100):\n    # assert all_equal(train_HW), \"image shapes are not all the same.\"\n    test_HW = [train_HW[0] for i in range(test_num)]\n    # assert all_equal(tr_K), \"Ks are not all the same.\"\n    test_K = [tr_K[0] for i in range(test_num)]\n    all_c2ws = tr_c2w.copy()[:test_num]  # initialize\n    all_c2ws = [np.array(c2w) for c2w in all_c2ws]\n    return all_c2ws, test_HW, test_K\n\n\ndef gen_straight_trajs(metadata, tr_c2w, train_HW, tr_K, tr_cam_idx, train_pos, test_num=100, rotate_angle=2, rot_freq=20):\n    assert all_equal(train_HW), \"image shapes are not all the same.\"\n    test_HW = [train_HW[0] for i in range(test_num)]\n    assert all_equal(tr_K), \"Ks are not all the same.\"\n    test_K = [tr_K[0] for i in range(test_num)]\n    assert all_equal(tr_cam_idx), \"Cameras are not all the same.\"\n    test_cam_idxs = [tr_cam_idx[0] for i in range(test_num)]\n    all_c2ws = tr_c2w.copy()[:test_num]  # initialize\n    all_c2ws = [np.array(c2w) for c2w in all_c2ws]\n    average_z = np.mean([c2w[2, 3] for c2w in all_c2ws])\n    for i, c2w in enumerate(all_c2ws):\n        final_rot = rotate_angle * np.sin(i / rot_freq * 2 * np.pi)\n        all_c2ws[i][:3, :3] = rotate_rot_matrix_by_degree(all_c2ws[i][:3, :3], final_rot, axis='y')\n    return all_c2ws, test_HW, test_K, test_cam_idxs\n\n\ndef gen_rotational_trajs(args, cfg, metadata, tr_c2w, train_HW, tr_K, rotate_angle=9):\n    # We assume the metadata has been sorted here. \n    # Assume the first one is the center image.\n    start_c2w, end_c2w = np.array(tr_c2w[0]), np.array(tr_c2w[-1])\n    start_rot, end_rot = start_c2w[:3, :3], end_c2w[:3, :3]\n    # forward to see the turning effect\n    base_rot = R.from_matrix(start_rot)\n    # base_rot = R.from_matrix(end_rot)\n    base_pos = start_c2w[:3,3]\n    # base_pos = end_c2w[:3,3]\n    # generate rotating matries\n    # rotate_interval = rotate_angle / test_num\n    test_num = 5\n    rotate_interval = 6\n        # test_num = 200\n        # rotate_interval = 0.1\n    forward_dis_max = 0.0  # default is 0.0\n    all_rot_yzx = [base_rot.as_euler('yzx', degrees=True)] \n    for i in range(test_num - 1):\n        if all_rot_yzx:\n            prev_rot = all_rot_yzx[-1]\n        else:\n            prev_rot = base_rot.as_euler('yzx', degrees=True)\n        new_rot = [prev_rot[0], prev_rot[1] + rotate_interval, prev_rot[2]]\n        all_rot_yzx.append(new_rot)\n    all_rot = [R.from_euler('yzx', rot, degrees=True).as_matrix() for rot in all_rot_yzx]\n    all_c2ws = [start_c2w.copy() for i in range(test_num)]  # initialize\n    for i, c2w in enumerate(all_c2ws):\n        all_c2ws[i][:3, :3] = all_rot[i]\n        # forward_dis = (1 - np.cos(i / len(all_c2ws) * np.pi / 2)) * forward_dis_max\n        forward_dis = forward_dis_max\n        cur_pos = [base_pos[0] - forward_dis, base_pos[1], base_pos[2]]\n        all_c2ws[i][:3, 3] = cur_pos\n    assert train_HW[0] == train_HW[-1], \"image shapes are not the same for the first and the last frame.\"\n    test_HW = [train_HW[0] for i in range(test_num)]\n    test_K = [tr_K[0] for i in range(test_num)]\n    return all_c2ws, test_HW, test_K"
  },
  {
    "path": "FourierGrid/trajectory_generators/waymo_traj.py",
    "content": "import numpy as np\nfrom scipy.spatial.transform import Rotation as R\nimport pdb\nfrom itertools import groupby\n\n\ndef all_equal(iterable):\n    g = groupby(iterable)\n    return next(g, True) and not next(g, False)\n\n\ndef rotate_rot_matrix_by_degree(rot, rot_degree, axis='y'):\n    rotate_r = R.from_euler(axis, -rot_degree, degrees=True)\n    rot_matrix_new = np.matmul(rot, rotate_r.as_matrix())\n    return rot_matrix_new\n\n\ndef gen_dummy_trajs(metadata, tr_c2w, train_HW, tr_K, test_num=100):\n    # assert all_equal(train_HW), \"image shapes are not all the same.\"\n    test_HW = [train_HW[0] for i in range(test_num)]\n    # assert all_equal(tr_K), \"Ks are not all the same.\"\n    test_K = [tr_K[0] for i in range(test_num)]\n    all_c2ws = tr_c2w.copy()[:test_num]  # initialize\n    all_c2ws = [np.array(c2w) for c2w in all_c2ws]\n    return all_c2ws, test_HW, test_K\n\n\ndef gen_straight_trajs(metadata, tr_c2w, train_HW, tr_K, tr_cam_idx, train_pos, test_num=100, rotate_angle=2, rot_freq=20):\n    assert all_equal(train_HW), \"image shapes are not all the same.\"\n    test_HW = [train_HW[0] for i in range(test_num)]\n    assert all_equal(tr_K), \"Ks are not all the same.\"\n    test_K = [tr_K[0] for i in range(test_num)]\n    assert all_equal(tr_cam_idx), \"Cameras are not all the same.\"\n    test_cam_idxs = [tr_cam_idx[0] for i in range(test_num)]\n    all_c2ws = tr_c2w.copy()[:test_num]  # initialize\n    all_c2ws = [np.array(c2w) for c2w in all_c2ws]\n    average_z = np.mean([c2w[2, 3] for c2w in all_c2ws])\n    for i, c2w in enumerate(all_c2ws):\n        final_rot = rotate_angle * np.sin(i / rot_freq * 2 * np.pi)\n        all_c2ws[i][:3, :3] = rotate_rot_matrix_by_degree(all_c2ws[i][:3, :3], final_rot, axis='y')\n    return all_c2ws, test_HW, test_K, test_cam_idxs\n\n\ndef gen_rotational_trajs(args, cfg, metadata, tr_c2w, train_HW, tr_K, tr_cam_idx, train_pos, rotate_angle=9):\n    # We assume the metadata has been sorted here.\n    start_c2w, end_c2w = np.array(tr_c2w[0]), np.array(tr_c2w[-1])\n    start_rot, end_rot = start_c2w[:3, :3], end_c2w[:3, :3]\n    # get base information, this is where we started\n    # base_pos = np.array(train_pos).mean(0)\n    base_pos = train_pos[0]\n    # forward to see the turning effect\n    base_rot = R.from_matrix(start_rot)\n    # generate rotating matries\n    # rotate_interval = rotate_angle / test_num\n    test_num = 200\n    rotate_interval = -0.3\n    forward_dis_max = 0.03\n    all_rot_yzx = [base_rot.as_euler('yzx', degrees=True)] \n    for i in range(test_num - 1):\n        if all_rot_yzx:\n            prev_rot = all_rot_yzx[-1]\n        else:\n            prev_rot = base_rot.as_euler('yzx', degrees=True)\n        new_rot = [prev_rot[0] + rotate_interval, prev_rot[1], prev_rot[2]]\n        all_rot_yzx.append(new_rot)\n    all_rot = [R.from_euler('yzx', rot, degrees=True).as_matrix() for rot in all_rot_yzx]\n    # cur_R = R.from_quat(base_quat).as_matrix()\n    # all_rot = [cur_R]\n    # for i in range(test_num - 1):\n    #     rotate_r = R.from_euler('y', rotate_interval, degrees=True)\n    #     cur_R = np.matmul(cur_R, rotate_r.as_matrix())\n    #     all_rot.append(cur_R)\n    # last_r = all_rot[-1]\n    # last_r = R.from_matrix(last_r).as_euler('yzx', degrees=True)\n    all_c2ws = [start_c2w.copy() for i in range(test_num)]  # initialize\n    test_pos = []\n    for i, c2w in enumerate(all_c2ws):\n        all_c2ws[i][:3, :3] = all_rot[i]\n        forward_dis = (1 - np.cos(i / len(all_c2ws) * np.pi / 2)) * forward_dis_max\n        cur_pos = [base_pos[0] - forward_dis, base_pos[1], base_pos[2]]\n        all_c2ws[i][:3, 3] = cur_pos\n        test_pos.append(cur_pos)\n    assert train_HW[0] == train_HW[-1], \"image shapes are not the same for the first and the last frame.\"\n    test_HW = [train_HW[0] for i in range(test_num)]\n    test_K = [tr_K[0] for i in range(test_num)]\n    test_cam_idxs = [tr_cam_idx[0] for i in range(test_num)]\n    return all_c2ws, test_HW, test_K, test_cam_idxs, test_pos\n"
  },
  {
    "path": "FourierGrid/utils.py",
    "content": "import os, math, cv2\nimport pdb\nimport numpy as np\nimport scipy.signal\nfrom typing import List, Optional\n\nfrom torch import Tensor\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom FourierGrid.masked_adam import MaskedAdam\nimport torch.optim\n\n\n''' Misc\n'''\nmse2psnr = lambda x : -10. * torch.log10(x)\nto8b = lambda x : (255*np.clip(x,0,1)).astype(np.uint8)\n\ndef resize_and_to_8b(input_images, res):\n    rgb_images = np.array([cv2.resize(input_image, res) for input_image in input_images])\n    b_images = to8b(rgb_images)\n    return b_images\n\ndef create_optimizer_or_freeze_model(model, cfg_train, global_step, verbose=False):\n    decay_steps = cfg_train.lrate_decay * 1000\n    decay_factor = 0.1 ** (global_step/decay_steps)\n\n    param_group = []\n    for k in cfg_train.keys():\n        if not k.startswith('lrate_'):\n            continue\n        k = k[len('lrate_'):]\n\n        if not hasattr(model, k):\n            continue\n\n        param = getattr(model, k)\n        if param is None:\n            if verbose:\n                print(f'create_optimizer_or_freeze_model: param {k} not exist')\n            continue\n\n        lr = getattr(cfg_train, f'lrate_{k}') * decay_factor\n        if lr > 0:\n            if verbose:\n                print(f'create_optimizer_or_freeze_model: param {k} lr {lr}')\n            if isinstance(param, nn.Module):\n                param = param.parameters()\n            param_group.append({'params': param, 'lr': lr, 'skip_zero_grad': (k in cfg_train.skip_zero_grad_fields)})\n        else:\n            if verbose:\n                print(f'create_optimizer_or_freeze_model: param {k} freeze')\n            param.requires_grad = False\n    return MaskedAdam(param_group)\n\n\n''' Checkpoint utils\n'''\ndef load_checkpoint(model, optimizer, ckpt_path, no_reload_optimizer):\n    ckpt = torch.load(ckpt_path)\n    start = ckpt['global_step']\n    model.load_state_dict(ckpt['model_state_dict'])\n    if not no_reload_optimizer:\n        optimizer.load_state_dict(ckpt['optimizer_state_dict'])\n    return model, optimizer, start\n\n\ndef load_model(model_class, ckpt_path):\n    ckpt = torch.load(ckpt_path)\n    model = model_class(**ckpt['model_kwargs'])\n    model.load_state_dict(ckpt['model_state_dict'])\n    return model\n\n\n''' Evaluation metrics (ssim, lpips)\n'''\ndef rgb_ssim(img0, img1, max_val,\n             filter_size=11,\n             filter_sigma=1.5,\n             k1=0.01,\n             k2=0.03,\n             return_map=False):\n    # Modified from https://github.com/google/mipnerf/blob/16e73dfdb52044dcceb47cda5243a686391a6e0f/internal/math.py#L58\n    assert len(img0.shape) == 3\n    assert img0.shape[-1] == 3\n    assert img0.shape == img1.shape\n\n    # Construct a 1D Gaussian blur filter.\n    hw = filter_size // 2\n    shift = (2 * hw - filter_size + 1) / 2\n    f_i = ((np.arange(filter_size) - hw + shift) / filter_sigma)**2\n    filt = np.exp(-0.5 * f_i)\n    filt /= np.sum(filt)\n\n    # Blur in x and y (faster than the 2D convolution).\n    def convolve2d(z, f):\n        return scipy.signal.convolve2d(z, f, mode='valid')\n\n    filt_fn = lambda z: np.stack([\n        convolve2d(convolve2d(z[...,i], filt[:, None]), filt[None, :])\n        for i in range(z.shape[-1])], -1)\n    mu0 = filt_fn(img0)\n    mu1 = filt_fn(img1)\n    mu00 = mu0 * mu0\n    mu11 = mu1 * mu1\n    mu01 = mu0 * mu1\n    sigma00 = filt_fn(img0**2) - mu00\n    sigma11 = filt_fn(img1**2) - mu11\n    sigma01 = filt_fn(img0 * img1) - mu01\n\n    # Clip the variances and covariances to valid values.\n    # Variance must be non-negative:\n    sigma00 = np.maximum(0., sigma00)\n    sigma11 = np.maximum(0., sigma11)\n    sigma01 = np.sign(sigma01) * np.minimum(\n        np.sqrt(sigma00 * sigma11), np.abs(sigma01))\n    c1 = (k1 * max_val)**2\n    c2 = (k2 * max_val)**2\n    numer = (2 * mu01 + c1) * (2 * sigma01 + c2)\n    denom = (mu00 + mu11 + c1) * (sigma00 + sigma11 + c2)\n    ssim_map = numer / denom\n    ssim = np.mean(ssim_map)\n    return ssim_map if return_map else ssim\n\n\n__LPIPS__ = {}\ndef init_lpips(net_name, device):\n    assert net_name in ['alex', 'vgg']\n    import lpips\n    print(f'init_lpips: lpips_{net_name}')\n    return lpips.LPIPS(net=net_name, version='0.1').eval().to(device)\n\ndef rgb_lpips(np_gt, np_im, net_name, device):\n    if net_name not in __LPIPS__:\n        __LPIPS__[net_name] = init_lpips(net_name, device)\n    gt = torch.from_numpy(np_gt).permute([2, 0, 1]).contiguous().to(device)\n    im = torch.from_numpy(np_im).permute([2, 0, 1]).contiguous().to(device)\n    return __LPIPS__[net_name](gt, im, normalize=True).item()\n\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2022 Zelin Zhao\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Unbounded Neural Radiance Fields in Pytorch\n\n<!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->\n[![All Contributors](https://img.shields.io/badge/all_contributors-6-orange.svg?style=flat-square)](#contributors-)\n<!-- ALL-CONTRIBUTORS-BADGE:END -->\n\n## 1. Introduction\n\n**This is still a research project in progress.**\n\nThis project aims for benchmarking several state-of-the-art large-scale radiance fields algorithms. We exchangely use terms \"unbounded NeRF\" and \"large-scale NeRF\" because we find the techniques behind them are closely related.\n\nInstead of pursuing a big and complicated code system, we pursue a simple code repo with SOTA performance for unbounded NeRFs.\n\nYou are expected to get the following results in this repository:\n\n| Benchmark                     | Methods      | PSNR      |\n|-------------------------------|--------------|-----------|\n| Unbounded Tanks & Temples     | NeRF++       | 20.49     |\n| Unbounded Tanks & Temples     | Plenoxels    | 20.40     |\n| Unbounded Tanks & Temples     | DVGO         | 20.10     |\n| **Unbounded Tanks & Temples** | **Ours**     | **20.85** |\n| Mip-NeRF-360 Benchmark          | NeRF         | 24.85     |\n| Mip-NeRF-360 Benchmark          | NeRF++       | 26.21     |\n| Mip-NeRF-360 Benchmark          | Mip-NeRF-360 | 28.94     |\n| Mip-NeRF-360 Benchmark          | DVGO         | 25.42     |\n| **Mip-NeRF-360 Benchmark**      | **Ours**     | **28.98** |\n\n<details> \n\n<summary> Expand / collapse qualitative results. </summary>\n\n## Tanks and Temples:\n\n* Playground:\n\nhttps://user-images.githubusercontent.com/31123348/220946729-d88db335-0618-4b75-9fc2-8de577e1ddb5.mp4\n\n* Truck:\n\nhttps://user-images.githubusercontent.com/31123348/220946857-0f4b7239-8be6-4fca-9bba-2f2425e857a5.mp4\n\n* M60:\n\nhttps://user-images.githubusercontent.com/31123348/220947063-068b94f6-3afb-421d-8746-43bcf9643a37.mp4\n\n* Train:\n\nhttps://user-images.githubusercontent.com/31123348/220947239-6528d542-b2b8-45e3-8e69-6e0eff869720.mp4\n\n## Mip-NeRF-360 Benchmark:\n\n* Bicycle:\n\nhttps://user-images.githubusercontent.com/31123348/220947385-ab31c646-c671-4522-8e4f-a1982d98c753.mp4\n\n* Stump:\n\nhttps://user-images.githubusercontent.com/31123348/220947472-47dc4716-095b-45ec-890b-d6afd97de9e9.mp4\n\n* Kitchen:\n\nhttps://user-images.githubusercontent.com/31123348/220947597-68f7ec32-c761-4253-955a-a2acc6a2eb25.mp4\n\n* Bonsai:\n\nhttps://user-images.githubusercontent.com/31123348/220947686-d8957a2e-ef52-46cf-b437-28de91f55871.mp4\n\n* Garden:\n\nhttps://user-images.githubusercontent.com/31123348/220947771-bbd249c0-3d0b-4d25-9b79-d4de9af17c4a.mp4\n\n* Counter:\n\nhttps://user-images.githubusercontent.com/31123348/220947818-e5c6b07f-c930-48b2-8aa7-363182dea6be.mp4\n\n* Room:\n\nhttps://user-images.githubusercontent.com/31123348/220948025-25ce5cc1-3c9a-450c-920d-98a8f153a0fa.mp4\n\n## San Francisco Mission Bay (dataset released by [Block-NeRF](https://waymo.com/research/block-nerf/)):\n* Training splits:\n\n  https://user-images.githubusercontent.com/31123348/200509378-4b9fe63f-4fa4-40b1-83a9-b8950d981a3b.mp4\n\n* Rotation: \n\n  https://user-images.githubusercontent.com/31123348/200509910-a5d8f820-143a-4e03-8221-b04d0db2d050.mov\n\n</details>\n\nHope our efforts could help your research or projects!\n\n## 2. News\n- [2023.3.20] This project is renamed to \"UnboundedNeRFPytorch\" because we find our work is not large enough (e.g., at city level), rigorously speaking.\n\n<details>\n<summary> Expand / collapse older news. </summary>\n\n- [2023.2.27] **A major update of our repository with better performance and full code release**. \n- [2022.12.23] Released several weeks' NeRF. Too many papers pop out these days so the update speed is slow.\n- [2022.9.12] Training Block-NeRF on the Waymo dataset, reaching PSNR 24.3.\n- [2022.8.31] Training Mega-NeRF on the Waymo dataset, loss still NAN.\n- [2022.8.24] Support the full Mega-NeRF pipeline.\n- [2022.8.18] Support all previous papers in weekly classified NeRF.\n- [2022.8.17] Support classification in weekly NeRF.\n- [2022.8.16] Support evaluation scripts and data format standard. Getting some results.\n- [2022.8.13] Add estimated camera pose and release a better dataset.\n- [2022.8.12] Add weekly NeRF functions.\n- [2022.8.8] Add the NeRF reconstruction code and doc for custom purposes.\n- [2022.7.28] The data preprocess script is finished.\n- [2022.7.20] This project started!\n</details>\n\n## 3. Installation\n<details>\n<summary>Expand / collapse installation steps.</summary>\n\n1. Clone this repository. Use depth == 1 to avoid download a large history.\n   ```bash\n   git clone --depth=1 git@github.com:sjtuytc/LargeScaleNeRFPytorch.git\n   mkdir data\n   mkdir logs\n   ```\n\n2. Create conda environment.\n   ```bash\n   conda create -n large-scale-nerf python=3.9\n   conda activate large-scale-nerf\n   ```\n3. Install pytorch and other libs. Make sure your Pytorch version is compatible with your CUDA.\n   ```bash\n   pip install --upgrade pip\n   conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia\n   pip install -r requirements.txt\n   ```\n\n4. Install grid-based operators to avoid running them every time, cuda lib required. (Check via \"nvcc -V\" to ensure that you have a latest cuda.)\n   ```bash\n   apt-get install g++ build-essential  # ensure you have g++ and other build essentials, sudo access required.\n   cd FourierGrid/cuda\n   python setup.py install\n   cd ../../\n   ```\n5. Install other libs used for reconstructing **custom** scenes. **This is only needed when you need to build your scenes.**\n   ```bash\n   sudo apt-get install colmap\n   sudo apt-get install imagemagick  # required sudo accesss\n   conda install pytorch-scatter -c pyg  # or install via https://github.com/rusty1s/pytorch_scatter\n   ```\n   You can use laptop version of COLMAP as well if you do not have access to sudo access on your server. However, we found if you do not set up COLMAP parameters properly, you would not get the SOTA performance.\n</details>\n\n## 4. Unbounded NeRF on the public datasets\n\nClick the following sub-section titles to expand / collapse steps.\n\n<details>\n<summary> 4.1 Download processed data.</summary>\n\n- **Disclaimer**: users are required to get permission from the original dataset provider. Any usage of the data must obey the license of the dataset owner.\n\n(1) [Unbounded Tanks & Temples](https://www.tanksandtemples.org/). Download data from [here](https://drive.google.com/file/d/11KRfN91W1AxAW6lOFs4EeYDbeoQZCi87/view). Then unzip the data.\n\n```bash\ncd data\ngdown --id 11KRfN91W1AxAW6lOFs4EeYDbeoQZCi87\nunzip tanks_and_temples.zip\ncd ../\n```\n\t\n(2) The [Mip-NeRF-360](https://jonbarron.info/mipnerf360/) dataset.\n\n```bash\ncd data\nwget http://storage.googleapis.com/gresearch/refraw360/360_v2.zip\nmkdir 360_v2\nunzip 360_v2.zip -d 360_v2\ncd ../\n```\n\n(3) [San Fran Cisco Mission Bay](https://waymo.com/research/block-nerf/).\nWhat you should know before downloading the data:\n\n- Our processed waymo data is significantly **smaller** than the original version (19.1GB vs. 191GB) because we store the camera poses instead of raw ray directions. Besides, our processed data is more friendly for Pytorch dataloaders. Download [the data](https://drive.google.com/drive/folders/1Lcc6MF35EnXGyUy0UZPkUx7SfeLsv8u9?usp=sharing) in the Google Drive. You may use [gdown](https://stackoverflow.com/questions/65001496/how-to-download-a-google-drive-folder-using-link-in-linux) to download the files via command lines. If you are interested in processing the raw waymo data on your own, please refer to [this doc](./docs/get_pytorch_waymo_dataset.md).\n\nThe downloaded data would look like this:\n\n   ```\n   data\n      |\n      |——————360_v2                                    // the root folder for the Mip-NeRF-360 benchmark\n      |        └——————bicycle                          // one scene under the Mip-NeRF-360 benchmark\n      |        |         └——————images                 // rgb images\n      |        |         └——————images_2               // rgb images downscaled by 2\n      |        |         └——————sparse                 // camera poses\n      |        ...\n      |——————tanks_and_temples                         // the root folder for Tanks&Temples\n      |        └——————tat_intermediate_M60             // one scene under Tanks&Temples\n      |        |         └——————camera_path            // render split camera poses, intrinsics and extrinsics\n      |        |         └——————test                   // test split\n      |        |         └——————train                  // train split\n      |        |         └——————validation             // validation split\n      |        ...\n      |——————pytorch_waymo_dataset                     // the root folder for San Fran Cisco Mission Bay\n      |        └——————cam_info.json                    // extracted cam2img information in dict.\n      |        └——————coordinates.pt                   // global camera information used in Mega-NeRF, deprecated\n      |        └——————train                            // train data\n      |        |         └——————metadata               // meta data per image (camera information, etc)\n      |        |         └——————rgbs                   // rgb images\n      |        |         └——————split_block_train.json // split block informations\n      |        |         └——————train_all_meta.json    // all meta informations in train folder\n      |        └——————val                              // val data with the same structure as train\n   ```\n</details>\n\n<details>\n<summary> 4.2 Train models and see the results!</summary>\n\nYou only need to run \"python run_FourierGrid.py\" to finish the train-test-render cycle. Explanations of some arguments: \n```bash\n--program: the program to run, normally --program train will be all you need.\n--config: the config pointing to the scene file, e.g., --config FourierGrid/configs/tankstemple_unbounded/truck_single.py.\n--num_per_block: number of blocks used in large-scale NeRFs, normally this is set to -1, unless specially needed.\n--render_train: render the trained model on the train split.\n--render_train: render the trained model on the test split.\n--render_train: render the trained model on the render split.\n--exp_id: add some experimental ids to identify different experiments. E.g., --exp_id 5.\n--eval_ssim / eval_lpips_vgg: report SSIM / LPIPS(VGG) scores.\n```\n\nWhile we list major of the commands in scripts/train_FourierGrid.sh, we list some of commands below for better reproducibility.\n\n```bash\n# Unbounded tanks and temples\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/playground_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 57\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/train_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 12\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/truck_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 4\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/m60_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 6\n\n# 360 degree dataset\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/room_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 9\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/stump_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 10\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/bicycle_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 11\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/bonsai_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 3\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/garden_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 2\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/kitchen_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 2\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/counter_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 2\n\n# San Francisco Mission Bay dataset\npython run_FourierGrid.py --program train --config FourierGrid/configs/waymo/waymo_no_block.py --num_per_block 100 --render_video --exp_id 30\n```\n\nThe old version of Block-NeRF is still provided to serve as a baseline, but it will be deprecated soon. We will mainly work on grid-based models later because they are simple and fast. Run the following command to reproduce the old Block-NeRF experiments:\n\n```bash\nbash scripts/block_nerf_train.sh\nbash scripts/block_nerf_eval.sh\n```\n\n</details>\n\n\n## 5. Build your custom unbounded NeRF (deprecated)\n\n<details>\n<summary>Expand / collapse steps for building custom NeRF world.</summary>\n\n1. Put your images under data folder. The structure should be like:\n\n\t```bash\n\tdata\n\t   |——————Madoka          // Your folder name here.\n\t   |        └——————source // Source images should be put here.\n\t   |                 └——————---|1.png\n\t   |                 └——————---|2.png\n\t   |                 └——————---|...\n\t```\n   The sample data is provided in [our Google drive folder](https://drive.google.com/drive/folders/1JyX0VNf0R58s46Abj8HDO1NwZqmGOVRS?usp=sharing). The Madoka and Otobai can be found [at this link](https://sunset1995.github.io/dvgo/tutor_forward_facing.html). \n\n2. Run COLMAP to reconstruct scenes. This would probably cost a long time.\n\n\t```bash\n\tpython FourierGrid/tools/imgs2poses.py data/Madoka\n\t```\n   You can replace data/Madoka by your data folder.\n   If your COLMAP version is larger than 3.6 (which should not happen if you use apt-get), you need to change export_path to output_path in the colmap_wrapper.py.\n\n3. Training NeRF scenes.\n\n\t```bash\n\tpython FourierGrid/run_FourierGrid.py --config configs/custom/Madoka.py\n\t```\n   You can replace configs/custom/Madoka.py by other configs.\n\n4. Validating the training results to generate a fly-through video.\n\n\t```bash\n\tpython FourierGrid/run_FourierGrid.py --config configs/custom/Madoka.py --render_only --render_video --render_video_factor 8\n\t```\n</details>\n\n\n## 6. Citations & acknowledgements\n\nOur latest theoretical work about grid-based models (**Oral & Best Paper Award Candidate & Full Review Score (5/5/5)** at CVPR24):\n```\n@misc{zhao2024grounding,\n      title={Grounding and Enhancing Grid-based Models for Neural Fields}, \n      author={Zelin Zhao and Fenglei Fan and Wenlong Liao and Junchi Yan},\n      year={2024},\n      eprint={2403.20002},\n      archivePrefix={arXiv},\n      primaryClass={cs.CV}\n}\n```\n\nConsider citing the following great works:\n\n```\n@inproceedings{dvgo,\n  title={Direct voxel grid optimization: Super-fast convergence for radiance fields reconstruction},\n  author={Sun, Cheng and Sun, Min and Chen, Hwann-Tzong},\n  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n  pages={5459--5469},\n  year={2022}\n}\n\n @InProceedings{Tancik_2022_CVPR,\n    author    = {Tancik, Matthew and Casser, Vincent and Yan, Xinchen and Pradhan, Sabeek and Mildenhall, Ben and Srinivasan, Pratul P. and Barron, Jonathan T. and Kretzschmar, Henrik},\n    title     = {Block-NeRF: Scalable Large Scene Neural View Synthesis},\n    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},\n    month     = {June},\n    year      = {2022},\n    pages     = {8248-8258}\n}\n```\n\nWe refer to the code and data from [DVGO](https://github.com/sunset1995/DirectVoxGO), [nerf-pl](https://github.com/kwea123/nerf_pl) and [SVOX2](https://github.com/sxyu/svox2), thanks for their great work!\n\n## [Weekly classified NeRF](docs/weekly_nerf.md)\nWe track weekly NeRF papers and classify them. All previous published NeRF papers have been added to the list. We provide an [English version](docs/weekly_nerf.md) and a [Chinese version](docs/weekly_nerf_cn.md). We welcome [contributions and corrections](docs/contribute_weekly_nerf.md) via PR.\n\nWe also provide an [excel version](docs/weekly_nerf_meta_data.xlsx) (the meta data) of all NeRF papers, you can add your own comments or make your own paper analysis tools based on the structured meta data.\n\n## Contributors ✨\n\nThanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):\n\n<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->\n<!-- prettier-ignore-start -->\n<!-- markdownlint-disable -->\n<table>\n  <tbody>\n    <tr>\n      <td align=\"center\" valign=\"top\" width=\"14.28%\"><a href=\"https://sjtuytc.github.io/\"><img src=\"https://avatars.githubusercontent.com/u/31123348?v=4?s=100\" width=\"100px;\" alt=\"Zelin Zhao\"/><br /><sub><b>Zelin Zhao</b></sub></a><br /><a href=\"https://github.com/sjtuytc/LargeScaleNeRFPytorch/commits?author=sjtuytc\" title=\"Code\">💻</a> <a href=\"#maintenance-sjtuytc\" title=\"Maintenance\">🚧</a></td>\n      <td align=\"center\" valign=\"top\" width=\"14.28%\"><a href=\"https://github.com/SEUleaderYang\"><img src=\"https://avatars.githubusercontent.com/u/55042050?v=4?s=100\" width=\"100px;\" alt=\"EZ-Yang\"/><br /><sub><b>EZ-Yang</b></sub></a><br /><a href=\"https://github.com/sjtuytc/LargeScaleNeRFPytorch/commits?author=SEUleaderYang\" title=\"Code\">💻</a></td>\n      <td align=\"center\" valign=\"top\" width=\"14.28%\"><a href=\"https://github.com/Alex-Alison-Zhang\"><img src=\"https://avatars.githubusercontent.com/u/71915735?v=4?s=100\" width=\"100px;\" alt=\"Alex-Zhang\"/><br /><sub><b>Alex-Zhang</b></sub></a><br /><a href=\"https://github.com/sjtuytc/LargeScaleNeRFPytorch/issues?q=author%3AAlex-Alison-Zhang\" title=\"Bug reports\">🐛</a></td>\n      <td align=\"center\" valign=\"top\" width=\"14.28%\"><a href=\"https://fanlu97.github.io/\"><img src=\"https://avatars.githubusercontent.com/u/45007531?v=4?s=100\" width=\"100px;\" alt=\"Fan Lu\"/><br /><sub><b>Fan Lu</b></sub></a><br /><a href=\"https://github.com/sjtuytc/LargeScaleNeRFPytorch/issues?q=author%3AFanLu97\" title=\"Bug reports\">🐛</a></td>\n      <td align=\"center\" valign=\"top\" width=\"14.28%\"><a href=\"https://maybeshewill-cv.github.io\"><img src=\"https://avatars.githubusercontent.com/u/15725187?v=4?s=100\" width=\"100px;\" alt=\"MaybeShewill-CV\"/><br /><sub><b>MaybeShewill-CV</b></sub></a><br /><a href=\"https://github.com/sjtuytc/LargeScaleNeRFPytorch/issues?q=author%3AMaybeShewill-CV\" title=\"Bug reports\">🐛</a></td>\n      <td align=\"center\" valign=\"top\" width=\"14.28%\"><a href=\"https://github.com/buer1121\"><img src=\"https://avatars.githubusercontent.com/u/48516434?v=4?s=100\" width=\"100px;\" alt=\"buer1121\"/><br /><sub><b>buer1121</b></sub></a><br /><a href=\"https://github.com/sjtuytc/LargeScaleNeRFPytorch/issues?q=author%3Abuer1121\" title=\"Bug reports\">🐛</a></td>\n    </tr>\n  </tbody>\n</table>\n\n<!-- markdownlint-restore -->\n<!-- prettier-ignore-end -->\n\n<!-- ALL-CONTRIBUTORS-LIST:END -->\n\nThis project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!\n"
  },
  {
    "path": "block_nerf/__init__.py",
    "content": ""
  },
  {
    "path": "block_nerf/block_nerf_lightning.py",
    "content": "from pytorch_lightning import LightningModule, Trainer\nimport torch\nimport os \nfrom collections import defaultdict\nfrom torch.utils.data import DataLoader\nfrom block_nerf.waymo_dataset import *\nfrom block_nerf.block_nerf_model import *\nfrom block_nerf.rendering import *\nfrom block_nerf.metrics import *\nfrom block_nerf.block_visualize import *\nfrom block_nerf.learning_utils import *\n\nclass Block_NeRF_System(LightningModule):\n    def __init__(self, hparams):\n        super(Block_NeRF_System, self).__init__()\n        self.hyper_params = hparams\n        self.save_hyperparameters(hparams)\n        self.loss = BlockNeRFLoss(1e-2)  #hparams['Visi_loss']\n\n        self.xyz_IPE = InterPosEmbedding(hparams['N_IPE_xyz'])  # xyz的L=10\n        self.dir_exposure_PE = PosEmbedding(\n            hparams['N_PE_dir_exposure'])  # dir的L=4\n        self.embedding_appearance = torch.nn.Embedding(\n            hparams['N_vocab'], hparams['N_appearance'])\n\n        self.Embedding = {'IPE': self.xyz_IPE,\n                          'PE': self.dir_exposure_PE,\n                          'appearance': self.embedding_appearance}\n\n        self.Block_NeRF = Block_NeRF(in_channel_xyz=6 * hparams['N_IPE_xyz'],\n                                     in_channel_dir=6 *\n                                                    hparams['N_PE_dir_exposure'],\n                                     in_channel_exposure=2 *\n                                                         hparams['N_PE_dir_exposure'],\n                                     in_channel_appearance=hparams['N_appearance'])\n\n        self.Visibility = Visibility(in_channel_xyz=6 * hparams['N_IPE_xyz'],\n                                     in_channel_dir=6 * hparams['N_PE_dir_exposure'])\n\n        self.models_to_train = []\n        self.models_to_train += [self.embedding_appearance]\n        self.models_to_train += [self.Block_NeRF]\n        self.models_to_train += [self.Visibility]\n\n    def forward(self, rays, ts):\n        B = rays.shape[0]\n        model = {\n            \"block_model\": self.Block_NeRF,\n            \"visibility_model\": self.Visibility\n        }\n\n        results = defaultdict(list)\n        for i in range(0, B, self.hparams['chunk']):\n            rendered_ray_chunks = render_rays(model, self.Embedding,\n                                              rays[i:i + self.hparams['chunk']],\n                                              ts[i:i + self.hparams['chunk']],\n                                              N_samples=self.hparams['N_samples'],\n                                              N_importance=self.hparams['N_importance'],\n                                              chunk=self.hparams['chunk'],\n                                              type=\"train\",\n                                              use_disp=self.hparams['use_disp']\n                                              )\n            for k, v in rendered_ray_chunks.items():\n                results[k] += [v]\n\n        for k, v in results.items():\n            results[k] = torch.cat(v, 0)\n\n        return results\n\n    def setup(self, stage):\n        self.train_dataset = WaymoDataset(root_dir=self.hparams['root_dir'],\n                                          split='train',\n                                          block=self.hparams['block_index'],\n                                          img_downscale=self.hparams['img_downscale'],\n                                          near=self.hparams['near'],\n                                          far=self.hparams['far'])\n        self.val_dataset = WaymoDataset(root_dir=self.hparams['root_dir'],\n                                        split='val',\n                                        block=self.hparams['block_index'],\n                                        img_downscale=self.hparams['img_downscale'],\n                                        near=self.hparams['near'],\n                                        far=self.hparams['far'])\n\n    def configure_optimizers(self):\n        self.optimizer = get_optimizer(self.hparams, self.models_to_train)\n        scheduler = get_scheduler(self.hparams, self.optimizer)\n        return [self.optimizer], [scheduler]\n\n    def train_dataloader(self):\n        return DataLoader(self.train_dataset,\n                          shuffle=True,\n                          num_workers=8,\n                          batch_size=self.hparams['batch_size'],\n                          pin_memory=True)\n\n    def val_dataloader(self):\n        return DataLoader(self.val_dataset,\n                          shuffle=False,\n                          num_workers=8,\n                          batch_size=1,\n                          pin_memory=True)\n\n    def training_step(self, batch, batch_nb):\n        rays, rgbs, ts = batch['rays'], batch['rgbs'], batch['ts']\n        results = self(rays, ts)\n        loss_d = self.loss(results, rgbs)\n        loss = sum(l for l in loss_d.values())\n\n        with torch.no_grad():\n            psnr_ = psnr(results['rgb_fine'], rgbs)\n\n        self.log('lr', get_learning_rate(self.optimizer))\n        self.log('train/loss', loss)\n        for k, v in loss_d.items():\n            self.log(f'train/{k}', v, prog_bar=True)\n        self.log('train/psnr', psnr_, prog_bar=True)\n\n        return loss\n\n    def validation_step(self, batch, batch_nb):  # validate at each epoch\n        rays, rgbs, ts = batch['rays'].squeeze(), batch['rgbs'].squeeze(), batch['ts'].squeeze()\n        W,H=batch['w_h']\n        results = self(rays, ts)\n        loss_d = self.loss(results, rgbs)\n        loss = sum(l for l in loss_d.values())\n\n        if batch_nb == 0:\n            img = results[f'rgb_fine'].view(H, W, 3).permute(2, 0, 1).cpu() # (3, H, W)\n            img_gt = rgbs.view(H, W, 3).permute(2, 0, 1).cpu() # (3, H, W)\n            depth = visualize_depth(results[f'depth_fine'].view(H, W)) # (3, H, W)\n            stack = torch.stack([img_gt, img, depth]) # (3, 3, H, W)\n            #stack = torch.stack([img_gt, img])  # (3, 3, H, W)\n            # todo: recheck this, * 255?\n            self.logger.experiment.add_images('val/GT_pred_depth',\n                                               stack, self.global_step)\n\n        psnr_ = psnr(results['rgb_fine'], rgbs)\n\n        log = {'val_loss': loss}\n        for k, v in loss_d.items():\n            log[f'val_{k}']= v\n        log['val_psnr']= psnr_\n\n        return log\n\n    def validation_epoch_end(self, outputs):\n        mean_loss = torch.stack([x['val_loss'] for x in outputs]).mean()\n        mean_psnr = torch.stack([x['val_psnr'] for x in outputs]).mean()\n\n        self.log('val/loss', mean_loss)\n        self.log('val/psnr', mean_psnr, prog_bar=True)"
  },
  {
    "path": "block_nerf/block_nerf_model.py",
    "content": "import torch\nfrom torch import nn\n\n\nclass BlockNeRFLoss(nn.Module):\n    def __init__(self, lambda_mu=0.01, Visi_loss=1e-2):\n        super(BlockNeRFLoss, self).__init__()\n        self.lambda_mu = lambda_mu\n        self.Visi_loss = Visi_loss\n\n    def forward(self, inputs, targets):\n        loss = {}\n        # RGB\n        loss['rgb_coarse'] = self.lambda_mu * ((inputs['rgb_coarse'] - targets[..., :3]) ** 2).mean()\n        loss['rgb_fine'] = ((inputs['rgb_fine'] - targets[..., :3]) ** 2).mean()\n        # visibility\n        loss[\"transmittance_coarse\"] = self.lambda_mu * self.Visi_loss * ((inputs['transmittance_coarse_real'].detach() -\n                                                                     inputs['transmittance_coarse_vis'].squeeze()) ** 2).mean()\n        loss[\"transmittance_fine\"] = self.Visi_loss * ((inputs['transmittance_fine_real'].detach() - inputs[\n            'transmittance_fine_vis'].squeeze()) ** 2).mean()\n\n        return loss\n\n\nclass InterPosEmbedding(nn.Module):\n    def __init__(self, N_freqs=10):\n        super(InterPosEmbedding, self).__init__()\n        self.N_freqs = N_freqs\n        self.funcs = [torch.sin, torch.cos]\n\n        # [2^0,2^1,...,2^(n-1)]: for sin\n        self.freq_band_1 = 2 ** torch.linspace(0, N_freqs - 1, N_freqs)\n        # [4^0,4^1,...,4^(n-1)]: for diag(∑)\n        self.freq_band_2 = self.freq_band_1 ** 2\n\n    def forward(self, mu, diagE):\n        sin_out = []\n        sin_cos = []\n        for freq in self.freq_band_1:\n            for func in self.funcs:\n                sin_cos.append(func(freq * mu))\n            sin_out.append(sin_cos)\n            sin_cos = []\n        # sin_out:list:[sin(mu),cos(mu)]\n        diag_out = []\n        for freq in self.freq_band_2:\n            diag_out.append(freq * diagE)\n        # diag_out:list:[4^(L-1)*diag(∑)]\n        out = []\n        for sc_γ, diag_Eγ in zip(sin_out, diag_out):\n            # torch.exp(-0.5 * x_var) * torch.sin(x)\n            for sin_cos in sc_γ:  # [sin,cos]\n                out.append(sin_cos * torch.exp(-0.5 * diag_Eγ))\n        return torch.cat(out, -1)\n\n\nclass PosEmbedding(nn.Module):\n    def __init__(self, N_freqs):\n        \"\"\"\n        Defines a function that embeds x to (x, sin(2^k x), cos(2^k x), ...)\n        in_channels: number of input channels (3 for both xyz and direction)\n        \"\"\"\n        super().__init__()\n        self.N_freqs = N_freqs\n        self.funcs = [torch.sin, torch.cos]\n        # [2^0,2^1,...,2^(n-1)]\n        self.freq_bands = 2 ** torch.linspace(0, N_freqs - 1, N_freqs)\n\n    def forward(self, x):\n        out = []\n        for freq in self.freq_bands:  # [2^0,2^1,...,2^(n-1)]\n            for func in self.funcs:\n                out += [func(freq * x)]\n        # xyz——>63,dir——>27\n        return torch.cat(out, -1)\n\nclass Block_NeRF(nn.Module):\n    def __init__(self, D=8, W=256, skips=[4],\n                 in_channel_xyz=60, in_channel_dir=24,\n                 in_channel_exposure=8,  # exposure is in 1d and dirs are in 3d\n                 in_channel_appearance=32,\n                 add_apperance=True,\n                 add_exposure=True):\n        # input：[xyz60,dir24,exposure24,appearance24]\n        super(Block_NeRF, self).__init__()\n        self.D = D\n        self.W = W\n        self.skips = skips\n        self.in_channel_xyz = in_channel_xyz\n        self.in_channel_dir = in_channel_dir\n        self.in_channel_exposure = in_channel_exposure\n        self.in_channel_appearance = in_channel_appearance\n        self.add_appearance = add_apperance\n        self.add_exposure = add_exposure\n\n        for i in range(D):\n            if i == 0:\n                layer = nn.Linear(in_channel_xyz, W)\n            elif i in skips:\n                layer = nn.Linear(W + in_channel_xyz, W)\n            else:\n                layer = nn.Linear(W, W)\n            layer = nn.Sequential(layer, nn.ReLU(True))\n            setattr(self, f'xyz_encoding_{i + 1}', layer)\n        self.xyz_encoding_final = nn.Linear(W, W)\n\n        input_channel = W + in_channel_dir\n        if add_apperance:\n            input_channel += in_channel_appearance\n        if add_exposure:\n            input_channel += in_channel_exposure\n        self.dir_encoding = nn.Sequential(\n            nn.Linear(\n                input_channel,\n                W // 2\n            ), nn.ReLU(True),\n            nn.Linear(W // 2, W // 2), nn.ReLU(True),\n            nn.Linear(W // 2, W // 2), nn.ReLU(True)\n        )\n\n        self.static_sigma = nn.Sequential(nn.Linear(W, 1), nn.Softplus())\n        self.static_rgb = nn.Sequential(nn.Linear(W // 2, 3), nn.Sigmoid())\n\n    def forward(self, x, sigma_only=False):\n        if sigma_only:\n            input_xyz = x\n        else:\n            input_xyz, input_dir, input_exp, input_appear = torch.split(x, [self.in_channel_xyz, self.in_channel_dir,\n                                                                        self.in_channel_exposure,\n                                                                        self.in_channel_appearance], dim=-1)\n        xyz = input_xyz\n        for i in range(self.D):\n            if i in self.skips:\n                xyz = torch.cat([xyz, input_xyz], dim=-1)\n            xyz = getattr(self, f'xyz_encoding_{i + 1}')(xyz)\n\n        static_sigma = self.static_sigma(xyz)\n        if sigma_only:\n            return static_sigma\n\n        xyz_feature = self.xyz_encoding_final(xyz)\n        input_xyz_feature = torch.cat([xyz_feature, input_dir], dim=-1)\n        if self.add_exposure:\n            input_xyz_feature = torch.cat([input_xyz_feature, input_exp], dim=-1)\n        if self.add_appearance:\n            input_xyz_feature = torch.cat([input_xyz_feature, input_appear], dim=-1)\n        \n        dir_encoding = self.dir_encoding(input_xyz_feature)\n\n        static_rgb = self.static_rgb(dir_encoding)\n        static_rgb_sigma = torch.cat([static_rgb, static_sigma], dim=-1)\n\n        return static_rgb_sigma\n\n\nclass Visibility(nn.Module):\n    def __init__(self,\n                 in_channel_xyz=60, in_channel_dir=24,\n                 W=128):\n        super(Visibility, self).__init__()\n        self.in_channel_xyz = in_channel_xyz\n        self.in_channel_dir = in_channel_dir\n\n        self.vis_encoding = nn.Sequential(\n            nn.Linear(in_channel_xyz + in_channel_dir, W), nn.ReLU(True),\n            nn.Linear(W, W), nn.ReLU(True),\n            nn.Linear(W, W), nn.ReLU(True),\n            nn.Linear(W, W), nn.ReLU(True),\n        )\n        self.visibility = nn.Sequential(nn.Linear(W, 1), nn.Softplus())\n\n    def forward(self, x):\n        vis_encode = self.vis_encoding(x)\n        visibility = self.visibility(vis_encode)\n        return visibility\n"
  },
  {
    "path": "block_nerf/block_visualize.py",
    "content": "import torchvision.transforms as T\nimport numpy as np\nimport cv2\nfrom PIL import Image\n\ndef visualize_depth(depth, cmap=cv2.COLORMAP_JET):\n    \"\"\"\n    depth: (H, W)\n    \"\"\"\n    x = depth.cpu().numpy()\n    x = np.nan_to_num(x) # change nan to 0\n    mi = np.min(x) # get minimum depth\n    ma = np.max(x)\n    x = (x-mi)/max(ma-mi, 1e-8) # normalize to 0~1\n    x = (255*x).astype(np.uint8)\n    x_ = Image.fromarray(cv2.applyColorMap(x, cmap))\n    x_ = T.ToTensor()(x_) # (3, H, W)\n    return x_\n"
  },
  {
    "path": "block_nerf/learning_utils.py",
    "content": "import torch\n# optimizer\nfrom torch.optim import SGD, Adam\nimport torch_optimizer as optim\n# scheduler\nfrom torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR\nfrom block_nerf.block_visualize import *\nfrom torch.optim.lr_scheduler import _LRScheduler\nfrom torch.optim.lr_scheduler import ReduceLROnPlateau\n\n\nclass GradualWarmupScheduler(_LRScheduler):\n    \"\"\" Gradually warm-up(increasing) learning rate in optimizer.\n    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.\n    Args:\n        optimizer (Optimizer): Wrapped optimizer.\n        multiplier: target learning rate = base lr * multiplier\n        total_epoch: target learning rate is reached at total_epoch, gradually\n        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)\n    \"\"\"\n\n    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):\n        self.multiplier = multiplier\n        if self.multiplier < 1.:\n            raise ValueError('multiplier should be greater thant or equal to 1.')\n        self.total_epoch = total_epoch\n        self.after_scheduler = after_scheduler\n        self.finished = False\n        super().__init__(optimizer)\n\n    def get_lr(self):\n        if self.last_epoch > self.total_epoch:\n            if self.after_scheduler:\n                if not self.finished:\n                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]\n                    self.finished = True\n                return self.after_scheduler.get_lr()\n            return [base_lr * self.multiplier for base_lr in self.base_lrs]\n\n        return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]\n\n    def step_ReduceLROnPlateau(self, metrics, epoch=None):\n        if epoch is None:\n            epoch = self.last_epoch + 1\n        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning\n        if self.last_epoch <= self.total_epoch:\n            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]\n            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):\n                param_group['lr'] = lr\n        else:\n            if epoch is None:\n                self.after_scheduler.step(metrics, None)\n            else:\n                self.after_scheduler.step(metrics, epoch - self.total_epoch)\n\n    def step(self, epoch=None, metrics=None):\n        if type(self.after_scheduler) != ReduceLROnPlateau:\n            if self.finished and self.after_scheduler:\n                if epoch is None:\n                    self.after_scheduler.step(None)\n                else:\n                    self.after_scheduler.step(epoch - self.total_epoch)\n            else:\n                return super(GradualWarmupScheduler, self).step(epoch)\n        else:\n            self.step_ReduceLROnPlateau(metrics, epoch)\n\n\ndef get_parameters(models):\n    \"\"\"Get all model parameters recursively.\"\"\"\n    parameters = []\n    if isinstance(models, list):\n        for model in models:\n            parameters += get_parameters(model)\n    elif isinstance(models, dict):\n        for model in models.values():\n            parameters += get_parameters(model)\n    else: # models is actually a single pytorch model\n        parameters += list(models.parameters())\n    return parameters\n\n\ndef get_optimizer(hparams, models):\n    eps = 1e-8\n    parameters = get_parameters(models)\n    if hparams['optimizer'] == 'sgd':\n        optimizer = SGD(parameters, lr=hparams['lr'],\n                        momentum=hparams.momentum, weight_decay=hparams.weight_decay)\n    elif hparams['optimizer'] == 'adam':\n        optimizer = Adam(parameters, lr=hparams['lr'], eps=eps,\n                         weight_decay=hparams['weight_decay'])\n    elif hparams['optimizer'] == 'radam':\n        optimizer = optim.RAdam(parameters, lr=hparams['lr'], eps=eps,\n                                weight_decay=hparams['weight_decay'])\n    elif hparams['optimizer'] == 'ranger':\n        optimizer = optim.Ranger(parameters, lr=hparams['lr'], eps=eps,\n                                 weight_decay=hparams['weight_decay'])\n    else:\n        raise ValueError('optimizer not recognized!')\n\n    return optimizer\n\ndef get_scheduler(hparams, optimizer):\n    eps = 1e-8\n    if hparams['lr_scheduler'] == 'steplr':\n        scheduler = MultiStepLR(optimizer, milestones=hparams['decay_step'],\n                                gamma=hparams['decay_gamma'])\n    elif hparams.lr_scheduler == 'cosine':\n        scheduler = CosineAnnealingLR(optimizer, T_max=hparams['num_epochs'], eta_min=eps)\n    elif hparams.lr_scheduler == 'poly':\n        scheduler = LambdaLR(optimizer,\n                             lambda epoch: (1-epoch/hparams['num_epochs'])**hparams['poly_exp'])\n    else:\n        raise ValueError('scheduler not recognized!')\n\n    if hparams['warmup_epochs'] > 0 and hparams['optimizer'] not in ['radam', 'ranger']:\n        scheduler = GradualWarmupScheduler(optimizer, multiplier=hparams['warmup_multiplier'],\n                                           total_epoch=hparams['warmup_epochs'], after_scheduler=scheduler)\n\n    return scheduler\n\ndef get_learning_rate(optimizer):\n    for param_group in optimizer.param_groups:\n        return param_group['lr']\n\ndef extract_model_state_dict(ckpt_path, model_name='model', prefixes_to_ignore=[]):\n    checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))\n    checkpoint_ = {}\n    if 'state_dict' in checkpoint: # if it's a pytorch-lightning checkpoint\n        checkpoint = checkpoint['state_dict']\n    for k, v in checkpoint.items():\n        if not k.startswith(model_name):\n            continue\n        k = k[len(model_name)+1:]\n        for prefix in prefixes_to_ignore:\n            if k.startswith(prefix):\n                print('ignore', k)\n                break\n        else:\n            checkpoint_[k] = v\n    return checkpoint_\n\ndef load_ckpt(model, ckpt_path, model_name='model', prefixes_to_ignore=[]):\n    if not ckpt_path:\n        return\n    model_dict = model.state_dict()\n    checkpoint_ = extract_model_state_dict(ckpt_path, model_name, prefixes_to_ignore)\n    model_dict.update(checkpoint_)\n    model.load_state_dict(model_dict)\n"
  },
  {
    "path": "block_nerf/metrics.py",
    "content": "import torch\nfrom kornia.losses import ssim as dssim\n\ndef mse(image_pred, image_gt, valid_mask=None, reduction='mean'):\n    value = (image_pred-image_gt)**2\n    if valid_mask is not None:\n        value = value[valid_mask]\n    if reduction == 'mean':\n        return torch.mean(value)\n    return value\n\ndef psnr(image_pred, image_gt, valid_mask=None, reduction='mean'):\n    return -10*torch.log10(mse(image_pred, image_gt, valid_mask, reduction))\n\ndef ssim(image_pred, image_gt, reduction='mean'):\n    \"\"\"\n    image_pred and image_gt: (1, 3, H, W)\n    \"\"\"\n    dssim_ = dssim(image_pred, image_gt, 3, reduction) # dissimilarity in [0, 1]\n    return 1-2*dssim_ # in [-1, 1]"
  },
  {
    "path": "block_nerf/rendering.py",
    "content": "import torch\nfrom einops import rearrange, reduce, repeat\nimport pdb\nfrom block_nerf.block_nerf_model import *\nfrom block_nerf.block_nerf_lightning import *\n\n\ndef get_cone_mean_conv(\n    t_samples,\n    rays_o,\n    rays_d,\n    radii,\n    ):\n    t0 = t_samples[..., :-1]  # left side\n    t1 = t_samples[..., 1:]  # right side\n    middle_t = (t0 + t1) / 2\n    difference_t = (t1 - t0) / 2\n    mean_t = difference_t ** 2 / 3 - 4 / 15 * (difference_t ** 4 * (12\n            * middle_t ** 2 - difference_t ** 2) / (3 * middle_t ** 2\n            + difference_t ** 2) ** 2)\n    variance_t = difference_t ** 2 / 3 - 4 / 15 * (difference_t ** 4\n            * (12 * middle_t ** 2 - difference_t ** 2) / (3 * middle_t\n            ** 2 + difference_t ** 2) ** 2)\n    variance_r = radii ** 2 * (middle_t ** 2 / 4 + 5 / 12\n                               * difference_t ** 2 - 4 / 15\n                               * difference_t ** 4 / (3 * middle_t ** 2\n                               + difference_t ** 2))\n    rays_d = rearrange(rays_d, 'n1 c -> n1 1 c')\n    rays_o = rearrange(rays_o, 'n1 c -> n1 1 c')\n    mean = rays_o + rays_d * rearrange(mean_t, 'n1 n2 -> n1 n2 1')\n    rays_d = rays_d.squeeze()  # [1024,3]\n    rays_o = rays_o.squeeze()  # [1024,3]\n    dod = rays_d ** 2\n    direct_norm = torch.sum(dod, dim=-1, keepdim=True) + 1e-10\n    diagE = rearrange(variance_t, 'n1 c -> n1 c 1') * rearrange(dod,\n            'n1 c -> n1 1 c') + rearrange(variance_r, 'n1 c -> n1 c 1') \\\n        * rearrange(1 - dod / direct_norm, 'n1 c -> n1 1 c')\n    return (mean_t, mean, diagE)  # [1024,64,3] [1024,64,3]\n\n\n\ndef sample_pdf(\n    bins,\n    weights,\n    N_importance,\n    alpha=1e-2,\n    ):\n    (N_rays, N_samples_) = weights.shape\n    weights_pad = torch.cat([weights[..., :1], weights, weights[..., -1:\n                            ]], dim=-1)\n    weights_max = torch.maximum(weights_pad[..., :-1], weights_pad[...,\n                                1:])\n    weights_blur = 0.5 * (weights_max[..., :-1] + weights_max[..., 1:])\n\n    weights = weights + alpha\n    pdf = weights / reduce(weights, 'n1 n2 -> n1 1', 'sum')\n\n    cdf = torch.cumsum(pdf, -1)\n\n    # (N_rays, N_samples_+1)\n\n    cdf = torch.cat([torch.zeros_like(cdf[:, :1]), cdf], -1)\n\n    # padded to 0~1 inclusive\n\n    u = torch.linspace(0, 1, N_importance + 1, device=bins.device)\n    u = u.expand(N_rays, N_importance + 1)\n    u = u.contiguous()\n    inds = torch.searchsorted(cdf, u, right=True)\n    below = torch.clamp_min(inds - 1, 0)\n    above = torch.clamp_max(inds, N_samples_)\n\n    inds_sampled = rearrange(torch.stack([below, above], -1),\n                             'n1 n2 c -> n1 (n2 c)', c=2)\n    cdf_g = rearrange(torch.gather(cdf, 1, inds_sampled),\n                      'n1 (n2 c) -> n1 n2 c', c=2)\n    bins_g = rearrange(torch.gather(bins, 1, inds_sampled),\n                       'n1 (n2 c) -> n1 n2 c', c=2)\n\n    denom = cdf_g[..., 1] - cdf_g[..., 0]\n    denom[denom < alpha] = 1  # denom equals 0 means a bin has weight 0,\n\n    # in which case it will not be sampled\n    # anyway, therefore any value for it is fine (set to 1 here)\n\n    samples = bins_g[..., 0] + (u - cdf_g[..., 0]) / denom \\\n        * (bins_g[..., 1] - bins_g[..., 0])\n    return samples\n\n\ndef volume_rendering(\n    rgbs=None,\n    sigmas=None,\n    z_vals=None,\n    mean_t=None,\n    type='train',\n    ):\n    deltas = z_vals[:, 1:] - z_vals[:, :-1]\n    noise = torch.randn_like(sigmas)\n    alphas = 1 - torch.exp(-deltas * torch.relu(sigmas + noise))\n    alphas_shifted = torch.cat([torch.ones_like(alphas[:, :1]), 1\n                               - alphas + 1e-10], -1)\n    Ti = torch.cumprod(alphas_shifted[:, :-1], -1)\n    weights = alphas * Ti\n    weights_sum = reduce(weights, 'n1 n2 -> n1', 'sum')\n\n    # volumetric rendering\n\n    results = {}\n    results['transmittance'] = Ti\n    results['weights'] = weights\n    results['opacity'] = weights_sum\n    results['z_vals'] = z_vals\n\n    if type == 'test_coarse':\n        return results\n    rgb_map = reduce(rearrange(weights, 'n1 n2 -> n1 n2 1') * rgbs,\n                     'n1 n2 c -> n1 c', 'sum')\n    depth_map = reduce(weights * mean_t, 'n1 n2 -> n1', 'sum')\n\n    results['rgb'] = rgb_map\n    results['depth'] = depth_map\n    return results\n\n#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\n\ndef render_rays(\n    models,\n    embedding,\n    rays,\n    ts,\n    N_samples=64,\n    N_importance=64,\n    chunk=1024,\n    type='train',\n    use_disp=False,\n    ):\n\n    N_rays = rays.shape[0]\n    (\n        rays_o,\n        rays_d,\n        radii,\n        exposure,\n        near,\n        far,\n        ) = torch.split(rays, [\n        3,\n        3,\n        1,\n        1,\n        1,\n        1,\n        ], dim=-1)\n\n    # first handle the coarse network\n\n    z_steps = torch.linspace(0, 1, N_samples + 1, device=rays.device)  # sample N_samples+1 points to form N_samples regions\n\n    if not use_disp:  # use linear sampling in depth space\n        z_vals = near * (1 - z_steps) + far * z_steps\n    else:\n\n           # use linear sampling in disparity space\n        # z_vals = 1 / (1 / near * (1 - z_steps) + 1 / far * z_steps)\n\n        z_vals = torch.exp(torch.log(near) * (1 - z_steps)\n                           + torch.log(far) * z_steps)\n\n    # z_vals = near + (far - near) * z_steps\n\n    z_vals_coarse = z_vals.expand(N_rays, N_samples + 1)\n\n    z_vals_mid = 0.5 * (z_vals_coarse[:, :-1] + z_vals_coarse[:, 1:])  # (N_rays, N_samples-1) interval mid points\n\n    # get intervals between samples\n\n    upper = torch.cat([z_vals_mid, z_vals[:, -1:]], -1)\n    lower = torch.cat([z_vals[:, :1], z_vals_mid], -1)\n\n    perturb_rand = 1 * torch.rand_like(z_vals)\n    z_vals_coarse = lower + (upper - lower) * perturb_rand\n\n    (mean_t_coarse, mean_coarse, diagE_coarse) = \\\n        get_cone_mean_conv(z_vals_coarse, rays_o, rays_d, radii)\n    if type == 'train':\n        IPE = embedding['IPE']\n        PE = embedding['PE']\n        appearance_encoding = embedding['appearance']\n\n        # ########\n\n        sample_coarse_encode = IPE(mean_coarse, diagE_coarse)\n        sample_coarse_encode = rearrange(sample_coarse_encode,\n                'n1 n2 c -> (n1 n2) c')\n\n        # ########\n\n        dir_coarse_encode = PE(rays_d)\n        dir_coarse_encode = repeat(dir_coarse_encode,\n                                   'n1 c -> (n1 n2) c', n2=N_samples)\n\n        # ########\n\n        exp_encode = PE(exposure)\n        exp_coarse_encode = repeat(exp_encode, 'n1 c -> (n1 n2) c',\n                                   n2=N_samples)\n        appearance_encode = appearance_encoding(ts)  # [1024,32]\n        appearance_coarse_encode = repeat(appearance_encode,\n                'n1 c -> (n1 n2) c', n2=N_samples)\n\n        xyzdir_coarse_encode_f_variance = \\\n            torch.cat([sample_coarse_encode, dir_coarse_encode,\n                      exp_coarse_encode, appearance_coarse_encode],\n                      dim=-1)\n        xyzdir_coarse_encode_f_v = torch.cat([sample_coarse_encode,\n                dir_coarse_encode], dim=-1)\n\n        out_coarse_rgb_sigma = []\n        out_coarse_visibility = []\n        for i in range(0, xyzdir_coarse_encode_f_variance.shape[0],\n                       chunk):\n            result = models['block_model'\n                            ](xyzdir_coarse_encode_f_variance[i:i\n                              + chunk])\n            out_coarse_rgb_sigma.append(result)\n        out_coarse_rgb_sigma = torch.cat(out_coarse_rgb_sigma, 0)\n        out_coarse_rgb_sigma = rearrange(out_coarse_rgb_sigma,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples, c=4)\n\n        for i in range(0, xyzdir_coarse_encode_f_v.shape[0], chunk):\n            result = models['visibility_model'\n                            ](xyzdir_coarse_encode_f_v[i:i + chunk])\n            out_coarse_visibility.append(result)\n        out_coarse_visibility = torch.cat(out_coarse_visibility, 0)\n        out_coarse_visibility = rearrange(out_coarse_visibility,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples, c=1)\n\n        rgbs_coarse = out_coarse_rgb_sigma[..., :3]\n        sigmas_coarse = out_coarse_rgb_sigma[..., 3]\n        results_coarse = volume_rendering(rgbs_coarse, sigmas_coarse,\n                z_vals_coarse, mean_t_coarse)\n\n        # #################################\n        # handling the fine network\n        # inverse sampling\n\n        z_vals_mid = 0.5 * (z_vals_coarse[:, :-1] + z_vals_coarse[:, 1:\n                            ])\n        z_vals_fine = sample_pdf(z_vals_mid, results_coarse['weights'][:\n                                 , 1:-1].detach(), N_importance)\n        z_vals_fine = torch.sort(torch.cat([z_vals_coarse,\n                                 z_vals_fine], -1), -1)[0]\n        (mean_t_fine, mean_fine, diagE_fine) = \\\n            get_cone_mean_conv(z_vals_fine, rays_o, rays_d, radii)\n\n        sample_fine_encode = IPE(mean_fine, diagE_fine)\n        sample_fine_encode = rearrange(sample_fine_encode,\n                'n1 n2 c -> (n1 n2) c')\n        dir_fine_encode = PE(rays_d)\n        dir_fine_encode = repeat(dir_fine_encode, 'n1 c -> (n1 n2) c',\n                                 n2=N_samples + N_importance + 1)\n        appearance_fine_encode = repeat(appearance_encode,\n                'n1 c -> (n1 n2) c', n2=N_samples + N_importance + 1)\n        exp_fine_encode = repeat(exp_encode, 'n1 c -> (n1 n2) c',\n                                 n2=N_samples + N_importance + 1)\n\n        xyzdir_fine_encode_f_variance = torch.cat([sample_fine_encode,\n                dir_fine_encode, exp_fine_encode,\n                appearance_fine_encode], dim=-1)\n        xyzdir_fine_encode_f_v = torch.cat([sample_fine_encode,\n                dir_fine_encode], dim=-1)\n\n        out_fine_rgb_sigma = []\n        out_fine_visibility = []\n        for i in range(0, xyzdir_fine_encode_f_variance.shape[0],\n                       chunk):\n            result = models['block_model'\n                            ](xyzdir_fine_encode_f_variance[i:i\n                              + chunk])\n            out_fine_rgb_sigma.append(result)\n        out_fine_rgb_sigma = torch.cat(out_fine_rgb_sigma, 0)\n        out_fine_rgb_sigma = rearrange(out_fine_rgb_sigma,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples\n                + N_importance + 1, c=4)\n\n        for i in range(0, xyzdir_fine_encode_f_v.shape[0], chunk):\n            result = models['visibility_model'\n                            ](xyzdir_fine_encode_f_v[i:i + chunk])\n            out_fine_visibility.append(result)\n        out_fine_visibility = torch.cat(out_fine_visibility, 0)\n        out_fine_visibility = rearrange(out_fine_visibility,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples\n                + N_importance + 1, c=1)\n\n        rgbs_fine = out_fine_rgb_sigma[..., :3]\n        sigmas_fine = out_fine_rgb_sigma[..., 3]\n        results_fine = volume_rendering(rgbs_fine, sigmas_fine,\n                z_vals_fine, mean_t_fine)\n\n        result = {}\n        result['rgb_coarse'] = results_coarse['rgb']\n        result['rgb_fine'] = results_fine['rgb']\n        result['depth_fine'] = results_fine['depth']\n        result['transmittance_coarse_real'] = \\\n            results_coarse['transmittance']\n        result['transmittance_fine_real'] = results_fine['transmittance'\n                ]\n        result['transmittance_coarse_vis'] = \\\n            out_coarse_visibility.squeeze()\n        result['transmittance_fine_vis'] = out_fine_visibility.squeeze()\n\n        # rearrange(results_fine['transmittance'],\"n1 n2 -> n1 n2 1\").shape\n\n        return result\n    else:\n\n           # for test and val\n\n        IPE = embedding['IPE']\n        PE = embedding['PE']\n        appearance_encoding = embedding['appearance']\n        exp_encode = PE(exposure)\n        appearance_encode = appearance_encoding(ts)  # [1024,32]\n        sample_coarse_encode = IPE(mean_coarse, diagE_coarse)\n        sample_coarse_encode = rearrange(sample_coarse_encode,\n                'n1 n2 c -> (n1 n2) c')\n\n        xyzdir_coarse_encode_f_variance = sample_coarse_encode\n\n        out_coarse_sigma = []\n        for i in range(0, xyzdir_coarse_encode_f_variance.shape[0],\n                       chunk):\n            result = models['block_model'\n                            ](xyzdir_coarse_encode_f_variance[i:i\n                              + chunk], sigma_only=True)\n            out_coarse_sigma.append(result)\n        out_coarse_sigma = torch.cat(out_coarse_sigma, 0)\n        out_coarse_sigma = rearrange(out_coarse_sigma,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples, c=1)\n\n        sigmas_coarse = out_coarse_sigma.squeeze()\n        results_coarse = volume_rendering(sigmas=sigmas_coarse,\n                z_vals=z_vals_coarse, type='test_coarse')\n\n        # #################################\n\n        z_vals_mid = 0.5 * (z_vals_coarse[:, :-1] + z_vals_coarse[:, 1:\n                            ])\n        z_vals_fine = sample_pdf(z_vals_mid, results_coarse['weights'][:\n                                 , 1:-1].detach(), N_importance)\n        z_vals_fine = torch.sort(torch.cat([z_vals_coarse,\n                                 z_vals_fine], -1), -1)[0]\n        (mean_t_fine, mean_fine, diagE_fine) = \\\n            get_cone_mean_conv(z_vals_fine, rays_o, rays_d, radii)\n\n        sample_fine_encode = IPE(mean_fine, diagE_fine)\n        sample_fine_encode = rearrange(sample_fine_encode,\n                'n1 n2 c -> (n1 n2) c')\n        dir_fine_encode = PE(rays_d)\n        dir_fine_encode = repeat(dir_fine_encode, 'n1 c -> (n1 n2) c',\n                                 n2=N_samples + N_importance + 1)\n        appearance_fine_encode = repeat(appearance_encode,\n                'n1 c -> (n1 n2) c', n2=N_samples + N_importance + 1)\n        exp_fine_encode = repeat(exp_encode, 'n1 c -> (n1 n2) c',\n                                 n2=N_samples + N_importance + 1)\n\n        xyzdir_fine_encode_f_variance = torch.cat([sample_fine_encode,\n                dir_fine_encode, exp_fine_encode,\n                appearance_fine_encode], dim=-1)\n        xyzdir_fine_encode_f_v = torch.cat([sample_fine_encode,\n                dir_fine_encode], dim=-1)\n\n        out_fine_rgb_sigma = []\n        out_fine_visibility = []\n        for i in range(0, xyzdir_fine_encode_f_variance.shape[0],\n                       chunk):\n            result = models['block_model'\n                            ](xyzdir_fine_encode_f_variance[i:i\n                              + chunk])\n            out_fine_rgb_sigma.append(result)\n        out_fine_rgb_sigma = torch.cat(out_fine_rgb_sigma, 0)\n        out_fine_rgb_sigma = rearrange(out_fine_rgb_sigma,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples\n                + N_importance + 1, c=4)\n\n        for i in range(0, xyzdir_fine_encode_f_v.shape[0], chunk):\n            result = models['visibility_model'\n                            ](xyzdir_fine_encode_f_v[i:i + chunk])\n            out_fine_visibility.append(result)\n        out_fine_visibility = torch.cat(out_fine_visibility, 0)\n        out_fine_visibility = rearrange(out_fine_visibility,\n                '(n1 n2) c -> n1 n2 c', n1=N_rays, n2=N_samples\n                + N_importance + 1, c=1)\n\n        rgbs_fine = out_fine_rgb_sigma[..., :3]\n        sigmas_fine = out_fine_rgb_sigma[..., 3]\n        results_fine = volume_rendering(rgbs_fine, sigmas_fine,\n                z_vals_fine, mean_t_fine)\n\n        result = {}\n        result['rgb_fine'] = results_fine['rgb']\n        result['depth_fine'] = results_fine['depth']\n        result['transmittance_fine_vis'] = out_fine_visibility.squeeze()\n        return result\n"
  },
  {
    "path": "block_nerf/waymo_dataset.py",
    "content": "import numpy as np\nimport os\nfrom PIL import Image\nfrom torchvision import transforms\nimport torch\nfrom torch.utils.data import Dataset\nimport json\nfrom kornia import create_meshgrid\nfrom tqdm import tqdm\nimport pdb\n\n\ndef get_ray_directions(H, W, K):\n    grid = create_meshgrid(H, W, normalized_coordinates=False)[0]\n    i, j = grid.unbind(-1)\n    # the direction here is without +0.5 pixel centering as calibration is not so accurate\n    # see https://github.com/bmild/nerf/issues/24\n    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]\n    directions = \\\n        torch.stack([(i - cx) / fx, -(j - cy) / fy, -\n        torch.ones_like(i)], -1)  # (H, W, 3)\n    # normalized camera space to pixel space\n    directions = directions / torch.norm(directions, dim=-1, keepdim=True)\n    return directions\n\n\ndef get_rays(directions, c2w):\n    # Rotate ray directions from camera coordinate to the world coordinate\n    rays_d = directions @ c2w[:, :3].T  # (H, W, 3)\n\n    # normalized before directions\n    # rays_d = rays_d / torch.norm(rays_d, dim=-1, keepdim=True)\n    # The origin of all rays is the camera origin in world coordinate\n    rays_o = c2w[:, 3].expand(rays_d.shape)  # (H, W, 3)\n\n    return rays_o, rays_d\n\n\ndef find_idx_name(elements, img_name):\n    #  image name to index\n    for element in elements:\n        if img_name in element:\n            return element[1]\n    return None\n\ndef find_nearest_idx(img_source,block_elements,train_meta):\n    # fetch te closest image according to figure.\n    cam_idx=img_source['cam_idx']\n    distance = 1000\n    img_idx=None\n    img_nearest=None\n    for element in block_elements:\n        if train_meta[element[0]]['cam_idx']==cam_idx:  # if this is the same camera\n            distance_temp = np.linalg.norm(np.array(img_source['origin_pos']) - np.array(train_meta[element[0]]['origin_pos']))\n            if distance_temp < distance:\n                distance = distance_temp\n                img_idx=element[1]\n                img_nearest=train_meta[element[0]]\n    return img_idx\n\n\nclass WaymoDataset(Dataset):\n    def __init__(self, root_dir, split='train', block='block_0',\n                 img_downscale=4,\n                 near=0.01, far=15,\n                 test_img_name=None,\n                 cam_begin=None,\n                 cam_end=None):\n        self.root_dir = root_dir\n        self.split = split\n        self.block = block\n        self.img_downscale = img_downscale\n        self.near = near\n        self.far = far\n        self.test_img_name = test_img_name\n        \n        # cam_begin is the start location of the compose phase\n        self.cam_begin=cam_begin \n        self.cam_end=cam_end\n        self.transform = transforms.ToTensor()\n        self.read_json()\n\n    def read_json(self):\n        if self.split == \"test\" or self.split==\"compose\":  # test stage use the data in train.json\n            # with open(os.path.join(self.root_dir, f'train/train_all_meta.json'), 'r') as fp:\n            #     self.meta = json.load(fp)\n            self.meta = torch.load(os.path.join(self.root_dir, f'train/train_all_meta.pt'))\n            with open(os.path.join(self.root_dir, f'train/split_block_train.json'), 'r') as fp:\n                self.block_split_info = json.load(fp)\n        else:\n            # with open(os.path.join(self.root_dir, f'{self.split}/{self.split}_all_meta.json'), 'r') as fp:\n            #     self.meta = json.load(fp)\n            self.meta = torch.load(os.path.join(self.root_dir, f'{self.split}/{self.split}_all_meta.pt'))\n            with open(os.path.join(self.root_dir, f'{self.split}/split_block_{self.split}.json'), 'r') as fp:\n                self.block_split_info = json.load(fp)\n\n        if self.split == \"train\":\n            self.image_path = []\n            self.all_rays = []\n            self.all_rgbs = []\n            self.c2w = {}\n\n            print(\"Loading the image...\")\n\n            for img_idx in tqdm(self.block_split_info[self.block]['elements']):\n                '''\n                img_idx:[image_name,index]\n                '''\n                img_info = self.meta[img_idx[0]]\n                self.image_path.append(img_info['image_name'])\n                exposure = torch.tensor(img_info['equivalent_exposure'])\n                c2w = torch.FloatTensor(img_info['c2w'].float())\n                self.c2w[img_idx[0]] = c2w\n\n                width = img_info['W'] // self.img_downscale\n                height = img_info['H'] // self.img_downscale\n\n                # img = Image.open(os.path.join(\n                #     self.root_dir, 'images', img_info['image_name'])).convert('RGB')\n                img = Image.open(os.path.join(\n                    self.root_dir, 'train', 'rgbs', img_info['image_name'] + \".png\")).convert('RGB')\n                if self.img_downscale != 1:\n                    img = img.resize((width, height), Image.Resampling.LANCZOS)\n                img = self.transform(img)  # (3,h,w)\n                img = img.view(3, -1).permute(1, 0)\n                self.all_rgbs.append(img)\n\n                K = np.zeros((3, 3), dtype=np.float32)\n                # fx=focal,fy=focal,cx=img_w/2,cy=img_h/2\n                K[0, 0] = img_info['intrinsics'][0] // self.img_downscale\n                K[1, 1] = img_info['intrinsics'][1] // self.img_downscale\n                K[0, 2] = width * 0.5\n                K[1, 2] = height * 0.5\n                K[2, 2] = 1\n\n                directions = get_ray_directions(height, width, K)\n                rays_o, rays_d = get_rays(directions, c2w)\n\n                # calculate the radius\n                dx_1 = torch.sqrt(\n                    torch.sum((rays_d[:-1, :, :] - rays_d[1:, :, :]) ** 2, -1))\n                dx = torch.cat([dx_1, dx_1[-2:-1, :]], 0)\n                radii = dx[..., None] * 2 / torch.sqrt(torch.tensor(12))\n                '''\n                rays_d = torch.tensor(\n                    np.load(os.path.join(self.root_dir, \"images\", f\"{img_idx[0]}_ray_dirs.npy\"))\n                )\n                rays_o = torch.tensor(\n                    np.load(os.path.join(self.root_dir, \"images\", f\"{img_idx[0]}_ray_origins.npy\"))\n                )\n                '''\n                # rays_d has been normalized\n                rays_d = rays_d.view(-1, 3)\n                rays_o = rays_o.view(-1, 3)\n                radii = radii.view(-1, 1)\n\n                rays_t = img_idx[1] * torch.ones(len(rays_o), 1)\n\n                self.all_rays.append(\n                    torch.cat([\n                        rays_o, rays_d,\n                        radii,\n                        exposure * torch.ones_like(rays_o[:, :1]),\n                        self.near * torch.ones_like(rays_o[:, :1]),\n                        self.far * torch.ones_like(rays_o[:, :1]),\n                        rays_t], -1)\n                )\n\n            # ((N_images-1)*h*w, 8)\n            self.all_rays = torch.cat(self.all_rays, 0)\n            # ((N_images-1)*h*w, 3)\n            self.all_rgbs = torch.cat(self.all_rgbs, 0)\n\n            print(\"Has  totally loaded {0} images and {1} rays!\".format(\n                len(self.image_path), len(self.all_rays)))\n\n        elif self.split == \"test\":\n            self.N_frames = 10\n            self.dy = np.linspace(0,0.2, self.N_frames)\n\n        elif self.split == \"compose\": # input two views with distances\n            print(f\"Now is inferencing the images between {self.cam_begin} and {self.cam_end} ...\")\n            self.img_info = self.meta[self.cam_begin]\n            self.img_info_end = self.meta[self.cam_end]\n\n            origin_begin = self.img_info[\"origin_pos\"]\n            origin_end = self.img_info_end[\"origin_pos\"]\n            self.dx_dy_dz = np.array(origin_begin) - np.array(origin_end)\n            print(f\"The distance between {self.cam_begin} and {self.cam_end} is {self.dx_dy_dz}\")\n\n            if self.dx_dy_dz[1]<0.01:\n                self.N_frames=1\n            else:\n                self.N_frames=self.dx_dy_dz[1]//0.01\n\n\n    def __len__(self):\n        if self.split == 'train':\n            return len(self.all_rays)\n        if self.split == 'val':\n            if len(self.block_split_info[self.block]) > 5:\n                return 5\n            else:\n                return len(self.block_split_info[self.block])  # only validate 8 images\n\n        return self.N_frames # test return the num of frames\n\n    def __getitem__(self, idx):\n        if self.split == 'train':\n            sample = {'rays': self.all_rays[idx, :10],\n                      'rgbs': self.all_rgbs[idx],\n                      'ts': self.all_rays[idx, 10].long()}\n        elif self.split==\"val\":  # self.split == 'val':\n            block_info = self.block_split_info[self.block]\n            img_name, img_idx = block_info[idx]\n            print(\"Basic image is {0}\".format(img_name))\n            img_info = self.meta[img_name]\n            exposure = torch.tensor(img_info['equivalent_exposure'])\n\n            c2w = torch.FloatTensor(img_info['c2w'].float())\n\n            width = img_info['W'] // self.img_downscale\n            height = img_info['H'] // self.img_downscale\n\n            if self.split == 'val':\n                img = Image.open(os.path.join(\n                    self.root_dir, 'val', 'rgbs', img_info['image_name'] + \".png\")).convert('RGB')\n                if self.img_downscale != 1:\n                    img = img.resize((width, height),\n                                     Image.Resampling.LANCZOS)  # cv2.imshow(\"123.png\",cv2.cvtColor(np.array(img),cv2.COLOR_BGR2RGB)),cv2.waitKey()\n                img = self.transform(img)  # (3,h,w)\n                img = img.view(3, -1).permute(1, 0)\n\n            self.K = {}\n            K = np.zeros((3, 3), dtype=np.float32)\n            # fx=focal,fy=focal,cx=img_w/2,cy=img_h/2\n            K[0, 0] = img_info['intrinsics'][0] // self.img_downscale\n            K[1, 1] = img_info['intrinsics'][1] // self.img_downscale\n            K[0, 2] = width * 0.5\n            K[1, 2] = height * 0.5\n            K[2, 2] = 1\n\n            directions = get_ray_directions(height, width, K)\n            rays_o, rays_d = get_rays(directions, c2w)\n\n            # calculate radius\n            dx_1 = torch.sqrt(\n                torch.sum((rays_d[:-1, :, :] - rays_d[1:, :, :]) ** 2, -1))\n            dx = torch.cat([dx_1, dx_1[-2:-1, :]], 0)\n            radii = dx[..., None] * 2 / torch.sqrt(torch.tensor(12))\n            '''\n            rays_d = torch.tensor(\n                np.load(os.path.join(self.root_dir, \"images\", f\"{img_idx[0]}_ray_dirs.npy\"))\n            )\n            rays_o = torch.tensor(\n                np.load(os.path.join(self.root_dir, \"images\", f\"{img_idx[0]}_ray_origins.npy\"))\n            )\n            '''\n            rays_d = rays_d.view(-1, 3)\n            rays_o = rays_o.view(-1, 3)\n            radii = radii.view(-1, 1)\n\n            rays_t = img_idx * torch.ones(len(rays_o), 1)\n            rays = torch.cat([\n                rays_o, rays_d,\n                radii,\n                exposure * torch.ones_like(rays_o[:, :1]),\n                self.near * torch.ones_like(rays_o[:, :1]),\n                self.far * torch.ones_like(rays_o[:, :1]),\n                rays_t], -1)\n            sample = {\"rays\": rays[:, :10],\n                      \"ts\": rays[:, 10].long(),\n                      \"w_h\": [width, height]}\n            sample[\"rgbs\"]=img\n\n        elif self.split==\"test\": #test\n            img_info = self.meta[self.test_img_name]\n            exposure = torch.tensor(img_info['equivalent_exposure'])\n            c2w = torch.FloatTensor(img_info['c2w'])\n            c2w[1, 3] += self.dy[idx]\n\n            width = img_info['width'] // self.img_downscale\n            height = img_info['height'] // self.img_downscale\n\n            self.K = {}\n            K = np.zeros((3, 3), dtype=np.float32)\n            # fx=focal,fy=focal,cx=img_w/2,cy=img_h/2\n            K[0, 0] = img_info['intrinsics'][0] // self.img_downscale\n            K[1, 1] = img_info['intrinsics'][1] // self.img_downscale\n            K[0, 2] = width * 0.5\n            K[1, 2] = height * 0.5\n            K[2, 2] = 1\n\n            directions = get_ray_directions(height, width, K)\n            rays_o, rays_d = get_rays(directions, c2w)\n\n            # calculate radius\n            dx_1 = torch.sqrt(\n                torch.sum((rays_d[:-1, :, :] - rays_d[1:, :, :]) ** 2, -1))\n            dx = torch.cat([dx_1, dx_1[-2:-1, :]], 0)\n            radii = dx[..., None] * 2 / torch.sqrt(torch.tensor(12))\n\n            # rays_d have been normalized\n            rays_d = rays_d.view(-1, 3)\n            rays_o = rays_o.view(-1, 3)\n            radii = radii.view(-1, 1)\n\n\n            img_idx=find_idx_name(self.block_split_info[self.block]['elements'],self.test_img_name)\n\n            if img_idx==None:\n                print(\"It seems that the {0} doesn't belong to {1}\".format(self.test_img_name,self.block))\n                img_idx=find_nearest_idx(img_info,self.block_split_info[self.block]['elements'],self.meta)\n\n\n            rays_t = img_idx * torch.ones(len(rays_o), 1)\n            rays = torch.cat([\n                rays_o, rays_d,\n                radii,\n                exposure * torch.ones_like(rays_o[:, :1]),\n                self.near * torch.ones_like(rays_o[:, :1]),\n                self.far * torch.ones_like(rays_o[:, :1]),\n                rays_t], -1)\n            sample = {\"rays\": rays[:, :10],\n                      \"ts\": rays[:, 10].long(),\n                      \"w_h\": [width, height]}\n\n\n        else: #compose\n            exposure = torch.tensor(self.img_info['equivalent_exposure'])\n            c2w = self.img_info['c2w'].float()\n\n            dx = np.linspace(0, self.dx_dy_dz[0], self.N_frames)\n            dy = np.linspace(0, self.dx_dy_dz[1], self.N_frames)\n            dz = np.linspace(0, self.dx_dy_dz[2], self.N_frames)\n\n            c2w[0, 3] += dx[idx]\n            c2w[1, 3] += dy[idx]\n            c2w[2, 3] += dz[idx]\n\n            width = self.img_info['W'] // self.img_downscale\n            height = self.img_info['H'] // self.img_downscale\n\n            self.K = {}\n            K = np.zeros((3, 3), dtype=np.float32)\n            # fx=focal,fy=focal,cx=img_w/2,cy=img_h/2\n            K[0, 0] = self.img_info['intrinsics'][0] // self.img_downscale\n            K[1, 1] = self.img_info['intrinsics'][1] // self.img_downscale\n            K[0, 2] = width * 0.5\n            K[1, 2] = height * 0.5\n            K[2, 2] = 1\n\n            directions = get_ray_directions(height, width, K)\n            rays_o, rays_d = get_rays(directions, c2w)\n\n            # calculare radius\n            dx_1 = torch.sqrt(\n                torch.sum((rays_d[:-1, :, :] - rays_d[1:, :, :]) ** 2, -1))\n            dx = torch.cat([dx_1, dx_1[-2:-1, :]], 0)\n            radii = dx[..., None] * 2 / torch.sqrt(torch.tensor(12))\n\n            rays_d = rays_d.view(-1, 3)\n            rays_o = rays_o.view(-1, 3)\n            radii = radii.view(-1, 1)\n\n            #   todo: change this because the default block is block 0\n            img_idx = 0\n            rays_t = img_idx * torch.ones(len(rays_o), 1)\n            rays = torch.cat([\n                rays_o, rays_d,\n                radii,\n                exposure * torch.ones_like(rays_o[:, :1]),\n                self.near * torch.ones_like(rays_o[:, :1]),\n                self.far * torch.ones_like(rays_o[:, :1]),\n                rays_t], -1)\n            sample = {\"rays\": rays[:, :10],\n                      \"ts\": rays[:, 10].long(),\n                      \"w_h\": [width, height]}\n\n\n        return sample\n"
  },
  {
    "path": "data_preprocess/__init__.py",
    "content": ""
  },
  {
    "path": "data_preprocess/comvog_organize_by_cam_pos.py",
    "content": "## organize the images and ground-truths by camera positions and indexes\nimport os\nimport json\nimport numpy as np\nimport random\nimport copy\nimport pdb\nimport shutil\nimport torch\nfrom tqdm import tqdm\nfrom pathlib import Path\n\n\nCOPYFILE = True  # change it to true to rename and copy image files\n\n\ndef get_pix2cam(focals, width, height):\n    fx = np.array(focals)\n    fy = np.array(focals)\n    cx = np.array(width) * .5\n    cy = np.array(height) * .5\n    arr0 = np.zeros_like(cx)\n    arr1 = np.ones_like(cx)\n    k_inv = np.array([\n        [arr1 / fx, arr0, -cx / fx],\n        [arr0, -arr1 / fy, cy / fy],\n        [arr0, arr0, -arr1],\n    ])\n    k_inv = np.moveaxis(k_inv, -1, 0)\n    return k_inv.tolist()\n\n\nroot_dir = \"data/pytorch_waymo_dataset\"\n\nwith open(os.path.join(root_dir, \"train\", f'train_all_meta.json'), 'r') as fp:\n    train_all_meta = json.load(fp)\n\nwith open(os.path.join(root_dir, \"val\", f'val_all_meta.json'), 'r') as fp:\n    val_all_meta = json.load(fp)\n\nsave_path = Path(f\"data/sep19_ordered_dataset\")\nsave_path.mkdir(parents=True, exist_ok=True)\nos.makedirs(os.path.join(save_path, 'images_train'), exist_ok=True)\nos.makedirs(os.path.join(save_path, 'images_val'), exist_ok=True)\nos.makedirs(os.path.join(save_path, 'images_test'), exist_ok=True)\n\n\ndef reorder_meta(meta_dict):\n    cam2images_pos = {}\n    old_name_2_new_name = {}\n    # collect images by cams\n    for one_key in tqdm(meta_dict):\n        cur_value = meta_dict[one_key]\n        cam_idx = cur_value['cam_idx']\n        if cam_idx not in cam2images_pos:\n            cam2images_pos[cam_idx] = [[one_key, cur_value['origin_pos']]]\n        else:\n            cam2images_pos[cam_idx].append([one_key, cur_value['origin_pos']])\n    for one_cam in cam2images_pos:\n        pos_list = cam2images_pos[one_cam]\n        pos_list.sort(key=lambda row: (row[1][1], row[1][0]))\n        for idx, ele in enumerate(pos_list):\n            old_name = ele[0]\n            new_name = str(one_cam) + \"_\" + str(idx)\n            old_name_2_new_name[old_name] = new_name\n    return old_name_2_new_name\n    \n    \ndef form_unified_dict(old_to_new, metas, save_prefix='images_train', split_prefix='train'):\n    file_paths = []\n    c2ws = []\n    widths = []\n    heights = []\n    focals = []\n    nears = []\n    fars = []\n    return_metas = []\n    positions = []\n    intrisincs = []\n    cam_idxs = []\n    for idx, one_img in enumerate(tqdm(metas)):\n        ori_path = os.path.join('data', 'pytorch_waymo_dataset', split_prefix, 'rgbs/' + one_img + \".png\")\n        new_name = old_to_new[one_img]\n        if 'train' in split_prefix:\n            cur_meta = train_all_meta[one_img]\n        else:\n            cur_meta = val_all_meta[one_img]\n        cam_idxs.append(cur_meta['cam_idx'])\n        final_path = os.path.join(save_prefix, new_name + \".png\")\n        full_save_path = os.path.join(save_path, final_path)\n        if COPYFILE:\n            shutil.copyfile(ori_path, full_save_path)\n        file_paths.append(final_path)\n        return_metas.append(cur_meta)\n        positions.append(cur_meta['origin_pos'])\n        if len(cur_meta['c2w']) < 4:\n            cur_meta['c2w'].append([0.0, 0.0, 0.0, 1.0])\n        c2ws.append(cur_meta['c2w'])\n        widths.append(cur_meta['W'])\n        heights.append(cur_meta['H'])\n        focals.append(cur_meta['intrinsics'][0])\n        nears.append(0.01)\n        fars.append(15.)\n        K = np.array([\n            [cur_meta['intrinsics'][0], 0, 0.5*cur_meta['W']],\n            [0, cur_meta['intrinsics'][0], 0.5*cur_meta['H']],\n            [0, 0, 1]\n        ]).tolist()\n        intrisincs.append(K)\n\n    lossmult = np.ones(np.array(heights).shape).tolist()\n    pix2cam = get_pix2cam(focals=np.array(focals), width=np.array(widths), height=np.array(heights))\n    positions = np.array(positions).tolist()\n    return_metas = {'file_path': file_paths, 'cam2world': np.array(c2ws).tolist(), 'width': np.array(widths).tolist(),\n    'height': np.array(heights).tolist(), 'focal': np.array(focals).tolist(), 'pix2cam': pix2cam, 'lossmult': lossmult, \n    'near':nears, 'far': fars, 'K': intrisincs, 'cam_idx': cam_idxs, 'position': positions}\n    return return_metas\n\n\ntrain_old_to_new = reorder_meta(train_all_meta)\nval_old_to_new = reorder_meta(val_all_meta)\n\ntrain_dict = form_unified_dict(train_old_to_new, train_all_meta, save_prefix='images_train', split_prefix='train')\nval_dict = form_unified_dict(val_old_to_new, val_all_meta, save_prefix='images_val', split_prefix='val')\nval_dict = form_unified_dict(val_old_to_new, val_all_meta, save_prefix='images_test', split_prefix='val')\n\n# waymo does not have a test split\nunified_meta = {'train': train_dict, 'val': val_dict, 'test': val_dict}\n\nwith open(os.path.join(save_path, 'metadata.json'), \"w\") as fp:\n    json.dump(unified_meta, fp)\n    fp.close()\n\nprint(\"All done!\")\n\n"
  },
  {
    "path": "data_preprocess/download_waymo.sh",
    "content": "gdown --id 1paqYra8mbryVsoXAiJCImXXE0qFGxC0N # pytorch_waymo_dataset, https://drive.google.com/file/d/1paqYra8mbryVsoXAiJCImXXE0qFGxC0N/view?usp=sharing\ngdown --id 1O7uzcPBQHNAcmAcmcS6TRbLqiIDE3D0y # ckpts, will be updated at next time\n"
  },
  {
    "path": "data_preprocess/fetch_data_from_tf_record.py",
    "content": "import numpy as np\nimport datetime, os\nimport cv2\nimport os\nimport pdb\nimport glob\nimport tensorflow as tf\nimport torch\nimport glob\nimport os\nimport numpy as np\nimport torch\nfrom kornia import create_meshgrid\nimport pdb\nfrom tqdm import tqdm\n\n\ndef test_rays_dir_radii(ray_dirs):\n    if type(ray_dirs) != torch.Tensor:\n        ray_dirs = torch.Tensor(ray_dirs)\n    dx_1 = torch.sqrt(torch.sum((ray_dirs[:-1, :, :] - ray_dirs[1:, :, :]) ** 2, -1))\n    dx = torch.cat([dx_1, dx_1[-2:-1, :]], 0)\n    radii = dx[..., None] * 2 / torch.sqrt(torch.tensor(12))\n\n    return radii\n\n\ndef decode_fn(record_bytes):\n    return tf.io.parse_single_example(\n        record_bytes,\n        {\n            \"image_hash\": tf.io.FixedLenFeature([], dtype=tf.int64),\n            \"cam_idx\": tf.io.FixedLenFeature([], dtype=tf.int64),  # 0~12\n            \"equivalent_exposure\": tf.io.FixedLenFeature([], dtype=tf.float32),\n            \"height\": tf.io.FixedLenFeature([], dtype=tf.int64),\n            \"width\": tf.io.FixedLenFeature([], dtype=tf.int64),\n            \"image\": tf.io.FixedLenFeature([], dtype=tf.string),\n            \"ray_origins\": tf.io.VarLenFeature(tf.float32),\n            \"ray_dirs\": tf.io.VarLenFeature(tf.float32),\n            \"intrinsics\": tf.io.VarLenFeature(tf.float32),\n        }\n    )\n\n\ndef get_cam_rays(H, W, K):\n    grid = create_meshgrid(H, W, normalized_coordinates=False)[0]\n    i, j = grid.unbind(-1)\n    # the direction here is without +0.5 pixel centering as calibration is not so accurate\n    # see https://github.com/bmild/nerf/issues/24\n    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]\n    directions = \\\n        torch.stack([(i - cx) / fx, -(j - cy) / fy, -torch.ones_like(i)], -1)  # (H, W, 3)\n    directions = directions / torch.norm(directions, dim=-1, keepdim=True)\n    return directions.numpy()\n\n\ndef get_rotate_one_image(cam_ray_dir, world_ray_dir):\n    b_matrix = cam_ray_dir = cam_ray_dir.reshape(-1, 3)\n    A_matrix = world_ray_dir = world_ray_dir.reshape(-1, 3)\n\n    world_r123 = np.mat(world_ray_dir[:, :1]).reshape(-1, 1)\n    world_r456 = np.mat(world_ray_dir[:, 1:2]).reshape(-1, 1)\n    world_r789 = np.mat(world_ray_dir[:, 2:3]).reshape(-1, 1)\n\n    cam_dir = np.mat(cam_ray_dir)\n    r123 = np.linalg.lstsq(cam_dir, world_r123, rcond=None)[0]\n    r456 = np.linalg.lstsq(cam_dir, world_r456, rcond=None)[0]\n    r789 = np.linalg.lstsq(cam_dir, world_r789, rcond=None)[0]\n\n    R = np.zeros([3, 3])\n    R[0:1, :] = r123.T\n    R[1:2, :] = r456.T\n    R[2:3, :] = r789.T\n\n    R_loss = world_ray_dir - cam_ray_dir @ R.T\n    # print(f\"Pose loss:\\t{np.absolute(R_loss).mean()}\")  # should < 0.01\n    return R.tolist()\n\n\ndef handle_one_record(tfrecord, train_index, val_index):\n    dataset = tf.data.TFRecordDataset(\n        tfrecord,\n        compression_type=\"GZIP\",\n    )\n    dataset_map = dataset.map(decode_fn)\n\n    train_or_val = 'train' in tfrecord\n    \n    if train_or_val:\n        image_folder = train_img_folder\n        meta_folder = train_meta_folder\n        index = train_index\n        train_index += 1\n    else:\n        image_folder = val_img_folder\n        meta_folder = val_meta_folder\n        index = val_index\n        val_index += 1\n    \n    for batch in dataset_map:\n        image_name = str(int(batch[\"image_hash\"]))\n\n        imagestr = batch[\"image\"]\n        image = tf.io.decode_png(imagestr, channels=0, dtype=tf.dtypes.uint8, name=None)\n        image = np.array(image)\n        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n        cv2.imwrite(os.path.join(image_folder, f\"{image_name}.png\"), image)\n\n        cam_idx = int(batch[\"cam_idx\"])\n        equivalent_exposure = float(batch[\"equivalent_exposure\"])\n        height, width = int(batch[\"height\"]), int(batch[\"width\"])\n        intrinsics = tf.sparse.to_dense(batch[\"intrinsics\"]).numpy().tolist()\n        ray_origins = tf.sparse.to_dense(batch[\"ray_origins\"]).numpy().reshape(height, width, 3)\n        ray_dirs = tf.sparse.to_dense(batch[\"ray_dirs\"]).numpy().reshape(height, width, 3)\n\n        K = np.zeros((3, 3), dtype=np.float32)\n        # fx=focal,fy=focal,cx=img_w/2,cy=img_h/2\n        K[0, 0] = intrinsics[0]\n        K[1, 1] = intrinsics[1]\n        K[0, 2] = width * 0.5\n        K[1, 2] = height * 0.5\n        K[2, 2] = 1\n        cam_ray_dir = get_cam_rays(height, width, K) # get normalized rays\n        world_ray_dir = np.array(ray_dirs)\n        rotate_m = get_rotate_one_image(cam_ray_dir, world_ray_dir)\n        c2w_matrix = np.zeros([3, 4])\n        c2w_matrix[:, :3] = rotate_m\n        # average the origin as the final origin\n        cur_origin = torch.mean(torch.mean(torch.tensor(ray_origins), dim=0), dim=0).tolist()\n        c2w_matrix[:, 3:] = np.array(cur_origin).reshape(3, 1)\n\n        meta_data_dict = {\n            'W': width,\n            'H': height,\n            \"intrinsics\": torch.tensor(intrinsics),\n            \"c2w\": torch.tensor(c2w_matrix),\n            'image_name': image_name,\n            \"cam_idx\": cam_idx,\n            \"equivalent_exposure\": equivalent_exposure,\n            \"origin_pos\": torch.tensor(ray_origins[0][0].tolist()),\n            \"index\": index\n        }\n        if train_or_val:\n            train_meta[image_name] = meta_data_dict\n        else:\n            val_meta[image_name] = meta_data_dict\n        torch.save(train_meta, os.path.join(train_folder, \"train_all_meta.pt\"))\n        torch.save(val_meta, os.path.join(val_folder, \"val_all_meta.pt\"))\n        torch.save(meta_data_dict, os.path.join(meta_folder, image_name + \".pt\"))\n    return train_index, val_index\n\nif __name__ == \"__main__\":\n    waymo_root_p = \"data/v1.0\"\n    result_root_folder = \"data/pytorch_waymo_dataset\"\n    os.makedirs(result_root_folder, exist_ok=True)\n\n    coordinate_info = {\n        'origin_drb': [0.0, 0.0, 0.0],\n        'pose_scale_factor': 1.0\n    }\n    torch.save(coordinate_info, os.path.join(result_root_folder, \"coordinates.pt\"))\n    train_folder = os.path.join(result_root_folder, 'train')\n    os.makedirs(train_folder, exist_ok=True)\n    val_folder = os.path.join(result_root_folder, 'val')\n    os.makedirs(val_folder, exist_ok=True)\n\n    train_img_folder = os.path.join(train_folder, 'rgbs')\n    train_meta_folder = os.path.join(train_folder, 'metadata')\n    val_img_folder = os.path.join(val_folder, 'rgbs')\n    val_meta_folder = os.path.join(val_folder, 'metadata')\n\n    os.makedirs(train_img_folder, exist_ok=True)\n    os.makedirs(val_img_folder, exist_ok=True)\n    os.makedirs(train_meta_folder, exist_ok=True)\n    os.makedirs(val_meta_folder, exist_ok=True)\n\n    train_meta, val_meta = {}, {}\n    train_index = 0\n    val_index = 0\n    ori_waymo_data = sorted(glob.glob(os.path.join(waymo_root_p, \"*\")))\n    for idx, tfrecord in enumerate(tqdm(ori_waymo_data)):\n        train_index, val_index = handle_one_record(tfrecord, train_index, val_index)\n"
  },
  {
    "path": "data_preprocess/gdown_download_folder.py",
    "content": "import gdown\n\nurl = r'https://drive.google.com/drive/folders/1Lcc6MF35EnXGyUy0UZPkUx7SfeLsv8u9'\ngdown.download_folder(url)\n\n"
  },
  {
    "path": "data_preprocess/get_one_block_meta_and_visualize.py",
    "content": "import os\nimport json\nimport numpy as np\nimport random\nimport copy\nimport pdb\nimport shutil\nimport torch\nfrom tqdm import tqdm\nfrom pathlib import Path\n\ndef get_pix2cam(focals, width, height):\n    fx = np.array(focals)\n    fy = np.array(focals)\n    cx = np.array(width) * .5\n    cy = np.array(height) * .5\n    arr0 = np.zeros_like(cx)\n    arr1 = np.ones_like(cx)\n    k_inv = np.array([\n        [arr1 / fx, arr0, -cx / fx],\n        [arr0, -arr1 / fy, cy / fy],\n        [arr0, arr0, -arr1],\n    ])\n    k_inv = np.moveaxis(k_inv, -1, 0)\n    return k_inv.tolist()\n\n\nroot_dir = \"data/pytorch_waymo_dataset\"\nblock_index = 0\n\nwith open(os.path.join(root_dir, \"train\", f'split_block_train.json'), 'r') as fp:\n    train_block_split = json.load(fp)\n\nwith open(os.path.join(root_dir, \"train\", f'train_all_meta.json'), 'r') as fp:\n    train_all_meta = json.load(fp)\n\nwith open(os.path.join(root_dir, \"val\", f'split_block_val.json'), 'r') as fp:\n    val_block_split = json.load(fp)\n\nwith open(os.path.join(root_dir, \"val\", f'val_all_meta.json'), 'r') as fp:\n    val_all_meta = json.load(fp)\n\nsave_path = Path(f\"data/samples/block_{block_index}\")\nsave_path.mkdir(parents=True, exist_ok=True)\nos.makedirs(os.path.join(save_path, 'images_train'), exist_ok=True)\nos.makedirs(os.path.join(save_path, 'images_val'), exist_ok=True)\nos.makedirs(os.path.join(save_path, 'images_test'), exist_ok=True)\n\ntrain_imgs = train_block_split['block_' + str(block_index)]['elements']\nval_imgs = val_block_split['block_' + str(block_index)]\n\n\ndef form_unified_dict(images, metas, save_prefix='images_train', split_prefix='train'):\n    file_paths = []\n    c2ws = []\n    widths = []\n    heights = []\n    focals = []\n    nears = []\n    fars = []\n    metas = []\n    positions = []\n    for idx, one_img in enumerate(images):\n        ori_path = os.path.join('data', 'pytorch_waymo_dataset', split_prefix, 'rgbs/' + one_img[0] + \".png\")\n        if 'train' in split_prefix:\n            cur_meta = train_all_meta[one_img[0]]\n        else:\n            cur_meta = val_all_meta[one_img[0]]\n        final_path = os.path.join(save_prefix, str(cur_meta['cam_idx']) + \"_\" + str(idx) + \".png\")\n        full_save_path = os.path.join(save_path, final_path)\n        shutil.copyfile(ori_path, full_save_path)\n        file_paths.append(final_path)\n        metas.append(cur_meta)\n        positions.append(cur_meta['origin_pos'])\n        cur_meta['c2w'].append([0.0, 0.0, 0.0, 1.0])\n        c2ws.append(cur_meta['c2w'])\n        widths.append(cur_meta['W'])\n        heights.append(cur_meta['H'])\n        focals.append(cur_meta['intrinsics'][0])\n        nears.append(0.01)\n        fars.append(15.)\n    lossmult = np.ones(np.array(heights).shape).tolist()\n    pix2cam = get_pix2cam(focals=np.array(focals), width=np.array(widths), height=np.array(heights))\n    positions = np.array(positions)\n    metas = {'file_path': file_paths, 'cam2world': np.array(c2ws).tolist(), 'width': np.array(widths).tolist(),\n    'height': np.array(heights).tolist(), 'focal': np.array(focals).tolist(), 'pix2cam': pix2cam, 'lossmult': lossmult, \n    'near':nears, 'far': fars}\n    return metas\n\n\ntrain_dict = form_unified_dict(train_imgs, train_all_meta, save_prefix='images_train', split_prefix='train')\nval_dict = form_unified_dict(val_imgs, val_all_meta, save_prefix='images_val', split_prefix='val')\nval_dict = form_unified_dict(val_imgs, val_all_meta, save_prefix='images_test', split_prefix='val')\n\n# waymo does not have a test split\nunified_meta = {'train': train_dict, 'val': val_dict, 'test': val_dict}\n\nwith open(os.path.join(save_path, 'metadata.json'), \"w\") as fp:\n    json.dump(unified_meta, fp)\n    fp.close()\n\nprint(\"All done!\")\n"
  },
  {
    "path": "data_preprocess/process_mega.sh",
    "content": "cd data/mega/\ntar xopf ArtsQuad_dataset.tar\nmkdir ArtsQuad\nmv ArtsQuad_* ArtsQuad\ntar -zxvf building-pixsfm.tgz\ntar -zxvf building-pixsfm-grid-8.tgz\ntar -zxvf rubble-pixsfm.tgz\ntar -zxvf rubble-pixsfm-grid-8.tgz\ntar -zxvf quad-pixsfm.tgz\ntar -zxvf quad-pixsfm-grid-8.tgz\ntar -zxvf residence-pixsfm.tgz\ntar -zxvf residence-pixsfm-grid-8.tgz\ntar -zxvf sci-art-pixsfm.tgz\ntar -zxvf sci-art-pixsfm-grid-25.tgz\ntar -zxvf campus-pixsfm.tgz\ntar -zxvf campus-pixsfm-grid-8.tgz\nmkdir building\nmv building-* building\nmkdir campus\nmv campus-* campus\nmkdir rubble\nmv rubble-* rubble\nmkdir residence\nmv residence-* residence\nmkdir campus\nmv campus-* campus\nmkdir sci-art\nmv sci-art-* sci-art\nmkdir quad\nmv quad-* quad\ncd ../../"
  },
  {
    "path": "data_preprocess/split_block.py",
    "content": "import glob\nimport os\nimport json\nfrom turtle import onkey\nimport numpy as np\nimport argparse\nfrom collections import defaultdict\nimport open3d as o3d\nimport random\nimport copy\nimport pdb\nimport torch\n\n\ndef get_hparams():\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument('--radius', type=float, default=0.3,\n                        help='The radius of a block')\n    parser.add_argument('--overlap', type=float,\n                        default=0.5, help='overlap each block')\n    parser.add_argument('--visualization', type=bool, default=True,\n                        help=\"Whether visualize the split results\")\n    parser.add_argument('--visual_Block', type=bool, default=False,\n                        help=\"When visualize whether visualize the split result\")\n\n    return vars(parser.parse_args())\n\n\ndef extract_imgname_origins(meta):\n    img_origins = {}\n\n    for idx, img_name in enumerate(meta):\n        img_info = meta[img_name]\n        origin = img_info['origin_pos']\n        img_origins[img_name] = origin\n    return img_origins\n\n\ndef resort_origins(img_origins, positions):\n    # switch key and values in img_origins\n    origin2name = {}\n    for img_orin in img_origins:\n        origin = img_origins[img_orin]\n        origin2name[tuple(np.array(origin))] = img_orin\n\n    sorted_origins = {}\n    for pos in positions:\n        sorted_origins[origin2name[tuple(np.array(pos))]] = pos\n\n    return sorted_origins\n\n\ndef get_the_distance(r=2, overlap=0.5):  \n    x = r * 0.9\n    x0 = x\n    # fd is the derivation of f\n    f = 2 * np.arccos(x0 / r) * (r ** 2) - 2 * x0 * \\\n        np.sqrt(r ** 2 - x0 ** 2) - overlap * np.pi * r ** 2\n    fd = (2 * x0 ** 2 - 2 * r ** 2) / np.sqrt(r **\n                                              2 - x0 ** 2) - 2 * np.sqrt(r ** 2 - x0 ** 2)\n    h = f / fd\n    x = x0 - h\n    # optimize the function using Newton Optimization\n    while abs(x - x0) >= 1e-6:\n        x0 = x\n        f = 2 * np.arccos(x0 / r) * (r ** 2) - 2 * x0 * \\\n            np.sqrt(r ** 2 - x0 ** 2) - overlap * np.pi * r ** 2\n        fd = (2 * x0 ** 2 - 2 * r ** 2) / np.sqrt(r **\n                                                  2 - x0 ** 2) - 2 * np.sqrt(r ** 2 - x0 ** 2)\n        h = f / fd\n        x = x0 - h\n    return 2 * x\n\n\ndef get_each_block_element_train(img_train_origins, centroid, radius):\n    block_train_element = []\n\n    index = 0\n    for img_origin in img_train_origins:\n        if np.linalg.norm(img_train_origins[centroid] - img_train_origins[img_origin]) <= radius:\n            img_element = [img_origin, index]\n            block_train_element.append(img_element)\n            index += 1\n\n    return block_train_element\n\n\ndef extract_img_base_camidx(cam_idx, train_meta):\n    img_name = []\n    for meta in train_meta:\n        img_info = train_meta[meta]\n        if img_info['cam_idx'] == cam_idx:\n            img_name.append(meta)\n    return img_name\n\n\ndef get_block_idx(img_name, split_train_results):\n    for block in split_train_results:\n        elements = split_train_results[block][\"elements\"]\n        for element in elements:\n            if element[0] == img_name:\n                return [block, element[1]]\n    return None\n\n\ndef get_val_block_index(img_val_origins, train_meta, val_meta, img_train_origins, split_train_results):\n    split_val_results = defaultdict(list)\n    for origin in img_val_origins:\n        # find the corresponding camera first\n        img_info = val_meta[origin]\n        cam_idx = img_info['cam_idx']\n        # fetch all the same index in train_meta\n        img_list = extract_img_base_camidx(cam_idx, train_meta)\n        # calculate the nearest img\n        distance = 1000\n        img_nearest = None\n        for img in img_list:\n            distance_temp = np.linalg.norm(img_val_origins[origin] - img_train_origins[img])\n            if distance_temp < distance:\n                distance = distance_temp\n                img_nearest = img\n        block_name, index = get_block_idx(img_nearest, split_train_results)\n        '''\n        block_info = {\n            'elements': block_train_element,\n            \"centroid\": [centroid, img_train_origins_resort[centroid].tolist()]\n        }\n        split_train_results[f'block_{idx}'] = block_info\n        '''\n        # fetch the nearest img_name and find its block and corresponding index\n        split_val_results[block_name].append([origin, index])\n\n    return split_val_results\n\n\ndef split_dataset(train_meta, val_meta, radius=0.5, overlap=0.5):\n    img_train_origins = extract_imgname_origins(train_meta)\n    img_val_origins = extract_imgname_origins(val_meta)\n\n    train_positions = np.array([np.array(value) for value in img_train_origins.values()])\n    val_positions = np.array([np.array(value) for value in img_val_origins.values()])\n\n    train_indices = np.argsort(train_positions[:, 1])\n    val_indices = np.argsort(val_positions[:, 1])\n\n    train_positions = train_positions[train_indices, :]\n    val_positions = val_positions[val_indices, :]\n\n    img_train_origins_resort = resort_origins(img_train_origins, train_positions)\n    img_val_origins_resort = resort_origins(img_val_origins, val_positions)\n\n    distance = get_the_distance(r=radius, overlap=overlap)\n    print(f\"The block distance is: {distance}.\")\n\n    origin_1 = train_positions[0]\n    centroids = []\n\n    # find the first centroid\n    temp_origin = {}\n    for index, origin in enumerate(img_train_origins_resort):\n        if np.linalg.norm(origin_1 - img_train_origins_resort[origin]) > radius:\n            centroids.append(temp_origin)\n            break\n        temp_origin = origin\n\n    # get a new centroid since the beggining\n    temp_origin = {}\n    judge = False\n    for idx, origin in enumerate(img_train_origins_resort):\n        if origin != centroids[-1] and judge == False: # have not reached the first centroid\n            continue\n        else:\n            judge = True\n        if np.linalg.norm(img_train_origins_resort[centroids[-1]] - img_train_origins_resort[origin]) > distance:\n            centroids.append(temp_origin)\n        temp_origin = origin\n\n    split_train_results = {}\n    for idx, centroid in enumerate(centroids):\n        # find all points within the range\n        block_train_element = get_each_block_element_train(img_train_origins_resort, centroid, radius)\n        block_info = {\n            'elements': block_train_element,\n            \"centroid\": [centroid, img_train_origins_resort[centroid].tolist()]\n        }\n        split_train_results[f'block_{idx}'] = block_info\n    # find the closest val_origin under the same camera settings\n    split_val_results = get_val_block_index(img_val_origins_resort, train_meta, val_meta, img_train_origins_resort,\n                                            split_train_results)\n\n    return split_train_results, split_val_results\n\n\ndef extract_origins(meta):\n    origins = []\n    cam_index = defaultdict(int)\n    for img_name in meta:\n        img_info = meta[img_name]\n        origins.append(img_info['origin_pos'])\n        cam_index[img_info['cam_idx']] += 1\n\n    print(cam_index)\n    origins = np.array(origins)\n    return origins\n\n\ndef extract_cam_idx(train_meta):\n    cam_idx=[]\n    for meta in train_meta:\n        img_info=train_meta[meta]\n        if img_info['cam_idx'] not in cam_idx:\n            cam_idx.append(img_info['cam_idx'])\n    return sorted(cam_idx)\n\n\ndef extract_img_each_idx(idx,train_meta,train_split_meta):\n    imgs=[]\n    for block in train_split_meta:\n        for element in train_split_meta[block]['elements']:\n            if train_meta[element[0]]['cam_idx']==idx:\n                if element[0] not in imgs:\n                    imgs.append(element[0])\n    return imgs\n\n\ndef transfer_pt_to_json(pt_meta):\n    new_dict = copy.deepcopy(pt_meta)\n    for one_key in new_dict:\n        new_dict[one_key]['intrinsics'] = np.array(new_dict[one_key]['intrinsics']).tolist()\n        new_dict[one_key]['c2w'] = np.array(new_dict[one_key]['c2w']).tolist()\n        new_dict[one_key]['origin_pos'] = np.array(new_dict[one_key]['origin_pos']).tolist()\n    return new_dict\n\nif __name__ == \"__main__\":\n    args = get_hparams()\n    print(args)\n    root_dir = \"data/pytorch_waymo_dataset\"\n    os.makedirs(root_dir, exist_ok=True)\n    os.makedirs(os.path.join(root_dir, 'train'), exist_ok=True)\n    os.makedirs(os.path.join(root_dir, 'val'), exist_ok=True)\n\n    train_meta_path = os.path.join(root_dir, 'train', \"split_block_train.json\")\n    val_meta_path = os.path.join(root_dir, 'val', \"split_block_val.json\")\n    \n    train_meta = torch.load(os.path.join(root_dir, 'train', 'train_all_meta.pt'))\n    val_meta = torch.load(os.path.join(root_dir, 'val', 'val_all_meta.pt'))\n\n    # rewrite to json following the convention\n    with open(os.path.join(root_dir, 'train', 'train_all_meta.json'), \"w\") as fp:\n        new_train_meta = transfer_pt_to_json(train_meta)\n        json.dump(new_train_meta, fp)\n        fp.close()\n    with open(os.path.join(root_dir, 'val', 'val_all_meta.json'), \"w\") as fp:\n        new_val_meta = transfer_pt_to_json(val_meta)\n        json.dump(new_val_meta, fp)\n        fp.close()\n\n    print(\n        f\"Before spliting, there are {len(train_meta)} images for train and {len(val_meta)} images for val!\")\n    split_train_results, split_val_results = split_dataset(train_meta, val_meta, radius=args['radius'],\n                                                           overlap=args['overlap'])\n    print(\"Complete the split work!\")\n\n    block_train_json = {}\n    block_val_json = {}\n\n    for block in split_train_results:\n        block_train_json[block] = split_train_results[block]\n        print(f\"{block} has {len(split_train_results[block]['elements'])}\")\n        with open(train_meta_path, \"w\") as fp:\n            json.dump(block_train_json, fp)\n            fp.close()\n\n    for block in split_val_results:\n        block_val_json[block] = split_val_results[block]\n        print(f\"{block} has {len(split_val_results[block])}\")\n        with open(val_meta_path, \"w\") as fp:\n            json.dump(block_val_json, fp)\n            fp.close()\n\n    print(f\"The split results has been stored in the {train_meta_path} and {val_meta_path}\")\n\n    if args['visualization']:\n        train_origins = extract_origins(train_meta)\n        val_origins = extract_origins(val_meta)\n        # 11269,3\n        # block_json\n    \n    cam_idxes=extract_cam_idx(train_meta)\n    print(f\"There are {len(cam_idxes)} cameras. \")\n    cam_save_path = os.path.join(root_dir, \"cam_info.json\")\n    cam_imgs={}\n    for idx in cam_idxes:\n        cam_imgs[idx]=extract_img_each_idx(idx, train_meta, split_train_results)\n        with open(cam_save_path, \"w\") as fp:\n            json.dump(cam_imgs, fp)\n            fp.close()\n    print(f\"The camera information has been saved in the path of {cam_save_path}.\")\n    print(\"All done.\")\n"
  },
  {
    "path": "data_preprocess/vis.py",
    "content": "# This code (visualization utils) is originated from [nerf plus plus](https://github.com/Kai-46/nerfplusplus)\nimport torch\nimport os\nimport torchvision\nimport torchvision.transforms as T\nimport numpy as np\nimport cv2\nfrom PIL import Image\nimport matplotlib.pyplot as plt\nfrom scipy.spatial.transform import Rotation as R\n\n\ndef normalize(v):\n    \"\"\"Normalize a vector.\"\"\"\n    return v / np.linalg.norm(v)\n\n\ndef vis_lr(lr_init: float = 5e-4,\n           lr_final: float = 5e-6,\n           max_steps: int = 2000000,\n           lr_delay_steps: int = 2500,\n           lr_delay_mult: float = 0.01):\n    def get_lr(last_epoch):\n        if lr_delay_steps > 0:\n            # A kind of reverse cosine decay.\n            delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(\n                0.5 * np.pi * np.clip(last_epoch / lr_delay_steps, 0, 1))\n\n        else:\n            delay_rate = 1.\n        t = np.clip(last_epoch / max_steps, 0, 1)\n        log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)\n        return [delay_rate * log_lerp]\n\n    lr = []\n    step = []\n    for i in range(max_steps):\n        step += [i]\n        lr += get_lr(i)\n    p = np.stack([step, lr], 1)\n    plt.plot(p[:, 0], p[:, 1])\n    plt.xscale('log')\n    plt.yscale('log')\n    plt.show()\n\n\ndef save_image_tensor(image: torch.tensor, height: int, width: int, save_path: str, nhwc: bool = True):\n    image = image.detach().cpu().clamp(0.0, 1.0)\n    if image.dim() == 3:\n        image = image[None, ...]\n        if nhwc:  # nhwc -> nchw\n            image = image.permute(0, 3, 1, 2)\n        torchvision.utils.save_image(image, save_path)\n    elif image.dim() == 4:\n        if nhwc:  # nhwc -> nchw\n            image = image.permute(0, 3, 1, 2)\n        torchvision.utils.save_image(image, save_path)\n    elif image.dim() == 2:  # flatten\n        image = image.reshape(1, height, width, 1)\n        if nhwc:  # nhwc -> nchw\n            image = image.permute(0, 3, 1, 2)\n        torchvision.utils.save_image(image, save_path)\n    else:\n        raise NotImplementedError\n\n\ndef save_images(rgb, dist, acc, path, idx):\n    B, H, W, C = rgb.shape\n    color_dist = visualize_depth(dist)\n    color_acc = visualize_depth(acc)\n    save_image_tensor(rgb, H, W, os.path.join(path, str('{:05d}'.format(idx)) + '_rgb' + '.png'))\n    save_image_tensor(color_dist, H, W, os.path.join(path, str('{:05d}'.format(idx)) + '_dist' + '.png'), False)\n    save_image_tensor(color_acc, H, W, os.path.join(path, str('{:05d}'.format(idx)) + '_acc' + '.png'), False)\n\n\ndef visualize_depth(depth, cmap=cv2.COLORMAP_JET):\n    \"\"\"\n    depth: (H, W)\n    \"\"\"\n    x = depth.cpu().numpy()\n    if len(x.shape) > 2:\n        x = np.squeeze(x)\n    x = np.nan_to_num(x)  # change nan to 0\n    mi = np.min(x)  # get minimum depth\n    ma = np.max(x)\n    x = (x - mi) / max(ma - mi, 1e-8)  # normalize to 0~1\n    x = (255 * x).astype(np.uint8)\n    x_ = Image.fromarray(cv2.applyColorMap(x, cmap))\n    x_ = T.ToTensor()(x_)  # (3, H, W)\n    return x_\n\n\ndef gen_render_path(c2ws, N_views=30):\n    N = len(c2ws)\n    rotvec, positions = [], []\n    rotvec_inteplat, positions_inteplat = [], []\n    weight = np.linspace(1.0, .0, N_views // 3, endpoint=False).reshape(-1, 1)\n    for i in range(N):\n        r = R.from_matrix(c2ws[i, :3, :3])\n        euler_ange = r.as_euler('xyz', degrees=True).reshape(1, 3)\n        if i:\n            mask = np.abs(euler_ange - rotvec[0]) > 180\n            euler_ange[mask] += 360.0\n        rotvec.append(euler_ange)\n        positions.append(c2ws[i, :3, 3:].reshape(1, 3))\n\n        if i:\n            rotvec_inteplat.append(weight * rotvec[i - 1] + (1.0 - weight) * rotvec[i])\n            positions_inteplat.append(weight * positions[i - 1] + (1.0 - weight) * positions[i])\n\n    rotvec_inteplat.append(weight * rotvec[-1] + (1.0 - weight) * rotvec[0])\n    positions_inteplat.append(weight * positions[-1] + (1.0 - weight) * positions[0])\n\n    c2ws_render = []\n    angles_inteplat, positions_inteplat = np.concatenate(rotvec_inteplat), np.concatenate(positions_inteplat)\n    for rotvec, position in zip(angles_inteplat, positions_inteplat):\n        c2w = np.eye(4)\n        c2w[:3, :3] = R.from_euler('xyz', rotvec, degrees=True).as_matrix()\n        c2w[:3, 3:] = position.reshape(3, 1)\n        c2ws_render.append(c2w.copy())\n    c2ws_render = np.stack(c2ws_render)\n    return c2ws_render\n\n\ndef create_spiral_poses(radii, focus_depth, n_poses=120):\n    \"\"\"\n    Computes poses that follow a spiral path for rendering purpose.\n    See https://github.com/Fyusion/LLFF/issues/19\n    In particular, the path looks like:\n    https://tinyurl.com/ybgtfns3\n\n    Inputs:\n        radii: (3) radii of the spiral for each axis\n        focus_depth: float, the depth that the spiral poses look at\n        n_poses: int, number of poses to create along the path\n\n    Outputs:\n        poses_spiral: (n_poses, 3, 4) the poses in the spiral path\n    \"\"\"\n\n    poses_spiral = []\n    for t in np.linspace(0, 4 * np.pi, n_poses + 1)[:-1]:  # rotate 4pi (2 rounds)\n        # the parametric function of the spiral (see the interactive web)\n        center = np.array([np.cos(t), -np.sin(t), -np.sin(0.5 * t)]) * radii\n\n        # the viewing z axis is the vector pointing from the @focus_depth plane\n        # to @center\n        z = normalize(center - np.array([0, 0, -focus_depth]))\n\n        # compute other axes as in @average_poses\n        y_ = np.array([0, 1, 0])  # (3)\n        x = normalize(np.cross(y_, z))  # (3)\n        y = np.cross(z, x)  # (3)\n\n        poses_spiral += [np.stack([x, y, z, center], 1)]  # (3, 4)\n\n    return np.stack(poses_spiral, 0)  # (n_poses, 3, 4)\n\n\ndef create_spheric_poses(radius, n_poses=120):\n    \"\"\"\n    Create circular poses around z axis.\n    Inputs:\n        radius: the (negative) height and the radius of the circle.\n\n    Outputs:\n        spheric_poses: (n_poses, 3, 4) the poses in the circular path\n    \"\"\"\n\n    def spheric_pose(theta, phi, radius):\n        trans_t = lambda t: np.array([\n            [1, 0, 0, 0],\n            [0, 1, 0, 0],\n            [0, 0, 1, t],\n            [0, 0, 0, 1],\n        ])\n\n        rot_phi = lambda phi: np.array([\n            [1, 0, 0, 0],\n            [0, np.cos(phi), -np.sin(phi), 0],\n            [0, np.sin(phi), np.cos(phi), 0],\n            [0, 0, 0, 1],\n        ])\n\n        rot_theta = lambda th: np.array([\n            [np.cos(th), 0, -np.sin(th), 0],\n            [0, 1, 0, 0],\n            [np.sin(th), 0, np.cos(th), 0],\n            [0, 0, 0, 1],\n        ])\n\n        c2w = rot_theta(theta) @ rot_phi(phi) @ trans_t(radius)\n        c2w = np.array([[-1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) @ c2w\n        return c2w[:3]\n\n    spheric_poses = []\n    for th in np.linspace(0, 2 * np.pi, n_poses + 1)[:-1]:\n        spheric_poses += [spheric_pose(th, -np.pi / 5, radius)]  # 36 degree view downwards\n    return np.stack(spheric_poses, 0)\n\n\ndef stack_rgb(rgb_gt, coarse_rgb, fine_rgb):\n    img_gt = rgb_gt.squeeze(0).permute(2, 0, 1).cpu()  # (3, H, W)\n    coarse_rgb = coarse_rgb.squeeze(0).permute(2, 0, 1).cpu()\n    fine_rgb = fine_rgb.squeeze(0).permute(2, 0, 1).cpu()\n\n    stack = torch.stack([img_gt, coarse_rgb, fine_rgb])  # (3, 3, H, W)\n    return stack\n"
  },
  {
    "path": "data_preprocess/visualize_cameras.py",
    "content": "# This code is originated from [nerf plus plus](https://github.com/Kai-46/nerfplusplus)\nimport open3d as o3d\nimport json\nimport numpy as np\nimport cv2\nimport os\nimport pdb\nimport argparse\nfrom vis import create_spheric_poses\n\nnp.random.seed(11)\n\n\ndef get_camera_frustum(img_size, K, W2C, frustum_length=0.5, color=[0.0, 1.0, 0.0]):\n    W, H = img_size\n    hfov = np.rad2deg(np.arctan(W / 2.0 / K[0, 0]) * 2.0)\n    vfov = np.rad2deg(np.arctan(H / 2.0 / K[1, 1]) * 2.0)\n    half_w = frustum_length * np.tan(np.deg2rad(hfov / 2.0))\n    half_h = frustum_length * np.tan(np.deg2rad(vfov / 2.0))\n\n    # build view frustum for camera (I, 0)\n    frustum_points = np.array(\n        [\n            [0.0, 0.0, 0.0],  # frustum origin\n            [-half_w, -half_h, frustum_length],  # top-left image corner\n            [half_w, -half_h, frustum_length],  # top-right image corner\n            [half_w, half_h, frustum_length],  # bottom-right image corner\n            [-half_w, half_h, frustum_length],\n        ]\n    )  # bottom-left image corner\n    frustum_lines = np.array(\n        [[0, i] for i in range(1, 5)] + [[i, (i + 1)] for i in range(1, 4)] + [[4, 1]]\n    )\n    frustum_colors = np.tile(\n        np.array(color).reshape((1, 3)), (frustum_lines.shape[0], 1)\n    )\n\n    # frustum_colors = np.vstack((np.tile(np.array([[1., 0., 0.]]), (4, 1)),\n    #                            np.tile(np.array([[0., 1., 0.]]), (4, 1))))\n\n    # transform view frustum from (I, 0) to (R, t)\n    C2W = np.linalg.inv(W2C)\n    frustum_points = np.dot(\n        np.hstack((frustum_points, np.ones_like(frustum_points[:, 0:1]))), C2W.T\n    )\n    frustum_points = frustum_points[:, :3] / frustum_points[:, 3:4]\n\n    return frustum_points, frustum_lines, frustum_colors\n\n\ndef frustums2lineset(frustums):\n    N = len(frustums)\n    merged_points = np.zeros((N * 5, 3))  # 5 vertices per frustum\n    merged_lines = np.zeros((N * 8, 2))  # 8 lines per frustum\n    merged_colors = np.zeros((N * 8, 3))  # each line gets a color\n\n    for i, (frustum_points, frustum_lines, frustum_colors) in enumerate(frustums):\n        merged_points[i * 5 : (i + 1) * 5, :] = frustum_points\n        merged_lines[i * 8 : (i + 1) * 8, :] = frustum_lines + i * 5\n        merged_colors[i * 8 : (i + 1) * 8, :] = frustum_colors\n\n    lineset = o3d.geometry.LineSet()\n    lineset.points = o3d.utility.Vector3dVector(merged_points)\n    lineset.lines = o3d.utility.Vector2iVector(merged_lines)\n    lineset.colors = o3d.utility.Vector3dVector(merged_colors)\n\n    return lineset\n\n\ndef visualize_cameras(\n    colored_camera_dicts, sphere_radius, geometry_file=None, geometry_type=\"mesh\"\n):\n    sphere = o3d.geometry.TriangleMesh.create_sphere(\n        radius=sphere_radius, resolution=10\n    )\n    sphere = o3d.geometry.LineSet.create_from_triangle_mesh(sphere)\n    sphere.paint_uniform_color((1, 0, 0))\n\n    coord_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(\n        size=0.5, origin=[0.0, 0.0, 0.0]\n    )\n    things_to_draw = [sphere, coord_frame]\n\n    idx = 0\n    for color, camera_dict in colored_camera_dicts:\n        idx += 1\n\n        cnt = 0\n        frustums = []\n        for img_name in sorted(camera_dict.keys()):\n            K = np.array(camera_dict[img_name][\"K\"]).reshape((4, 4))\n            W2C = np.array(camera_dict[img_name][\"W2C\"]).reshape((4, 4))\n            img_size = camera_dict[img_name][\"img_size\"]\n            camera_size = camera_dict[img_name][\"camera_size\"]\n            frustums.append(\n                get_camera_frustum(\n                    img_size, K, W2C, frustum_length=camera_size, color=color\n                )\n            )\n            cnt += 1\n        cameras = frustums2lineset(frustums)\n        things_to_draw.append(cameras)\n\n    if geometry_file is not None:\n        if geometry_type == \"mesh\":\n            geometry = o3d.io.read_triangle_mesh(geometry_file)\n            geometry.compute_vertex_normals()\n        elif geometry_type == \"pointcloud\":\n            geometry = o3d.io.read_point_cloud(geometry_file)\n        else:\n            raise Exception(\"Unknown geometry_type: \", geometry_type)\n\n        things_to_draw.append(geometry)\n\n    o3d.visualization.draw_geometries(things_to_draw)\n\n\ndef convert_pose(C2W):\n    flip_yz = np.eye(4)\n    flip_yz[1, 1] = -1\n    flip_yz[2, 2] = -1\n    C2W = np.matmul(C2W, flip_yz)\n    return C2W\n\n\ndef read_single_scale_cam(path):\n    with open(path, \"r\") as fp:\n        meta = json.load(fp)\n    frame = meta[\"frames\"][0]\n    fname = os.path.join(\n        os.path.join(*path.split(\"/\")[:-1]), frame[\"file_path\"] + \".png\"\n    )\n    if not fname.startswith(\"/\"):\n        fname = \"/\" + fname\n    h, w, _ = cv2.imread(fname, -1).shape\n    focal = 0.5 * w / np.tan(0.5 * float(meta[\"camera_angle_x\"]))\n    K = [[focal, 0, w / 2, 0], [0, focal, h / 2, 0], [0, 0, 1, 0], [0, 0, 0, 1]]\n    cams_dict = {}\n    z = []\n    for i in range(len(meta[\"frames\"])):\n        frame = meta[\"frames\"][i]\n        C2W = np.array(frame[\"transform_matrix\"], dtype=np.float32)\n        # convert to right-down-forward coordinate\n        z.append(C2W[2, 3])\n        C2W = convert_pose(C2W)\n        W2C = np.eye(4)\n        W2C[:3, :3] = C2W[:3, :3].T\n        W2C[:3, 3] = -C2W[:3, :3].T @ C2W[:3, 3]\n        cams_dict[frame[\"file_path\"].split(\"/\")[-1]] = {\n            \"K\": K,\n            \"W2C\": W2C.flatten(),\n            \"img_size\": [w, h],\n            \"camera_size\": 1,\n        }\n    return cams_dict, np.percentile(z, 90)\n\n\ndef read_multi_scale_cam(path):\n    splits = [\"train\", \"val\", \"test\"]\n    with open(path, \"r\") as fp:\n        meta = json.load(fp)\n    colored_camera_dicts = []\n    render_color = np.random.rand(3)\n    for split in splits:\n        file_paths = meta[split][\"file_path\"]\n        c2ws = np.array(meta[split][\"cam2world\"])\n        focals = np.array(meta[split][\"focal\"])\n        widths = np.array(meta[split][\"width\"])\n        heights = np.array(meta[split][\"height\"])\n        color = np.random.rand(3)\n        camera_size = 1\n        for i in range(4):\n            file_path = file_paths[i::4]\n            c2w = c2ws[i::4]\n            focal = focals[i::4]\n            width = widths[i::4]\n            height = heights[i::4]\n            K = [\n                [focal[0], 0, width[0] / 2, 0],\n                [0, focal[0], height[0] / 2, 0],\n                [0, 0, 1, 0],\n                [0, 0, 0, 1],\n            ]\n            cams_dict = {}\n            z = []\n            for j in range(len(file_path)):\n                C2W = c2w[j]\n                z.append(C2W[2, 3])\n                C2W = convert_pose(C2W)\n                W2C = np.eye(4)\n                W2C[:3, :3] = C2W[:3, :3].T\n                W2C[:3, 3] = -C2W[:3, :3].T @ C2W[:3, 3]\n                cams_dict[file_path[j]] = {\n                    \"K\": K,\n                    \"W2C\": W2C.flatten(),\n                    \"img_size\": [width[j], height[j]],\n                    \"camera_size\": camera_size,\n                }\n            if split == \"train\":\n                colored_camera_dicts.append(\n                    (\n                        render_color,\n                        create_spheric_cam(\n                            focal[0],\n                            [width[0], height[0]],\n                            np.percentile(z, 90),\n                            camera_size,\n                        ),\n                    )\n                )\n            camera_size /= 2\n            colored_camera_dicts.append((color, cams_dict))\n\n    return colored_camera_dicts\n\n\ndef create_spheric_cam(focal, img_size, radius, camera_size=1):\n    w, h = img_size\n    K = [[focal, 0, w / 2, 0], [0, focal, h / 2, 0], [0, 0, 1, 0], [0, 0, 0, 1]]\n    cams_dict = {}\n    C2Ws = create_spheric_poses(radius)\n    for i in range(len(C2Ws)):\n        C2W = C2Ws[i, ...]\n        # convert to right-down-forward coordinate\n        C2W = convert_pose(C2W)\n        W2C = np.eye(4)\n        W2C[:3, :3] = C2W[:3, :3].T\n        W2C[:3, 3] = -C2W[:3, :3].T @ C2W[:3, 3]\n        cams_dict[str(i)] = {\n            \"K\": K,\n            \"W2C\": W2C.flatten(),\n            \"img_size\": [w, h],\n            \"camera_size\": camera_size,\n        }\n    return cams_dict\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"--data_path\",\n        help=\"Path to data.\",\n    )\n    parser.add_argument(\n        \"--multi_scale\", help=\"Whether vis multi camera.\", action=\"store_true\"\n    )\n    args = parser.parse_args()\n    sphere_radius = 1.0\n    if not args.multi_scale:\n        train_cam_dict, radius = read_single_scale_cam(\n            os.path.join(args.data_path, \"transforms_train.json\")\n        )\n        val_cam_dict, _ = read_single_scale_cam(\n            os.path.join(args.data_path, \"transforms_val.json\")\n        )\n        test_cam_dict, _ = read_single_scale_cam(\n            os.path.join(args.data_path, \"transforms_test.json\")\n        )\n        cam = train_cam_dict[list(train_cam_dict.keys())[0]]\n        render_dict = create_spheric_cam(cam[\"K\"][0][0], cam[\"img_size\"], radius)\n        colored_camera_dicts = [\n            ([0, 1, 0], train_cam_dict),\n            ([0, 0, 1], val_cam_dict),\n            ([1, 1, 0], test_cam_dict),\n            ([0, 1, 1], render_dict),\n        ]\n    else:\n        colored_camera_dicts = read_multi_scale_cam(\n            os.path.join(args.data_path, \"metadata.json\")\n        )\n\n    visualize_cameras(colored_camera_dicts, sphere_radius)\n"
  },
  {
    "path": "docs/classified_weekly_nerf/dynamic.md",
    "content": "\nWeekly Classified Neural Radiance Fields - dynamic ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n===================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [Tensor4D : Efficient Neural 4D Decomposition for High-fidelity Dynamic Reconstruction and Rendering](https://arxiv.org/abs/2211.11610) | [code]\n    > We present Tensor4D, an efficient yet effective approach to dynamic scene modeling. The key of our solution is an efficient 4D tensor decomposition method so that the dynamic scene can be directly represented as a 4D spatio-temporal tensor. To tackle the accompanying memory issue, we decompose the 4D tensor hierarchically by projecting it first into three time-aware volumes and then nine compact feature planes. In this way, spatial information over time can be simultaneously captured in a compact and memory-efficient manner. When applying Tensor4D for dynamic scene reconstruction and rendering, we further factorize the 4D fields to different scales in the sense that structural motions and dynamic detailed changes can be learned from coarse to fine. The effectiveness of our method is validated on both synthetic and real-world scenes. Extensive experiments show that our method is able to achieve high-quality dynamic reconstruction and rendering from sparse-view camera rigs or even a monocular camera. The code and dataset will be released at this https URL.\n  - [DynIBaR: Neural Dynamic Image-Based Rendering, -](https://arxiv.org/abs/2211.11082) | [code]\n    > We address the problem of synthesizing novel views from a monocular video depicting a complex dynamic scene. State-of-the-art methods based on temporally varying Neural Radiance Fields (aka dynamic NeRFs) have shown impressive results on this task. However, for long videos with complex object motions and uncontrolled camera trajectories, these methods can produce blurry or inaccurate renderings, hampering their use in real-world applications. Instead of encoding the entire dynamic scene within the weights of an MLP, we present a new approach that addresses these limitations by adopting a volumetric image-based rendering framework that synthesizes new viewpoints by aggregating features from nearby views in a scene-motion-aware manner. Our system retains the advantages of prior methods in its ability to model complex scenes and view-dependent effects, but also enables synthesizing photo-realistic novel views from long videos featuring complex scene dynamics with unconstrained camera trajectories. We demonstrate significant improvements over state-of-the-art methods on dynamic scene datasets, and also apply our approach to in-the-wild videos with challenging camera and object motion, where prior methods fail to produce high-quality renderings. Our project webpage is at this http URL.\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n  - [ParticleNeRF: A Particle-Based Encoding for Online Neural Radiance Fields in Dynamic Scenes](https://arxiv.org/abs/2211.04041) | [code]\n    > Neural Radiance Fields (NeRFs) learn implicit representations of - typically static - environments from images. Our paper extends NeRFs to handle dynamic scenes in an online fashion. We propose ParticleNeRF that adapts to changes in the geometry of the environment as they occur, learning a new up-to-date representation every 350 ms. ParticleNeRF can represent the current state of dynamic environments with much higher fidelity as other NeRF frameworks. To achieve this, we introduce a new particle-based parametric encoding, which allows the intermediate NeRF features - now coupled to particles in space - to move with the dynamic geometry. This is possible by backpropagating the photometric reconstruction loss into the position of the particles. The position gradients are interpreted as particle velocities and integrated into positions using a position-based dynamics (PBS) physics system. Introducing PBS into the NeRF formulation allows us to add collision constraints to the particle motion and creates future opportunities to add other movement priors into the system, such as rigid and deformable body\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n  - [An Exploration of Neural Radiance Field Scene Reconstruction: Synthetic, Real-world and Dynamic Scenes](https://arxiv.org/abs/2210.12268) | [code]\n    > This project presents an exploration into 3D scene reconstruction of synthetic and real-world scenes using Neural Radiance Field (NeRF) approaches. We primarily take advantage of the reduction in training and rendering time of neural graphic primitives multi-resolution hash encoding, to reconstruct static video game scenes and real-world scenes, comparing and observing reconstruction detail and limitations. Additionally, we explore dynamic scene reconstruction using Neural Radiance Fields for Dynamic Scenes(D-NeRF). Finally, we extend the implementation of D-NeRF, originally constrained to handle synthetic scenes to also handle real-world dynamic scenes.\n## Oct9 - Oct15, 2022\n  - [Towards Efficient Neural Scene Graphs by Learning Consistency Fields, BMVC2022](https://arxiv.org/abs/2210.04127) | [***``[code]``***](https://github.com/ldynx/CF-NSG)\n    > Neural Radiance Fields (NeRF) achieves photo-realistic image rendering from novel views, and the Neural Scene Graphs (NSG) \\cite{ost2021neural} extends it to dynamic scenes (video) with multiple objects. Nevertheless, computationally heavy ray marching for every image frame becomes a huge burden. In this paper, taking advantage of significant redundancy across adjacent frames in videos, we propose a feature-reusing framework. From the first try of naively reusing the NSG features, however, we learn that it is crucial to disentangle object-intrinsic properties consistent across frames from transient ones. Our proposed method, \\textit{Consistency-Field-based NSG (CF-NSG)}, reformulates neural radiance fields to additionally consider \\textit{consistency fields}. With disentangled representations, CF-NSG takes full advantage of the feature-reusing scheme and performs an extended degree of scene manipulation in a more controllable manner. We empirically verify that CF-NSG greatly improves the inference efficiency by using 85\\% less queries than NSG without notable degradation in rendering quality. Code will be available at: this https URL\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n  - [PREF: Predictability Regularized Neural Motion Fields, ECCV2022(oral)](https://arxiv.org/abs/2209.10691) | [code]\n    > Knowing the 3D motions in a dynamic scene is essential to many vision applications. Recent progress is mainly focused on estimating the activity of some specific elements like humans. In this paper, we leverage a neural motion field for estimating the motion of all points in a multiview setting. Modeling the motion from a dynamic scene with multiview data is challenging due to the ambiguities in points of similar color and points with time-varying color. We propose to regularize the estimated motion to be predictable. If the motion from previous frames is known, then the motion in the near future should be predictable. Therefore, we introduce a predictability regularization by first conditioning the estimated motion on latent embeddings, then by adopting a predictor network to enforce predictability on the embeddings. The proposed framework PREF (Predictability REgularized Fields) achieves on par or better results than state-of-the-art neural motion field-based dynamic scene representation methods, while requiring no prior knowledge of the scene.\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n  - [Neural Feature Fusion Fields: 3D Distillation of Self-Supervised 2D Image Representations, 3DV2022(oral)](https://arxiv.org/abs/2209.03494) | [***``[code]``***](https://github.com/dichotomies/N3F)\n    > We present Neural Feature Fusion Fields (N3F), a method that improves dense 2D image feature extractors when the latter are applied to the analysis of multiple images reconstructible as a 3D scene. Given an image feature extractor, for example pre-trained using self-supervision, N3F uses it as a teacher to learn a student network defined in 3D space. The 3D student network is similar to a neural radiance field that distills said features and can be trained with the usual differentiable rendering machinery. As a consequence, N3F is readily applicable to most neural rendering formulations, including vanilla NeRF and its extensions to complex dynamic scenes. We show that our method not only enables semantic understanding in the context of scene-specific neural fields without the use of manual labels, but also consistently improves over the self-supervised 2D baselines. This is demonstrated by considering various tasks, such as 2D object retrieval, 3D segmentation, and scene editing, in diverse sequences, including long egocentric videos in the EPIC-KITCHENS benchmark.\n  - [MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model](https://arxiv.org/abs/2208.15001) | [***``[code]``***](https://github.com/mingyuan-zhang/MotionDiffuse)\n    > Human motion modeling is important for many modern graphics applications, which typically require professional skills. In order to remove the skill barriers for laymen, recent motion generation methods can directly generate human motions conditioned on natural languages. However, it remains challenging to achieve diverse and fine-grained motion generation with various text inputs. To address this problem, we propose MotionDiffuse, the first diffusion model-based text-driven motion generation framework, which demonstrates several desired properties over existing methods. 1) Probabilistic Mapping. Instead of a deterministic language-motion mapping, MotionDiffuse generates motions through a series of denoising steps in which variations are injected. 2) Realistic Synthesis. MotionDiffuse excels at modeling complicated data distribution and generating vivid motion sequences. 3) Multi-Level Manipulation. MotionDiffuse responds to fine-grained instructions on body parts, and arbitrary-length motion synthesis with time-varied text prompts. Our experiments show MotionDiffuse outperforms existing SoTA methods by convincing margins on text-driven motion generation and action-conditioned motion generation. A qualitative analysis further demonstrates MotionDiffuse's controllability for comprehensive motion generation. Homepage: this https URL\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n  - [E-NeRF: Neural Radiance Fields from a Moving Event Camera](https://arxiv.org/abs/2208.11300) | [code]\n    > Estimating neural radiance fields (NeRFs) from ideal images has been extensively studied in the computer vision community. Most approaches assume optimal illumination and slow camera motion. These assumptions are often violated in robotic applications, where images contain motion blur and the scene may not have suitable illumination. This can cause significant problems for downstream tasks such as navigation, inspection or visualization of the scene. To alleviate these problems we present E-NeRF, the first method which estimates a volumetric scene representation in the form of a NeRF from a fast-moving event camera. Our method can recover NeRFs during very fast motion and in high dynamic range conditions, where frame-based approaches fail. We show that rendering high-quality frames is possible by only providing an event stream as input. Furthermore, by combining events and frames, we can estimate NeRFs of higher quality than state-of-the-art approaches under severe motion blur. We also show that combining events and frames can overcome failure cases of NeRF estimation in scenarios where only few input views are available, without requiring additional regularization.\n## Aug14 - Aug20, 2022\n  - [Neural Capture of Animatable 3D Human from Monocular Video, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > We present a novel paradigm of building an animatable 3D human representation from a monocular video input, such that it can be rendered in any unseen poses and views. Our method is based on a dynamic Neural Radiance Field (NeRF) rigged by a mesh-based parametric 3D human model serving as a geometry proxy. Previous methods usually rely on multi-view videos or accurate 3D geometry information as additional inputs; besides, most methods suffer from degraded quality when generalized to unseen poses. We identify that the key to generalization is a good input embedding for querying dynamic NeRF: A good input embedding should define an injective mapping in the full volumetric space, guided by surface mesh deformation under pose variation. Based on this observation, we propose to embed the input query with its relationship to local surface regions spanned by a set of geodesic nearest neighbors on mesh vertices. By including both position and relative distance information, our embedding defines a distance-preserved deformation mapping and generalizes well to unseen poses. To reduce the dependency on additional inputs, we first initialize per-frame 3D meshes using off-the-shelf tools and then propose a pipeline to jointly optimize NeRF and refine the initial mesh. Extensive experiments show our method can synthesize plausible human rendering results under unseen poses and views.\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n  - [NFOMP: Neural Field for Optimal Motion Planner of Differential Drive Robots With Nonholonomic Constraints, IEEE Robotics and Automation Letters, IEEE Robotics and Automation Letters](https://ieeexplore.ieee.org/abstract/document/9851532/) | [code]\n    > Optimal motion planning is one of the most critical problems in mobile robotics. On the one hand, classical sampling-based methods propose asymptotically optimal solutions to this problem. However, these planners cannot achieve smooth and short trajectories in reasonable calculation time. On the other hand, optimization-based methods are able to generate smooth and plain trajectories in a variety of scenarios, including a dense human crowd. However, modern optimization-based methods use the precomputed signed distance function for collision loss estimation, and it limits the application of these methods for general configuration spaces, including a differential drive non-circular robot with non-holonomic constraints. Moreover, optimization-based methods lack the ability to handle U-shaped or thin obstacles accurately. We propose to improve the optimization methods in two aspects. Firstly, we developed an obstacle neural field model to estimate collision loss; training this model together with trajectory optimization allows improving collision loss continuously, while achieving more feasible and smoother trajectories. Secondly, we forced the trajectory to consider non-holonomic constraints by adding Lagrange multipliers to the trajectory loss function. We applied our method for solving the optimal motion planning problem for differential drive robots with non-holonomic constraints, benchmarked our solution, and proved that the novel planner generates smooth, short, and plain trajectories perfectly suitable for a robot to follow, and outperforms the state-of-the-art approaches by 25% on normalized curvature and by 75% on the number of cusps in the MovingAI environment.\n  - [Controllable Free Viewpoint Video Reconstruction Based on Neural Radiance Fields and Motion Graphs, IEEE Transactions on Visualization and Computer Graphics](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > In this paper, we propose a controllable high-quality free viewpoint video generation method based on the motion graph and neural radiance fields (NeRF). Different from existing pose-driven NeRF or time/structure conditioned NeRF works, we propose to first construct a directed motion graph of the captured sequence. Such a sequence-motion-parameterization strategy not only enables flexible pose control for free viewpoint video rendering but also avoids redundant calculation of similar poses and thus improves the overall reconstruction efficiency. Moreover, to support body shape control without losing the realistic free viewpoint rendering performance, we improve the vanilla NeRF by combining explicit surface deformation and implicit neural scene representations. Specifically, we train a local surface-guided NeRF for each valid frame on the motion graph, and the volumetric rendering was only performed in the local space around the real surface, thus enabling plausible shape control ability. As far as we know, our method is the first method that supports both realistic free viewpoint video reconstruction and motion graph-based user-guided motion traversal. The results and comparisons further demonstrate the effectiveness of the proposed method.\n  - [Robust Change Detection Based on Neural Descriptor Fields, IROS2022](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > The ability to reason about changes in the environment is crucial for robots operating over extended periods of time. Agents are expected to capture changes during operation so that actions can be followed to ensure a smooth progression of the working session. However, varying viewing angles and accumulated localization errors make it easy for robots to falsely detect changes in the surrounding world due to low observation overlap and drifted object associations. In this paper, based on the recently proposed category-level Neural Descriptor Fields (NDFs), we develop an object-level online change detection approach that is robust to partially overlapping observations and noisy localization results. Utilizing the shape completion capability and SE(3)-equivariance of NDFs, we represent objects with compact shape codes encoding full object shapes from partial observations. The objects are then organized in a spatial tree structure based on object centers recovered from NDFs for fast queries of object neighborhoods. By associating objects via shape code similarity and comparing local object-neighbor spatial layout, our proposed approach demonstrates robustness to low observation overlap and localization noises. We conduct experiments on both synthetic and real-world sequences and achieve improved change detection results compared to multiple baseline methods. Project webpage: this https URL\n## Jul24 - Jul30, 2022\n  - [Deforming Radiance Fields with Cages, ECCV2022](https://arxiv.org/abs/2207.12298) | [code]\n    > Recent advances in radiance fields enable photorealistic rendering of static or dynamic 3D scenes, but still do not support explicit deformation that is used for scene manipulation or animation. In this paper, we propose a method that enables a new type of deformation of the radiance field: free-form radiance field deformation. We use a triangular mesh that encloses the foreground object called cage as an interface, and by manipulating the cage vertices, our approach enables the free-form deformation of the radiance field. The core of our approach is cage-based deformation which is commonly used in mesh deformation. We propose a novel formulation to extend it to the radiance field, which maps the position and the view direction of the sampling points from the deformed space to the canonical space, thus enabling the rendering of the deformed scene. The deformation results of the synthetic datasets and the real-world datasets demonstrate the effectiveness of our approach.\n## Previous weeks\n  - [D-NeRF: Neural Radiance Fields for Dynamic Scenes, CVPR2021](https://arxiv.org/abs/2011.13961) | [***``[code]``***](https://github.com/albertpumarola/D-NeRF)\n    > Neural rendering techniques combining machine learning with geometric reasoning have arisen as one of the most promising approaches for synthesizing novel views of a scene from a sparse set of images. Among these, stands out the Neural radiance fields (NeRF), which trains a deep network to map 5D input coordinates (representing spatial location and viewing direction) into a volume density and view-dependent emitted radiance. However, despite achieving an unprecedented level of photorealism on the generated images, NeRF is only applicable to static scenes, where the same spatial location can be queried from different images. In this paper we introduce D-NeRF, a method that extends neural radiance fields to a dynamic domain, allowing to reconstruct and render novel images of objects under rigid and non-rigid motions from a \\emph{single} camera moving around the scene. For this purpose we consider time as an additional input to the system, and split the learning process in two main stages: one that encodes the scene into a canonical space and another that maps this canonical representation into the deformed scene at a particular time. Both mappings are simultaneously learned using fully-connected networks. Once the networks are trained, D-NeRF can render novel images, controlling both the camera view and the time variable, and thus, the object movement. We demonstrate the effectiveness of our approach on scenes with objects under rigid, articulated and non-rigid motions. Code, model weights and the dynamic scenes dataset will be released.\n  - [Dynamic Neural Radiance Fields for Monocular 4D Facial Avatar Reconstruction, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > We present dynamic neural radiance fields for modeling the appearance and dynamics of a human face. Digitally modeling and reconstructing a talking human is a key building-block for a variety of applications. Especially, for telepresence applications in AR or VR, a faithful reproduction of the appearance including novel viewpoint or head-poses is required. In contrast to state-of-the-art approaches that model the geometry and material properties explicitly, or are purely image-based, we introduce an implicit representation of the head based on scene representation networks. To handle the dynamics of the face, we combine our scene representation network with a low-dimensional morphable model which provides explicit control over pose and expressions. We use volumetric rendering to generate images from this hybrid representation and demonstrate that such a dynamic neural scene representation can be learned from monocular input data only, without the need of a specialized capture setup. In our experiments, we show that this learned volumetric representation allows for photo-realistic image generation that surpasses the quality of state-of-the-art video-based reenactment methods.\n  - [Non-Rigid Neural Radiance Fields: Reconstruction and Novel View Synthesis of a Deforming Scene from Monocular Video,, ICCV2021](https://vcai.mpi-inf.mpg.de/projects/nonrigid_nerf/) | [***``[code]``***](https://github.com/facebookresearch/nonrigid_nerf)\n    > We present Non-Rigid Neural Radiance Fields (NR-NeRF), a reconstruction and novel view synthesis approach for general non-rigid dynamic scenes. Our approach takes RGB images of a dynamic scene as input (e.g., from a monocular video recording), and creates a high-quality space-time geometry and appearance representation. We show that a single handheld consumer-grade camera is sufficient to synthesize sophisticated renderings of a dynamic scene from novel virtual camera views, e.g. a `bullet-time' video effect. NR-NeRF disentangles the dynamic scene into a canonical volume and its deformation. Scene deformation is implemented as ray bending, where straight rays are deformed non-rigidly. We also propose a novel rigidity network to better constrain rigid regions of the scene, leading to more stable results. The ray bending and rigidity network are trained without explicit supervision. Our formulation enables dense correspondence estimation across views and time, and compelling video editing applications such as motion exaggeration. Our code will be open sourced.\n  - [Neural Body: Implicit Neural Representations with Structured Latent Codes for Novel View Synthesis of Dynamic Humans, CVPR2021](https://zju3dv.github.io/neuralbody/) | [***``[code]``***](https://github.com/zju3dv/neuralbody)\n    > This paper addresses the challenge of novel view synthesis for a human performer from a very sparse set of camera views. Some recent works have shown that learning implicit neural representations of 3D scenes achieves remarkable view synthesis quality given dense input views. However, the representation learning will be ill-posed if the views are highly sparse. To solve this ill-posed problem, our key idea is to integrate observations over video frames. To this end, we propose Neural Body, a new human body representation which assumes that the learned neural representations at different frames share the same set of latent codes anchored to a deformable mesh, so that the observations across frames can be naturally integrated. The deformable mesh also provides geometric guidance for the network to learn 3D representations more efficiently. To evaluate our approach, we create a multi-view dataset named ZJU-MoCap that captures performers with complex motions. Experiments on ZJU-MoCap show that our approach outperforms prior works by a large margin in terms of novel view synthesis quality. We also demonstrate the capability of our approach to reconstruct a moving person from a monocular video on the People-Snapshot dataset.\n  - [Dynamic View Synthesis from Dynamic Monocular Video, ICCV2021](https://free-view-video.github.io/) | [***``[code]``***](https://github.com/gaochen315/DynamicNeRF)\n    > We present an algorithm for generating novel views at arbitrary viewpoints and any input time step given a monocular video of a dynamic scene. Our work builds upon recent advances in neural implicit representation and uses continuous and differentiable functions for modeling the time-varying structure and the appearance of the scene. We jointly train a time-invariant static NeRF and a time-varying dynamic NeRF, and learn how to blend the results in an unsupervised manner. However, learning this implicit function from a single video is highly ill-posed (with infinitely many solutions that match the input video). To resolve the ambiguity, we introduce regularization losses to encourage a more physically plausible solution. We show extensive quantitative and qualitative results of dynamic view synthesis from casually captured videos.\n  - [TöRF: Time-of-Flight Radiance Fields for Dynamic Scene View Synthesis, NeurIPS2021](https://imaging.cs.cmu.edu/torf/) | [***``[code]``***](https://github.com/breuckelen/torf)\n    > Neural networks can represent and accurately reconstruct radiance fields for static 3D scenes (e.g., NeRF). Several works extend these to dynamic scenes captured with monocular video, with promising performance. However, the monocular setting is known to be an under-constrained problem, and so methods rely on data-driven priors for reconstructing dynamic content. We replace these priors with measurements from a time-of-flight (ToF) camera, and introduce a neural representation based on an image formation model for continuous-wave ToF cameras. Instead of working with processed depth maps, we model the raw ToF sensor measurements to improve reconstruction quality and avoid issues with low reflectance regions, multi-path interference, and a sensor's limited unambiguous depth range. We show that this approach improves robustness of dynamic scene reconstruction to erroneous calibration and large motions, and discuss the benefits and limitations of integrating RGB+ToF sensors that are now available on modern smartphones.\n  - [Object-Centric Neural Scene Rendering](https://shellguo.com/osf/) | [***``[code]``***](https://shellguo.com/osf/)\n    > We present a method for composing photorealistic scenes from captured images of objects. Our work builds upon neural radiance fields (NeRFs), which implicitly model the volumetric density and directionally-emitted radiance of a scene. While NeRFs synthesize realistic pictures, they only model static scenes and are closely tied to specific imaging conditions. This property makes NeRFs hard to generalize to new scenarios, including new lighting or new arrangements of objects. Instead of learning a scene radiance field as a NeRF does, we propose to learn object-centric neural scattering functions (OSFs), a representation that models per-object light transport implicitly using a lighting- and view-dependent neural network. This enables rendering scenes even when objects or lights move, without retraining. Combined with a volumetric path tracing procedure, our framework is capable of rendering both intra- and inter-object light transport effects including occlusions, specularities, shadows, and indirect illumination. We evaluate our approach on scene composition and show that it generalizes to novel illumination conditions, producing photorealistic, physically accurate renderings of multi-object scenes.\n  - [Learning Compositional Radiance Fields of Dynamic Human Heads, CVPR2021(oral)](https://ziyanw1.github.io/hybrid_nerf/) | [code]\n    > Photorealistic rendering of dynamic humans is an important ability for telepresence systems, virtual shopping, synthetic data generation, and more. Recently, neural rendering methods, which combine techniques from computer graphics and machine learning, have created high-fidelity models of humans and objects. Some of these methods do not produce results with high-enough fidelity for driveable human models (Neural Volumes) whereas others have extremely long rendering times (NeRF). We propose a novel compositional 3D representation that combines the best of previous methods to produce both higher-resolution and faster results. Our representation bridges the gap between discrete and continuous volumetric representations by combining a coarse 3D-structure-aware grid of animation codes with a continuous learned scene function that maps every position and its corresponding local animation code to its view-dependent emitted radiance and local volume density. Differentiable volume rendering is employed to compute photo-realistic novel views of the human head and upper body as well as to train our novel representation end-to-end using only 2D supervision. In addition, we show that the learned dynamic radiance field can be used to synthesize novel unseen expressions based on a global animation code. Our approach achieves state-of-the-art results for synthesizing novel views of dynamic human heads and the upper body.\n  - [Neural Scene Graphs for Dynamic Scenes, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > Recent implicit neural rendering methods have demonstrated that it is possible to learn accurate view synthesis for complex scenes by predicting their volumetric density and color supervised solely by a set of RGB images. However, existing methods are restricted to learning efficient representations of static scenes that encode all scene objects into a single neural network, and lack the ability to represent dynamic scenes and decompositions into individual scene objects. In this work, we present the first neural rendering method that decomposes dynamic scenes into scene graphs. We propose a learned scene graph representation, which encodes object transformation and radiance, to efficiently render novel arrangements and views of the scene. To this end, we learn implicitly encoded scenes, combined with a jointly learned latent representation to describe objects with a single implicit function. We assess the proposed method on synthetic and real automotive data, validating that our approach learns dynamic scenes -- only by observing a video of this scene -- and allows for rendering novel photo-realistic views of novel scene compositions with unseen sets of objects at unseen poses.\n  - [3D Neural Scene Representations for Visuomotor Control, CoRL2021(oral)](https://3d-representation-learning.github.io/nerf-dy/) | [code]\n    > Humans have a strong intuitive understanding of the 3D environment around us. The mental model of the physics in our brain applies to objects of different materials and enables us to perform a wide range of manipulation tasks that are far beyond the reach of current robots. In this work, we desire to learn models for dynamic 3D scenes purely from 2D visual observations. Our model combines Neural Radiance Fields (NeRF) and time contrastive learning with an autoencoding framework, which learns viewpoint-invariant 3D-aware scene representations. We show that a dynamics model, constructed over the learned representation space, enables visuomotor control for challenging manipulation tasks involving both rigid bodies and fluids, where the target is specified in a viewpoint different from what the robot operates on. When coupled with an auto-decoding framework, it can even support goal specification from camera viewpoints that are outside the training distribution. We further demonstrate the richness of the learned 3D dynamics model by performing future prediction and novel view synthesis. Finally, we provide detailed ablation studies regarding different system designs and qualitative analysis of the learned representations.\n  - [Vision-Only Robot Navigation in a Neural Radiance World](https://arxiv.org/abs/2110.00168) | [code]\n    > Neural Radiance Fields (NeRFs) have recently emerged as a powerful paradigm for the representation of natural, complex 3D scenes. NeRFs represent continuous volumetric density and RGB values in a neural network, and generate photo-realistic images from unseen camera viewpoints through ray tracing. We propose an algorithm for navigating a robot through a 3D environment represented as a NeRF using only an on-board RGB camera for localization. We assume the NeRF for the scene has been pre-trained offline, and the robot's objective is to navigate through unoccupied space in the NeRF to reach a goal pose. We introduce a trajectory optimization algorithm that avoids collisions with high-density regions in the NeRF based on a discrete time version of differential flatness that is amenable to constraining the robot's full pose and control inputs. We also introduce an optimization based filtering method to estimate 6DoF pose and velocities for the robot in the NeRF given only an onboard RGB camera. We combine the trajectory planner with the pose filter in an online replanning loop to give a vision-based robot navigation pipeline. We present simulation results with a quadrotor robot navigating through a jungle gym environment, the inside of a church, and Stonehenge using only an RGB camera. We also demonstrate an omnidirectional ground robot navigating through the church, requiring it to reorient to fit through the narrow gap. Videos of this work can be found at this https URL .\n"
  },
  {
    "path": "docs/classified_weekly_nerf/editing.md",
    "content": "\nWeekly Classified Neural Radiance Fields - editing ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n===================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n  - [Removing Objects From Neural Radiance Fields](https://arxiv.org/abs/2212.11966) | [code]\n    > Neural Radiance Fields (NeRFs) are emerging as a ubiquitous scene representation that allows for novel view synthesis. Increasingly, NeRFs will be shareable with other people. Before sharing a NeRF, though, it might be desirable to remove personal information or unsightly objects. Such removal is not easily achieved with the current NeRF editing frameworks. We propose a framework to remove objects from a NeRF representation created from an RGB-D sequence. Our NeRF inpainting method leverages recent work in 2D image inpainting and is guided by a user-provided mask. Our algorithm is underpinned by a confidence based view selection procedure. It chooses which of the individual 2D inpainted images to use in the creation of the NeRF, so that the resulting inpainted NeRF is 3D consistent. We show that our method for NeRF editing is effective for synthesizing plausible inpaintings in a multi-view coherent manner. We validate our approach using a new and still-challenging dataset for the task of NeRF inpainting.\n## Dec11 - Dec17, 2022\n  - [NeRF-Art: Text-Driven Neural Radiance Fields Stylization](https://arxiv.org/abs/2212.08070) | [***``[code]``***](https://cassiepython.github.io/nerfart/)\n    > As a powerful representation of 3D scenes, the neural radiance field (NeRF) enables high-quality novel view synthesis from multi-view images. Stylizing NeRF, however, remains challenging, especially on simulating a text-guided style with both the appearance and the geometry altered simultaneously. In this paper, we present NeRF-Art, a text-guided NeRF stylization approach that manipulates the style of a pre-trained NeRF model with a simple text prompt. Unlike previous approaches that either lack sufficient geometry deformations and texture details or require meshes to guide the stylization, our method can shift a 3D scene to the target style characterized by desired geometry and appearance variations without any mesh guidance. This is achieved by introducing a novel global-local contrastive learning strategy, combined with the directional constraint to simultaneously control both the trajectory and the strength of the target style. Moreover, we adopt a weight regularization method to effectively suppress cloudy artifacts and geometry noises which arise easily when the density field is transformed during geometry stylization. Through extensive experiments on various styles, we demonstrate that our method is effective and robust regarding both single-view stylization quality and cross-view consistency. The code and more results can be found in our project page: this https URL.\n## Dec4 - Dec10, 2022\n  - [Ref-NPR: Reference-Based Non-Photorealistic Radiance Fields](https://arxiv.org/abs/2212.02766) | [code]\n    > Existing 3D scene stylization methods employ an arbitrary style reference to transfer textures and colors as styles without establishing meaningful semantic correspondences. We present Reference-Based Non-Photorealistic Radiance Fields, i.e., Ref-NPR. It is a controllable scene stylization method utilizing radiance fields to stylize a 3D scene, with a single stylized 2D view taken as reference. To achieve decent results, we propose a ray registration process based on the stylized reference view to obtain pseudo-ray supervision in novel views, and exploit the semantic correspondence in content images to fill occluded regions with perceptually similar styles. Combining these operations, Ref-NPR generates non-photorealistic and continuous novel view sequences with a single reference while obtaining reasonable stylization in occluded regions. Experiments show that Ref-NPR significantly outperforms other scene and video stylization methods in terms of both visual quality and semantic correspondence. Code and data will be made publicly available.\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n  - [Learning-based Inverse Rendering of Complex Indoor Scenes with Differentiable Monte Carlo Raytracing, SIGGRAPH-Asia2022](https://jingsenzhu.github.io/invrend/) | [code]\n    > We present a learning-based approach for inverse rendering of complex indoor scenes with differentiable Monte Carlo raytracing. Our method takes a single indoor scene RGB image as input and automatically infers its underlying surface reflectance , geometry, and spatially-varying illumination. This enables us to perform photorealistic editing of the scene, such as inserting multiple complex virtual objects and editing surface materials faithfully with global illumination.\n## Oct30 - Nov5, 2022\n  - [gCoRF: Generative Compositional Radiance Fields, 3DV2022](https://vcai.mpi-inf.mpg.de/projects/gCoRF/) | [code]\n    > 3D generative models of objects enable photorealistic image synthesis with 3D control. Existing methods model the scene as a global scene representation, ignoring the compositional aspect of the scene. Compositional reasoning can enable a wide variety of editing applications, in addition to enabling generalizable 3D reasoning. In this paper, we present a compositional generative model, where each semantic part of the object is represented as an independent 3D representation learnt from only in-the-wild 2D data. We start with a global generative model (GAN) and learn to decompose it into different semantic parts using supervision from 2D segmentation masks. We then learn to composite independently sampled parts in order to create coherent global scenes. Different parts can be independently sampled, while keeping rest of the object fixed. We evaluate our method on a wide variety of objects and parts, and demonstrate editing applications.\n## Oct23 - Oct29, 2022\n  - [Boosting Point Clouds Rendering via Radiance Mapping](https://arxiv.org/abs/2210.15107) | [code]\n    > Recent years we have witnessed rapid development in NeRF-based image rendering due to its high quality. However, point clouds rendering is somehow less explored. Compared to NeRF-based rendering which suffers from dense spatial sampling, point clouds rendering is naturally less computation intensive, which enables its deployment in mobile computing device. In this work, we focus on boosting the image quality of point clouds rendering with a compact model design. We first analyze the adaption of the volume rendering formulation on point clouds. Based on the analysis, we simplify the NeRF representation to a spatial mapping function which only requires single evaluation per pixel. Further, motivated by ray marching, we rectify the the noisy raw point clouds to the estimated intersection between rays and surfaces as queried coordinates, which could avoid spatial frequency collapse and neighbor point disturbance. Composed of rasterization, spatial mapping and the refinement stages, our method achieves the state-of-the-art performance on point clouds rendering, outperforming prior works by notable margins, with a smaller model size. We obtain a PSNR of 31.74 on NeRF-Synthetic, 25.88 on ScanNet and 30.81 on DTU. Code and data would be released soon.\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [LB-NERF: Light Bending Neural Radiance Fields for Transparent Medium, ICIP2022](https://ieeexplore.ieee.org/abstract/document/9897642) | [code]\n    > Neural radiance fields (NeRFs) have been proposed as methods of novel view synthesis and have been used to address various problems because of its versatility. NeRF can represent colors and densities in 3D space using neural rendering assuming a straight light path. However, a medium with a different refractive index in the scene, such as a transparent medium, causes light refraction and breaks the assumption of the straight path of light. Therefore, the NeRFs cannot be learned consistently across multi-view images. To solve this problem, this study proposes a method to learn consistent radiance fields across multiple viewpoints by introducing the light refraction effect as an offset from the straight line originating from the camera center. The experimental results quantitatively and qualitatively verified that our method can interpolate viewpoints better than the conventional NeRF method when considering the refraction of transparent objects.\n  - [Controllable Style Transfer via Test-time Training of Implicit Neural Representation](https://arxiv.org/abs/2210.07762) | [code]\n    > We propose a controllable style transfer framework based on Implicit Neural Representation that pixel-wisely controls the stylized output via test-time training. Unlike traditional image optimization methods that often suffer from unstable convergence and learning-based methods that require intensive training and have limited generalization ability, we present a model optimization framework that optimizes the neural networks during test-time with explicit loss functions for style transfer. After being test-time trained once, thanks to the flexibility of the INR-based model, our framework can precisely control the stylized images in a pixel-wise manner and freely adjust image resolution without further optimization or training. We demonstrate several applications.\n  - [Neural Shape Deformation Priors, NeurIPS2022](https://arxiv.org/abs/2210.05616) | [code]\n    > We present Neural Shape Deformation Priors, a novel method for shape manipulation that predicts mesh deformations of non-rigid objects from user-provided handle movements. State-of-the-art methods cast this problem as an optimization task, where the input source mesh is iteratively deformed to minimize an objective function according to hand-crafted regularizers such as ARAP. In this work, we learn the deformation behavior based on the underlying geometric properties of a shape, while leveraging a large-scale dataset containing a diverse set of non-rigid deformations. Specifically, given a source mesh and desired target locations of handles that describe the partial surface deformation, we predict a continuous deformation field that is defined in 3D space to describe the space deformation. To this end, we introduce transformer-based deformation networks that represent a shape deformation as a composition of local surface deformations. It learns a set of local latent codes anchored in 3D space, from which we can learn a set of continuous deformation functions for local surfaces. Our method can be applied to challenging deformations and generalizes well to unseen deformations. We validate our approach in experiments using the DeformingThing4D dataset, and compare to both classic optimization-based and recent neural network-based methods.\n## Oct2 - Oct8, 2022\n  - [Unsupervised Multi-View Object Segmentation Using Radiance Field Propagation, NeurIPS2022](https://arxiv.org/abs/2210.00489) | [code]\n    > We present radiance field propagation (RFP), a novel approach to segmenting objects in 3D during reconstruction given only unlabeled multi-view images of a scene. RFP is derived from emerging neural radiance field-based techniques, which jointly encodes semantics with appearance and geometry. The core of our method is a novel propagation strategy for individual objects' radiance fields with a bidirectional photometric loss, enabling an unsupervised partitioning of a scene into salient or meaningful regions corresponding to different object instances. To better handle complex scenes with multiple objects and occlusions, we further propose an iterative expectation-maximization algorithm to refine object masks. To the best of our knowledge, RFP is the first unsupervised approach for tackling 3D scene object segmentation for neural radiance field (NeRF) without any supervision, annotations, or other cues such as 3D bounding boxes and prior knowledge of object class. Experiments demonstrate that RFP achieves feasible segmentation results that are more accurate than previous unsupervised image/scene segmentation approaches, and are comparable to existing supervised NeRF-based methods. The segmented object representations enable individual 3D object editing operations.\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n## Sep11 - Sep17, 2022\n  - [3DMM-RF: Convolutional Radiance Fields for 3D Face Modeling](https://arxiv.org/abs/2209.07366) | [code]\n    > Facial 3D Morphable Models are a main computer vision subject with countless applications and have been highly optimized in the last two decades. The tremendous improvements of deep generative networks have created various possibilities for improving such models and have attracted wide interest. Moreover, the recent advances in neural radiance fields, are revolutionising novel-view synthesis of known scenes. In this work, we present a facial 3D Morphable Model, which exploits both of the above, and can accurately model a subject's identity, pose and expression and render it in arbitrary illumination. This is achieved by utilizing a powerful deep style-based generator to overcome two main weaknesses of neural radiance fields, their rigidity and rendering speed. We introduce a style-based generative network that synthesizes in one pass all and only the required rendering samples of a neural radiance field. We create a vast labelled synthetic dataset of facial renders, and train the network on these data, so that it can accurately model and generalize on facial identity, pose and appearance. Finally, we show that this model can accurately be fit to \"in-the-wild\" facial images of arbitrary pose and illumination, extract the facial characteristics, and be used to re-render the face in controllable conditions.\n## Sep4 - Sep10, 2022\n  - [SIRA: Relightable Avatars from a Single Image](https://arxiv.org/abs/2209.03027) | [code]\n    > Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.\n## Aug28 - Sep3, 2022\n  - [NerfCap: Human Performance Capture With Dynamic Neural Radiance Fields, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > This paper addresses the challenge of human performance capture from sparse multi-view or monocular videos. Given a template mesh of the performer, previous methods capture the human motion by non-rigidly registering the template mesh to images with 2D silhouettes or dense photometric alignment. However, the detailed surface deformation cannot be recovered from the silhouettes, while the photometric alignment suffers from instability caused by appearance variation in the videos. To solve these problems, we propose NerfCap, a novel performance capture method based on the dynamic neural radiance field (NeRF) representation of the performer. Specifically, a canonical NeRF is initialized from the template geometry and registered to the video frames by optimizing the deformation field and the appearance model of the canonical NeRF. To capture both large body motion and detailed surface deformation, NerfCap combines linear blend skinning with embedded graph deformation. In contrast to the mesh-based methods that suffer from fixed topology and texture, NerfCap is able to flexibly capture complex geometry and appearance variation across the videos, and synthesize more photo-realistic images. In addition, NerfCap can be pre-trained end to end in a self-supervised manner by matching the synthesized videos with the input videos. Experimental results on various datasets show that NerfCap outperforms prior works in terms of both surface reconstruction accuracy and novel-view synthesis quality.\n## Aug21 - Aug27, 2022\n  - [Training and Tuning Generative Neural Radiance Fields for Attribute-Conditional 3D-Aware Face Generation](https://arxiv.org/abs/2208.12550) | [***``[code]``***](https://github.com/zhangqianhui/TT-GNeRF)\n    > 3D-aware GANs based on generative neural radiance fields (GNeRF) have achieved impressive high-quality image generation, while preserving strong 3D consistency. The most notable achievements are made in the face generation domain. However, most of these models focus on improving view consistency but neglect a disentanglement aspect, thus these models cannot provide high-quality semantic/attribute control over generation. To this end, we introduce a conditional GNeRF model that uses specific attribute labels as input in order to improve the controllabilities and disentangling abilities of 3D-aware generative models. We utilize the pre-trained 3D-aware model as the basis and integrate a dual-branches attribute-editing module (DAEM), that utilize attribute labels to provide control over generation. Moreover, we propose a TRIOT (TRaining as Init, and Optimizing for Tuning) method to optimize the latent vector to improve the precision of the attribute-editing further. Extensive experiments on the widely used FFHQ show that our model yields high-quality editing with better view consistency while preserving the non-target regions. The code is available at this https URL.\n  - [DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://dreambooth.github.io/) | [code]\n    > Large text-to-image models achieved a remarkable leap in the evolution of AI, enabling high-quality and diverse synthesis of images from a given text prompt. However, these models lack the ability to mimic the appearance of subjects in a given reference set and synthesize novel renditions of them in different contexts. In this work, we present a new approach for \"personalization\" of text-to-image diffusion models (specializing them to users' needs). Given as input just a few images of a subject, we fine-tune a pretrained text-to-image model (Imagen, although our method is not limited to a specific model) such that it learns to bind a unique identifier with that specific subject. Once the subject is embedded in the output domain of the model, the unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in different scenes. By leveraging the semantic prior embedded in the model with a new autogenous class-specific prior preservation loss, our technique enables synthesizing the subject in diverse scenes, poses, views, and lighting conditions that do not appear in the reference images. We apply our technique to several previously-unassailable tasks, including subject recontextualization, text-guided view synthesis, appearance modification, and artistic rendering (all while preserving the subject's key features). Project page: this https URL\n  - [FurryGAN: High Quality Foreground-aware Image Synthesis, ECCV2022](https://jeongminb.github.io/FurryGAN/) | [***``[code]``***](https://jeongminb.github.io/FurryGAN/)\n    > Foreground-aware image synthesis aims to generate images as well as their foreground masks. A common approach is to formulate an image as an masked blending of a foreground image and a background image. It is a challenging problem because it is prone to reach the trivial solution where either image overwhelms the other, i.e., the masks become completely full or empty, and the foreground and background are not meaningfully separated. We present FurryGAN with three key components: 1) imposing both the foreground image and the composite image to be realistic, 2) designing a mask as a combination of coarse and fine masks, and 3) guiding the generator by an auxiliary mask predictor in the discriminator. Our method produces realistic images with remarkably detailed alpha masks which cover hair, fur, and whiskers in a fully unsupervised manner.\n## Aug14 - Aug20, 2022\n  - [Vox-Surf: Voxel-based Implicit Surface Representation](https://arxiv.org/abs/2208.10925) | [code]\n    > Virtual content creation and interaction play an important role in modern 3D applications such as AR and VR. Recovering detailed 3D models from real scenes can significantly expand the scope of its applications and has been studied for decades in the computer vision and computer graphics community. We propose Vox-Surf, a voxel-based implicit surface representation. Our Vox-Surf divides the space into finite bounded voxels. Each voxel stores geometry and appearance information in its corner vertices. Vox-Surf is suitable for almost any scenario thanks to sparsity inherited from voxel representation and can be easily trained from multiple view images. We leverage the progressive training procedure to extract important voxels gradually for further optimization so that only valid voxels are preserved, which greatly reduces the number of sampling points and increases rendering speed.The fine voxels can also be considered as the bounding volume for collision detection.The experiments show that Vox-Surf representation can learn delicate surface details and accurate color with less memory and faster rendering speed than other methods.We also show that Vox-Surf can be more practical in scene editing and AR applications.\n  - [DM-NeRF: 3D Scene Geometry Decomposition and Manipulation from 2D Images](https://arxiv.org/abs/2208.07227) | [***``[code]``***](https://github.com/vLAR-group/DM-NeRF)\n    > In this paper, we study the problem of 3D scene geometry decomposition and manipulation from 2D views. By leveraging the recent implicit neural representation techniques, particularly the appealing neural radiance fields, we introduce an object field component to learn unique codes for all individual objects in 3D space only from 2D supervision. The key to this component is a series of carefully designed loss functions to enable every 3D point, especially in non-occupied space, to be effectively optimized even without 3D labels. In addition, we introduce an inverse query algorithm to freely manipulate any specified 3D object shape in the learned scene representation. Notably, our manipulation algorithm can explicitly tackle key issues such as object collisions and visual occlusions. Our method, called DM-NeRF, is among the first to simultaneously reconstruct, decompose, manipulate and render complex 3D scenes in a single pipeline. Extensive experiments on three datasets clearly show that our method can accurately decompose all 3D objects from 2D views, allowing any interested object to be freely manipulated in 3D space such as translation, rotation, size adjustment, and deformation.\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n  - [VolTeMorph: Realtime, Controllable and Generalisable Animation of Volumetric Representations](https://arxiv.org/pdf/2208.00949) | [code]\n    > The recent increase in popularity of volumetric representations for scene reconstruction and novel view synthesis has put renewed focus on animating volumetric content at high visual quality and in real-time. While implicit deformation methods based on learned functions can produce impressive results, they are `black boxes' to artists and content creators, they require large amounts of training data to generalise meaningfully, and they do not produce realistic extrapolations outside the training data. In this work we solve these issues by introducing a volume deformation method which is real-time, easy to edit with off-the-shelf software and can extrapolate convincingly. To demonstrate the versatility of our method, we apply it in two scenarios: physics-based object deformation and telepresence where avatars are controlled using blendshapes. We also perform thorough experiments showing that our method compares favourably to both volumetric approaches combined with implicit deformation and methods based on mesh deformation.\n  - [Controllable Free Viewpoint Video Reconstruction Based on Neural Radiance Fields and Motion Graphs, IEEE Transactions on Visualization and Computer Graphics](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > In this paper, we propose a controllable high-quality free viewpoint video generation method based on the motion graph and neural radiance fields (NeRF). Different from existing pose-driven NeRF or time/structure conditioned NeRF works, we propose to first construct a directed motion graph of the captured sequence. Such a sequence-motion-parameterization strategy not only enables flexible pose control for free viewpoint video rendering but also avoids redundant calculation of similar poses and thus improves the overall reconstruction efficiency. Moreover, to support body shape control without losing the realistic free viewpoint rendering performance, we improve the vanilla NeRF by combining explicit surface deformation and implicit neural scene representations. Specifically, we train a local surface-guided NeRF for each valid frame on the motion graph, and the volumetric rendering was only performed in the local space around the real surface, thus enabling plausible shape control ability. As far as we know, our method is the first method that supports both realistic free viewpoint video reconstruction and motion graph-based user-guided motion traversal. The results and comparisons further demonstrate the effectiveness of the proposed method.\n  - [Robust Change Detection Based on Neural Descriptor Fields, IROS2022](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > The ability to reason about changes in the environment is crucial for robots operating over extended periods of time. Agents are expected to capture changes during operation so that actions can be followed to ensure a smooth progression of the working session. However, varying viewing angles and accumulated localization errors make it easy for robots to falsely detect changes in the surrounding world due to low observation overlap and drifted object associations. In this paper, based on the recently proposed category-level Neural Descriptor Fields (NDFs), we develop an object-level online change detection approach that is robust to partially overlapping observations and noisy localization results. Utilizing the shape completion capability and SE(3)-equivariance of NDFs, we represent objects with compact shape codes encoding full object shapes from partial observations. The objects are then organized in a spatial tree structure based on object centers recovered from NDFs for fast queries of object neighborhoods. By associating objects via shape code similarity and comparing local object-neighbor spatial layout, our proposed approach demonstrates robustness to low observation overlap and localization noises. We conduct experiments on both synthetic and real-world sequences and achieve improved change detection results compared to multiple baseline methods. Project webpage: this https URL\n## Jul24 - Jul30, 2022\n  - [MobileNeRF: Exploiting the Polygon Rasterization Pipeline for Efficient Neural Field Rendering on Mobile Architectures](https://arxiv.org/abs/2208.00277) | [***``[code]``***](https://github.com/google-research/jax3d/tree/main/jax3d/projects/mobilenerf)\n    > Neural Radiance Fields (NeRFs) have demonstrated amazing ability to synthesize images of 3D scenes from novel views. However, they rely upon specialized volumetric rendering algorithms based on ray marching that are mismatched to the capabilities of widely deployed graphics hardware. This paper introduces a new NeRF representation based on textured polygons that can synthesize novel images efficiently with standard rendering pipelines. The NeRF is represented as a set of polygons with textures representing binary opacities and feature vectors. Traditional rendering of the polygons with a z-buffer yields an image with features at every pixel, which are interpreted by a small, view-dependent MLP running in a fragment shader to produce a final pixel color. This approach enables NeRFs to be rendered with the traditional polygon rasterization pipeline, which provides massive pixel-level parallelism, achieving interactive frame rates on a wide range of compute platforms, including mobile phones.\n  - [Deforming Radiance Fields with Cages, ECCV2022](https://arxiv.org/abs/2207.12298) | [code]\n    > Recent advances in radiance fields enable photorealistic rendering of static or dynamic 3D scenes, but still do not support explicit deformation that is used for scene manipulation or animation. In this paper, we propose a method that enables a new type of deformation of the radiance field: free-form radiance field deformation. We use a triangular mesh that encloses the foreground object called cage as an interface, and by manipulating the cage vertices, our approach enables the free-form deformation of the radiance field. The core of our approach is cage-based deformation which is commonly used in mesh deformation. We propose a novel formulation to extend it to the radiance field, which maps the position and the view direction of the sampling points from the deformed space to the canonical space, thus enabling the rendering of the deformed scene. The deformation results of the synthetic datasets and the real-world datasets demonstrate the effectiveness of our approach.\n  - [NeuMesh: Learning Disentangled Neural Mesh-based Implicit Field for Geometry and Texture Editing, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > Very recently neural implicit rendering techniques have been rapidly evolved and shown great advantages in novel view synthesis and 3D scene reconstruction. However, existing neural rendering methods for editing purposes offer limited functionality, e.g., rigid transformation, or not applicable for fine-grained editing for general objects from daily lives. In this paper, we present a novel mesh-based representation by encoding the neural implicit field with disentangled geometry and texture codes on mesh vertices, which facilitates a set of editing functionalities, including mesh-guided geometry editing, designated texture editing with texture swapping, filling and painting operations. To this end, we develop several techniques including learnable sign indicators to magnify spatial distinguishability of mesh-based representation, distillation and fine-tuning mechanism to make a steady convergence, and the spatial-aware optimization strategy to realize precise texture editing. Extensive experiments and editing examples on both real and synthetic data demonstrate the superiority of our method on representation quality and editing ability. Code is available on the project webpage: this https URL.\n## Previous weeks\n  - [Neural Sparse Voxel Fields, NeurIPS2020](https://lingjie0206.github.io/papers/NSVF/) | [***``[code]``***](https://github.com/facebookresearch/NSVF)\n    > We introduce Neural Sparse Voxel Fields (NSVF), a new neural scene representation for fast and high-quality free-viewpoint rendering. NSVF defines a set of voxel-bounded implicit fields organized in a sparse voxel octree to model local properties in each cell. We progressively learn the underlying voxel structures with a diffentiable ray-marching operation from only a set of posed RGB images. With the sparse voxel octree structure, rendering novel views can be accelerated by skipping the voxels containing no relevant scene content. Our method is over 10 times faster than the state-of-the-art (namely, NeRF (Mildenhall et al., 2020)) at inference time while achieving higher quality results. Furthermore, by utilizing an explicit sparse voxel representation, our method can easily be applied to scene editing and scene composition. We also demonstrate several challenging tasks, including multi-scene learning, free-viewpoint rendering of a moving human, and large-scale scene rendering.\n  - [CAMPARI: Camera-Aware Decomposed Generative Neural Radiance Fields](https://arxiv.org/pdf/2103.17269.pdf) | [code]\n    > Tremendous progress in deep generative models has led to photorealistic image synthesis. While achieving compelling results, most approaches operate in the two-dimensional image domain, ignoring the three-dimensional nature of our world. Several recent works therefore propose generative models which are 3D-aware, i.e., scenes are modeled in 3D and then rendered differentiably to the image plane. This leads to impressive 3D consistency, but incorporating such a bias comes at a price: the camera needs to be modeled as well. Current approaches assume fixed intrinsics and a predefined prior over camera pose ranges. As a result, parameter tuning is typically required for real-world data, and results degrade if the data distribution is not matched. Our key hypothesis is that learning a camera generator jointly with the image generator leads to a more principled approach to 3D-aware image synthesis. Further, we propose to decompose the scene into a background and foreground model, leading to more efficient and disentangled scene representations. While training from raw, unposed image collections, we learn a 3D- and camera-aware generative model which faithfully recovers not only the image but also the camera data distribution. At test time, our model generates images with explicit control over the camera as well as the shape and appearance of the scene.\n  - [NeRFactor: Neural Factorization of Shape and Reflectance Under an Unknown Illumination, TOG 2021 (Proc. SIGGRAPH Asia)](https://xiuming.info/projects/nerfactor/) | [code]\n    > We address the problem of recovering the shape and spatially-varying reflectance of an object from multi-view images (and their camera poses) of an object illuminated by one unknown lighting condition. This enables the rendering of novel views of the object under arbitrary environment lighting and editing of the object's material properties. The key to our approach, which we call Neural Radiance Factorization (NeRFactor), is to distill the volumetric geometry of a Neural Radiance Field (NeRF) [Mildenhall et al. 2020] representation of the object into a surface representation and then jointly refine the geometry while solving for the spatially-varying reflectance and environment lighting. Specifically, NeRFactor recovers 3D neural fields of surface normals, light visibility, albedo, and Bidirectional Reflectance Distribution Functions (BRDFs) without any supervision, using only a re-rendering loss, simple smoothness priors, and a data-driven BRDF prior learned from real-world BRDF measurements. By explicitly modeling light visibility, NeRFactor is able to separate shadows from albedo and synthesize realistic soft or hard shadows under arbitrary lighting conditions. NeRFactor is able to recover convincing 3D models for free-viewpoint relighting in this challenging and underconstrained capture setup for both synthetic and real scenes. Qualitative and quantitative experiments show that NeRFactor outperforms classic and deep learning-based state of the art across various tasks. Our videos, code, and data are available at people.csail.mit.edu/xiuming/projects/nerfactor/.\n  - [Object-Centric Neural Scene Rendering](https://shellguo.com/osf/) | [***``[code]``***](https://shellguo.com/osf/)\n    > We present a method for composing photorealistic scenes from captured images of objects. Our work builds upon neural radiance fields (NeRFs), which implicitly model the volumetric density and directionally-emitted radiance of a scene. While NeRFs synthesize realistic pictures, they only model static scenes and are closely tied to specific imaging conditions. This property makes NeRFs hard to generalize to new scenarios, including new lighting or new arrangements of objects. Instead of learning a scene radiance field as a NeRF does, we propose to learn object-centric neural scattering functions (OSFs), a representation that models per-object light transport implicitly using a lighting- and view-dependent neural network. This enables rendering scenes even when objects or lights move, without retraining. Combined with a volumetric path tracing procedure, our framework is capable of rendering both intra- and inter-object light transport effects including occlusions, specularities, shadows, and indirect illumination. We evaluate our approach on scene composition and show that it generalizes to novel illumination conditions, producing photorealistic, physically accurate renderings of multi-object scenes.\n  - [Unsupervised Discovery of Object Radiance Fields, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > We study the problem of inferring an object-centric scene representation from a single image, aiming to derive a representation that explains the image formation process, captures the scene's 3D nature, and is learned without supervision. Most existing methods on scene decomposition lack one or more of these characteristics, due to the fundamental challenge in integrating the complex 3D-to-2D image formation process into powerful inference schemes like deep networks. In this paper, we propose unsupervised discovery of Object Radiance Fields (uORF), integrating recent progresses in neural 3D scene representations and rendering with deep inference networks for unsupervised 3D scene decomposition. Trained on multi-view RGB images without annotations, uORF learns to decompose complex scenes with diverse, textured background from a single image. We show that uORF performs well on unsupervised 3D scene segmentation, novel view synthesis, and scene editing on three datasets.\n  - [Learning Object-Compositional Neural Radiance Field for Editable Scene Rendering, ICCV2021](https://zju3dv.github.io/object_nerf/) | [***``[code]``***](https://github.com/zju3dv/object_nerf)\n    > Implicit neural rendering techniques have shown promising results for novel view synthesis. However, existing methods usually encode the entire scene as a whole, which is generally not aware of the object identity and limits the ability to the high-level editing tasks such as moving or adding furniture. In this paper, we present a novel neural scene rendering system, which learns an object-compositional neural radiance field and produces realistic rendering with editing capability for a clustered and real-world scene. Specifically, we design a novel two-pathway architecture, in which the scene branch encodes the scene geometry and appearance, and the object branch encodes each standalone object conditioned on learnable object activation codes. To survive the training in heavily cluttered scenes, we propose a scene-guided training strategy to solve the 3D space ambiguity in the occluded regions and learn sharp boundaries for each object. Extensive experiments demonstrate that our system not only achieves competitive performance for static scene novel-view synthesis, but also produces realistic rendering for object-level editing.\n  - [Editing Conditional Radiance Fields, ICCV2021](http://editnerf.csail.mit.edu/) | [***``[code]``***](https://github.com/stevliu/editnerf)\n    > A neural radiance field (NeRF) is a scene model supporting high-quality view synthesis, optimized per scene. In this paper, we explore enabling user editing of a category-level NeRF - also known as a conditional radiance field - trained on a shape category. Specifically, we introduce a method for propagating coarse 2D user scribbles to the 3D space, to modify the color or shape of a local region. First, we propose a conditional radiance field that incorporates new modular network components, including a shape branch that is shared across object instances. Observing multiple instances of the same category, our model learns underlying part semantics without any supervision, thereby allowing the propagation of coarse 2D user scribbles to the entire 3D region (e.g., chair seat). Next, we propose a hybrid network update strategy that targets specific network components, which balances efficiency and accuracy. During user interaction, we formulate an optimization problem that both satisfies the user's constraints and preserves the original object structure. We demonstrate our approach on various editing tasks over three shape datasets and show that it outperforms prior neural editing approaches. Finally, we edit the appearance and shape of a real photograph and show that the edit propagates to extrapolated novel views.\n  - [Editable Free-Viewpoint Video using a Layered Neural Representation, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > Generating free-viewpoint videos is critical for immersive VR/AR experience but recent neural advances still lack the editing ability to manipulate the visual perception for large dynamic scenes. To fill this gap, in this paper we propose the first approach for editable photo-realistic free-viewpoint video generation for large-scale dynamic scenes using only sparse 16 cameras. The core of our approach is a new layered neural representation, where each dynamic entity including the environment itself is formulated into a space-time coherent neural layered radiance representation called ST-NeRF. Such layered representation supports fully perception and realistic manipulation of the dynamic scene whilst still supporting a free viewing experience in a wide range. In our ST-NeRF, the dynamic entity/layer is represented as continuous functions, which achieves the disentanglement of location, deformation as well as the appearance of the dynamic entity in a continuous and self-supervised manner. We propose a scene parsing 4D label map tracking to disentangle the spatial information explicitly, and a continuous deform module to disentangle the temporal motion implicitly. An object-aware volume rendering scheme is further introduced for the re-assembling of all the neural layers. We adopt a novel layered loss and motion-aware ray sampling strategy to enable efficient training for a large dynamic scene with multiple performers, Our framework further enables a variety of editing functions, i.e., manipulating the scale and location, duplicating or retiming individual neural layers to create numerous visual effects while preserving high realism. Extensive experiments demonstrate the effectiveness of our approach to achieve high-quality, photo-realistic, and editable free-viewpoint video generation for dynamic scenes.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/fast.md",
    "content": "\nWeekly Classified Neural Radiance Fields - fast ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [GARF:Geometry-Aware Generalized Neural Radiance Field](https://arxiv.org/abs/2212.02280) | [code]\n    > Neural Radiance Field (NeRF) has revolutionized free viewpoint rendering tasks and achieved impressive results. However, the efficiency and accuracy problems hinder its wide applications. To address these issues, we propose Geometry-Aware Generalized Neural Radiance Field (GARF) with a geometry-aware dynamic sampling (GADS) strategy to perform real-time novel view rendering and unsupervised depth estimation on unseen scenes without per-scene optimization. Distinct from most existing generalized NeRFs, our framework infers the unseen scenes on both pixel-scale and geometry-scale with only a few input images. More specifically, our method learns common attributes of novel-view synthesis by an encoder-decoder structure and a point-level learnable multi-view feature fusion module which helps avoid occlusion. To preserve scene characteristics in the generalized model, we introduce an unsupervised depth estimation module to derive the coarse geometry, narrow down the ray sampling interval to proximity space of the estimated surface and sample in expectation maximum position, constituting Geometry-Aware Dynamic Sampling strategy (GADS). Moreover, we introduce a Multi-level Semantic Consistency loss (MSC) to assist more informative representation learning. Extensive experiments on indoor and outdoor datasets show that comparing with state-of-the-art generalized NeRF methods, GARF reduces samples by more than 25\\%, while improving rendering quality and 3D geometry estimation.\n## Nov27 - Dec3, 2022\n  - [QFF: Quantized Fourier Features for Neural Field Representations](https://arxiv.org/abs/2212.00914) | [code]\n    > Multilayer perceptrons (MLPs) learn high frequencies slowly. Recent approaches encode features in spatial bins to improve speed of learning details, but at the cost of larger model size and loss of continuity. Instead, we propose to encode features in bins of Fourier features that are commonly used for positional encoding. We call these Quantized Fourier Features (QFF). As a naturally multiresolution and periodic representation, our experiments show that using QFF can result in smaller model size, faster training, and better quality outputs for several applications, including Neural Image Representations (NIR), Neural Radiance Field (NeRF) and Signed Distance Function (SDF) modeling. QFF are easy to code, fast to compute, and serve as a simple drop-in addition to many neural field representations.\n  - [Mixed Neural Voxels for Fast Multi-view Video Synthesis](https://arxiv.org/abs/2212.00190) | [code]\n    > Synthesizing high-fidelity videos from real-world multi-view input is challenging because of the complexities of real-world environments and highly dynamic motions. Previous works based on neural radiance fields have demonstrated high-quality reconstructions of dynamic scenes. However, training such models on real-world scenes is time-consuming, usually taking days or weeks. In this paper, we present a novel method named MixVoxels to better represent the dynamic scenes with fast training speed and competitive rendering qualities. The proposed MixVoxels represents the 4D dynamic scenes as a mixture of static and dynamic voxels and processes them with different networks. In this way, the computation of the required modalities for static voxels can be processed by a lightweight model, which essentially reduces the amount of computation, especially for many daily dynamic scenes dominated by the static background. To separate the two kinds of voxels, we propose a novel variation field to estimate the temporal variance of each voxel. For the dynamic voxels, we design an inner-product time query method to efficiently query multiple time steps, which is essential to recover the high-dynamic motions. As a result, with 15 minutes of training for dynamic scenes with inputs of 300-frame videos, MixVoxels achieves better PSNR than previous methods. Codes and trained models are available at this https URL\n  - [Differentiable Rendering Using RGBXY Derivatives and Optimal Transport, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555479) | [code]\n    > Traditional differentiable rendering approaches are usually hard to converge in inverse rendering optimizations, especially when initial and target object locations are not so close. Inspired by Lagrangian fluid simulation, we present a novel differentiable rendering method to address this problem. We associate each screen-space pixel with the visible 3D geometric point covered by the center of the pixel and compute derivatives on geometric points rather than on pixels. We refer to the associated geometric points as point proxies of pixels. For each point proxy, we compute its 5D RGBXY derivatives which measures how its 3D RGB color and 2D projected screen-space position change with respect to scene parameters. Furthermore, in order to capture global and long-range object motions, we utilize optimal transport based pixel matching to design a more sophisticated loss function. We have conducted experiments to evaluate the effectiveness of our proposed method on various inverse rendering applications and have demonstrated superior convergence behavior compared to state-of-the-art baselines.\n  - [QuadStream: A Quad-Based Scene Streaming Architecture for Novel Viewpoint Reconstruction, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555524) | [code]\n    > Streaming rendered 3D content over a network to a thin client device, such as a phone or a VR/AR headset, brings high-fidelity graphics to platforms where it would not normally possible due to thermal, power, or cost constraints. Streamed 3D content must be transmitted with a representation that is both robust to latency and potential network dropouts. Transmitting a video stream and reprojecting to correct for changing viewpoints fails in the presence of disocclusion events; streaming scene geometry and performing high-quality rendering on the client is not possible on limited-power mobile GPUs. To balance the competing goals of disocclusion robustness and minimal client workload, we introduce QuadStream, a new streaming content representation that reduces motion-to-photon latency by allowing clients to efficiently render novel views without artifacts caused by disocclusion events. Motivated by traditional macroblock approaches to video codec design, we decompose the scene seen from positions in a view cell into a series of quad proxies, or view-aligned quads from multiple views. By operating on a rasterized G-Buffer, our approach is independent of the representation used for the scene itself; the resulting QuadStream is an approximate geometric representation of the scene that can be reconstructed by a thin client to render both the current view and nearby adjacent views. Our technical contributions are an efficient parallel quad generation, merging, and packing strategy for proxy views covering potential client movement in a scene; a packing and encoding strategy that allows masked quads with depth information to be transmitted as a frame-coherent stream; and an efficient rendering approach for rendering our QuadStream representation into entirely novel views on thin clients. We show that our approach achieves superior quality compared both to video data streaming methods, and to geometry-based streaming.\n  - [Lightweight Neural Basis Functions for All-Frequency Shading, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555386) | [code]\n    > Basis functions provide both the abilities for compact representation and the properties for efficient computation. Therefore, they are pervasively used in rendering to perform all-frequency shading. However, common basis functions, including spherical harmonics (SH), wavelets, and spherical Gaussians (SG) all have their own limitations, such as low-frequency for SH, not rotationally invariant for wavelets, and no multiple product support for SG. In this paper, we present neural basis functions, an implicit and data-driven set of basis functions that circumvents the limitations with all desired properties. We first introduce a representation neural network that takes any general 2D spherical function (e.g. environment lighting, BRDF, and visibility) as input and projects it onto the latent space as coefficients of our neural basis functions. Then, we design several lightweight neural networks that perform different types of computation, giving our basis functions different computational properties such as double/triple product integrals and rotations. We demonstrate the practicality of our neural basis functions by integrating them into all-frequency shading applications, showing that our method not only achieves a compression rate of and 10 × -40 × better performance than wavelets at equal quality, but also renders all-frequency lighting effects in real-time without the aforementioned limitations from classic basis functions.\n## Nov20 - Nov26, 2022\n  - [ScanNeRF: a Scalable Benchmark for Neural Radiance Fields, WACV2023](https://arxiv.org/abs/2211.13762) | [code]\n    > In this paper, we propose the first-ever real benchmark thought for evaluating Neural Radiance Fields (NeRFs) and, in general, Neural Rendering (NR) frameworks. We design and implement an effective pipeline for scanning real objects in quantity and effortlessly. Our scan station is built with less than 500$ hardware budget and can collect roughly 4000 images of a scanned object in just 5 minutes. Such a platform is used to build ScanNeRF, a dataset characterized by several train/val/test splits aimed at benchmarking the performance of modern NeRF methods under different conditions. Accordingly, we evaluate three cutting-edge NeRF variants on it to highlight their strengths and weaknesses. The dataset is available on our project page, together with an online benchmark to foster the development of better and better NeRFs.\n  - [Immersive Neural Graphics Primitives](https://arxiv.org/abs/2211.13494) | [code]\n    > Neural radiance field (NeRF), in particular its extension by instant neural graphics primitives, is a novel rendering method for view synthesis that uses real-world images to build photo-realistic immersive virtual scenes. Despite its potential, research on the combination of NeRF and virtual reality (VR) remains sparse. Currently, there is no integration into typical VR systems available, and the performance and suitability of NeRF implementations for VR have not been evaluated, for instance, for different scene complexities or screen resolutions. In this paper, we present and evaluate a NeRF-based framework that is capable of rendering scenes in immersive VR allowing users to freely move their heads to explore complex real-world scenes. We evaluate our framework by benchmarking three different NeRF scenes concerning their rendering performance at different scene complexities and resolutions. Utilizing super-resolution, our approach can yield a frame rate of 30 frames per second with a resolution of 1280x720 pixels per eye. We discuss potential applications of our framework and provide an open source implementation online.\n  - [Shape, Pose, and Appearance from a Single Image via Bootstrapped Radiance Field Inversion](https://arxiv.org/abs/2211.11674) | [code]\n    > Neural Radiance Fields (NeRF) coupled with GANs represent a promising direction in the area of 3D reconstruction from a single view, owing to their ability to efficiently model arbitrary topologies. Recent work in this area, however, has mostly focused on synthetic datasets where exact ground-truth poses are known, and has overlooked pose estimation, which is important for certain downstream applications such as augmented reality (AR) and robotics. We introduce a principled end-to-end reconstruction framework for natural images, where accurate ground-truth poses are not available. Our approach recovers an SDF-parameterized 3D shape, pose, and appearance from a single image of an object, without exploiting multiple views during training. More specifically, we leverage an unconditional 3D-aware generator, to which we apply a hybrid inversion scheme where a model produces a first guess of the solution which is then refined via optimization. Our framework can de-render an image in as few as 10 steps, enabling its use in practical scenarios. We demonstrate state-of-the-art results on a variety of real and synthetic benchmarks.\n## Nov13 - Nov19, 2022\n  - [DINER: Disorder-Invariant Implicit Neural Representation](https://arxiv.org/abs/2211.07871) | [code]\n    > Implicit neural representation (INR) characterizes the attributes of a signal as a function of corresponding coordinates which emerges as a sharp weapon for solving inverse problems. However, the capacity of INR is limited by the spectral bias in the network training. In this paper, we find that such a frequency-related problem could be largely solved by re-arranging the coordinates of the input signal, for which we propose the disorder-invariant implicit neural representation (DINER) by augmenting a hash-table to a traditional INR backbone. Given discrete signals sharing the same histogram of attributes and different arrangement orders, the hash-table could project the coordinates into the same distribution for which the mapped signal can be better modeled using the subsequent INR network, leading to significantly alleviated spectral bias. Experiments not only reveal the generalization of the DINER for different INR backbones (MLP vs. SIREN) and various tasks (image/video representation, phase retrieval, and refractive index recovery) but also show the superiority over the state-of-the-art algorithms both in quality and speed.\n## Nov6 - Nov12, 2022\n  - [Temporal Coherence-Based Distributed Ray Tracing of Massive Scenes, ToG2022](https://ieeexplore.ieee.org/abstract/document/9940545) | [code]\n    > Distributed ray tracing algorithms are widely used when rendering massive scenes, where data utilization and load balancing are the keys to improving performance. One essential observation is that rays are temporally coherent, which indicates that temporal information can be used to improve computational efficiency. In this paper, we use temporal coherence to optimize the performance of distributed ray tracing. First, we propose a temporal coherence-based scheduling algorithm to guide the task/data assignment and scheduling. Then, we propose a virtual portal structure to predict the radiance of rays based on the previous frame, and send the rays with low radiance to a precomputed simplified model for further tracing, which can dramatically reduce the traversal complexity and the overhead of network data transmission. The approach was validated on scenes of sizes up to 355 GB. Our algorithm can achieve a speedup of up to 81% compared to previous algorithms, with a very small mean squared error.\n  - [QRF: Implicit Neural Representations with Quantum Radiance Fields](https://arxiv.org/abs/2211.03418) | [code]\n    > Photorealistic rendering of real-world scenes is a tremendous challenge with a wide range of applications, including mixed reality (MR), and virtual reality (VR). Neural networks, which have long been investigated in the context of solving differential equations, have previously been introduced as implicit representations for photorealistic rendering. However, realistic rendering using classic computing is challenging because it requires time-consuming optical ray marching, and suffer computational bottlenecks due to the curse of dimensionality. In this paper, we propose Quantum Radiance Fields (QRF), which integrate the quantum circuit, quantum activation function, and quantum volume rendering for implicit scene representation. The results indicate that QRF not only exploits the advantage of quantum computing, such as high speed, fast convergence, and high parallelism, but also ensure high quality of volume rendering.\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n  - [NeX360: Real-time All-around View Synthesis with Neural Basis Expansion, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9931981) | [code]\n    > We present NeX, a new approach to novel view synthesis based on enhancements of multiplane images (MPI) that can reproduce view-dependent effects in real time. Unlike traditional MPI, our technique parameterizes each pixel as a linear combination of spherical basis functions learned from a neural network to model view-dependent effects and uses a hybrid implicit-explicit modeling strategy to improve fine detail. Moreover, we also present an extension to NeX, which leverages knowledge distillation to train multiple MPIs for unbounded 360 ∘ scenes. Our method is evaluated on several benchmark datasets: NeRF-Synthetic dataset, Light Field dataset, Real Forward-Facing dataset, Space dataset, as well as Shiny , our new dataset that contains significantly more challenging view-dependent effects, such as the rainbow reflections on the CD. Our method outperforms other real-time rendering approaches on PSNR, SSIM, and LPIPS and can render unbounded 360 ∘ scenes in real time.\n  - [NeRFPlayer: A Streamable Dynamic Scene Representation with Decomposed Neural Radiance Fields](https://arxiv.org/abs/2210.15947) | [code]\n    > Visually exploring in a real-world 4D spatiotemporal space freely in VR has been a long-term quest. The task is especially appealing when only a few or even single RGB cameras are used for capturing the dynamic scene. To this end, we present an efficient framework capable of fast reconstruction, compact modeling, and streamable rendering. First, we propose to decompose the 4D spatiotemporal space according to temporal characteristics. Points in the 4D space are associated with probabilities of belonging to three categories: static, deforming, and new areas. Each area is represented and regularized by a separate neural field. Second, we propose a hybrid representations based feature streaming scheme for efficiently modeling the neural fields. Our approach, coined NeRFPlayer, is evaluated on dynamic scenes captured by single hand-held cameras and multi-camera arrays, achieving comparable or superior rendering performance in terms of quality and speed comparable to recent state-of-the-art methods, achieving reconstruction in 10 seconds per frame and real-time rendering.\n  - [Streaming Radiance Fields for 3D Video Synthesis, NeurIPS2022](https://arxiv.org/abs/2210.14831) | [code]\n    > We present an explicit-grid based method for efficiently reconstructing streaming radiance fields for novel view synthesis of real world dynamic scenes. Instead of training a single model that combines all the frames, we formulate the dynamic modeling problem with an incremental learning paradigm in which per-frame model difference is trained to complement the adaption of a base model on the current frame. By exploiting the simple yet effective tuning strategy with narrow bands, the proposed method realizes a feasible framework for handling video sequences on-the-fly with high training efficiency. The storage overhead induced by using explicit grid representations can be significantly reduced through the use of model difference based compression. We also introduce an efficient strategy to further accelerate model optimization for each frame. Experiments on challenging video sequences demonstrate that our approach is capable of achieving a training speed of 15 seconds per-frame with competitive rendering quality, which attains 1000× speedup over the state-of-the-art implicit methods. Code is available at this https URL.\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [Lightweight Stepless Super-Resolution of Remote Sensing Images via Saliency-Aware Dynamic Routing Strategy](https://arxiv.org/abs/2210.07598) | [***``[code]``***](https://github.com/hanlinwu/SalDRN)\n    > Deep learning-based algorithms have greatly improved the performance of remote sensing image (RSI) super-resolution (SR). However, increasing network depth and parameters cause a huge burden of computing and storage. Directly reducing the depth or width of existing models results in a large performance drop. We observe that the SR difficulty of different regions in an RSI varies greatly, and existing methods use the same deep network to process all regions in an image, resulting in a waste of computing resources. In addition, existing SR methods generally predefine integer scale factors and cannot perform stepless SR, i.e., a single model can deal with any potential scale factor. Retraining the model on each scale factor wastes considerable computing resources and model storage space. To address the above problems, we propose a saliency-aware dynamic routing network (SalDRN) for lightweight and stepless SR of RSIs. First, we introduce visual saliency as an indicator of region-level SR difficulty and integrate a lightweight saliency detector into the SalDRN to capture pixel-level visual characteristics. Then, we devise a saliency-aware dynamic routing strategy that employs path selection switches to adaptively select feature extraction paths of appropriate depth according to the SR difficulty of sub-image patches. Finally, we propose a novel lightweight stepless upsampling module whose core is an implicit feature function for realizing mapping from low-resolution feature space to high-resolution feature space. Comprehensive experiments verify that the SalDRN can achieve a good trade-off between performance and complexity. The code is available at \\url{this https URL}.\n  - [Scalable Neural Video Representations with Learnable Positional Features, NeurIPS2022](https://arxiv.org/abs/2210.06823) | [***``[code]``***](https://github.com/subin-kim-cv/NVP)\n    > Succinct representation of complex signals using coordinate-based neural representations (CNRs) has seen great progress, and several recent efforts focus on extending them for handling videos. Here, the main challenge is how to (a) alleviate a compute-inefficiency in training CNRs to (b) achieve high-quality video encoding while (c) maintaining the parameter-efficiency. To meet all requirements (a), (b), and (c) simultaneously, we propose neural video representations with learnable positional features (NVP), a novel CNR by introducing \"learnable positional features\" that effectively amortize a video as latent codes. Specifically, we first present a CNR architecture based on designing 2D latent keyframes to learn the common video contents across each spatio-temporal axis, which dramatically improves all of those three requirements. Then, we propose to utilize existing powerful image and video codecs as a compute-/memory-efficient compression procedure of latent codes. We demonstrate the superiority of NVP on the popular UVG benchmark; compared with prior arts, NVP not only trains 2 times faster (less than 5 minutes) but also exceeds their encoding quality as 34.07→34.57 (measured with the PSNR metric), even using >8 times fewer parameters. We also show intriguing properties of NVP, e.g., video inpainting, video frame interpolation, etc.\n  - [CUF: Continuous Upsampling Filters](https://arxiv.org/abs/2210.06965) | [code]\n    > Neural fields have rapidly been adopted for representing 3D signals, but their application to more classical 2D image-processing has been relatively limited. In this paper, we consider one of the most important operations in image processing: upsampling. In deep learning, learnable upsampling layers have extensively been used for single image super-resolution. We propose to parameterize upsampling kernels as neural fields. This parameterization leads to a compact architecture that obtains a 40-fold reduction in the number of parameters when compared with competing arbitrary-scale super-resolution architectures. When upsampling images of size 256x256 we show that our architecture is 2x-10x more efficient than competing arbitrary-scale super-resolution architectures, and more efficient than sub-pixel convolutions when instantiated to a single-scale model. In the general setting, these gains grow polynomially with the square of the target scale. We validate our method on standard benchmarks showing such efficiency gains can be achieved without sacrifices in super-resolution performance.\n  - [NerfAcc: A General NeRF Acceleration Toolbox](https://arxiv.org/abs/2210.04847) | [***``[code]``***](https://github.com/KAIR-BAIR/nerfacc)\n    > We propose NerfAcc, a toolbox for efficient volumetric rendering of radiance fields. We build on the techniques proposed in Instant-NGP, and extend these techniques to not only support bounded static scenes, but also for dynamic scenes and unbounded scenes. NerfAcc comes with a user-friendly Python API, and is ready for plug-and-play acceleration of most NeRFs. Various examples are provided to show how to use this toolbox. Code can be found here: this https URL.\n## Oct2 - Oct8, 2022\n  - [Learning Perception-Aware Agile Flight in Cluttered Environments](https://arxiv.org/abs/2210.01841) | [code]\n    > Recently, neural control policies have outperformed existing model-based planning-and-control methods for autonomously navigating quadrotors through cluttered environments in minimum time. However, they are not perception aware, a crucial requirement in vision-based navigation due to the camera's limited field of view and the underactuated nature of a quadrotor. We propose a method to learn neural network policies that achieve perception-aware, minimum-time flight in cluttered environments. Our method combines imitation learning and reinforcement learning (RL) by leveraging a privileged learning-by-cheating framework. Using RL, we first train a perception-aware teacher policy with full-state information to fly in minimum time through cluttered environments. Then, we use imitation learning to distill its knowledge into a vision-based student policy that only perceives the environment via a camera. Our approach tightly couples perception and control, showing a significant advantage in computation speed (10x faster) and success rate. We demonstrate the closed-loop control performance using a physical quadrotor and hardware-in-the-loop simulation at speeds up to 50km/h.\n## Sep25 - Oct1, 2022\n  - [Understanding Pure CLIP Guidance for Voxel Grid NeRF Models](https://arxiv.org/abs/2209.15172) | [code]\n    > We explore the task of text to 3D object generation using CLIP. Specifically, we use CLIP for guidance without access to any datasets, a setting we refer to as pure CLIP guidance. While prior work has adopted this setting, there is no systematic study of mechanics for preventing adversarial generations within CLIP. We illustrate how different image-based augmentations prevent the adversarial generation problem, and how the generated results are impacted. We test different CLIP model architectures and show that ensembling different models for guidance can prevent adversarial generations within bigger models and generate sharper results. Furthermore, we implement an implicit voxel grid model to show how neural networks provide an additional layer of regularization, resulting in better geometrical structure and coherency of generated objects. Compared to prior work, we achieve more coherent results with higher memory efficiency and faster training speeds.\n## Sep18 - Sep24, 2022\n  - [Fast Disparity Estimation from a Single Compressed Light Field Measurement](https://arxiv.org/abs/2209.11342) | [code]\n    > The abundant spatial and angular information from light fields has allowed the development of multiple disparity estimation approaches. However, the acquisition of light fields requires high storage and processing cost, limiting the use of this technology in practical applications. To overcome these drawbacks, the compressive sensing (CS) theory has allowed the development of optical architectures to acquire a single coded light field measurement. This measurement is decoded using an optimization algorithm or deep neural network that requires high computational costs. The traditional approach for disparity estimation from compressed light fields requires first recovering the entire light field and then a post-processing step, thus requiring long times. In contrast, this work proposes a fast disparity estimation from a single compressed measurement by omitting the recovery step required in traditional approaches. Specifically, we propose to jointly optimize an optical architecture for acquiring a single coded light field snapshot and a convolutional neural network (CNN) for estimating the disparity maps. Experimentally, the proposed method estimates disparity maps comparable with those obtained from light fields reconstructed using deep learning approaches. Furthermore, the proposed method is 20 times faster in training and inference than the best method that estimates the disparity from reconstructed light fields.\n  - [wildNeRF: Complete view synthesis of in-the-wild dynamic scenes captured using sparse monocular data](https://arxiv.org/abs/2209.10399) | [code]\n    > We present a novel neural radiance model that is trainable in a self-supervised manner for novel-view synthesis of dynamic unstructured scenes. Our end-to-end trainable algorithm learns highly complex, real-world static scenes within seconds and dynamic scenes with both rigid and non-rigid motion within minutes. By differentiating between static and motion-centric pixels, we create high-quality representations from a sparse set of images. We perform extensive qualitative and quantitative evaluation on existing benchmarks and set the state-of-the-art on performance measures on the challenging NVIDIA Dynamic Scenes Dataset. Additionally, we evaluate our model performance on challenging real-world datasets such as Cholec80 and SurgicalActions160.\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n  - [FoV-NeRF: Foveated Neural Radiance Fields for Virtual Reality, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9872532) | [code]\n    > Virtual Reality (VR) is becoming ubiquitous with the rise of consumer displays and commercial VR platforms. Such displays require low latency and high quality rendering of synthetic imagery with reduced compute overheads. Recent advances in neural rendering showed promise of unlocking new possibilities in 3D computer graphics via image-based representations of virtual or physical environments. Specifically, the neural radiance fields (NeRF) demonstrated that photo-realistic quality and continuous view changes of 3D scenes can be achieved without loss of view-dependent effects. While NeRF can significantly benefit rendering for VR applications, it faces unique challenges posed by high field-of-view, high resolution, and stereoscopic/egocentric viewing, typically causing low quality and high latency of the rendered images. In VR, this not only harms the interaction experience but may also cause sickness. To tackle these problems toward six-degrees-of-freedom, egocentric, and stereo NeRF in VR, we present the first gaze-contingent 3D neural representation and view synthesis method . We incorporate the human psychophysics of visual- and stereo-acuity into an egocentric neural representation of 3D scenery. We then jointly optimize the latency/performance and visual quality while mutually bridging human perception and neural scene synthesis to achieve perceptually high-quality immersive interaction. We conducted both objective analysis and subjective studies to evaluate the effectiveness of our approach. We find that our method significantly reduces latency (up to 99% time reduction compared with NeRF) without loss of high-fidelity rendering (perceptually identical to full-resolution ground truth). The presented approach may serve as the first step toward future VR/AR systems that capture, teleport, and visualize remote environments in real-time.\n  - [CLONeR: Camera-Lidar Fusion for Occupancy Grid-aided Neural Representations](https://arxiv.org/abs/2209.01194) | [code]\n    > This paper proposes CLONeR, which significantly improves upon NeRF by allowing it to model large outdoor driving scenes that are observed from sparse input sensor views. This is achieved by decoupling occupancy and color learning within the NeRF framework into separate Multi-Layer Perceptrons (MLPs) trained using LiDAR and camera data, respectively. In addition, this paper proposes a novel method to build differentiable 3D Occupancy Grid Maps (OGM) alongside the NeRF model, and leverage this occupancy grid for improved sampling of points along a ray for volumetric rendering in metric space.\n## Aug21 - Aug27, 2022\n  - [Voxurf: Voxel-based Efficient and Accurate Neural Surface Reconstruction](https://arxiv.org/abs/2208.12697) | [code]\n    > Neural surface reconstruction aims to reconstruct accurate 3D surfaces based on multi-view images. Previous methods based on neural volume rendering mostly train a fully implicit model, and they require hours of training for a single scene. Recent efforts explore the explicit volumetric representation, which substantially accelerates the optimization process by memorizing significant information in learnable voxel grids. However, these voxel-based methods often struggle in reconstructing fine-grained geometry. Through empirical studies, we found that high-quality surface reconstruction hinges on two key factors: the capability of constructing a coherent shape and the precise modeling of color-geometry dependency. In particular, the latter is the key to the accurate reconstruction of fine details. Inspired by these findings, we develop Voxurf, a voxel-based approach for efficient and accurate neural surface reconstruction, which consists of two stages: 1) leverage a learnable feature grid to construct the color field and obtain a coherent coarse shape, and 2) refine detailed geometry with a dual color network that captures precise color-geometry dependency. We further introduce a hierarchical geometry feature to enable information sharing across voxels. Our experiments show that Voxurf achieves high efficiency and high quality at the same time. On the DTU benchmark, Voxurf achieves higher reconstruction quality compared to state-of-the-art methods, with 20x speedup in training.\n  - [E-NeRF: Neural Radiance Fields from a Moving Event Camera](https://arxiv.org/abs/2208.11300) | [code]\n    > Estimating neural radiance fields (NeRFs) from ideal images has been extensively studied in the computer vision community. Most approaches assume optimal illumination and slow camera motion. These assumptions are often violated in robotic applications, where images contain motion blur and the scene may not have suitable illumination. This can cause significant problems for downstream tasks such as navigation, inspection or visualization of the scene. To alleviate these problems we present E-NeRF, the first method which estimates a volumetric scene representation in the form of a NeRF from a fast-moving event camera. Our method can recover NeRFs during very fast motion and in high dynamic range conditions, where frame-based approaches fail. We show that rendering high-quality frames is possible by only providing an event stream as input. Furthermore, by combining events and frames, we can estimate NeRFs of higher quality than state-of-the-art approaches under severe motion blur. We also show that combining events and frames can overcome failure cases of NeRF estimation in scenarios where only few input views are available, without requiring additional regularization.\n## Aug14 - Aug20, 2022\n  - [PDRF: Progressively Deblurring Radiance Field for Fast and Robust Scene Reconstruction from Blurry Images](https://arxiv.org/abs/2208.08049) | [code]\n    > We present Progressively Deblurring Radiance Field (PDRF), a novel approach to efficiently reconstruct high quality radiance fields from blurry images. While current State-of-The-Art (SoTA) scene reconstruction methods achieve photo-realistic rendering results from clean source views, their performances suffer when the source views are affected by blur, which is commonly observed for images in the wild. Previous deblurring methods either do not account for 3D geometry, or are computationally intense. To addresses these issues, PDRF, a progressively deblurring scheme in radiance field modeling, accurately models blur by incorporating 3D scene context. PDRF further uses an efficient importance sampling scheme, which results in fast scene optimization. Specifically, PDRF proposes a Coarse Ray Renderer to quickly estimate voxel density and feature; a Fine Voxel Renderer is then used to achieve high quality ray tracing. We perform extensive experiments and show that PDRF is 15X faster than previous SoTA while achieving better performance on both synthetic and real scenes.\n  - [HDR-Plenoxels: Self-Calibrating High Dynamic Range Radiance Fields, ECCV2022](https://arxiv.org/abs/2208.06787) | [code]\n    > We propose high dynamic range radiance (HDR) fields, HDR-Plenoxels, that learn a plenoptic function of 3D HDR radiance fields, geometry information, and varying camera settings inherent in 2D low dynamic range (LDR) images. Our voxel-based volume rendering pipeline reconstructs HDR radiance fields with only multi-view LDR images taken from varying camera settings in an end-to-end manner and has a fast convergence speed. To deal with various cameras in real-world scenarios, we introduce a tone mapping module that models the digital in-camera imaging pipeline (ISP) and disentangles radiometric settings. Our tone mapping module allows us to render by controlling the radiometric settings of each novel view. Finally, we build a multi-view dataset with varying camera conditions, which fits our problem setting. Our experiments show that HDR-Plenoxels can express detail and high-quality HDR novel views from only LDR images with various cameras.\n## Aug7 - Aug13, 2022\n  - [OmniVoxel: A Fast and Precise Reconstruction Method of Omnidirectional Neural Radiance Field, GCCE 2022](https://arxiv.org/abs/2208.06335) | [code]\n    > This paper proposes a method to reconstruct the neural radiance field with equirectangular omnidirectional images. Implicit neural scene representation with a radiance field can reconstruct the 3D shape of a scene continuously within a limited spatial area. However, training a fully implicit representation on commercial PC hardware requires a lot of time and computing resources (15 ∼ 20 hours per scene). Therefore, we propose a method to accelerate this process significantly (20 ∼ 40 minutes per scene). Instead of using a fully implicit representation of rays for radiance field reconstruction, we adopt feature voxels that contain density and color features in tensors. Considering omnidirectional equirectangular input and the camera layout, we use spherical voxelization for representation instead of cubic representation. Our voxelization method could balance the reconstruction quality of the inner scene and outer scene. In addition, we adopt the axis-aligned positional encoding method on the color features to increase the total image quality. Our method achieves satisfying empirical performance on synthetic datasets with random camera poses. Moreover, we test our method with real scenes which contain complex geometries and also achieve state-of-the-art performance. Our code and complete dataset will be released at the same time as the paper publication.\n  - [HRF-Net: Holistic Radiance Fields from Sparse Inputs](https://arxiv.org/abs/2208.04717) | [code]\n    > We present HRF-Net, a novel view synthesis method based on holistic radiance fields that renders novel views using a set of sparse inputs. Recent generalizing view synthesis methods also leverage the radiance fields but the rendering speed is not real-time. There are existing methods that can train and render novel views efficiently but they can not generalize to unseen scenes. Our approach addresses the problem of real-time rendering for generalizing view synthesis and consists of two main stages: a holistic radiance fields predictor and a convolutional-based neural renderer. This architecture infers not only consistent scene geometry based on the implicit neural fields but also renders new views efficiently using a single GPU. We first train HRF-Net on multiple 3D scenes of the DTU dataset and the network can produce plausible novel views on unseen real and synthetics data using only photometric losses. Moreover, our method can leverage a denser set of reference images of a single scene to produce accurate novel views without relying on additional explicit representations and still maintains the high-speed rendering of the pre-trained model. Experimental results show that HRF-Net outperforms state-of-the-art generalizable neural rendering methods on various synthetic and real datasets.\n## Jul31 - Aug6, 2022\n  - [End-to-end learning of 3D phase-only holograms for holographic display](https://www.nature.com/articles/s41377-022-00894-6) | [code]\n    > Computer-generated holography (CGH) provides volumetric control of coherent wavefront and is fundamental to applications such as volumetric 3D displays, lithography, neural photostimulation, and optical/acoustic trapping. Recently, deep learning-based methods emerged as promising computational paradigms for CGH synthesis that overcome the quality-runtime tradeoff in conventional simulation/optimization-based methods. Yet, the quality of the predicted hologram is intrinsically bounded by the dataset’s quality. Here we introduce a new hologram dataset, MIT-CGH-4K-V2, that uses a layered depth image as a data-efficient volumetric 3D input and a two-stage supervised+unsupervised training protocol for direct synthesis of high-quality 3D phase-only holograms. The proposed system also corrects vision aberration, allowing customization for end-users. We experimentally show photorealistic 3D holographic projections and discuss relevant spatial light modulator calibration procedures. Our method runs in real-time on a consumer GPU and 5 FPS on an iPhone 13 Pro, promising drastically enhanced performance for the applications above.\n## Jul24 - Jul30, 2022\n  - [MobileNeRF: Exploiting the Polygon Rasterization Pipeline for Efficient Neural Field Rendering on Mobile Architectures](https://arxiv.org/abs/2208.00277) | [***``[code]``***](https://github.com/google-research/jax3d/tree/main/jax3d/projects/mobilenerf)\n    > Neural Radiance Fields (NeRFs) have demonstrated amazing ability to synthesize images of 3D scenes from novel views. However, they rely upon specialized volumetric rendering algorithms based on ray marching that are mismatched to the capabilities of widely deployed graphics hardware. This paper introduces a new NeRF representation based on textured polygons that can synthesize novel images efficiently with standard rendering pipelines. The NeRF is represented as a set of polygons with textures representing binary opacities and feature vectors. Traditional rendering of the polygons with a z-buffer yields an image with features at every pixel, which are interpreted by a small, view-dependent MLP running in a fragment shader to produce a final pixel color. This approach enables NeRFs to be rendered with the traditional polygon rasterization pipeline, which provides massive pixel-level parallelism, achieving interactive frame rates on a wide range of compute platforms, including mobile phones.\n  - [End-to-end View Synthesis via NeRF Attention](https://arxiv.org/abs/2207.14741) | [code]\n    > In this paper, we present a simple seq2seq formulation for view synthesis where we take a set of ray points as input and output colors corresponding to the rays. Directly applying a standard transformer on this seq2seq formulation has two limitations. First, the standard attention cannot successfully fit the volumetric rendering procedure, and therefore high-frequency components are missing in the synthesized views. Second, applying global attention to all rays and pixels is extremely inefficient. Inspired by the neural radiance field (NeRF), we propose the NeRF attention (NeRFA) to address the above problems. On the one hand, NeRFA considers the volumetric rendering equation as a soft feature modulation procedure. In this way, the feature modulation enhances the transformers with the NeRF-like inductive bias. On the other hand, NeRFA performs multi-stage attention to reduce the computational overhead. Furthermore, the NeRFA model adopts the ray and pixel transformers to learn the interactions between rays and pixels. NeRFA demonstrates superior performance over NeRF and NerFormer on four datasets: DeepVoxels, Blender, LLFF, and CO3D. Besides, NeRFA establishes a new state-of-the-art under two settings: the single-scene view synthesis and the category-centric novel view synthesis. The code will be made publicly available.\n  - [Going Off-Grid: Continuous Implicit Neural Representations for 3D Vascular Modeling, MICCAI STACOM 2022](https://arxiv.org/abs/2207.14663) | [code]\n    > Personalised 3D vascular models are valuable for diagnosis, prognosis and treatment planning in patients with cardiovascular disease. Traditionally, such models have been constructed with explicit representations such as meshes and voxel masks, or implicit representations such as radial basis functions or atomic (tubular) shapes. Here, we propose to represent surfaces by the zero level set of their signed distance function (SDF) in a differentiable implicit neural representation (INR). This allows us to model complex vascular structures with a representation that is implicit, continuous, light-weight, and easy to integrate with deep learning algorithms. We here demonstrate the potential of this approach with three practical examples. First, we obtain an accurate and watertight surface for an abdominal aortic aneurysm (AAA) from CT images and show robust fitting from as little as 200 points on the surface. Second, we simultaneously fit nested vessel walls in a single INR without intersections. Third, we show how 3D models of individual arteries can be smoothly blended into a single watertight surface. Our results show that INRs are a flexible representation with potential for minimally interactive annotation and manipulation of complex vascular structures.\n## Previous weeks\n  - [﻿Plenoxels: Radiance Fields without Neural Networks, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > We introduce Plenoxels (plenoptic voxels), a system for photorealistic view synthesis. Plenoxels represent a scene as a sparse 3D grid with spherical harmonics. This representation can be optimized from calibrated images via gradient methods and regularization without any neural components. On standard, benchmark tasks, Plenoxels are optimized two orders of magnitude faster than Neural Radiance Fields with no loss in visual quality.\n  - [Neural Sparse Voxel Fields, NeurIPS2020](https://lingjie0206.github.io/papers/NSVF/) | [***``[code]``***](https://github.com/facebookresearch/NSVF)\n    > We introduce Neural Sparse Voxel Fields (NSVF), a new neural scene representation for fast and high-quality free-viewpoint rendering. NSVF defines a set of voxel-bounded implicit fields organized in a sparse voxel octree to model local properties in each cell. We progressively learn the underlying voxel structures with a diffentiable ray-marching operation from only a set of posed RGB images. With the sparse voxel octree structure, rendering novel views can be accelerated by skipping the voxels containing no relevant scene content. Our method is over 10 times faster than the state-of-the-art (namely, NeRF (Mildenhall et al., 2020)) at inference time while achieving higher quality results. Furthermore, by utilizing an explicit sparse voxel representation, our method can easily be applied to scene editing and scene composition. We also demonstrate several challenging tasks, including multi-scene learning, free-viewpoint rendering of a moving human, and large-scale scene rendering.\n  - [AutoInt: Automatic Integration for Fast Neural Volume Rendering, CVPR2021](http://www.computationalimaging.org/publications/automatic-integration/) | [***``[code]``***](https://github.com/computational-imaging/automatic-integration)\n    > Numerical integration is a foundational technique in scientific computing and is at the core of many computer vision applications. Among these applications, implicit neural volume rendering has recently been proposed as a new paradigm for view synthesis, achieving photorealistic image quality. However, a fundamental obstacle to making these methods practical is the extreme computational and memory requirements caused by the required volume integrations along the rendered rays during training and inference. Millions of rays, each requiring hundreds of forward passes through a neural network are needed to approximate those integrations with Monte Carlo sampling. Here, we propose automatic integration, a new framework for learning efficient, closed-form solutions to integrals using implicit neural representation networks. For training, we instantiate the computational graph corresponding to the derivative of the implicit neural representation. The graph is fitted to the signal to integrate. After optimization, we reassemble the graph to obtain a network that represents the antiderivative. By the fundamental theorem of calculus, this enables the calculation of any definite integral in two evaluations of the network. Using this approach, we demonstrate a greater than 10× improvement in computation requirements, enabling fast neural volume rendering.\n  - [DeRF: Decomposed Radiance Fields](https://arxiv.org/abs/2011.12490) | [code]\n    > With the advent of Neural Radiance Fields (NeRF), neural networks can now render novel views of a 3D scene with quality that fools the human eye. Yet, generating these images is very computationally intensive, limiting their applicability in practical scenarios. In this paper, we propose a technique based on spatial decomposition capable of mitigating this issue. Our key observation is that there are diminishing returns in employing larger (deeper and/or wider) networks. Hence, we propose to spatially decompose a scene and dedicate smaller networks for each decomposed part. When working together, these networks can render the whole scene. This allows us near-constant inference time regardless of the number of decomposed parts. Moreover, we show that a Voronoi spatial decomposition is preferable for this purpose, as it is provably compatible with the Painter's Algorithm for efficient and GPU-friendly rendering. Our experiments show that for real-world scenes, our method provides up to 3x more efficient inference than NeRF (with the same rendering quality), or an improvement of up to 1.0~dB in PSNR (for the same inference cost).\n  - [DONeRF: Towards Real-Time Rendering of Compact Neural Radiance Fields using Depth Oracle Networks, CGF2021](https://depthoraclenerf.github.io/) | [***``[code]``***](https://github.com/facebookresearch/DONERF)\n    > The recent research explosion around Neural Radiance Fields (NeRFs) shows that there is immense potential for implicitly storing scene and lighting information in neural networks, e.g., for novel view generation. However, one major limitation preventing the widespread use of NeRFs is the prohibitive computational cost of excessive network evaluations along each view ray, requiring dozens of petaFLOPS when aiming for real-time rendering on current devices. We show that the number of samples required for each view ray can be significantly reduced when local samples are placed around surfaces in the scene. To this end, we propose a depth oracle network, which predicts ray sample locations for each view ray with a single network evaluation. We show that using a classification network around logarithmically discretized and spherically warped depth values is essential to encode surface locations rather than directly estimating depth. The combination of these techniques leads to DONeRF, a dual network design with a depth oracle network as a first step and a locally sampled shading network for ray accumulation. With our design, we reduce the inference costs by up to 48x compared to NeRF. Using an off-the-shelf inference API in combination with simple compute kernels, we are the first to render raymarching-based neural representations at interactive frame rates (15 frames per second at 800x800) on a single GPU. At the same time, since we focus on the important parts of the scene around surfaces, we achieve equal or better quality compared to NeRF.\n  - [FastNeRF: High-Fidelity Neural Rendering at 200FPS, ICCV2021](https://arxiv.org/abs/2103.10380) | [code]\n    > Recent work on Neural Radiance Fields (NeRF) showed how neural networks can be used to encode complex 3D environments that can be rendered photorealistically from novel viewpoints. Rendering these images is very computationally demanding and recent improvements are still a long way from enabling interactive rates, even on high-end hardware. Motivated by scenarios on mobile and mixed reality devices, we propose FastNeRF, the first NeRF-based system capable of rendering high fidelity photorealistic images at 200Hz on a high-end consumer GPU. The core of our method is a graphics-inspired factorization that allows for (i) compactly caching a deep radiance map at each position in space, (ii) efficiently querying that map using ray directions to estimate the pixel values in the rendered image. Extensive experiments show that the proposed method is 3000 times faster than the original NeRF algorithm and at least an order of magnitude faster than existing work on accelerating NeRF, while maintaining visual quality and extensibility.\n  - [KiloNeRF: Speeding up Neural Radiance Fields with Thousands of Tiny MLPs , ICCV2021](https://arxiv.org/abs/2103.13744) | [***``[code]``***](https://github.com/creiser/kilonerf/)\n    > NeRF synthesizes novel views of a scene with unprecedented quality by fitting a neural radiance field to RGB images. However, NeRF requires querying a deep Multi-Layer Perceptron (MLP) millions of times, leading to slow rendering times, even on modern GPUs. In this paper, we demonstrate that real-time rendering is possible by utilizing thousands of tiny MLPs instead of one single large MLP. In our setting, each individual MLP only needs to represent parts of the scene, thus smaller and faster-to-evaluate MLPs can be used. By combining this divide-and-conquer strategy with further optimizations, rendering is accelerated by three orders of magnitude compared to the original NeRF model without incurring high storage costs. Further, using teacher-student distillation for training, we show that this speed-up can be achieved without sacrificing visual quality.\n  - [PlenOctrees for Real-time Rendering of Neural Radiance Fields, ICCV2021(oral)](https://alexyu.net/plenoctrees/) | [***``[code]``***](https://github.com/sxyu/volrend)\n    > Real-time performance is achieved by pre-tabulating the NeRF into an octree-based radiance field that we call PlenOctrees. In order to preserve view-dependent effects such as specularities, we propose to encode appearances via closed-form spherical basis functions. Specifically, we show that it is possible to train NeRFs to predict a spherical harmonic representation of radiance, removing the viewing direction as input to the neural network. Furthermore, we show that our PlenOctrees can be directly optimized to further minimize the reconstruction loss, which leads to equal or better quality than competing methods. We further show that this octree optimization step can be used to accelerate the training time, as we no longer need to wait for the NeRF training to converge fully. Our real-time neural rendering approach may potentially enable new applications such as 6-DOF industrial and product visualizations, as well as next generation AR/VR systems.\n  - [Mixture of Volumetric Primitives for Efficient Neural Rendering, SIGGRAPH2021](https://arxiv.org/abs/2103.01954) | [code]\n    > Real-time rendering and animation of humans is a core function in games, movies, and telepresence applications. Existing methods have a number of drawbacks we aim to address with our work. Triangle meshes have difficulty modeling thin structures like hair, volumetric representations like Neural Volumes are too low-resolution given a reasonable memory budget, and high-resolution implicit representations like Neural Radiance Fields are too slow for use in real-time applications. We present Mixture of Volumetric Primitives (MVP), a representation for rendering dynamic 3D content that combines the completeness of volumetric representations with the efficiency of primitive-based rendering, e.g., point-based or mesh-based methods. Our approach achieves this by leveraging spatially shared computation with a deconvolutional architecture and by minimizing computation in empty regions of space with volumetric primitives that can move to cover only occupied regions. Our parameterization supports the integration of correspondence and tracking constraints, while being robust to areas where classical tracking fails, such as around thin or translucent structures and areas with large topological variability. MVP is a hybrid that generalizes both volumetric and primitive-based representations. Through a series of extensive experiments we demonstrate that it inherits the strengths of each, while avoiding many of their limitations. We also compare our approach to several state-of-the-art methods and demonstrate that MVP produces superior results in terms of quality and runtime performance.\n  - [Light Field Networks: Neural Scene Representations with Single-Evaluation Rendering, NeurIPS2021(spotlight)](https://www.vincentsitzmann.com/lfns/) | [***``[code]``***](https://github.com/vsitzmann/light-field-networks)\n    > Inferring representations of 3D scenes from 2D observations is a fundamental problem of computer graphics, computer vision, and artificial intelligence. Emerging 3D-structured neural scene representations are a promising approach to 3D scene understanding. In this work, we propose a novel neural scene representation, Light Field Networks or LFNs, which represent both geometry and appearance of the underlying 3D scene in a 360-degree, four-dimensional light field parameterized via a neural implicit representation. Rendering a ray from an LFN requires only a *single* network evaluation, as opposed to hundreds of evaluations per ray for ray-marching or volumetric based renderers in 3D-structured neural scene representations. In the setting of simple scenes, we leverage meta-learning to learn a prior over LFNs that enables multi-view consistent light field reconstruction from as little as a single image observation. This results in dramatic reductions in time and memory complexity, and enables real-time rendering. The cost of storing a 360-degree light field via an LFN is two orders of magnitude lower than conventional methods such as the Lumigraph. Utilizing the analytical differentiability of neural implicit representations and a novel parameterization of light space, we further demonstrate the extraction of sparse depth maps from LFNs.\n  - [Depth-supervised NeRF: Fewer Views and Faster Training for Free, CVPR2022](https://arxiv.org/abs/2107.02791) | [***``[code]``***](https://github.com/dunbar12138/DSNeRF)\n    > A commonly observed failure mode of Neural Radiance Field (NeRF) is fitting incorrect geometries when given an insufficient number of input views. One potential reason is that standard volumetric rendering does not enforce the constraint that most of a scene's geometry consist of empty space and opaque surfaces. We formalize the above assumption through DS-NeRF (Depth-supervised Neural Radiance Fields), a loss for learning radiance fields that takes advantage of readily-available depth supervision. We leverage the fact that current NeRF pipelines require images with known camera poses that are typically estimated by running structure-from-motion (SFM). Crucially, SFM also produces sparse 3D points that can be used as \"free\" depth supervision during training: we add a loss to encourage the distribution of a ray's terminating depth matches a given 3D keypoint, incorporating depth uncertainty. DS-NeRF can render better images given fewer training views while training 2-3x faster. Further, we show that our loss is compatible with other recently proposed NeRF methods, demonstrating that depth is a cheap and easily digestible supervisory signal. And finally, we find that DS-NeRF can support other types of depth supervision such as scanned depth sensors and RGB-D reconstruction outputs.\n  - [Direct Voxel Grid Optimization: Super-fast Convergence for Radiance Fields Reconstruction, CVPR2022(oral)](https://arxiv.org/abs/2111.11215) | [***``[code]``***](https://github.com/sunset1995/DirectVoxGO)\n    > We present a super-fast convergence approach to reconstructing the per-scene radiance field from a set of images that capture the scene with known poses. This task, which is often applied to novel view synthesis, is recently revolutionized by Neural Radiance Field (NeRF) for its state-of-the-art quality and flexibility. However, NeRF and its variants require a lengthy training time ranging from hours to days for a single scene. In contrast, our approach achieves NeRF-comparable quality and converges rapidly from scratch in less than 15 minutes with a single GPU. We adopt a representation consisting of a density voxel grid for scene geometry and a feature voxel grid with a shallow network for complex view-dependent appearance. Modeling with explicit and discretized volume representations is not new, but we propose two simple yet non-trivial techniques that contribute to fast convergence speed and high-quality output. First, we introduce the post-activation interpolation on voxel density, which is capable of producing sharp surfaces in lower grid resolution. Second, direct voxel density optimization is prone to suboptimal geometry solutions, so we robustify the optimization process by imposing several priors. Finally, evaluation on five inward-facing benchmarks shows that our method matches, if not surpasses, NeRF's quality, yet it only takes about 15 minutes to train from scratch for a new scene.\n  - [Implicit Mapping and Positioning in Real-Time, ICCV2021](https://arxiv.org/abs/2103.12352) | [code]\n    > We show for the first time that a multilayer perceptron (MLP) can serve as the only scene representation in a real-time SLAM system for a handheld RGB-D camera. Our network is trained in live operation without prior data, building a dense, scene-specific implicit 3D model of occupancy and colour which is also immediately used for tracking.\n  - [Mip-NeRF: A Multiscale Representation for Anti-Aliasing Neural Radiance Fields, ICCV2021(oral)](https://jonbarron.info/mipnerf/) | [***``[code]``***](https://github.com/google/mipnerf)\n    > The rendering procedure used by neural radiance fields (NeRF) samples a scene with a single ray per pixel and may therefore produce renderings that are excessively blurred or aliased when training or testing images observe scene content at different resolutions. The straightforward solution of supersampling by rendering with multiple rays per pixel is impractical for NeRF, because rendering each ray requires querying a multilayer perceptron hundreds of times. Our solution, which we call \"mip-NeRF\" (à la \"mipmap\"), extends NeRF to represent the scene at a continuously-valued scale. By efficiently rendering anti-aliased conical frustums instead of rays, mip-NeRF reduces objectionable aliasing artifacts and significantly improves NeRF's ability to represent fine details, while also being 7% faster than NeRF and half the size. Compared to NeRF, mip-NeRF reduces average error rates by 17% on the dataset presented with NeRF and by 60% on a challenging multiscale variant of that dataset that we present. mip-NeRF is also able to match the accuracy of a brute-force supersampled NeRF on our multiscale dataset while being 22x faster.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/generalization.md",
    "content": "\nWeekly Classified Neural Radiance Fields - generalization ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n==========================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n  - [Neural Radiance Fields from Sparse RGB-D Images for High-Quality View Synthesis, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9999509) | [code]\n    > The recently proposed neural radiance fields (NeRF) use a continuous function formulated as a multi-layer perceptron (MLP) to model the appearance and geometry of a 3D scene. This enables realistic synthesis of novel views, even for scenes with view dependent appearance. Many follow-up works have since extended NeRFs in different ways. However, a fundamental restriction of the method remains that it requires a large number of images captured from densely placed viewpoints for high-quality synthesis and the quality of the results quickly degrades when the number of captured views is insufficient. To address this problem, we propose a novel NeRF-based framework capable of high-quality view synthesis using only a sparse set of RGB-D images, which can be easily captured using cameras and LiDAR sensors on current consumer devices. First, a geometric proxy of the scene is reconstructed from the captured RGB-D images. Renderings of the reconstructed scene along with precise camera parameters can then be used to pre-train a network. Finally, the network is fine-tuned with a small number of real captured images. We further introduce a patch discriminator to supervise the network under novel views during fine-tuning, as well as a 3D color prior to improve synthesis quality. We demonstrate that our method can generate arbitrary novel views of a 3D scene from as few as 6 RGB-D images. Extensive experiments show the improvements of our method compared with the existing NeRF-based methods, including approaches that also aim to reduce the number of input images.\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [Diffusion Guided Domain Adaptation of Image Generators](https://arxiv.org/abs/2212.04473) | [code]\n    > Can a text-to-image diffusion model be used as a training objective for adapting a GAN generator to another domain? In this paper, we show that the classifier-free guidance can be leveraged as a critic and enable generators to distill knowledge from large-scale text-to-image diffusion models. Generators can be efficiently shifted into new domains indicated by text prompts without access to groundtruth samples from target domains. We demonstrate the effectiveness and controllability of our method through extensive experiments. Although not trained to minimize CLIP loss, our model achieves equally high CLIP scores and significantly lower FID than prior work on short prompts, and outperforms the baseline qualitatively and quantitatively on long and complicated prompts. To our best knowledge, the proposed method is the first attempt at incorporating large-scale pre-trained diffusion models and distillation sampling for text-driven image generator domain adaptation and gives a quality previously beyond possible. Moreover, we extend our work to 3D-aware style-based generators and DreamBooth guidance.\n  - [NeRDi: Single-View NeRF Synthesis with Language-Guided Diffusion as General Image Priors](https://arxiv.org/abs/2212.03267) | [code]\n    > 2D-to-3D reconstruction is an ill-posed problem, yet humans are good at solving this problem due to their prior knowledge of the 3D world developed over years. Driven by this observation, we propose NeRDi, a single-view NeRF synthesis framework with general image priors from 2D diffusion models. Formulating single-view reconstruction as an image-conditioned 3D generation problem, we optimize the NeRF representations by minimizing a diffusion loss on its arbitrary view renderings with a pretrained image diffusion model under the input-view constraint. We leverage off-the-shelf vision-language models and introduce a two-section language guidance as conditioning inputs to the diffusion model. This is essentially helpful for improving multiview content coherence as it narrows down the general image prior conditioned on the semantic and visual features of the single-view input image. Additionally, we introduce a geometric loss based on estimated depth maps to regularize the underlying 3D geometry of the NeRF. Experimental results on the DTU MVS dataset show that our method can synthesize novel views with higher quality even compared to existing methods trained on this dataset. We also demonstrate our generalizability in zero-shot NeRF synthesis for in-the-wild images.\n## Nov27 - Dec3, 2022\n  - [StegaNeRF: Embedding Invisible Information within Neural Radiance Fields](https://arxiv.org/abs/2212.01602) | [***``[code]``***](https://github.com/XGGNet/StegaNeRF)\n    > Recent advances in neural rendering imply a future of widespread visual data distributions through sharing NeRF model weights. However, while common visual data (images and videos) have standard approaches to embed ownership or copyright information explicitly or subtly, the problem remains unexplored for the emerging NeRF format. We present StegaNeRF, a method for steganographic information embedding in NeRF renderings. We design an optimization framework allowing accurate hidden information extractions from images rendered by NeRF, while preserving its original visual quality. We perform experimental evaluations of our method under several potential deployment scenarios, and we further discuss the insights discovered through our analysis. StegaNeRF signifies an initial exploration into the novel problem of instilling customizable, imperceptible, and recoverable information to NeRF renderings, with minimal impact to rendered images. Project page: this https URL.\n  - [LatentSwap3D: Semantic Edits on 3D Image GANs](https://arxiv.org/abs/2212.01381) | [***``[code]``***](https://github.com/enisimsar/latentswap3d)\n    > Recent 3D-aware GANs rely on volumetric rendering techniques to disentangle the pose and appearance of objects, de facto generating entire 3D volumes rather than single-view 2D images from a latent code. Complex image editing tasks can be performed in standard 2D-based GANs (e.g., StyleGAN models) as manipulation of latent dimensions. However, to the best of our knowledge, similar properties have only been partially explored for 3D-aware GAN models. This work aims to fill this gap by showing the limitations of existing methods and proposing LatentSwap3D, a model-agnostic approach designed to enable attribute editing in the latent space of pre-trained 3D-aware GANs. We first identify the most relevant dimensions in the latent space of the model controlling the targeted attribute by relying on the feature importance ranking of a random forest classifier. Then, to apply the transformation, we swap the top-K most relevant latent dimensions of the image being edited with an image exhibiting the desired attribute. Despite its simplicity, LatentSwap3D provides remarkable semantic edits in a disentangled manner and outperforms alternative approaches both qualitatively and quantitatively. We demonstrate our semantic edit approach on various 3D-aware generative models such as pi-GAN, GIRAFFE, StyleSDF, MVCGAN, EG3D and VolumeGAN, and on diverse datasets, such as FFHQ, AFHQ, Cats, MetFaces, and CompCars. The project page can be found: \\url{this https URL}.\n  - [DiffRF: Rendering-Guided 3D Radiance Field Diffusion](https://arxiv.org/abs/2212.01206) | [code]\n    > We introduce DiffRF, a novel approach for 3D radiance field synthesis based on denoising diffusion probabilistic models. While existing diffusion-based methods operate on images, latent codes, or point cloud data, we are the first to directly generate volumetric radiance fields. To this end, we propose a 3D denoising model which directly operates on an explicit voxel grid representation. However, as radiance fields generated from a set of posed images can be ambiguous and contain artifacts, obtaining ground truth radiance field samples is non-trivial. We address this challenge by pairing the denoising formulation with a rendering loss, enabling our model to learn a deviated prior that favours good image quality instead of trying to replicate fitting errors like floating artifacts. In contrast to 2D-diffusion models, our model learns multi-view consistent priors, enabling free-view synthesis and accurate shape generation. Compared to 3D GANs, our diffusion-based approach naturally enables conditional generation such as masked completion or single-view 3D synthesis at inference time.\n  - [SparseFusion: Distilling View-conditioned Diffusion for 3D Reconstruction](https://arxiv.org/abs/2212.00792) | [code]\n    > We propose SparseFusion, a sparse view 3D reconstruction approach that unifies recent advances in neural rendering and probabilistic image generation. Existing approaches typically build on neural rendering with re-projected features but fail to generate unseen regions or handle uncertainty under large viewpoint changes. Alternate methods treat this as a (probabilistic) 2D synthesis task, and while they can generate plausible 2D images, they do not infer a consistent underlying 3D. However, we find that this trade-off between 3D consistency and probabilistic image generation does not need to exist. In fact, we show that geometric consistency and generative inference can be complementary in a mode-seeking behavior. By distilling a 3D consistent scene representation from a view-conditioned latent diffusion model, we are able to recover a plausible 3D representation whose renderings are both accurate and realistic. We evaluate our approach across 51 categories in the CO3D dataset and show that it outperforms existing methods, in both distortion and perception metrics, for sparse-view novel view synthesis.\n  - [Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation](https://arxiv.org/abs/2212.00774) | [code]\n    > A diffusion model learns to predict a vector field of gradients. We propose to apply chain rule on the learned gradients, and back-propagate the score of a diffusion model through the Jacobian of a differentiable renderer, which we instantiate to be a voxel radiance field. This setup aggregates 2D scores at multiple camera viewpoints into a 3D score, and repurposes a pretrained 2D model for 3D data generation. We identify a technical challenge of distribution mismatch that arises in this application, and propose a novel estimation mechanism to resolve it. We run our algorithm on several off-the-shelf diffusion image generative models, including the recently released Stable Diffusion trained on the large-scale LAION dataset.\n  - [3D-LDM: Neural Implicit 3D Shape Generation with Latent Diffusion Models](https://arxiv.org/abs/2212.00842) | [code]\n    > Diffusion models have shown great promise for image generation, beating GANs in terms of generation diversity, with comparable image quality. However, their application to 3D shapes has been limited to point or voxel representations that can in practice not accurately represent a 3D surface. We propose a diffusion model for neural implicit representations of 3D shapes that operates in the latent space of an auto-decoder. This allows us to generate diverse and high quality 3D surfaces. We additionally show that we can condition our model on images or text to enable image-to-3D generation and text-to-3D generation using CLIP embeddings. Furthermore, adding noise to the latent codes of existing shapes allows us to explore shape variations.\n  - [Efficient Neural Radiance Fields for Interactive Free-viewpoint Video, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555376) | [code]\n    > This paper aims to tackle the challenge of efficiently producing interactive free-viewpoint videos. Some recent works equip neural radiance fields with image encoders, enabling them to generalize across scenes. When processing dynamic scenes, they can simply treat each video frame as an individual scene and perform novel view synthesis to generate free-viewpoint videos. However, their rendering process is slow and cannot support interactive applications. A major factor is that they sample lots of points in empty space when inferring radiance fields. We propose a novel scene representation, called ENeRF, for the fast creation of interactive free-viewpoint videos. Specifically, given multi-view images at one frame, we first build the cascade cost volume to predict the coarse geometry of the scene. The coarse geometry allows us to sample few points near the scene surface, thereby significantly improving the rendering speed. This process is fully differentiable, enabling us to jointly learn the depth prediction and radiance field networks from RGB images. Experiments on multiple benchmarks show that our approach exhibits competitive performance while being at least 60 times faster than previous generalizable radiance field methods.\n  - [A Light Touch Approach to Teaching Transformers Multi-view Geometry](https://arxiv.org/abs/2211.15107) | [code]\n    > Transformers are powerful visual learners, in large part due to their conspicuous lack of manually-specified priors. This flexibility can be problematic in tasks that involve multiple-view geometry, due to the near-infinite possible variations in 3D shapes and viewpoints (requiring flexibility), and the precise nature of projective geometry (obeying rigid laws). To resolve this conundrum, we propose a \"light touch\" approach, guiding visual Transformers to learn multiple-view geometry but allowing them to break free when needed. We achieve this by using epipolar lines to guide the Transformer's cross-attention maps, penalizing attention values outside the epipolar lines and encouraging higher attention along these lines since they contain geometrically plausible matches. Unlike previous methods, our proposal does not require any camera pose information at test-time. We focus on pose-invariant object instance retrieval, where standard Transformer networks struggle, due to the large differences in viewpoint between query and retrieved images. Experimentally, our method outperforms state-of-the-art approaches at object retrieval, without needing pose information at test-time.\n  - [High-fidelity 3D GAN Inversion by Pseudo-multi-view Optimization](https://arxiv.org/abs/2211.15662) | [***``[code]``***](https://github.com/jiaxinxie97/HFGI3D)\n    > We present a high-fidelity 3D generative adversarial network (GAN) inversion framework that can synthesize photo-realistic novel views while preserving specific details of the input image. High-fidelity 3D GAN inversion is inherently challenging due to the geometry-texture trade-off in high-fidelity 3D inversion, where overfitting to a single view input image often damages the estimated geometry during the latent optimization. To solve this challenge, we propose a novel pipeline that builds on the pseudo-multi-view estimation with visibility analysis. We keep the original textures for the visible parts and utilize generative priors for the occluded parts. Extensive experiments show that our approach achieves advantageous reconstruction and novel view synthesis quality over state-of-the-art methods, even for images with out-of-distribution textures. The proposed pipeline also enables image attribute editing with the inverted latent code and 3D-aware texture modification. Our approach enables high-fidelity 3D rendering from a single image, which is promising for various applications of AI-generated 3D content.\n## Nov20 - Nov26, 2022\n  - [Unsupervised Continual Semantic Adaptation through Neural Rendering](https://arxiv.org/abs/2211.13969) | [code]\n    > An increasing amount of applications rely on data-driven models that are deployed for perception tasks across a sequence of scenes. Due to the mismatch between training and deployment data, adapting the model on the new scenes is often crucial to obtain good performance. In this work, we study continual multi-scene adaptation for the task of semantic segmentation, assuming that no ground-truth labels are available during deployment and that performance on the previous scenes should be maintained. We propose training a Semantic-NeRF network for each scene by fusing the predictions of a segmentation model and then using the view-consistent rendered semantic labels as pseudo-labels to adapt the model. Through joint training with the segmentation model, the Semantic-NeRF model effectively enables 2D-3D knowledge transfer. Furthermore, due to its compact size, it can be stored in a long-term memory and subsequently used to render data from arbitrary viewpoints to reduce forgetting. We evaluate our approach on ScanNet, where we outperform both a voxel-based baseline and a state-of-the-art unsupervised domain adaptation method.\n  - [ShadowNeuS: Neural SDF Reconstruction by Shadow Ray Supervision](https://arxiv.org/abs/2211.14086) | [code]\n    > By supervising camera rays between a scene and multi-view image planes, NeRF reconstructs a neural scene representation for the task of novel view synthesis. On the other hand, shadow rays between the light source and the scene have yet to be considered. Therefore, we propose a novel shadow ray supervision scheme that optimizes both the samples along the ray and the ray location. By supervising shadow rays, we successfully reconstruct a neural SDF of the scene from single-view pure shadow or RGB images under multiple lighting conditions. Given single-view binary shadows, we train a neural network to reconstruct a complete scene not limited by the camera's line of sight. By further modeling the correlation between the image colors and the shadow rays, our technique can also be effectively extended to RGB inputs. We compare our method with previous works on challenging tasks of shape reconstruction from single-view binary shadow or RGB images and observe significant improvements. The code and data will be released.\n  - [Peekaboo: Text to Image Diffusion Models are Zero-Shot Segmentors](https://arxiv.org/abs/2211.13224) | [code]\n    > Recent diffusion-based generative models combined with vision-language models are capable of creating realistic images from natural language prompts. While these models are trained on large internet-scale datasets, such pre-trained models are not directly introduced to any semantic localization or grounding. Most current approaches for localization or grounding rely on human-annotated localization information in the form of bounding boxes or segmentation masks. The exceptions are a few unsupervised methods that utilize architectures or loss functions geared towards localization, but they need to be trained separately. In this work, we explore how off-the-shelf diffusion models, trained with no exposure to such localization information, are capable of grounding various semantic phrases with no segmentation-specific re-training. An inference time optimization process is introduced, that is capable of generating segmentation masks conditioned on natural language. We evaluate our proposal Peekaboo for unsupervised semantic segmentation on the Pascal VOC dataset. In addition, we evaluate for referring segmentation on the RefCOCO dataset. In summary, we present a first zero-shot, open-vocabulary, unsupervised (no localization information), semantic grounding technique leveraging diffusion-based generative models with no re-training. Our code will be released publicly.\n  - [PANeRF: Pseudo-view Augmentation for Improved Neural Radiance Fields Based on Few-shot Inputs](https://arxiv.org/abs/2211.12758) | [code]\n    > The method of neural radiance fields (NeRF) has been developed in recent years, and this technology has promising applications for synthesizing novel views of complex scenes. However, NeRF requires dense input views, typically numbering in the hundreds, for generating high-quality images. With a decrease in the number of input views, the rendering quality of NeRF for unseen viewpoints tends to degenerate drastically. To overcome this challenge, we propose pseudo-view augmentation of NeRF, a scheme that expands a sufficient amount of data by considering the geometry of few-shot inputs. We first initialized the NeRF network by leveraging the expanded pseudo-views, which efficiently minimizes uncertainty when rendering unseen views. Subsequently, we fine-tuned the network by utilizing sparse-view inputs containing precise geometry and color information. Through experiments under various settings, we verified that our model faithfully synthesizes novel-view images of superior quality and outperforms existing methods for multi-view datasets.\n  - [Zero NeRF: Registration with Zero Overlap](https://arxiv.org/abs/2211.12544) | [code]\n    > We present Zero-NeRF, a projective surface registration method that, to the best of our knowledge, offers the first general solution capable of alignment between scene representations with minimal or zero visual correspondence. To do this, we enforce consistency between visible surfaces of partial and complete reconstructions, which allows us to constrain occluded geometry. We use a NeRF as our surface representation and the NeRF rendering pipeline to perform this alignment. To demonstrate the efficacy of our method, we register real-world scenes from opposite sides with infinitesimal overlaps that cannot be accurately registered using prior methods, and we compare these results against widely used registration methods.\n  - [SPARF: Neural Radiance Fields from Sparse and Noisy Poses](https://arxiv.org/abs/2211.11738) | [code]\n    > Neural Radiance Field (NeRF) has recently emerged as a powerful representation to synthesize photorealistic novel views. While showing impressive performance, it relies on the availability of dense input views with highly accurate camera poses, thus limiting its application in real-world scenarios. In this work, we introduce Sparse Pose Adjusting Radiance Field (SPARF), to address the challenge of novel-view synthesis given only few wide-baseline input images (as low as 3) with noisy camera poses. Our approach exploits multi-view geometry constraints in order to jointly learn the NeRF and refine the camera poses. By relying on pixel matches extracted between the input views, our multi-view correspondence objective enforces the optimized scene and camera poses to converge to a global and geometrically accurate solution. Our depth consistency loss further encourages the reconstructed scene to be consistent from any viewpoint. Our approach sets a new state of the art in the sparse-view regime on multiple challenging datasets.\n## Nov13 - Nov19, 2022\n  - [Magic3D: High-Resolution Text-to-3D Content Creation](https://arxiv.org/abs/2211.10440) | [code]\n    > DreamFusion has recently demonstrated the utility of a pre-trained text-to-image diffusion model to optimize Neural Radiance Fields (NeRF), achieving remarkable text-to-3D synthesis results. However, the method has two inherent limitations: (a) extremely slow optimization of NeRF and (b) low-resolution image space supervision on NeRF, leading to low-quality 3D models with a long processing time. In this paper, we address these limitations by utilizing a two-stage optimization framework. First, we obtain a coarse model using a low-resolution diffusion prior and accelerate with a sparse 3D hash grid structure. Using the coarse representation as the initialization, we further optimize a textured 3D mesh model with an efficient differentiable renderer interacting with a high-resolution latent diffusion model. Our method, dubbed Magic3D, can create high quality 3D mesh models in 40 minutes, which is 2x faster than DreamFusion (reportedly taking 1.5 hours on average), while also achieving higher resolution. User studies show 61.7% raters to prefer our approach over DreamFusion. Together with the image-conditioned generation capabilities, we provide users with new ways to control 3D synthesis, opening up new avenues to various creative applications.\n  - [RenderDiffusion: Image Diffusion for 3D Reconstruction, Inpainting and Generation](https://arxiv.org/abs/2211.09869) | [code]\n    > Diffusion models currently achieve state-of-the-art performance for both conditional and unconditional image generation. However, so far, image diffusion models do not support tasks required for 3D understanding, such as view-consistent 3D generation or single-view object reconstruction. In this paper, we present RenderDiffusion as the first diffusion model for 3D generation and inference that can be trained using only monocular 2D supervision. At the heart of our method is a novel image denoising architecture that generates and renders an intermediate three-dimensional representation of a scene in each denoising step. This enforces a strong inductive structure into the diffusion process that gives us a 3D consistent representation while only requiring 2D supervision. The resulting 3D representation can be rendered from any viewpoint. We evaluate RenderDiffusion on ShapeNet and Clevr datasets and show competitive performance for generation of 3D scenes and inference of 3D scenes from 2D images. Additionally, our diffusion-based approach allows us to use 2D inpainting to edit 3D scenes. We believe that our work promises to enable full 3D generation at scale when trained on massive image collections, thus circumventing the need to have large-scale 3D model collections for supervision.\n## Nov6 - Nov12, 2022\n  - [Common Pets in 3D: Dynamic New-View Synthesis of Real-Life Deformable Categories](https://arxiv.org/abs/2211.03889) | [code]\n    > Obtaining photorealistic reconstructions of objects from sparse views is inherently ambiguous and can only be achieved by learning suitable reconstruction priors. Earlier works on sparse rigid object reconstruction successfully learned such priors from large datasets such as CO3D. In this paper, we extend this approach to dynamic objects. We use cats and dogs as a representative example and introduce Common Pets in 3D (CoP3D), a collection of crowd-sourced videos showing around 4,200 distinct pets. CoP3D is one of the first large-scale datasets for benchmarking non-rigid 3D reconstruction \"in the wild\". We also propose Tracker-NeRF, a method for learning 4D reconstruction from our dataset. At test time, given a small number of video frames of an unseen object, Tracker-NeRF predicts the trajectories of its 3D points and generates new views, interpolating viewpoint and time. Results on CoP3D reveal significantly better non-rigid new-view synthesis performance than existing baselines.\n## Oct30 - Nov5, 2022\n  - [Neural Grasp Distance Fields for Robot Manipulation](https://arxiv.org/abs/2211.02647) | [code]\n    > We formulate grasp learning as a neural field and present Neural Grasp Distance Fields (NGDF). Here, the input is a 6D pose of a robot end effector and output is a distance to a continuous manifold of valid grasps for an object. In contrast to current approaches that predict a set of discrete candidate grasps, the distance-based NGDF representation is easily interpreted as a cost, and minimizing this cost produces a successful grasp pose. This grasp distance cost can be incorporated directly into a trajectory optimizer for joint optimization with other costs such as trajectory smoothness and collision avoidance. During optimization, as the various costs are balanced and minimized, the grasp target is allowed to smoothly vary, as the learned grasp field is continuous. In simulation benchmarks with a Franka arm, we find that joint grasping and planning with NGDF outperforms baselines by 63% execution success while generalizing to unseen query poses and unseen object shapes. Project page: this https URL.\n## Oct23 - Oct29, 2022\n  - [Compressing Explicit Voxel Grid Representations: fast NeRFs become also small](https://arxiv.org/abs/2210.12782) | [code]\n    > NeRFs have revolutionized the world of per-scene radiance field reconstruction because of their intrinsic compactness. One of the main limitations of NeRFs is their slow rendering speed, both at training and inference time. Recent research focuses on the optimization of an explicit voxel grid (EVG) that represents the scene, which can be paired with neural networks to learn radiance fields. This approach significantly enhances the speed both at train and inference time, but at the cost of large memory occupation. In this work we propose Re:NeRF, an approach that specifically targets EVG-NeRFs compressibility, aiming to reduce memory storage of NeRF models while maintaining comparable performance. We benchmark our approach with three different EVG-NeRF architectures on four popular benchmarks, showing Re:NeRF's broad usability and effectiveness.\n## Oct16 - Oct22, 2022\n  - [TANGO: Text-driven Photorealistic and Robust 3D Stylization via Lighting Decomposition, NeurIPS2022](https://arxiv.org/abs/2210.11277) | [***``[code]``***](https://cyw-3d.github.io/tango/)\n    > Creation of 3D content by stylization is a promising yet challenging problem in computer vision and graphics research. In this work, we focus on stylizing photorealistic appearance renderings of a given surface mesh of arbitrary topology. Motivated by the recent surge of cross-modal supervision of the Contrastive Language-Image Pre-training (CLIP) model, we propose TANGO, which transfers the appearance style of a given 3D shape according to a text prompt in a photorealistic manner. Technically, we propose to disentangle the appearance style as the spatially varying bidirectional reflectance distribution function, the local geometric variation, and the lighting condition, which are jointly optimized, via supervision of the CLIP loss, by a spherical Gaussians based differentiable renderer. As such, TANGO enables photorealistic 3D style transfer by automatically predicting reflectance effects even for bare, low-quality meshes, without training on a task-specific dataset. Extensive experiments show that TANGO outperforms existing methods of text-driven 3D style transfer in terms of photorealistic quality, consistency of 3D geometry, and robustness when stylizing low-quality meshes. Our codes and results are available at our project webpage this https URL.\n  - [Coordinates Are NOT Lonely -- Codebook Prior Helps Implicit Neural 3D Representations, NeurIPS2022](https://arxiv.org/abs/2210.11170) | [code]\n    > Implicit neural 3D representation has achieved impressive results in surface or scene reconstruction and novel view synthesis, which typically uses the coordinate-based multi-layer perceptrons (MLPs) to learn a continuous scene representation. However, existing approaches, such as Neural Radiance Field (NeRF) and its variants, usually require dense input views (i.e. 50-150) to obtain decent results. To relive the over-dependence on massive calibrated images and enrich the coordinate-based feature representation, we explore injecting the prior information into the coordinate-based network and introduce a novel coordinate-based model, CoCo-INR, for implicit neural 3D representation. The cores of our method are two attention modules: codebook attention and coordinate attention. The former extracts the useful prototypes containing rich geometry and appearance information from the prior codebook, and the latter propagates such prior information into each coordinate and enriches its feature representation for a scene or object surface. With the help of the prior information, our method can render 3D views with more photo-realistic appearance and geometries than the current methods using fewer calibrated images available. Experiments on various scene reconstruction datasets, including DTU and BlendedMVS, and the full 3D head reconstruction dataset, H3DS, demonstrate the robustness under fewer input views and fine detail-preserving capability of our proposed method.\n## Oct9 - Oct15, 2022\n  - [AniFaceGAN: Animatable 3D-Aware Face Image Generation for Video Avatars, NeurIPS2022](https://arxiv.org/abs/2210.06465) | [***``[code]``***](https://yuewuhkust.github.io/AniFaceGAN/files/github_icon.jpeg)\n    > Although 2D generative models have made great progress in face image generation and animation, they often suffer from undesirable artifacts such as 3D inconsistency when rendering images from different camera viewpoints. This prevents them from synthesizing video animations indistinguishable from real ones. Recently, 3D-aware GANs extend 2D GANs for explicit disentanglement of camera pose by leveraging 3D scene representations. These methods can well preserve the 3D consistency of the generated images across different views, yet they cannot achieve fine-grained control over other attributes, among which facial expression control is arguably the most useful and desirable for face animation. In this paper, we propose an animatable 3D-aware GAN for multiview consistent face animation generation. The key idea is to decompose the 3D representation of the 3D-aware GAN into a template field and a deformation field, where the former represents different identities with a canonical expression, and the latter characterizes expression variations of each identity. To achieve meaningful control over facial expressions via deformation, we propose a 3D-level imitative learning scheme between the generator and a parametric 3D face model during adversarial training of the 3D-aware GAN. This helps our method achieve high-quality animatable face image generation with strong visual 3D consistency, even though trained with only unstructured 2D images. Extensive experiments demonstrate our superior performance over prior works. Project page: this https URL\n  - [LION: Latent Point Diffusion Models for 3D Shape Generation, NeurIPS2022](https://arxiv.org/abs/2210.06978) | [***``[code]``***](https://nv-tlabs.github.io/LION)\n    > Denoising diffusion models (DDMs) have shown promising results in 3D point cloud synthesis. To advance 3D DDMs and make them useful for digital artists, we require (i) high generation quality, (ii) flexibility for manipulation and applications such as conditional synthesis and shape interpolation, and (iii) the ability to output smooth surfaces or meshes. To this end, we introduce the hierarchical Latent Point Diffusion Model (LION) for 3D shape generation. LION is set up as a variational autoencoder (VAE) with a hierarchical latent space that combines a global shape latent representation with a point-structured latent space. For generation, we train two hierarchical DDMs in these latent spaces. The hierarchical VAE approach boosts performance compared to DDMs that operate on point clouds directly, while the point-structured latents are still ideally suited for DDM-based modeling. Experimentally, LION achieves state-of-the-art generation performance on multiple ShapeNet benchmarks. Furthermore, our VAE framework allows us to easily use LION for different relevant tasks: LION excels at multimodal shape denoising and voxel-conditioned synthesis, and it can be adapted for text- and image-driven 3D generation. We also demonstrate shape autoencoding and latent shape interpolation, and we augment LION with modern surface reconstruction techniques to generate smooth 3D meshes. We hope that LION provides a powerful tool for artists working with 3D shapes due to its high-quality generation, flexibility, and surface reconstruction. Project page and code: this https URL.\n  - [CLIP-Fields: Weakly Supervised Semantic Fields for Robotic Memory](https://mahis.life/clip-fields/) | [code]\n    > We propose CLIP-Fields, an implicit scene model that can be trained with no direct human supervision. This model learns a mapping from spatial locations to semantic embedding vectors. The mapping can then be used for a variety of tasks, such as segmentation, instance identification, semantic search over space, and view localization. Most importantly, the mapping can be trained with supervision coming only from web-image and web-text trained models such as CLIP, Detic, and Sentence-BERT. When compared to baselines like Mask-RCNN, our method outperforms on few-shot instance identification or semantic segmentation on the HM3D dataset with only a fraction of the examples. Finally, we show that using CLIP-Fields as a scene memory, robots can perform semantic navigation in real-world environments. Our code and demonstrations are available here: https://mahis.life/clip-fields/\n## Oct2 - Oct8, 2022\n  - [Self-improving Multiplane-to-layer Images for Novel View Synthesis, WACV2023](https://samsunglabs.github.io/MLI/) | [***``[code]``***](https://github.com/SamsungLabs/MLI)\n    > We present a new method for lightweight novel-view synthesis that generalizes to an arbitrary forward-facing scene. Recent approaches are computationally expensive, require per-scene optimization, or produce a memory-expensive representation. We start by representing the scene with a set of fronto-parallel semitransparent planes and afterward convert them to deformable layers in an end-to-end manner. Additionally, we employ a feed-forward refinement procedure that corrects the estimated representation by aggregating information from input views. Our method does not require fine-tuning when a new scene is processed and can handle an arbitrary number of views without restrictions. Experimental results show that our approach surpasses recent models in terms of common metrics and human evaluation, with the noticeable advantage in inference speed and compactness of the inferred layered geometry, see this https URL\n  - [Uncertainty-Driven Active Vision for Implicit Scene Reconstruction](https://arxiv.org/abs/2210.00978) | [code]\n    > Multi-view implicit scene reconstruction methods have become increasingly popular due to their ability to represent complex scene details. Recent efforts have been devoted to improving the representation of input information and to reducing the number of views required to obtain high quality reconstructions. Yet, perhaps surprisingly, the study of which views to select to maximally improve scene understanding remains largely unexplored. We propose an uncertainty-driven active vision approach for implicit scene reconstruction, which leverages occupancy uncertainty accumulated across the scene using volume rendering to select the next view to acquire. To this end, we develop an occupancy-based reconstruction method which accurately represents scenes using either 2D or 3D supervision. We evaluate our proposed approach on the ABC dataset and the in the wild CO3D dataset, and show that: (1) we are able to obtain high quality state-of-the-art occupancy reconstructions; (2) our perspective conditioned uncertainty definition is effective to drive improvements in next best view selection and outperforms strong baseline approaches; and (3) we can further improve shape understanding by performing a gradient-based search on the view selection candidates. Overall, our results highlight the importance of view selection for implicit scene reconstruction, making it a promising avenue to explore further.\n  - [SinGRAV: Learning a Generative Radiance Volume from a Single Natural Scene](https://arxiv.org/abs/2210.01202) | [code]\n    > We present a 3D generative model for general natural scenes. Lacking necessary volumes of 3D data characterizing the target scene, we propose to learn from a single scene. Our key insight is that a natural scene often contains multiple constituents whose geometry, texture, and spatial arrangements follow some clear patterns, but still exhibit rich variations over different regions within the same scene. This suggests localizing the learning of a generative model on substantial local regions. Hence, we exploit a multi-scale convolutional network, which possesses the spatial locality bias in nature, to learn from the statistics of local regions at multiple scales within a single scene. In contrast to existing methods, our learning setup bypasses the need to collect data from many homogeneous 3D scenes for learning common features. We coin our method SinGRAV, for learning a Generative RAdiance Volume from a Single natural scene. We demonstrate the ability of SinGRAV in generating plausible and diverse variations from a single scene, the merits of SinGRAV over state-of-the-art generative neural scene methods, as well as the versatility of SinGRAV by its use in a variety of applications, spanning 3D scene editing, composition, and animation. Code and data will be released to facilitate further research.\n  - [IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable Novel View Synthesis](https://arxiv.org/abs/2210.00647) | [***``[code]``***](https://github.com/zju3dv/IntrinsicNeRF)\n    > We present intrinsic neural radiance fields, dubbed IntrinsicNeRF, that introduce intrinsic decomposition into the NeRF-based~\\cite{mildenhall2020nerf} neural rendering method and can perform editable novel view synthesis in room-scale scenes while existing inverse rendering combined with neural rendering methods~\\cite{zhang2021physg, zhang2022modeling} can only work on object-specific scenes. Given that intrinsic decomposition is a fundamentally ambiguous and under-constrained inverse problem, we propose a novel distance-aware point sampling and adaptive reflectance iterative clustering optimization method that enables IntrinsicNeRF with traditional intrinsic decomposition constraints to be trained in an unsupervised manner, resulting in temporally consistent intrinsic decomposition results. To cope with the problem of different adjacent instances of similar reflectance in a scene being incorrectly clustered together, we further propose a hierarchical clustering method with coarse-to-fine optimization to obtain a fast hierarchical indexing representation. It enables compelling real-time augmented reality applications such as scene recoloring, material editing, and illumination variation. Extensive experiments on Blender Object and Replica Scene demonstrate that we can obtain high-quality, consistent intrinsic decomposition results and high-fidelity novel view synthesis even for challenging sequences. Code and data are available on the project webpage: this https URL.\n## Sep25 - Oct1, 2022\n  - [Structure-Aware NeRF without Posed Camera via Epipolar Constraint](https://arxiv.org/abs/2210.00183) | [***``[code]``***](https://github.com/XTU-PR-LAB/SaNerf)\n    > The neural radiance field (NeRF) for realistic novel view synthesis requires camera poses to be pre-acquired by a structure-from-motion (SfM) approach. This two-stage strategy is not convenient to use and degrades the performance because the error in the pose extraction can propagate to the view synthesis. We integrate the pose extraction and view synthesis into a single end-to-end procedure so they can benefit from each other. For training NeRF models, only RGB images are given, without pre-known camera poses. The camera poses are obtained by the epipolar constraint in which the identical feature in different views has the same world coordinates transformed from the local camera coordinates according to the extracted poses. The epipolar constraint is jointly optimized with pixel color constraint. The poses are represented by a CNN-based deep network, whose input is the related frames. This joint optimization enables NeRF to be aware of the scene's structure that has an improved generalization performance. Extensive experiments on a variety of scenes demonstrate the effectiveness of the proposed approach. Code is available at this https URL.\n  - [Improving 3D-aware Image Synthesis with A Geometry-aware Discriminator, NeurIPS2022](https://arxiv.org/abs/2209.15637) | [***``[code]``***](https://github.com/vivianszf/geod)\n    > 3D-aware image synthesis aims at learning a generative model that can render photo-realistic 2D images while capturing decent underlying 3D shapes. A popular solution is to adopt the generative adversarial network (GAN) and replace the generator with a 3D renderer, where volume rendering with neural radiance field (NeRF) is commonly used. Despite the advancement of synthesis quality, existing methods fail to obtain moderate 3D shapes. We argue that, considering the two-player game in the formulation of GANs, only making the generator 3D-aware is not enough. In other words, displacing the generative mechanism only offers the capability, but not the guarantee, of producing 3D-aware images, because the supervision of the generator primarily comes from the discriminator. To address this issue, we propose GeoD through learning a geometry-aware discriminator to improve 3D-aware GANs. Concretely, besides differentiating real and fake samples from the 2D image space, the discriminator is additionally asked to derive the geometry information from the inputs, which is then applied as the guidance of the generator. Such a simple yet effective design facilitates learning substantially more accurate 3D shapes. Extensive experiments on various generator architectures and training datasets verify the superiority of GeoD over state-of-the-art alternatives. Moreover, our approach is registered as a general framework such that a more capable discriminator (i.e., with a third task of novel view synthesis beyond domain classification and geometry extraction) can further assist the generator with a better multi-view consistency.\n  - [MonoNeuralFusion: Online Monocular Neural 3D Reconstruction with Geometric Priors](https://arxiv.org/abs/2209.15153) | [code]\n    > High-fidelity 3D scene reconstruction from monocular videos continues to be challenging, especially for complete and fine-grained geometry reconstruction. The previous 3D reconstruction approaches with neural implicit representations have shown a promising ability for complete scene reconstruction, while their results are often over-smooth and lack enough geometric details. This paper introduces a novel neural implicit scene representation with volume rendering for high-fidelity online 3D scene reconstruction from monocular videos. For fine-grained reconstruction, our key insight is to incorporate geometric priors into both the neural implicit scene representation and neural volume rendering, thus leading to an effective geometry learning mechanism based on volume rendering optimization. Benefiting from this, we present MonoNeuralFusion to perform the online neural 3D reconstruction from monocular videos, by which the 3D scene geometry is efficiently generated and optimized during the on-the-fly 3D monocular scanning. The extensive comparisons with state-of-the-art approaches show that our MonoNeuralFusion consistently generates much better complete and fine-grained reconstruction results, both quantitatively and qualitatively.\n  - [SymmNeRF: Learning to Explore Symmetry Prior for Single-View View Synthesis, ACCV2022](https://arxiv.org/abs/2209.14819) | [***``[code]``***](https://github.com/xingyi-li/SymmNeRF)\n    > We study the problem of novel view synthesis of objects from a single image. Existing methods have demonstrated the potential in single-view view synthesis. However, they still fail to recover the fine appearance details, especially in self-occluded areas. This is because a single view only provides limited information. We observe that manmade objects usually exhibit symmetric appearances, which introduce additional prior knowledge. Motivated by this, we investigate the potential performance gains of explicitly embedding symmetry into the scene representation. In this paper, we propose SymmNeRF, a neural radiance field (NeRF) based framework that combines local and global conditioning under the introduction of symmetry priors. In particular, SymmNeRF takes the pixel-aligned image features and the corresponding symmetric features as extra inputs to the NeRF, whose parameters are generated by a hypernetwork. As the parameters are conditioned on the image-encoded latent codes, SymmNeRF is thus scene-independent and can generalize to new scenes. Experiments on synthetic and realworld datasets show that SymmNeRF synthesizes novel views with more details regardless of the pose transformation, and demonstrates good generalization when applied to unseen objects. Code is available at: this https URL.\n  - [360FusionNeRF: Panoramic Neural Radiance Fields with Joint Guidance](https://arxiv.org/abs/2209.14265) | [code]\n    > We present a method to synthesize novel views from a single 360∘ panorama image based on the neural radiance field (NeRF). Prior studies in a similar setting rely on the neighborhood interpolation capability of multi-layer perceptions to complete missing regions caused by occlusion, which leads to artifacts in their predictions. We propose 360FusionNeRF, a semi-supervised learning framework where we introduce geometric supervision and semantic consistency to guide the progressive training process. Firstly, the input image is re-projected to 360∘ images, and auxiliary depth maps are extracted at other camera positions. The depth supervision, in addition to the NeRF color guidance, improves the geometry of the synthesized views. Additionally, we introduce a semantic consistency loss that encourages realistic renderings of novel views. We extract these semantic features using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse 2D photographs mined from the web with natural language supervision. Experiments indicate that our proposed method can produce plausible completions of unobserved regions while preserving the features of the scene. When trained across various scenes, 360FusionNeRF consistently achieves the state-of-the-art performance when transferring to synthetic Structured3D dataset (PSNR~5%, SSIM~3% LPIPS~13%), real-world Matterport3D dataset (PSNR~3%, SSIM~3% LPIPS~9%) and Replica360 dataset (PSNR~8%, SSIM~2% LPIPS~18%).\n## Sep18 - Sep24, 2022\n  - [PNeRF: Probabilistic Neural Scene Representations for Uncertain 3D Visual Mapping, ICRA2023](https://arxiv.org/abs/2209.11677) | [code]\n    > Recently neural scene representations have provided very impressive results for representing 3D scenes visually, however, their study and progress have mainly been limited to visualization of virtual models in computer graphics or scene reconstruction in computer vision without explicitly accounting for sensor and pose uncertainty. Using this novel scene representation in robotics applications, however, would require accounting for this uncertainty in the neural map. The aim of this paper is therefore to propose a novel method for training {\\em probabilistic neural scene representations} with uncertain training data that could enable the inclusion of these representations in robotics applications. Acquiring images using cameras or depth sensors contains inherent uncertainty, and furthermore, the camera poses used for learning a 3D model are also imperfect. If these measurements are used for training without accounting for their uncertainty, then the resulting models are non-optimal, and the resulting scene representations are likely to contain artifacts such as blur and un-even geometry. In this work, the problem of uncertainty integration to the learning process is investigated by focusing on training with uncertain information in a probabilistic manner. The proposed method involves explicitly augmenting the training likelihood with an uncertainty term such that the learnt probability distribution of the network is minimized with respect to the training uncertainty. It will be shown that this leads to more accurate image rendering quality, in addition to more precise and consistent geometry. Validation has been carried out on both synthetic and real datasets showing that the proposed approach outperforms state-of-the-art methods. The results show notably that the proposed method is capable of rendering novel high-quality views even when the training data is limited.\n  - [ActiveNeRF: Learning where to See with Uncertainty Estimation](https://arxiv.org/abs/2209.08546) | [***``[code]``***](https://github.com/LeapLabTHU/ActiveNeRF)\n    > Recently, Neural Radiance Fields (NeRF) has shown promising performances on reconstructing 3D scenes and synthesizing novel views from a sparse set of 2D images. Albeit effective, the performance of NeRF is highly influenced by the quality of training samples. With limited posed images from the scene, NeRF fails to generalize well to novel views and may collapse to trivial solutions in unobserved regions. This makes NeRF impractical under resource-constrained scenarios. In this paper, we present a novel learning framework, ActiveNeRF, aiming to model a 3D scene with a constrained input budget. Specifically, we first incorporate uncertainty estimation into a NeRF model, which ensures robustness under few observations and provides an interpretation of how NeRF understands the scene. On this basis, we propose to supplement the existing training set with newly captured samples based on an active learning scheme. By evaluating the reduction of uncertainty given new inputs, we select the samples that bring the most information gain. In this way, the quality of novel view synthesis can be improved with minimal additional resources. Extensive experiments validate the performance of our model on both realistic and synthetic scenes, especially with scarcer training data. Code will be released at \\url{this https URL}.\n## Sep11 - Sep17, 2022\n  - [Learning A Unified 3D Point Cloud for View Synthesis](https://arxiv.org/abs/2209.05013) | [code]\n    > 3D point cloud representation-based view synthesis methods have demonstrated effectiveness. However, existing methods usually synthesize novel views only from a single source view, and it is non-trivial to generalize them to handle multiple source views for pursuing higher reconstruction quality. In this paper, we propose a new deep learning-based view synthesis paradigm, which learns a unified 3D point cloud from different source views. Specifically, we first construct sub-point clouds by projecting source views to 3D space based on their depth maps. Then, we learn the unified 3D point cloud by adaptively fusing points at a local neighborhood defined on the union of the sub-point clouds. Besides, we also propose a 3D geometry-guided image restoration module to fill the holes and recover high-frequency details of the rendered novel views. Experimental results on three benchmark datasets demonstrate that our method outperforms state-of-the-art view synthesis methods to a large extent both quantitatively and visually.\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n  - [Dual-Space NeRF: Learning Animatable Avatars and Scene Lighting in Separate Spaces, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > Modeling the human body in a canonical space is a common practice for capturing and animation. But when involving the neural radiance field (NeRF), learning a static NeRF in the canonical space is not enough because the lighting of the body changes when the person moves even though the scene lighting is constant. Previous methods alleviate the inconsistency of lighting by learning a per-frame embedding, but this operation does not generalize to unseen poses. Given that the lighting condition is static in the world space while the human body is consistent in the canonical space, we propose a dual-space NeRF that models the scene lighting and the human body with two MLPs in two separate spaces. To bridge these two spaces, previous methods mostly rely on the linear blend skinning (LBS) algorithm. However, the blending weights for LBS of a dynamic neural field are intractable and thus are usually memorized with another MLP, which does not generalize to novel poses. Although it is possible to borrow the blending weights of a parametric mesh such as SMPL, the interpolation operation introduces more artifacts. In this paper, we propose to use the barycentric mapping, which can directly generalize to unseen poses and surprisingly achieves superior results than LBS with neural blending weights. Quantitative and qualitative results on the Human3.6M and the ZJU-MoCap datasets show the effectiveness of our method.\n## Aug21 - Aug27, 2022\n  - [DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://dreambooth.github.io/) | [code]\n    > Large text-to-image models achieved a remarkable leap in the evolution of AI, enabling high-quality and diverse synthesis of images from a given text prompt. However, these models lack the ability to mimic the appearance of subjects in a given reference set and synthesize novel renditions of them in different contexts. In this work, we present a new approach for \"personalization\" of text-to-image diffusion models (specializing them to users' needs). Given as input just a few images of a subject, we fine-tune a pretrained text-to-image model (Imagen, although our method is not limited to a specific model) such that it learns to bind a unique identifier with that specific subject. Once the subject is embedded in the output domain of the model, the unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in different scenes. By leveraging the semantic prior embedded in the model with a new autogenous class-specific prior preservation loss, our technique enables synthesizing the subject in diverse scenes, poses, views, and lighting conditions that do not appear in the reference images. We apply our technique to several previously-unassailable tasks, including subject recontextualization, text-guided view synthesis, appearance modification, and artistic rendering (all while preserving the subject's key features). Project page: this https URL\n  - [E-NeRF: Neural Radiance Fields from a Moving Event Camera](https://arxiv.org/abs/2208.11300) | [code]\n    > Estimating neural radiance fields (NeRFs) from ideal images has been extensively studied in the computer vision community. Most approaches assume optimal illumination and slow camera motion. These assumptions are often violated in robotic applications, where images contain motion blur and the scene may not have suitable illumination. This can cause significant problems for downstream tasks such as navigation, inspection or visualization of the scene. To alleviate these problems we present E-NeRF, the first method which estimates a volumetric scene representation in the form of a NeRF from a fast-moving event camera. Our method can recover NeRFs during very fast motion and in high dynamic range conditions, where frame-based approaches fail. We show that rendering high-quality frames is possible by only providing an event stream as input. Furthermore, by combining events and frames, we can estimate NeRFs of higher quality than state-of-the-art approaches under severe motion blur. We also show that combining events and frames can overcome failure cases of NeRF estimation in scenarios where only few input views are available, without requiring additional regularization.\n  - [FurryGAN: High Quality Foreground-aware Image Synthesis, ECCV2022](https://jeongminb.github.io/FurryGAN/) | [***``[code]``***](https://jeongminb.github.io/FurryGAN/)\n    > Foreground-aware image synthesis aims to generate images as well as their foreground masks. A common approach is to formulate an image as an masked blending of a foreground image and a background image. It is a challenging problem because it is prone to reach the trivial solution where either image overwhelms the other, i.e., the masks become completely full or empty, and the foreground and background are not meaningfully separated. We present FurryGAN with three key components: 1) imposing both the foreground image and the composite image to be realistic, 2) designing a mask as a combination of coarse and fine masks, and 3) guiding the generator by an auxiliary mask predictor in the discriminator. Our method produces realistic images with remarkably detailed alpha masks which cover hair, fur, and whiskers in a fully unsupervised manner.\n## Aug14 - Aug20, 2022\n  - [Text-to-Image Generation via Implicit Visual Guidance and Hypernetwork](https://arxiv.org/abs/2208.08493) | [code]\n    > We develop an approach for text-to-image generation that embraces additional retrieval images, driven by a combination of implicit visual guidance loss and generative objectives. Unlike most existing text-to-image generation methods which merely take the text as input, our method dynamically feeds cross-modal search results into a unified training stage, hence improving the quality, controllability and diversity of generation results. We propose a novel hypernetwork modulated visual-text encoding scheme to predict the weight update of the encoding layer, enabling effective transfer from visual information (e.g. layout, content) into the corresponding latent domain. Experimental results show that our model guided with additional retrieval visual data outperforms existing GAN-based models. On COCO dataset, we achieve better FID of 9.13 with up to 3.5× fewer generator parameters, compared with the state-of-the-art method.\n  - [UPST-NeRF: Universal Photorealistic Style Transfer of Neural Radiance Fields for 3D Scene](https://arxiv.org/abs/2208.07059) | [***``[code]``***](https://github.com/semchan/UPST-NeRF)\n    > 3D scenes photorealistic stylization aims to generate photorealistic images from arbitrary novel views according to a given style image while ensuring consistency when rendering from different viewpoints. Some existing stylization methods with neural radiance fields can effectively predict stylized scenes by combining the features of the style image with multi-view images to train 3D scenes. However, these methods generate novel view images that contain objectionable artifacts. Besides, they cannot achieve universal photorealistic stylization for a 3D scene. Therefore, a styling image must retrain a 3D scene representation network based on a neural radiation field. We propose a novel 3D scene photorealistic style transfer framework to address these issues. It can realize photorealistic 3D scene style transfer with a 2D style image. We first pre-trained a 2D photorealistic style transfer network, which can meet the photorealistic style transfer between any given content image and style image. Then, we use voxel features to optimize a 3D scene and get the geometric representation of the scene. Finally, we jointly optimize a hyper network to realize the scene photorealistic style transfer of arbitrary style images. In the transfer stage, we use a pre-trained 2D photorealistic network to constrain the photorealistic style of different views and different style images in the 3D scene. The experimental results show that our method not only realizes the 3D photorealistic style transfer of arbitrary style images but also outperforms the existing methods in terms of visual quality and consistency. Project page:this https URL.\n## Aug7 - Aug13, 2022\n  - [HRF-Net: Holistic Radiance Fields from Sparse Inputs](https://arxiv.org/abs/2208.04717) | [code]\n    > We present HRF-Net, a novel view synthesis method based on holistic radiance fields that renders novel views using a set of sparse inputs. Recent generalizing view synthesis methods also leverage the radiance fields but the rendering speed is not real-time. There are existing methods that can train and render novel views efficiently but they can not generalize to unseen scenes. Our approach addresses the problem of real-time rendering for generalizing view synthesis and consists of two main stages: a holistic radiance fields predictor and a convolutional-based neural renderer. This architecture infers not only consistent scene geometry based on the implicit neural fields but also renders new views efficiently using a single GPU. We first train HRF-Net on multiple 3D scenes of the DTU dataset and the network can produce plausible novel views on unseen real and synthetics data using only photometric losses. Moreover, our method can leverage a denser set of reference images of a single scene to produce accurate novel views without relying on additional explicit representations and still maintains the high-speed rendering of the pre-trained model. Experimental results show that HRF-Net outperforms state-of-the-art generalizable neural rendering methods on various synthetic and real datasets.\n## Jul31 - Aug6, 2022\n  - [NeSF: Neural Semantic Fields for Generalizable Semantic Segmentation of 3D Scenes](https://research.google/pubs/pub51563/) | [code]\n    > We present NeSF, a method for producing 3D semantic fields from pre-trained density fields and sparse 2D semantic supervision. Our method side-steps traditional scene representations by leveraging neural representations where 3D information is stored within neural fields. In spite of being supervised by 2D signals alone, our method is able to generate 3D-consistent semantic maps from novel camera poses and can be queried at arbitrary 3D points. Notably, NeSF is compatible with any method producing a density field, and its accuracy improves as the quality of the pre-trained density fields improve. Our empirical analysis demonstrates comparable quality to competitive 2D and 3D semantic segmentation baselines on convincing synthetic scenes while also offering features unavailable to existing methods.\n  - [Transformers as Meta-Learners for Implicit Neural Representations, ECCV2022](https://arxiv.org/abs/2208.02801) | [***``[code]``***](https://yinboc.github.io/trans-inr/)\n    > Implicit Neural Representations (INRs) have emerged and shown their benefits over discrete representations in recent years. However, fitting an INR to the given observations usually requires optimization with gradient descent from scratch, which is inefficient and does not generalize well with sparse observations. To address this problem, most of the prior works train a hypernetwork that generates a single vector to modulate the INR weights, where the single vector becomes an information bottleneck that limits the reconstruction precision of the output INR. Recent work shows that the whole set of weights in INR can be precisely inferred without the single-vector bottleneck by gradient-based meta-learning. Motivated by a generalized formulation of gradient-based meta-learning, we propose a formulation that uses Transformers as hypernetworks for INRs, where it can directly build the whole set of INR weights with Transformers specialized as set-to-set mapping. We demonstrate the effectiveness of our method for building INRs in different tasks and domains, including 2D image regression and view synthesis for 3D objects. Our work draws connections between the Transformer hypernetworks and gradient-based meta-learning algorithms and we provide further analysis for understanding the generated INRs.\n  - [VolTeMorph: Realtime, Controllable and Generalisable Animation of Volumetric Representations](https://arxiv.org/pdf/2208.00949) | [code]\n    > The recent increase in popularity of volumetric representations for scene reconstruction and novel view synthesis has put renewed focus on animating volumetric content at high visual quality and in real-time. While implicit deformation methods based on learned functions can produce impressive results, they are `black boxes' to artists and content creators, they require large amounts of training data to generalise meaningfully, and they do not produce realistic extrapolations outside the training data. In this work we solve these issues by introducing a volume deformation method which is real-time, easy to edit with off-the-shelf software and can extrapolate convincingly. To demonstrate the versatility of our method, we apply it in two scenarios: physics-based object deformation and telepresence where avatars are controlled using blendshapes. We also perform thorough experiments showing that our method compares favourably to both volumetric approaches combined with implicit deformation and methods based on mesh deformation.\n## Jul24 - Jul30, 2022\n  - [ZEPI-Net: Light Field Super Resolution via Internal Cross-Scale Epipolar Plane Image Zero-Shot Learning, Neural Processing Letters (2022)](https://link.springer.com/article/10.1007/s11063-022-10955-x) | [code]\n    > Many applications of light field (LF) imaging have been limited by the spatial-angular resolution problem, hence the need for efficient super-resolution techniques. Recently, learning-based solutions have achieved remarkably better performances than traditional super-resolution (SR) techniques. Unfortunately, the learning or training process relies heavily on the training dataset, which could be limited for most LF imaging applications. In this paper, we propose a novel LF spatial-angular SR algorithm based on zero-shot learning. We suggest learning cross-scale reusable features in the epipolar plane image (EPI) space, and avoiding explicitly modeling scene priors or implicitly learning that from a large number of LFs. Most importantly, without using any external LFs, the proposed algorithm can simultaneously super-resolve a LF in both spatial and angular domains. Moreover, the proposed solution is free of depth or disparity estimation, which is usually employed by existing LF spatial and angular SR. By using a simple 8-layers fully convolutional network, we show that the proposed algorithm can generate comparable results to the state-of-the-art spatial SR. Our algorithm outperforms the existing methods in terms of angular SR on multiple groups of public LF datasets. The experiment results indicate that the cross-scale features can be well learned and be reused for LF SR in the EPI space.\n  - [ObjectFusion: Accurate object-level SLAM with neural object priors, Graphical Models, Volume 123, September 2022](https://www.sciencedirect.com/science/article/pii/S1524070322000418) | [code]\n    > Previous object-level Simultaneous Localization and Mapping (SLAM) approaches still fail to create high quality object-oriented 3D map in an efficient way. The main challenges come from how to represent the object shape effectively and how to apply such object representation to accurate online camera tracking efficiently. In this paper, we provide ObjectFusion as a novel object-level SLAM in static scenes which efficiently creates object-oriented 3D map with high-quality object reconstruction, by leveraging neural object priors. We propose a neural object representation with only a single encoder–decoder network to effectively express the object shape across various categories, which benefits high quality reconstruction of object instance. More importantly, we propose to convert such neural object representation as precise measurements to jointly optimize the object shape, object pose and camera pose for the final accurate 3D object reconstruction. With extensive evaluations on synthetic and real-world RGB-D datasets, we show that our ObjectFusion outperforms previous approaches, with better object reconstruction quality, using much less memory footprint, and in a more efficient way, especially at the object level.\n  - [End-to-end View Synthesis via NeRF Attention](https://arxiv.org/abs/2207.14741) | [code]\n    > In this paper, we present a simple seq2seq formulation for view synthesis where we take a set of ray points as input and output colors corresponding to the rays. Directly applying a standard transformer on this seq2seq formulation has two limitations. First, the standard attention cannot successfully fit the volumetric rendering procedure, and therefore high-frequency components are missing in the synthesized views. Second, applying global attention to all rays and pixels is extremely inefficient. Inspired by the neural radiance field (NeRF), we propose the NeRF attention (NeRFA) to address the above problems. On the one hand, NeRFA considers the volumetric rendering equation as a soft feature modulation procedure. In this way, the feature modulation enhances the transformers with the NeRF-like inductive bias. On the other hand, NeRFA performs multi-stage attention to reduce the computational overhead. Furthermore, the NeRFA model adopts the ray and pixel transformers to learn the interactions between rays and pixels. NeRFA demonstrates superior performance over NeRF and NerFormer on four datasets: DeepVoxels, Blender, LLFF, and CO3D. Besides, NeRFA establishes a new state-of-the-art under two settings: the single-scene view synthesis and the category-centric novel view synthesis. The code will be made publicly available.\n## Previous weeks\n  - [CLA-NeRF: Category-Level Articulated Neural Radiance Field, ICRA2022](https://arxiv.org/abs/2202.00181) | [code]\n    > We propose CLA-NeRF -- a Category-Level Articulated Neural Radiance Field that can perform view synthesis, part segmentation, and articulated pose estimation. CLA-NeRF is trained at the object category level using no CAD models and no depth, but a set of RGB images with ground truth camera poses and part segments. During inference, it only takes a few RGB views (i.e., few-shot) of an unseen 3D object instance within the known category to infer the object part segmentation and the neural radiance field. Given an articulated pose as input, CLA-NeRF can perform articulation-aware volume rendering to generate the corresponding RGB image at any camera pose. Moreover, the articulated pose of an object can be estimated via inverse rendering. In our experiments, we evaluate the framework across five categories on both synthetic and real-world data. In all cases, our method shows realistic deformation results and accurate articulated pose estimation. We believe that both few-shot articulated object rendering and articulated pose estimation open doors for robots to perceive and interact with unseen articulated objects.\n  - [GRAF: Generative Radiance Fields for 3D-Aware Image Synthesis, NeurIPS2020](https://avg.is.mpg.de/publications/schwarz2020NeurIPS) | [***``[code]``***](https://github.com/autonomousvision/graf)\n    > While 2D generative adversarial networks have enabled high-resolution image synthesis, they largely lack an understanding of the 3D world and the image formation process. Thus, they do not provide precise control over camera viewpoint or object pose. To address this problem, several recent approaches leverage intermediate voxel-based representations in combination with differentiable rendering. However, existing methods either produce low image resolution or fall short in disentangling camera and scene properties, eg, the object identity may vary with the viewpoint. In this paper, we propose a generative model for radiance fields which have recently proven successful for novel view synthesis of a single scene. In contrast to voxel-based representations, radiance fields are not confined to a coarse discretization of the 3D space, yet allow for disentangling camera and scene properties while degrading gracefully in the presence of reconstruction ambiguity. By introducing a multi-scale patch-based discriminator, we demonstrate synthesis of high-resolution images while training our model from unposed 2D images alone. We systematically analyze our approach on several challenging synthetic and real-world datasets. Our experiments reveal that radiance fields are a powerful representation for generative image synthesis, leading to 3D consistent models that render with high fidelity.\n  - [GRF: Learning a General Radiance Field for 3D Scene Representation and Rendering, ICCV2021(oral)](https://arxiv.org/abs/2010.04595) | [***``[code]``***](https://github.com/alextrevithick/GRF)\n    > We present a simple yet powerful neural network that implicitly represents and renders 3D objects and scenes only from 2D observations. The network models 3D geometries as a general radiance field, which takes a set of 2D images with camera poses and intrinsics as input, constructs an internal representation for each point of the 3D space, and then renders the corresponding appearance and geometry of that point viewed from an arbitrary position. The key to our approach is to learn local features for each pixel in 2D images and to then project these features to 3D points, thus yielding general and rich point representations. We additionally integrate an attention mechanism to aggregate pixel features from multiple 2D views, such that visual occlusions are implicitly taken into account. Extensive experiments demonstrate that our method can generate high-quality and realistic novel views for novel objects, unseen categories and challenging real-world scenes.\n  - [pixelNeRF: Neural Radiance Fields from One or Few Images, CVPR2021](https://arxiv.org/abs/2012.02190) | [***``[code]``***](https://github.com/sxyu/pixel-nerf)\n    > We propose pixelNeRF, a learning framework that predicts a continuous neural scene representation conditioned on one or few input images. The existing approach for constructing neural radiance fields involves optimizing the representation to every scene independently, requiring many calibrated views and significant compute time. We take a step towards resolving these shortcomings by introducing an architecture that conditions a NeRF on image inputs in a fully convolutional manner. This allows the network to be trained across multiple scenes to learn a scene prior, enabling it to perform novel view synthesis in a feed-forward manner from a sparse set of views (as few as one). Leveraging the volume rendering approach of NeRF, our model can be trained directly from images with no explicit 3D supervision. We conduct extensive experiments on ShapeNet benchmarks for single image novel view synthesis tasks with held-out objects as well as entire unseen categories. We further demonstrate the flexibility of pixelNeRF by demonstrating it on multi-object ShapeNet scenes and real scenes from the DTU dataset. In all cases, pixelNeRF outperforms current state-of-the-art baselines for novel view synthesis and single image 3D reconstruction. For the video and code, please visit the project website: this https URL\n  - [Learned Initializations for Optimizing Coordinate-Based Neural Representations, CVPR2021](https://www.matthewtancik.com/learnit) | [***``[code]``***](https://github.com/tancik/learnit)\n    > Coordinate-based neural representations have shown significant promise as an alternative to discrete, array-based representations for complex low dimensional signals. However, optimizing a coordinate-based network from randomly initialized weights for each new signal is inefficient. We propose applying standard meta-learning algorithms to learn the initial weight parameters for these fully-connected networks based on the underlying class of signals being represented (e.g., images of faces or 3D models of chairs). Despite requiring only a minor change in implementation, using these learned initial weights enables faster convergence during optimization and can serve as a strong prior over the signal class being modeled, resulting in better generalization when only partial observations of a given signal are available. We explore these benefits across a variety of tasks, including representing 2D images, reconstructing CT scans, and recovering 3D shapes and scenes from 2D image observations.\n  - [pi-GAN: Periodic Implicit Generative Adversarial Networks for 3D-Aware Image Synthesis, CVPR2021(oral)](https://marcoamonteiro.github.io/pi-GAN-website/) | [***``[code]``***](https://github.com/marcoamonteiro/pi-GAN)\n    > We have witnessed rapid progress on 3D-aware image synthesis, leveraging recent advances in generative visual models and neural rendering. Existing approaches however fall short in two ways: first, they may lack an underlying 3D representation or rely on view-inconsistent rendering, hence synthesizing images that are not multi-view consistent; second, they often depend upon representation network architectures that are not expressive enough, and their results thus lack in image quality. We propose a novel generative model, named Periodic Implicit Generative Adversarial Networks (π-GAN or pi-GAN), for high-quality 3D-aware image synthesis. π-GAN leverages neural representations with periodic activation functions and volumetric rendering to represent scenes as view-consistent 3D representations with fine detail. The proposed approach obtains state-of-the-art results for 3D-aware image synthesis with multiple real and synthetic datasets.\n  - [Portrait Neural Radiance Fields from a Single Image](https://portrait-nerf.github.io/) | [code]\n    > We present a method for estimating Neural Radiance Fields (NeRF) from a single headshot portrait. While NeRF has demonstrated high-quality view synthesis, it requires multiple images of static scenes and thus impractical for casual captures and moving subjects. In this work, we propose to pretrain the weights of a multilayer perceptron (MLP), which implicitly models the volumetric density and colors, with a meta-learning framework using a light stage portrait dataset. To improve the generalization to unseen faces, we train the MLP in the canonical coordinate space approximated by 3D face morphable models. We quantitatively evaluate the method using controlled captures and demonstrate the generalization to real portrait images, showing favorable results against state-of-the-arts.\n  - [CAMPARI: Camera-Aware Decomposed Generative Neural Radiance Fields](https://arxiv.org/pdf/2103.17269.pdf) | [code]\n    > Tremendous progress in deep generative models has led to photorealistic image synthesis. While achieving compelling results, most approaches operate in the two-dimensional image domain, ignoring the three-dimensional nature of our world. Several recent works therefore propose generative models which are 3D-aware, i.e., scenes are modeled in 3D and then rendered differentiably to the image plane. This leads to impressive 3D consistency, but incorporating such a bias comes at a price: the camera needs to be modeled as well. Current approaches assume fixed intrinsics and a predefined prior over camera pose ranges. As a result, parameter tuning is typically required for real-world data, and results degrade if the data distribution is not matched. Our key hypothesis is that learning a camera generator jointly with the image generator leads to a more principled approach to 3D-aware image synthesis. Further, we propose to decompose the scene into a background and foreground model, leading to more efficient and disentangled scene representations. While training from raw, unposed image collections, we learn a 3D- and camera-aware generative model which faithfully recovers not only the image but also the camera data distribution. At test time, our model generates images with explicit control over the camera as well as the shape and appearance of the scene.\n  - [NeRF-VAE: A Geometry Aware 3D Scene Generative Model](https://arxiv.org/abs/2104.00587) | [code]\n    > We propose NeRF-VAE, a 3D scene generative model that incorporates geometric structure via NeRF and differentiable volume rendering. In contrast to NeRF, our model takes into account shared structure across scenes, and is able to infer the structure of a novel scene -- without the need to re-train -- using amortized inference. NeRF-VAE's explicit 3D rendering process further contrasts previous generative models with convolution-based rendering which lacks geometric structure. Our model is a VAE that learns a distribution over radiance fields by conditioning them on a latent scene representation. We show that, once trained, NeRF-VAE is able to infer and render geometrically-consistent scenes from previously unseen 3D environments using very few input images. We further demonstrate that NeRF-VAE generalizes well to out-of-distribution cameras, while convolutional models do not. Finally, we introduce and study an attention-based conditioning mechanism of NeRF-VAE's decoder, which improves model performance.\n  - [Unconstrained Scene Generation with Locally Conditioned Radiance Fields, ICCV2021](https://apple.github.io/ml-gsn/) | [***``[code]``***](https://github.com/apple/ml-gsn)\n    > We follow an adversarial learning framework, where the generator models scenes via their radiance field, and the discriminator attempts to distinguish between images rendered from those radiance fields and images of real scenes. Conceptually, our model decomposes the radiance field of a scene into many small local radiance fields that result from conditioning on a 2D grid of latent codes W. W can be interpreted as a latent floorplan representing the scene.\n  - [MVSNeRF: Fast Generalizable Radiance Field Reconstruction from Multi-View Stereo, ICCV2021](https://apchenstu.github.io/mvsnerf/) | [***``[code]``***](https://github.com/apchenstu/mvsnerf)\n    > We present MVSNeRF, a novel neural rendering approach that can efficiently reconstruct neural radiance fields for view synthesis. Unlike prior works on neural radiance fields that consider per-scene optimization on densely captured images, we propose a generic deep neural network that can reconstruct radiance fields from only three nearby input views via fast network inference. Our approach leverages plane-swept cost volumes (widely used in multi-view stereo) for geometry-aware scene reasoning, and combines this with physically based volume rendering for neural radiance field reconstruction. We train our network on real objects in the DTU dataset, and test it on three different datasets to evaluate its effectiveness and generalizability. Our approach can generalize across scenes (even indoor scenes, completely different from our training scenes of objects) and generate realistic view synthesis results using only three input images, significantly outperforming concurrent works on generalizable radiance field reconstruction. Moreover, if dense images are captured, our estimated radiance field representation can be easily fine-tuned; this leads to fast per-scene reconstruction with higher rendering quality and substantially less optimization time than NeRF.\n  - [Stereo Radiance Fields (SRF): Learning View Synthesis from Sparse Views of Novel Scenes, CVPR2021](https://arxiv.org/abs/2104.06935) | [***``[code]``***](https://virtualhumans.mpi-inf.mpg.de/srf/)\n    > Recent neural view synthesis methods have achieved impressive quality and realism, surpassing classical pipelines which rely on multi-view reconstruction. State-of-the-Art methods, such as NeRF, are designed to learn a single scene with a neural network and require dense multi-view inputs. Testing on a new scene requires re-training from scratch, which takes 2-3 days. In this work, we introduce Stereo Radiance Fields (SRF), a neural view synthesis approach that is trained end-to-end, generalizes to new scenes, and requires only sparse views at test time. The core idea is a neural architecture inspired by classical multi-view stereo methods, which estimates surface points by finding similar image regions in stereo images. In SRF, we predict color and density for each 3D point given an encoding of its stereo correspondence in the input images. The encoding is implicitly learned by an ensemble of pair-wise similarities -- emulating classical stereo. Experiments show that SRF learns structure instead of overfitting on a scene. We train on multiple scenes of the DTU dataset and generalize to new ones without re-training, requiring only 10 sparse and spread-out views as input. We show that 10-15 minutes of fine-tuning further improve the results, achieving significantly sharper, more detailed results than scene-specific models. The code, model, and videos are available at this https URL.\n  - [Neural Rays for Occlusion-aware Image-based Rendering, CVPR2022](https://liuyuan-pal.github.io/NeuRay/) | [***``[code]``***](https://github.com/liuyuan-pal/NeuRay)\n    > We present a new neural representation, called Neural Ray (NeuRay), for the novel view synthesis task. Recent works construct radiance fields from image features of input views to render novel view images, which enables the generalization to new scenes. However, due to occlusions, a 3D point may be invisible to some input views. On such a 3D point, these generalization methods will include inconsistent image features from invisible views, which interfere with the radiance field construction. To solve this problem, we predict the visibility of 3D points to input views within our NeuRay representation. This visibility enables the radiance field construction to focus on visible image features, which significantly improves its rendering quality. Meanwhile, a novel consistency loss is proposed to refine the visibility in NeuRay when finetuning on a specific scene. Experiments demonstrate that our approach achieves state-of-the-art performance on the novel view synthesis task when generalizing to unseen scenes and outperforms per-scene optimization methods after finetuning.\n  - [Putting NeRF on a Diet: Semantically Consistent Few-Shot View Synthesis, ICCV2021](https://www.ajayj.com/dietnerf) | [***``[code]``***](https://github.com/ajayjain/DietNeRF)\n    > We present DietNeRF, a 3D neural scene representation estimated from a few images. Neural Radiance Fields (NeRF) learn a continuous volumetric representation of a scene through multi-view consistency, and can be rendered from novel viewpoints by ray casting. While NeRF has an impressive ability to reconstruct geometry and fine details given many images, up to 100 for challenging 360° scenes, it often finds a degenerate solution to its image reconstruction objective when only a few input views are available. To improve few-shot quality, we propose DietNeRF. We introduce an auxiliary semantic consistency loss that encourages realistic renderings at novel poses. DietNeRF is trained on individual scenes to (1) correctly render given input views from the same pose, and (2) match high-level semantic attributes across different, random poses. Our semantic loss allows us to supervise DietNeRF from arbitrary poses. We extract these semantics using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse single-view, 2D photographs mined from the web with natural language supervision. In experiments, DietNeRF improves the perceptual quality of few-shot view synthesis when learned from scratch, can render novel views with as few as one observed image when pre-trained on a multi-view dataset, and produces plausible completions of completely unobserved regions.\n  - [CodeNeRF: Disentangled Neural Radiance Fields for Object Categories, ICCV2021(oral)](https://www.google.com/url?q=https%3A%2F%2Farxiv.org%2Fpdf%2F2109.01750.pdf&sa=D&sntz=1&usg=AOvVaw1Fnir0e4aRa22Nt0HoXDWh) | [***``[code]``***](https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fwbjang%2Fcode-nerf&sa=D&sntz=1&usg=AOvVaw2eD5ZoRbk2aWFuwUSHlh5_)\n    > CodeNeRF is an implicit 3D neural representation that learns the variation of object shapes and textures across a category and can be trained, from a set of posed images, to synthesize novel views of unseen objects. Unlike the original NeRF, which is scene specific, CodeNeRF learns to disentangle shape and texture by learning separate embeddings. At test time, given a single unposed image of an unseen object, CodeNeRF jointly estimates camera viewpoint, and shape and appearance codes via optimization. Unseen objects can be reconstructed from a single image, and then rendered from new viewpoints or their shape and texture edited by varying the latent codes. We conduct experiments on the SRN benchmark, which show that CodeNeRF generalises well to unseen objects and achieves on-par performance with methods that require known camera pose at test time. Our results on real-world images demonstrate that CodeNeRF can bridge the sim-to-real gap. \n  - [StyleNeRF: A Style-based 3D-Aware Generator for High-resolution Image Synthesis, ICLR2022](https://jiataogu.me/style_nerf/) | [***``[code]``***](https://github.com/facebookresearch/StyleNeRF)\n    > We propose StyleNeRF, a 3D-aware generative model for photo-realistic high-resolution image synthesis with high multi-view consistency, which can be trained on unstructured 2D images. Existing approaches either cannot synthesize high-resolution images with fine details or yield noticeable 3D-inconsistent artifacts. In addition, many of them lack control over style attributes and explicit 3D camera poses. StyleNeRF integrates the neural radiance field (NeRF) into a style-based generator to tackle the aforementioned challenges, i.e., improving rendering efficiency and 3D consistency for high-resolution image generation. We perform volume rendering only to produce a low-resolution feature map and progressively apply upsampling in 2D to address the first issue. To mitigate the inconsistencies caused by 2D upsampling, we propose multiple designs, including a better upsampler and a new regularization loss. With these designs, StyleNeRF can synthesize high-resolution images at interactive rates while preserving 3D consistency at high quality. StyleNeRF also enables control of camera poses and different levels of styles, which can generalize to unseen views. It also supports challenging tasks, including zoom-in and-out, style mixing, inversion, and semantic editing.\n  - [GNeRF: GAN-based Neural Radiance Field without Posed Camera, ICCV2021(oral)](https://arxiv.org/abs/2103.15606) | [code]\n    > We introduce GNeRF, a framework to marry Generative Adversarial Networks (GAN) with Neural Radiance Field (NeRF) reconstruction for the complex scenarios with unknown and even randomly initialized camera poses. Recent NeRF-based advances have gained popularity for remarkable realistic novel view synthesis. However, most of them heavily rely on accurate camera poses estimation, while few recent methods can only optimize the unknown camera poses in roughly forward-facing scenes with relatively short camera trajectories and require rough camera poses initialization. Differently, our GNeRF only utilizes randomly initialized poses for complex outside-in scenarios. We propose a novel two-phases end-to-end framework. The first phase takes the use of GANs into the new realm for optimizing coarse camera poses and radiance fields jointly, while the second phase refines them with additional photometric loss. We overcome local minima using a hybrid and iterative optimization scheme. Extensive experiments on a variety of synthetic and natural scenes demonstrate the effectiveness of GNeRF. More impressively, our approach outperforms the baselines favorably in those scenes with repeated patterns or even low textures that are regarded as extremely challenging before.\n  - [NeRD: Neural Reflectance Decomposition from Image Collections, ICCV2021](https://markboss.me/publication/2021-nerd/#:~:text=NeRD%20is%20a%20novel%20method,can%20turn%20around%20the%20object.) | [***``[code]``***](https://github.com/cgtuebingen/NeRD-Neural-Reflectance-Decomposition)\n    > Decomposing a scene into its shape, reflectance, and illumination is a challenging but important problem in computer vision and graphics. This problem is inherently more challenging when the illumination is not a single light source under laboratory conditions but is instead an unconstrained environmental illumination. Though recent work has shown that implicit representations can be used to model the radiance field of an object, most of these techniques only enable view synthesis and not relighting. Additionally, evaluating these radiance fields is resource and time-intensive. We propose a neural reflectance decomposition (NeRD) technique that uses physically-based rendering to decompose the scene into spatially varying BRDF material properties. In contrast to existing techniques, our input images can be captured under different illumination conditions. In addition, we also propose techniques to convert the learned reflectance volume into a relightable textured mesh enabling fast real-time rendering with novel illuminations. We demonstrate the potential of the proposed approach with experiments on both synthetic and real datasets, where we are able to obtain high-quality relightable 3D assets from image collections.\n  - [NeRF++: Analyzing and Improving Neural Radiance Fields](https://arxiv.org/abs/2010.07492) | [***``[code]``***](https://github.com/Kai-46/nerfplusplus;)\n    > Neural Radiance Fields (NeRF) achieve impressive view synthesis results for a variety of capture settings, including 360 capture of bounded scenes and forward-facing capture of bounded and unbounded scenes. NeRF fits multi-layer perceptrons (MLPs) representing view-invariant opacity and view-dependent color volumes to a set of training images, and samples novel views based on volume rendering techniques. In this technical report, we first remark on radiance fields and their potential ambiguities, namely the shape-radiance ambiguity, and analyze NeRF's success in avoiding such ambiguities. Second, we address a parametrization issue involved in applying NeRF to 360 captures of objects within large-scale, unbounded 3D scenes. Our method improves view synthesis fidelity in this challenging scenario. Code is available at this https URL.\n  - [GIRAFFE: Representing Scenes as Compositional Generative Neural Feature Fields, CVPR2021(oral)](https://arxiv.org/abs/2011.12100) | [***``[code]``***](https://github.com/autonomousvision/giraffe)\n    > Deep generative models allow for photorealistic image synthesis at high resolutions. But for many applications, this is not enough: content creation also needs to be controllable. While several recent works investigate how to disentangle underlying factors of variation in the data, most of them operate in 2D and hence ignore that our world is three-dimensional. Further, only few works consider the compositional nature of scenes. Our key hypothesis is that incorporating a compositional 3D scene representation into the generative model leads to more controllable image synthesis. Representing scenes as compositional generative neural feature fields allows us to disentangle one or multiple objects from the background as well as individual objects' shapes and appearances while learning from unstructured and unposed image collections without any additional supervision. Combining this scene representation with a neural rendering pipeline yields a fast and realistic image synthesis model. As evidenced by our experiments, our model is able to disentangle individual objects and allows for translating and rotating them in the scene as well as changing the camera pose.\n  - [FiG-NeRF: Figure Ground Neural Radiance Fields for 3D Object Category Modelling, 3DV2021](https://fig-nerf.github.io/) | [code]\n    > We investigate the use of Neural Radiance Fields (NeRF) to learn high quality 3D object category models from collections of input images. In contrast to previous work, we are able to do this whilst simultaneously separating foreground objects from their varying backgrounds. We achieve this via a 2-component NeRF model, FiG-NeRF, that prefers explanation of the scene as a geometrically constant background and a deformable foreground that represents the object category. We show that this method can learn accurate 3D object category models using only photometric supervision and casually captured images of the objects. Additionally, our 2-part decomposition allows the model to perform accurate and crisp amodal segmentation. We quantitatively evaluate our method with view synthesis and image fidelity metrics, using synthetic, lab-captured, and in-the-wild data. Our results demonstrate convincing 3D object category modelling that exceed the performance of existing methods.\n  - [NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-view Stereo, ICCV2021(oral)](https://arxiv.org/abs/2109.01129) | [***``[code]``***](https://github.com/weiyithu/NerfingMVS)\n    > In this work, we present a new multi-view depth estimation method that utilizes both conventional SfM reconstruction and learning-based priors over the recently proposed neural radiance fields (NeRF). Unlike existing neural network based optimization method that relies on estimated correspondences, our method directly optimizes over implicit volumes, eliminating the challenging step of matching pixels in indoor scenes. The key to our approach is to utilize the learning-based priors to guide the optimization process of NeRF. Our system firstly adapts a monocular depth network over the target scene by finetuning on its sparse SfM reconstruction. Then, we show that the shape-radiance ambiguity of NeRF still exists in indoor environments and propose to address the issue by employing the adapted depth priors to monitor the sampling process of volume rendering. Finally, a per-pixel confidence map acquired by error computation on the rendered image can be used to further improve the depth quality. Experiments show that our proposed framework significantly outperforms state-of-the-art methods on indoor scenes, with surprising findings presented on the effectiveness of correspondence-based optimization and NeRF-based optimization over the adapted depth priors. In addition, we show that the guided optimization scheme does not sacrifice the original synthesis capability of neural radiance fields, improving the rendering quality on both seen and novel views.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/human.md",
    "content": "\nWeekly Classified Neural Radiance Fields - human ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n  - [NeuWigs: A Neural Dynamic Model for Volumetric Hair Capture and Animation](https://arxiv.org/abs/2212.00613) | [code]\n    > The capture and animation of human hair are two of the major challenges in the creation of realistic avatars for the virtual reality. Both problems are highly challenging, because hair has complex geometry and appearance, as well as exhibits challenging motion. In this paper, we present a two-stage approach that models hair independently from the head to address these challenges in a data-driven manner. The first stage, state compression, learns a low-dimensional latent space of 3D hair states containing motion and appearance, via a novel autoencoder-as-a-tracker strategy. To better disentangle the hair and head in appearance learning, we employ multi-view hair segmentation masks in combination with a differentiable volumetric renderer. The second stage learns a novel hair dynamics model that performs temporal hair transfer based on the discovered latent codes. To enforce higher stability while driving our dynamics model, we employ the 3D point-cloud autoencoder from the compression stage for de-noising of the hair state. Our model outperforms the state of the art in novel view synthesis and is capable of creating novel hair animations without having to rely on hair observations as a driving signal. Project page is here this https URL.\n  - [NeRFInvertor: High Fidelity NeRF-GAN Inversion for Single-shot Real Image Animation](https://arxiv.org/abs/2211.17235) | [code]\n    > Nerf-based Generative models have shown impressive capacity in generating high-quality images with consistent 3D geometry. Despite successful synthesis of fake identity images randomly sampled from latent space, adopting these models for generating face images of real subjects is still a challenging task due to its so-called inversion issue. In this paper, we propose a universal method to surgically fine-tune these NeRF-GAN models in order to achieve high-fidelity animation of real subjects only by a single image. Given the optimized latent code for an out-of-domain real image, we employ 2D loss functions on the rendered image to reduce the identity gap. Furthermore, our method leverages explicit and implicit 3D regularizations using the in-domain neighborhood samples around the optimized latent code to remove geometrical and visual artifacts. Our experiments confirm the effectiveness of our method in realistic, high-fidelity, and 3D consistent animation of real faces on multiple NeRF-GAN models across different datasets.\n  - [LaplacianFusion: Detailed 3D Clothed-Human Body Reconstruction, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555511) | [code]\n    > We propose LaplacianFusion, a novel approach that reconstructs detailed and controllable 3D clothed-human body shapes from an input depth or 3D point cloud sequence. The key idea of our approach is to use Laplacian coordinates, well-known differential coordinates that have been used for mesh editing, for representing the local structures contained in the input scans, instead of implicit 3D functions or vertex displacements used previously. Our approach reconstructs a controllable base mesh using SMPL, and learns a surface function that predicts Laplacian coordinates representing surface details on the base mesh. For a given pose, we first build and subdivide a base mesh, which is a deformed SMPL template, and then estimate Laplacian coordinates for the mesh vertices using the surface function. The final reconstruction for the pose is obtained by integrating the estimated Laplacian coordinates as a whole. Experimental results show that our approach based on Laplacian coordinates successfully reconstructs more visually pleasing shape details than previous methods. The approach also enables various surface detail manipulations, such as detail transfer and enhancement.\n  - [DINER: Depth-aware Image-based NEural Radiance Fields](https://arxiv.org/abs/2211.16630) | [code]\n    > We present Depth-aware Image-based NEural Radiance fields (DINER). Given a sparse set of RGB input views, we predict depth and feature maps to guide the reconstruction of a volumetric scene representation that allows us to render 3D objects under novel views. Specifically, we propose novel techniques to incorporate depth information into feature fusion and efficient scene sampling. In comparison to the previous state of the art, DINER achieves higher synthesis quality and can process input views with greater disparity. This allows us to capture scenes more completely without changing capturing hardware requirements and ultimately enables larger viewpoint changes during novel view synthesis. We evaluate our method by synthesizing novel views, both for human heads and for general objects, and observe significantly improved qualitative results and increased perceptual metrics compared to the previous state of the art. The code will be made publicly available for research purposes.\n  - [Reconstructing Hand-Held Objects from Monocular Video, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555401) | [code]\n    > This paper presents an approach that reconstructs a hand-held object from a monocular video. In contrast to many recent methods that directly predict object geometry by a trained network, the proposed approach does not require any learned prior about the object and is able to recover more accurate and detailed object geometry. The key idea is that the hand motion naturally provides multiple views of the object and the motion can be reliably estimated by a hand pose tracker. Then, the object geometry can be recovered by solving a multi-view reconstruction problem. We devise an implicit neural representation-based method to solve the reconstruction problem and address the issues of imprecise hand pose estimation, relative hand-object motion, and insufficient geometry optimization for small objects. We also provide a newly collected dataset with 3D ground truth to validate the proposed approach. The dataset and code will be released at https://dihuangdh.github.io/hhor.\n  - [Dr.3D: Adapting 3D GANs to Artistic Drawings, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555422) | [code]\n    > While 3D GANs have recently demonstrated the high-quality synthesis of multi-view consistent images and 3D shapes, they are mainly restricted to photo-realistic human portraits. This paper aims to extend 3D GANs to a different, but meaningful visual form: artistic portrait drawings. However, extending existing 3D GANs to drawings is challenging due to the inevitable geometric ambiguity present in drawings. To tackle this, we present Dr.3D, a novel adaptation approach that adapts an existing 3D GAN to artistic drawings. Dr.3D is equipped with three novel components to handle the geometric ambiguity: a deformation-aware 3D synthesis network, an alternating adaptation of pose estimation and image synthesis, and geometric priors. Experiments show that our approach can successfully adapt 3D GANs to drawings and enable multi-view consistent semantic editing of drawings.\n  - [Fast-SNARF: A Fast Deformer for Articulated Neural Fields](https://arxiv.org/abs/2211.15601) | [code]\n    > Neural fields have revolutionized the area of 3D reconstruction and novel view synthesis of rigid scenes. A key challenge in making such methods applicable to articulated objects, such as the human body, is to model the deformation of 3D locations between the rest pose (a canonical space) and the deformed space. We propose a new articulation module for neural fields, Fast-SNARF, which finds accurate correspondences between canonical space and posed space via iterative root finding. Fast-SNARF is a drop-in replacement in functionality to our previous work, SNARF, while significantly improving its computational efficiency. We contribute several algorithmic and implementation improvements over SNARF, yielding a speed-up of 150×. These improvements include voxel-based correspondence search, pre-computing the linear blend skinning function, and an efficient software implementation with CUDA kernels. Fast-SNARF enables efficient and simultaneous optimization of shape and skinning weights given deformed observations without correspondences (e.g. 3D meshes). Because learning of deformation maps is a crucial component in many 3D human avatar methods and since Fast-SNARF provides a computationally efficient solution, we believe that this work represents a significant step towards the practical creation of 3D virtual humans.\n## Nov20 - Nov26, 2022\n  - [Dynamic Neural Portraits, WACV2023](https://arxiv.org/abs/2211.13994) | [code]\n    > We present Dynamic Neural Portraits, a novel approach to the problem of full-head reenactment. Our method generates photo-realistic video portraits by explicitly controlling head pose, facial expressions and eye gaze. Our proposed architecture is different from existing methods that rely on GAN-based image-to-image translation networks for transforming renderings of 3D faces into photo-realistic images. Instead, we build our system upon a 2D coordinate-based MLP with controllable dynamics. Our intuition to adopt a 2D-based representation, as opposed to recent 3D NeRF-like systems, stems from the fact that video portraits are captured by monocular stationary cameras, therefore, only a single viewpoint of the scene is available. Primarily, we condition our generative model on expression blendshapes, nonetheless, we show that our system can be successfully driven by audio features as well. Our experiments demonstrate that the proposed method is 270 times faster than recent NeRF-based reenactment methods, with our networks achieving speeds of 24 fps for resolutions up to 1024 x 1024, while outperforming prior works in terms of visual quality.\n  - [FLNeRF: 3D Facial Landmarks Estimation in Neural Radiance Fields](https://arxiv.org/abs/2211.11202) | [code]\n    > This paper presents the first significant work on directly predicting 3D face landmarks on neural radiance fields (NeRFs), without using intermediate representations such as 2D images, depth maps, or point clouds. Our 3D coarse-to-fine Face Landmarks NeRF (FLNeRF) model efficiently samples from the NeRF on the whole face with individual facial features for accurate landmarks. To mitigate the limited number of facial expressions in the available data, local and non-linear NeRF warp is applied at facial features in fine scale to simulate large emotions range, including exaggerated facial expressions (e.g., cheek blowing, wide opening mouth, eye blinking), for training FLNeRF. With such expression augmentation, our model can predict 3D landmarks not limited to the 20 discrete expressions given in the data. Robust 3D NeRF facial landmarks contribute to many downstream tasks. As an example, we modify MoFaNeRF to enable high-quality face editing and swapping using face landmarks on NeRF, allowing more direct control and wider range of complex expressions. Experiments show that the improved model using landmarks achieves comparable to better results.\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n  - [NeARportation: A Remote Real-time Neural Rendering Framework, VRST22](https://arxiv.org/abs/2210.12398) | [code]\n    > While the presentation of photo-realistic appearance plays a major role in immersion in an augmented virtuality environment, displaying the photo-realistic appearance of real objects remains a challenging problem. Recent developments in photogrammetry have facilitated the incorporation of real objects into virtual space. However, photo-realistic photogrammetry requires a dedicated measurement environment, and there is a trade-off between measurement cost and quality. Furthermore, even with photo-realistic appearance measurements, there is a trade-off between rendering quality and framerate. There is no framework that could resolve these trade-offs and easily provide a photo-realistic appearance in real-time. Our NeARportation framework combines server-client bidirectional communication and neural rendering to resolve these trade-offs. Neural rendering on the server receives the client's head posture and generates a novel-view image with realistic appearance reproduction, which is streamed onto the client's display. By applying our framework to a stereoscopic display, we confirmed that it could display a high-fidelity appearance on full-HD stereo videos at 35-40 frames-per-second (fps), according to the user's head motion.\n  - [HDHumans: A Hybrid Approach for High-fidelity Digital Humans](https://arxiv.org/abs/2210.12003) | [code]\n    > Photo-real digital human avatars are of enormous importance in graphics, as they enable immersive communication over the globe, improve gaming and entertainment experiences, and can be particularly beneficial for AR and VR settings. However, current avatar generation approaches either fall short in high-fidelity novel view synthesis, generalization to novel motions, reproduction of loose clothing, or they cannot render characters at the high resolution offered by modern displays. To this end, we propose HDHumans, which is the first method for HD human character synthesis that jointly produces an accurate and temporally coherent 3D deforming surface and highly photo-realistic images of arbitrary novel views and of motions not seen at training time. At the technical core, our method tightly integrates a classical deforming character template with neural radiance fields (NeRF). Our method is carefully designed to achieve a synergy between classical surface deformation and NeRF. First, the template guides the NeRF, which allows synthesizing novel views of a highly dynamic and articulated character and even enables the synthesis of novel motions. Second, we also leverage the dense pointclouds resulting from NeRF to further improve the deforming surface via 3D-to-3D supervision. We outperform the state of the art quantitatively and qualitatively in terms of synthesis quality and resolution, as well as the quality of 3D surface reconstruction.\n## Oct9 - Oct15, 2022\n  - [AniFaceGAN: Animatable 3D-Aware Face Image Generation for Video Avatars, NeurIPS2022](https://arxiv.org/abs/2210.06465) | [***``[code]``***](https://yuewuhkust.github.io/AniFaceGAN/files/github_icon.jpeg)\n    > Although 2D generative models have made great progress in face image generation and animation, they often suffer from undesirable artifacts such as 3D inconsistency when rendering images from different camera viewpoints. This prevents them from synthesizing video animations indistinguishable from real ones. Recently, 3D-aware GANs extend 2D GANs for explicit disentanglement of camera pose by leveraging 3D scene representations. These methods can well preserve the 3D consistency of the generated images across different views, yet they cannot achieve fine-grained control over other attributes, among which facial expression control is arguably the most useful and desirable for face animation. In this paper, we propose an animatable 3D-aware GAN for multiview consistent face animation generation. The key idea is to decompose the 3D representation of the 3D-aware GAN into a template field and a deformation field, where the former represents different identities with a canonical expression, and the latter characterizes expression variations of each identity. To achieve meaningful control over facial expressions via deformation, we propose a 3D-level imitative learning scheme between the generator and a parametric 3D face model during adversarial training of the 3D-aware GAN. This helps our method achieve high-quality animatable face image generation with strong visual 3D consistency, even though trained with only unstructured 2D images. Extensive experiments demonstrate our superior performance over prior works. Project page: this https URL\n  - [Reconstructing Personalized Semantic Facial NeRF Models From Monocular Video, SIGGRAPH-Asia2022](https://arxiv.org/abs/2210.06108) | [***``[code]``***](https://github.com/USTC3DV/NeRFBlendShape-code)\n    > We present a novel semantic model for human head defined with neural radiance field. The 3D-consistent head model consist of a set of disentangled and interpretable bases, and can be driven by low-dimensional expression coefficients. Thanks to the powerful representation ability of neural radiance field, the constructed model can represent complex facial attributes including hair, wearings, which can not be represented by traditional mesh blendshape. To construct the personalized semantic facial model, we propose to define the bases as several multi-level voxel fields. With a short monocular RGB video as input, our method can construct the subject's semantic facial NeRF model with only ten to twenty minutes, and can render a photo-realistic human head image in tens of miliseconds with a given expression coefficient and view direction. With this novel representation, we apply it to many tasks like facial retargeting and expression editing. Experimental results demonstrate its strong representation ability and training/inference speed. Demo videos and released code are provided in our project page: this https URL\n  - [Controllable Radiance Fields for Dynamic Face Synthesis, 3DV2022](https://arxiv.org/abs/2210.05825) | [code]\n    > Recent work on 3D-aware image synthesis has achieved compelling results using advances in neural rendering. However, 3D-aware synthesis of face dynamics hasn't received much attention. Here, we study how to explicitly control generative model synthesis of face dynamics exhibiting non-rigid motion (e.g., facial expression change), while simultaneously ensuring 3D-awareness. For this we propose a Controllable Radiance Field (CoRF): 1) Motion control is achieved by embedding motion features within the layered latent motion space of a style-based generator; 2) To ensure consistency of background, motion features and subject-specific attributes such as lighting, texture, shapes, albedo, and identity, a face parsing net, a head regressor and an identity encoder are incorporated. On head image/video data we show that CoRFs are 3D-aware while enabling editing of identity, viewing directions, and motion.\n  - [Self-Supervised 3D Human Pose Estimation in Static Video Via Neural Rendering](https://arxiv.org/abs/2210.04514) | [code]\n    > Inferring 3D human pose from 2D images is a challenging and long-standing problem in the field of computer vision with many applications including motion capture, virtual reality, surveillance or gait analysis for sports and medicine. We present preliminary results for a method to estimate 3D pose from 2D video containing a single person and a static background without the need for any manual landmark annotations. We achieve this by formulating a simple yet effective self-supervision task: our model is required to reconstruct a random frame of a video given a frame from another timepoint and a rendered image of a transformed human shape template. Crucially for optimisation, our ray casting based rendering pipeline is fully differentiable, enabling end to end training solely based on the reconstruction task.\n  - [ReFu: Refine and Fuse the Unobserved View for Detail-Preserving Single-Image 3D Human Reconstruction](https://dl.acm.org/doi/abs/10.1145/3503161.3547971) | [code]\n    > Single-image 3D human reconstruction aims to reconstruct the 3D textured surface of the human body given a single image. While implicit function-based methods recently achieved reasonable reconstruction performance, they still bear limitations showing degraded quality in both surface geometry and texture from an unobserved view. In response, to generate a realistic textured surface, we propose ReFu, a coarse-to-fine approach that refines the projected backside view image and fuses the refined image to predict the final human body. To suppress the diffused occupancy that causes noise in projection images and reconstructed meshes, we propose to train occupancy probability by simultaneously utilizing 2D and 3D supervisions with occupancy-based volume rendering. We also introduce a refinement architecture that generates detail-preserving backside-view images with front-to-back warping. Extensive experiments demonstrate that our method achieves state-of-the-art performance in 3D human reconstruction from a single image, showing enhanced geometry and texture quality from an unobserved view.\n## Oct2 - Oct8, 2022\n  - [A Keypoint Based Enhancement Method for Audio Driven Free View Talking Head Synthesis](https://arxiv.org/abs/2210.03335) | [code]\n    > Audio driven talking head synthesis is a challenging task that attracts increasing attention in recent years. Although existing methods based on 2D landmarks or 3D face models can synthesize accurate lip synchronization and rhythmic head pose for arbitrary identity, they still have limitations, such as the cut feeling in the mouth mapping and the lack of skin highlights. The morphed region is blurry compared to the surrounding face. A Keypoint Based Enhancement (KPBE) method is proposed for audio driven free view talking head synthesis to improve the naturalness of the generated video. Firstly, existing methods were used as the backend to synthesize intermediate results. Then we used keypoint decomposition to extract video synthesis controlling parameters from the backend output and the source image. After that, the controlling parameters were composited to the source keypoints and the driving keypoints. A motion field based method was used to generate the final image from the keypoint representation. With keypoint representation, we overcame the cut feeling in the mouth mapping and the lack of skin highlights. Experiments show that our proposed enhancement method improved the quality of talking-head videos in terms of mean opinion score.\n  - [SelfNeRF: Fast Training NeRF for Human from Monocular Self-rotating Video](https://arxiv.org/abs/2210.01651) | [code]\n    > In this paper, we propose SelfNeRF, an efficient neural radiance field based novel view synthesis method for human performance. Given monocular self-rotating videos of human performers, SelfNeRF can train from scratch and achieve high-fidelity results in about twenty minutes. Some recent works have utilized the neural radiance field for dynamic human reconstruction. However, most of these methods need multi-view inputs and require hours of training, making it still difficult for practical use. To address this challenging problem, we introduce a surface-relative representation based on multi-resolution hash encoding that can greatly improve the training speed and aggregate inter-frame information. Extensive experimental results on several different datasets demonstrate the effectiveness and efficiency of SelfNeRF to challenging monocular videos.\n  - [Capturing and Animation of Body and Clothing from Monocular Video](https://arxiv.org/abs/2210.01868) | [code]\n    > While recent work has shown progress on extracting clothed 3D human avatars from a single image, video, or a set of 3D scans, several limitations remain. Most methods use a holistic representation to jointly model the body and clothing, which means that the clothing and body cannot be separated for applications like virtual try-on. Other methods separately model the body and clothing, but they require training from a large set of 3D clothed human meshes obtained from 3D/4D scanners or physics simulations. Our insight is that the body and clothing have different modeling requirements. While the body is well represented by a mesh-based parametric 3D model, implicit representations and neural radiance fields are better suited to capturing the large variety in shape and appearance present in clothing. Building on this insight, we propose SCARF (Segmented Clothed Avatar Radiance Field), a hybrid model combining a mesh-based body with a neural radiance field. Integrating the mesh into the volumetric rendering in combination with a differentiable rasterizer enables us to optimize SCARF directly from monocular videos, without any 3D supervision. The hybrid modeling enables SCARF to (i) animate the clothed body avatar by changing body poses (including hand articulation and facial expressions), (ii) synthesize novel views of the avatar, and (iii) transfer clothing between avatars in virtual try-on applications. We demonstrate that SCARF reconstructs clothing with higher visual quality than existing methods, that the clothing deforms with changing body pose and body shape, and that clothing can be successfully transferred between avatars of different subjects. The code and models are available at this https URL.\n  - [MonoNHR: Monocular Neural Human Renderer](https://arxiv.org/abs/2210.00627) | [code]\n    > Existing neural human rendering methods struggle with a single image input due to the lack of information in invisible areas and the depth ambiguity of pixels in visible areas. In this regard, we propose Monocular Neural Human Renderer (MonoNHR), a novel approach that renders robust free-viewpoint images of an arbitrary human given only a single image. MonoNHR is the first method that (i) renders human subjects never seen during training in a monocular setup, and (ii) is trained in a weakly-supervised manner without geometry supervision. First, we propose to disentangle 3D geometry and texture features and to condition the texture inference on the 3D geometry features. Second, we introduce a Mesh Inpainter module that inpaints the occluded parts exploiting human structural priors such as symmetry. Experiments on ZJU-MoCap, AIST, and HUMBI datasets show that our approach significantly outperforms the recent methods adapted to the monocular case.\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n  - [FNeVR: Neural Volume Rendering for Face Animation](https://arxiv.org/abs/2209.10340) | [code]\n    > Face animation, one of the hottest topics in computer vision, has achieved a promising performance with the help of generative models. However, it remains a critical challenge to generate identity preserving and photo-realistic images due to the sophisticated motion deformation and complex facial detail modeling. To address these problems, we propose a Face Neural Volume Rendering (FNeVR) network to fully explore the potential of 2D motion warping and 3D volume rendering in a unified framework. In FNeVR, we design a 3D Face Volume Rendering (FVR) module to enhance the facial details for image rendering. Specifically, we first extract 3D information with a well-designed architecture, and then introduce an orthogonal adaptive ray-sampling module for efficient rendering. We also design a lightweight pose editor, enabling FNeVR to edit the facial pose in a simple yet effective way. Extensive experiments show that our FNeVR obtains the best overall quality and performance on widely used talking-head benchmarks.\n  - [Human Performance Modeling and Rendering via Neural Animated Mesh](https://arxiv.org/abs/2209.08468) | [code]\n    > We have recently seen tremendous progress in the neural advances for photo-real human modeling and rendering. However, it's still challenging to integrate them into an existing mesh-based pipeline for downstream applications. In this paper, we present a comprehensive neural approach for high-quality reconstruction, compression, and rendering of human performances from dense multi-view videos. Our core intuition is to bridge the traditional animated mesh workflow with a new class of highly efficient neural techniques. We first introduce a neural surface reconstructor for high-quality surface generation in minutes. It marries the implicit volumetric rendering of the truncated signed distance field (TSDF) with multi-resolution hash encoding. We further propose a hybrid neural tracker to generate animated meshes, which combines explicit non-rigid tracking with implicit dynamic deformation in a self-supervised framework. The former provides the coarse warping back into the canonical space, while the latter implicit one further predicts the displacements using the 4D hash encoding as in our reconstructor. Then, we discuss the rendering schemes using the obtained animated meshes, ranging from dynamic texturing to lumigraph rendering under various bandwidth settings. To strike an intricate balance between quality and bandwidth, we propose a hierarchical solution by first rendering 6 virtual views covering the performer and then conducting occlusion-aware neural texture blending. We demonstrate the efficacy of our approach in a variety of mesh-based applications and photo-realistic free-view experiences on various platforms, i.e., inserting virtual human performances into real environments through mobile AR or immersively watching talent shows with VR headsets.\n## Sep11 - Sep17, 2022\n  - [3DMM-RF: Convolutional Radiance Fields for 3D Face Modeling](https://arxiv.org/abs/2209.07366) | [code]\n    > Facial 3D Morphable Models are a main computer vision subject with countless applications and have been highly optimized in the last two decades. The tremendous improvements of deep generative networks have created various possibilities for improving such models and have attracted wide interest. Moreover, the recent advances in neural radiance fields, are revolutionising novel-view synthesis of known scenes. In this work, we present a facial 3D Morphable Model, which exploits both of the above, and can accurately model a subject's identity, pose and expression and render it in arbitrary illumination. This is achieved by utilizing a powerful deep style-based generator to overcome two main weaknesses of neural radiance fields, their rigidity and rendering speed. We introduce a style-based generative network that synthesizes in one pass all and only the required rendering samples of a neural radiance field. We create a vast labelled synthetic dataset of facial renders, and train the network on these data, so that it can accurately model and generalize on facial identity, pose and appearance. Finally, we show that this model can accurately be fit to \"in-the-wild\" facial images of arbitrary pose and illumination, extract the facial characteristics, and be used to re-render the face in controllable conditions.\n  - [Explicitly Controllable 3D-Aware Portrait Generation](https://arxiv.org/abs/2209.05434) | [code]\n    > In contrast to the traditional avatar creation pipeline which is a costly process, contemporary generative approaches directly learn the data distribution from photographs. While plenty of works extend unconditional generative models and achieve some levels of controllability, it is still challenging to ensure multi-view consistency, especially in large poses. In this work, we propose a network that generates 3D-aware portraits while being controllable according to semantic parameters regarding pose, identity, expression and illumination. Our network uses neural scene representation to model 3D-aware portraits, whose generation is guided by a parametric face model that supports explicit control. While the latent disentanglement can be further enhanced by contrasting images with partially different attributes, there still exists noticeable inconsistency in non-face areas, e.g., hair and background, when animating expressions. Wesolve this by proposing a volume blending strategy in which we form a composite output by blending dynamic and static areas, with two parts segmented from the jointly learned semantic field. Our method outperforms prior arts in extensive experiments, producing realistic portraits with vivid expression in natural lighting when viewed from free viewpoints. It also demonstrates generalization ability to real images as well as out-of-domain data, showing great promise in real applications.\n## Sep4 - Sep10, 2022\n  - [SIRA: Relightable Avatars from a Single Image](https://arxiv.org/abs/2209.03027) | [code]\n    > Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.\n  - [MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model](https://arxiv.org/abs/2208.15001) | [***``[code]``***](https://github.com/mingyuan-zhang/MotionDiffuse)\n    > Human motion modeling is important for many modern graphics applications, which typically require professional skills. In order to remove the skill barriers for laymen, recent motion generation methods can directly generate human motions conditioned on natural languages. However, it remains challenging to achieve diverse and fine-grained motion generation with various text inputs. To address this problem, we propose MotionDiffuse, the first diffusion model-based text-driven motion generation framework, which demonstrates several desired properties over existing methods. 1) Probabilistic Mapping. Instead of a deterministic language-motion mapping, MotionDiffuse generates motions through a series of denoising steps in which variations are injected. 2) Realistic Synthesis. MotionDiffuse excels at modeling complicated data distribution and generating vivid motion sequences. 3) Multi-Level Manipulation. MotionDiffuse responds to fine-grained instructions on body parts, and arbitrary-length motion synthesis with time-varied text prompts. Our experiments show MotionDiffuse outperforms existing SoTA methods by convincing margins on text-driven motion generation and action-conditioned motion generation. A qualitative analysis further demonstrates MotionDiffuse's controllability for comprehensive motion generation. Homepage: this https URL\n## Aug28 - Sep3, 2022\n  - [Dual-Space NeRF: Learning Animatable Avatars and Scene Lighting in Separate Spaces, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > Modeling the human body in a canonical space is a common practice for capturing and animation. But when involving the neural radiance field (NeRF), learning a static NeRF in the canonical space is not enough because the lighting of the body changes when the person moves even though the scene lighting is constant. Previous methods alleviate the inconsistency of lighting by learning a per-frame embedding, but this operation does not generalize to unseen poses. Given that the lighting condition is static in the world space while the human body is consistent in the canonical space, we propose a dual-space NeRF that models the scene lighting and the human body with two MLPs in two separate spaces. To bridge these two spaces, previous methods mostly rely on the linear blend skinning (LBS) algorithm. However, the blending weights for LBS of a dynamic neural field are intractable and thus are usually memorized with another MLP, which does not generalize to novel poses. Although it is possible to borrow the blending weights of a parametric mesh such as SMPL, the interpolation operation introduces more artifacts. In this paper, we propose to use the barycentric mapping, which can directly generalize to unseen poses and surprisingly achieves superior results than LBS with neural blending weights. Quantitative and qualitative results on the Human3.6M and the ZJU-MoCap datasets show the effectiveness of our method.\n  - [NerfCap: Human Performance Capture With Dynamic Neural Radiance Fields, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > This paper addresses the challenge of human performance capture from sparse multi-view or monocular videos. Given a template mesh of the performer, previous methods capture the human motion by non-rigidly registering the template mesh to images with 2D silhouettes or dense photometric alignment. However, the detailed surface deformation cannot be recovered from the silhouettes, while the photometric alignment suffers from instability caused by appearance variation in the videos. To solve these problems, we propose NerfCap, a novel performance capture method based on the dynamic neural radiance field (NeRF) representation of the performer. Specifically, a canonical NeRF is initialized from the template geometry and registered to the video frames by optimizing the deformation field and the appearance model of the canonical NeRF. To capture both large body motion and detailed surface deformation, NerfCap combines linear blend skinning with embedded graph deformation. In contrast to the mesh-based methods that suffer from fixed topology and texture, NerfCap is able to flexibly capture complex geometry and appearance variation across the videos, and synthesize more photo-realistic images. In addition, NerfCap can be pre-trained end to end in a self-supervised manner by matching the synthesized videos with the input videos. Experimental results on various datasets show that NerfCap outperforms prior works in terms of both surface reconstruction accuracy and novel-view synthesis quality.\n## Aug21 - Aug27, 2022\n  - [Neural Novel Actor: Learning a Generalized Animatable Neural Representation for Human Actors](https://arxiv.org/abs/2208.11905) | [code]\n    > We propose a new method for learning a generalized animatable neural human representation from a sparse set of multi-view imagery of multiple persons. The learned representation can be used to synthesize novel view images of an arbitrary person from a sparse set of cameras, and further animate them with the user's pose control. While existing methods can either generalize to new persons or synthesize animations with user control, none of them can achieve both at the same time. We attribute this accomplishment to the employment of a 3D proxy for a shared multi-person human model, and further the warping of the spaces of different poses to a shared canonical pose space, in which we learn a neural field and predict the person- and pose-dependent deformations, as well as appearance with the features extracted from input images. To cope with the complexity of the large variations in body shapes, poses, and clothing deformations, we design our neural human model with disentangled geometry and appearance. Furthermore, we utilize the image features both at the spatial point and on the surface points of the 3D proxy for predicting person- and pose-dependent properties. Experiments show that our method significantly outperforms the state-of-the-arts on both tasks. The video and code are available at this https URL.\n## Aug14 - Aug20, 2022\n  - [Temporal View Synthesis of Dynamic Scenes through 3D Object Motion Estimation with Multi-Plane Images, ISMAR2022](https://arxiv.org/abs/2208.09463) | [***``[code]``***](https://github.com/NagabhushanSN95/DeCOMPnet)\n    > The challenge of graphically rendering high frame-rate videos on low compute devices can be addressed through periodic prediction of future frames to enhance the user experience in virtual reality applications. This is studied through the problem of temporal view synthesis (TVS), where the goal is to predict the next frames of a video given the previous frames and the head poses of the previous and the next frames. In this work, we consider the TVS of dynamic scenes in which both the user and objects are moving. We design a framework that decouples the motion into user and object motion to effectively use the available user motion while predicting the next frames. We predict the motion of objects by isolating and estimating the 3D object motion in the past frames and then extrapolating it. We employ multi-plane images (MPI) as a 3D representation of the scenes and model the object motion as the 3D displacement between the corresponding points in the MPI representation. In order to handle the sparsity in MPIs while estimating the motion, we incorporate partial convolutions and masked correlation layers to estimate corresponding points. The predicted object motion is then integrated with the given user or camera motion to generate the next frame. Using a disocclusion infilling module, we synthesize the regions uncovered due to the camera and object motion. We develop a new synthetic dataset for TVS of dynamic scenes consisting of 800 videos at full HD resolution. We show through experiments on our dataset and the MPI Sintel dataset that our model outperforms all the competing methods in the literature.\n  - [LoRD: Local 4D Implicit Representation for High-Fidelity Dynamic Human Modeling, ECCV2022](https://arxiv.org/abs/2208.08622) | [code]\n    > Recent progress in 4D implicit representation focuses on globally controlling the shape and motion with low dimensional latent vectors, which is prone to missing surface details and accumulating tracking error. While many deep local representations have shown promising results for 3D shape modeling, their 4D counterpart does not exist yet. In this paper, we fill this blank by proposing a novel Local 4D implicit Representation for Dynamic clothed human, named LoRD, which has the merits of both 4D human modeling and local representation, and enables high-fidelity reconstruction with detailed surface deformations, such as clothing wrinkles. Particularly, our key insight is to encourage the network to learn the latent codes of local part-level representation, capable of explaining the local geometry and temporal deformations. To make the inference at test-time, we first estimate the inner body skeleton motion to track local parts at each time step, and then optimize the latent codes for each part via auto-decoding based on different types of observed data. Extensive experiments demonstrate that the proposed method has strong capability for representing 4D human, and outperforms state-of-the-art methods on practical applications, including 4D reconstruction from sparse points, non-rigid depth fusion, both qualitatively and quantitatively.\n  - [Neural Capture of Animatable 3D Human from Monocular Video, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > We present a novel paradigm of building an animatable 3D human representation from a monocular video input, such that it can be rendered in any unseen poses and views. Our method is based on a dynamic Neural Radiance Field (NeRF) rigged by a mesh-based parametric 3D human model serving as a geometry proxy. Previous methods usually rely on multi-view videos or accurate 3D geometry information as additional inputs; besides, most methods suffer from degraded quality when generalized to unseen poses. We identify that the key to generalization is a good input embedding for querying dynamic NeRF: A good input embedding should define an injective mapping in the full volumetric space, guided by surface mesh deformation under pose variation. Based on this observation, we propose to embed the input query with its relationship to local surface regions spanned by a set of geodesic nearest neighbors on mesh vertices. By including both position and relative distance information, our embedding defines a distance-preserved deformation mapping and generalizes well to unseen poses. To reduce the dependency on additional inputs, we first initialize per-frame 3D meshes using off-the-shelf tools and then propose a pipeline to jointly optimize NeRF and refine the initial mesh. Extensive experiments show our method can synthesize plausible human rendering results under unseen poses and views.\n## Aug7 - Aug13, 2022\n  - [Progressive Multi-scale Light Field Networks, 3DV2022](https://arxiv.org/abs/2208.06710) | [code]\n    > Neural representations have shown great promise in their ability to represent radiance and light fields while being very compact compared to the image set representation. However, current representations are not well suited for streaming as decoding can only be done at a single level of detail and requires downloading the entire neural network model. Furthermore, high-resolution light field networks can exhibit flickering and aliasing as neural networks are sampled without appropriate filtering. To resolve these issues, we present a progressive multi-scale light field network that encodes a light field with multiple levels of detail. Lower levels of detail are encoded using fewer neural network weights enabling progressive streaming and reducing rendering time. Our progressive multi-scale light field network addresses aliasing by encoding smaller anti-aliased representations at its lower levels of detail. Additionally, per-pixel level of detail enables our representation to support dithered transitions and foveated rendering.\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [Neural Strands: Learning Hair Geometry and Appearance from Multi-View Images, ECCV2022](https://arxiv.org/pdf/2207.14067) | [***``[code]``***](https://radualexandru.github.io/neural_strands/)\n    > We present Neural Strands, a novel learning framework for modeling accurate hair geometry and appearance from multi-view image inputs. The learned hair model can be rendered in real-time from any viewpoint with high-fidelity view-dependent effects. Our model achieves intuitive shape and style control unlike volumetric counterparts. To enable these properties, we propose a novel hair representation based on a neural scalp texture that encodes the geometry and appearance of individual strands at each texel location. Furthermore, we introduce a novel neural rendering framework based on rasterization of the learned hair strands. Our neural rendering is strand-accurate and anti-aliased, making the rendering view-consistent and photorealistic. Combining appearance with a multi-view geometric prior, we enable, for the first time, the joint learning of appearance and explicit hair geometry from a multi-view setup. We demonstrate the efficacy of our approach in terms of fidelity and efficiency for various hairstyles.\n## Previous weeks\n  - [Dynamic Neural Radiance Fields for Monocular 4D Facial Avatar Reconstruction, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > We present dynamic neural radiance fields for modeling the appearance and dynamics of a human face. Digitally modeling and reconstructing a talking human is a key building-block for a variety of applications. Especially, for telepresence applications in AR or VR, a faithful reproduction of the appearance including novel viewpoint or head-poses is required. In contrast to state-of-the-art approaches that model the geometry and material properties explicitly, or are purely image-based, we introduce an implicit representation of the head based on scene representation networks. To handle the dynamics of the face, we combine our scene representation network with a low-dimensional morphable model which provides explicit control over pose and expressions. We use volumetric rendering to generate images from this hybrid representation and demonstrate that such a dynamic neural scene representation can be learned from monocular input data only, without the need of a specialized capture setup. In our experiments, we show that this learned volumetric representation allows for photo-realistic image generation that surpasses the quality of state-of-the-art video-based reenactment methods.\n  - [PVA: Pixel-aligned Volumetric Avatars, CVPR2021](https://volumetric-avatars.github.io/) | [code]\n    > Acquisition and rendering of photorealistic human heads is a highly challenging research problem of particular importance for virtual telepresence. Currently, the highest quality is achieved by volumetric approaches trained in a person-specific manner on multi-view data. These models better represent fine structure, such as hair, compared to simpler mesh-based models. Volumetric models typically employ a global code to represent facial expressions, such that they can be driven by a small set of animation parameters. While such architectures achieve impressive rendering quality, they can not easily be extended to the multi-identity setting. In this paper, we devise a novel approach for predicting volumetric avatars of the human head given just a small number of inputs. We enable generalization across identities by a novel parameterization that combines neural radiance fields with local, pixel-aligned features extracted directly from the inputs, thus side-stepping the need for very deep or complex networks. Our approach is trained in an end-to-end manner solely based on a photometric rerendering loss without requiring explicit 3D supervision.We demonstrate that our approach outperforms the existing state of the art in terms of quality and is able to generate faithful facial expressions in a multi-identity setting.\n  - [Animatable Neural Radiance Fields for Human Body Modeling, ICCV2021](https://zju3dv.github.io/animatable_nerf/) | [***``[code]``***](https://github.com/zju3dv/animatable_nerf)\n    > This paper addresses the challenge of reconstructing an animatable human model from a multi-view video. Some recent works have proposed to decompose a non-rigidly deforming scene into a canonical neural radiance field and a set of deformation fields that map observation-space points to the canonical space, thereby enabling them to learn the dynamic scene from images. However, they represent the deformation field as translational vector field or SE(3) field, which makes the optimization highly under-constrained. Moreover, these representations cannot be explicitly controlled by input motions. Instead, we introduce neural blend weight fields to produce the deformation fields. Based on the skeleton-driven deformation, blend weight fields are used with 3D human skeletons to generate observation-to-canonical and canonical-to-observation correspondences. Since 3D human skeletons are more observable, they can regularize the learning of deformation fields. Moreover, the learned blend weight fields can be combined with input skeletal motions to generate new deformation fields to animate the human model. Experiments show that our approach significantly outperforms recent human synthesis methods. The code will be available at https://zju3dv.github.io/animatable_nerf/.\n  - [Neural Actor: Neural Free-view Synthesis of Human Actors with Pose Control, SIGSIGGRAPH Asia 2021](https://vcai.mpi-inf.mpg.de/projects/NeuralActor/) | [***``[code]``***](https://people.mpi-inf.mpg.de/~lliu/projects/NeuralActor/)\n    > We propose Neural Actor (NA), a new method for high-quality synthesis of humans from arbitrary viewpoints and under arbitrary controllable poses. Our method is built upon recent neural scene representation and rendering works which learn representations of geometry and appearance from only 2D images. While existing works demonstrated compelling rendering of static scenes and playback of dynamic scenes, photo-realistic reconstruction and rendering of humans with neural implicit methods, in particular under user-controlled novel poses, is still difficult. To address this problem, we utilize a coarse body model as the proxy to unwarp the surrounding 3D space into a canonical pose. A neural radiance field learns pose-dependent geometric deformations and pose- and view-dependent appearance effects in the canonical space from multi-view video input. To synthesize novel views of high fidelity dynamic geometry and appearance, we leverage 2D texture maps defined on the body model as latent variables for predicting residual deformations and the dynamic appearance. Experiments demonstrate that our method achieves better quality than the state-of-the-arts on playback as well as novel pose synthesis, and can even generalize well to new poses that starkly differ from the training poses. Furthermore, our method also supports body shape control of the synthesized results.\n  - [Neural Body: Implicit Neural Representations with Structured Latent Codes for Novel View Synthesis of Dynamic Humans, CVPR2021](https://zju3dv.github.io/neuralbody/) | [***``[code]``***](https://github.com/zju3dv/neuralbody)\n    > This paper addresses the challenge of novel view synthesis for a human performer from a very sparse set of camera views. Some recent works have shown that learning implicit neural representations of 3D scenes achieves remarkable view synthesis quality given dense input views. However, the representation learning will be ill-posed if the views are highly sparse. To solve this ill-posed problem, our key idea is to integrate observations over video frames. To this end, we propose Neural Body, a new human body representation which assumes that the learned neural representations at different frames share the same set of latent codes anchored to a deformable mesh, so that the observations across frames can be naturally integrated. The deformable mesh also provides geometric guidance for the network to learn 3D representations more efficiently. To evaluate our approach, we create a multi-view dataset named ZJU-MoCap that captures performers with complex motions. Experiments on ZJU-MoCap show that our approach outperforms prior works by a large margin in terms of novel view synthesis quality. We also demonstrate the capability of our approach to reconstruct a moving person from a monocular video on the People-Snapshot dataset.\n  - [Portrait Neural Radiance Fields from a Single Image](https://portrait-nerf.github.io/) | [code]\n    > We present a method for estimating Neural Radiance Fields (NeRF) from a single headshot portrait. While NeRF has demonstrated high-quality view synthesis, it requires multiple images of static scenes and thus impractical for casual captures and moving subjects. In this work, we propose to pretrain the weights of a multilayer perceptron (MLP), which implicitly models the volumetric density and colors, with a meta-learning framework using a light stage portrait dataset. To improve the generalization to unseen faces, we train the MLP in the canonical coordinate space approximated by 3D face morphable models. We quantitatively evaluate the method using controlled captures and demonstrate the generalization to real portrait images, showing favorable results against state-of-the-arts.\n  - [A-NeRF: Surface-free Human 3D Pose Refinement via Neural Rendering, NeurIPS2021](https://arxiv.org/abs/2102.06199) | [***``[code]``***](https://github.com/LemonATsu/A-NeRF)\n    > While deep learning reshaped the classical motion capture pipeline with feed-forward networks, generative models are required to recover fine alignment via iterative refinement. Unfortunately, the existing models are usually hand-crafted or learned in controlled conditions, only applicable to limited domains. We propose a method to learn a generative neural body model from unlabelled monocular videos by extending Neural Radiance Fields (NeRFs). We equip them with a skeleton to apply to time-varying and articulated motion. A key insight is that implicit models require the inverse of the forward kinematics used in explicit surface models. Our reparameterization defines spatial latent variables relative to the pose of body parts and thereby overcomes ill-posed inverse operations with an overparameterization. This enables learning volumetric body shape and appearance from scratch while jointly refining the articulated pose; all without ground truth labels for appearance, pose, or 3D shape on the input videos. When used for novel-view-synthesis and motion capture, our neural model improves accuracy on diverse datasets. Project website: this https URL .\n  - [Learning Compositional Radiance Fields of Dynamic Human Heads, CVPR2021(oral)](https://ziyanw1.github.io/hybrid_nerf/) | [code]\n    > Photorealistic rendering of dynamic humans is an important ability for telepresence systems, virtual shopping, synthetic data generation, and more. Recently, neural rendering methods, which combine techniques from computer graphics and machine learning, have created high-fidelity models of humans and objects. Some of these methods do not produce results with high-enough fidelity for driveable human models (Neural Volumes) whereas others have extremely long rendering times (NeRF). We propose a novel compositional 3D representation that combines the best of previous methods to produce both higher-resolution and faster results. Our representation bridges the gap between discrete and continuous volumetric representations by combining a coarse 3D-structure-aware grid of animation codes with a continuous learned scene function that maps every position and its corresponding local animation code to its view-dependent emitted radiance and local volume density. Differentiable volume rendering is employed to compute photo-realistic novel views of the human head and upper body as well as to train our novel representation end-to-end using only 2D supervision. In addition, we show that the learned dynamic radiance field can be used to synthesize novel unseen expressions based on a global animation code. Our approach achieves state-of-the-art results for synthesizing novel views of dynamic human heads and the upper body.\n  - [Editable Free-Viewpoint Video using a Layered Neural Representation, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > Generating free-viewpoint videos is critical for immersive VR/AR experience but recent neural advances still lack the editing ability to manipulate the visual perception for large dynamic scenes. To fill this gap, in this paper we propose the first approach for editable photo-realistic free-viewpoint video generation for large-scale dynamic scenes using only sparse 16 cameras. The core of our approach is a new layered neural representation, where each dynamic entity including the environment itself is formulated into a space-time coherent neural layered radiance representation called ST-NeRF. Such layered representation supports fully perception and realistic manipulation of the dynamic scene whilst still supporting a free viewing experience in a wide range. In our ST-NeRF, the dynamic entity/layer is represented as continuous functions, which achieves the disentanglement of location, deformation as well as the appearance of the dynamic entity in a continuous and self-supervised manner. We propose a scene parsing 4D label map tracking to disentangle the spatial information explicitly, and a continuous deform module to disentangle the temporal motion implicitly. An object-aware volume rendering scheme is further introduced for the re-assembling of all the neural layers. We adopt a novel layered loss and motion-aware ray sampling strategy to enable efficient training for a large dynamic scene with multiple performers, Our framework further enables a variety of editing functions, i.e., manipulating the scale and location, duplicating or retiming individual neural layers to create numerous visual effects while preserving high realism. Extensive experiments demonstrate the effectiveness of our approach to achieve high-quality, photo-realistic, and editable free-viewpoint video generation for dynamic scenes.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/lighting.md",
    "content": "\nWeekly Classified Neural Radiance Fields - lighting ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n====================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n  - [Neural Subspaces for Light Fields, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9968104) | [code]\n    > We introduce a framework for compactly representing light field content with the novel concept of neural subspaces. While the recently proposed neural light field representation achieves great compression results by encoding a light field into a single neural network, the unified design is not optimized for the composite structures exhibited in light fields. Moreover, encoding every part of the light field into one network is not ideal for applications that require rapid transmission and decoding. We recognize this problem's connection to subspace learning. We present a method that uses several small neural networks, specializing in learning the neural subspace for a particular light field segment. Moreover, we propose an adaptive weight sharing strategy among those small networks, improving parameter efficiency. In effect, this strategy enables a concerted way to track the similarity among nearby neural subspaces by leveraging the layered structure of neural networks. Furthermore, we develop a soft-classification technique to enhance the color prediction accuracy of neural representations. Our experimental results show that our method better reconstructs the light field than previous methods on various light field scenes. We further demonstrate its successful deployment on encoding light fields with irregular viewpoint layout and dynamic scene content.\n  - [Efficient Light Probes for Real-Time Global Illumination, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555452) | [code]\n    > Reproducing physically-based global illumination (GI) effects has been a long-standing demand for many real-time graphical applications. In pursuit of this goal, many recent engines resort to some form of light probes baked in a precomputation stage. Unfortunately, the GI effects stemming from the precomputed probes are rather limited due to the constraints in the probe storage, representation or query. In this paper, we propose a new method for probe-based GI rendering which can generate a wide range of GI effects, including glossy reflection with multiple bounces, in complex scenes. The key contributions behind our work include a gradient-based search algorithm and a neural image reconstruction method. The search algorithm is designed to reproject the probes' contents to any query viewpoint, without introducing parallax errors, and converges fast to the optimal solution. The neural image reconstruction method, based on a dedicated neural network and several G-buffers, tries to recover high-quality images from low-quality inputs due to limited resolution or (potential) low sampling rate of the probes. This neural method makes the generation of light probes efficient. Moreover, a temporal reprojection strategy and a temporal loss are employed to improve temporal stability for animation sequences. The whole pipeline runs in realtime (>30 frames per second) even for high-resolution (1920×1080) outputs, thanks to the fast convergence rate of the gradient-based search algorithm and a light-weight design of the neural network. Extensive experiments on multiple complex scenes have been conducted to show the superiority of our method over the state-of-the-arts.\n  - [NeuLighting: Neural Lighting for Free Viewpoint Outdoor Scene Relighting with Unconstrained Photo Collections, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555384) | [code]\n    > We propose NeuLighting, a new framework for free viewpoint outdoor scene relighting from a sparse set of unconstrained in-the-wild photo collections. Our framework represents all the scene components as continuous functions parameterized by MLPs that take a 3D location and the lighting condition as input and output reflectance and necessary outdoor illumination properties. Unlike object-level relighting methods which often leverage training images with controllable and consistent indoor illumination, we concentrate on the more challenging outdoor situation where all the images are captured under arbitrary unknown illumination. The key to our method includes a neural lighting representation that compresses the per-image illumination into a disentangled latent vector, and a new free viewpoint relighting scheme that is robust to arbitrary lighting variations across images. The lighting representation is compressive to explain a wide range of illumination and can be easily fed into the query-based NeuLighting framework, enabling efficient shading effect evaluation under any kind of novel illumination. Furthermore, to produce high-quality cast shadows, we estimate the sun visibility map to indicate the shadow regions according to the scene geometry and the sun direction. Thanks to the flexible and explainable neural lighting representation, our system supports outdoor relighting with many different illumination sources, including natural images, environment maps, and time-lapse videos. The high-fidelity renderings under novel views and illumination prove the superiority of our method against state-of-the-art relighting solutions.\n## Nov20 - Nov26, 2022\n  - [Sampling Neural Radiance Fields for Refractive Objects, SIGGRAPH-Asia2022](https://arxiv.org/abs/2211.14799) | [***``[code]``***](https://github.com/alexkeroro86/SampleNeRFRO)\n    > Recently, differentiable volume rendering in neural radiance fields (NeRF) has gained a lot of popularity, and its variants have attained many impressive results. However, existing methods usually assume the scene is a homogeneous volume so that a ray is cast along the straight path. In this work, the scene is instead a heterogeneous volume with a piecewise-constant refractive index, where the path will be curved if it intersects the different refractive indices. For novel view synthesis of refractive objects, our NeRF-based framework aims to optimize the radiance fields of bounded volume and boundary from multi-view posed images with refractive object silhouettes. To tackle this challenging problem, the refractive index of a scene is reconstructed from silhouettes. Given the refractive index, we extend the stratified and hierarchical sampling techniques in NeRF to allow drawing samples along a curved path tracked by the Eikonal equation. The results indicate that our framework outperforms the state-of-the-art method both quantitatively and qualitatively, demonstrating better performance on the perceptual similarity metric and an apparent improvement in the rendering quality on several synthetic and real scenes.\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [LB-NERF: Light Bending Neural Radiance Fields for Transparent Medium, ICIP2022](https://ieeexplore.ieee.org/abstract/document/9897642) | [code]\n    > Neural radiance fields (NeRFs) have been proposed as methods of novel view synthesis and have been used to address various problems because of its versatility. NeRF can represent colors and densities in 3D space using neural rendering assuming a straight light path. However, a medium with a different refractive index in the scene, such as a transparent medium, causes light refraction and breaks the assumption of the straight path of light. Therefore, the NeRFs cannot be learned consistently across multi-view images. To solve this problem, this study proposes a method to learn consistent radiance fields across multiple viewpoints by introducing the light refraction effect as an offset from the straight line originating from the camera center. The experimental results quantitatively and qualitatively verified that our method can interpolate viewpoints better than the conventional NeRF method when considering the refraction of transparent objects.\n  - [IBL-NeRF: Image-Based Lighting Formulation of Neural Radiance Fields](https://arxiv.org/abs/2210.08202) | [code]\n    > We propose IBL-NeRF, which decomposes the neural radiance fields (NeRF) of large-scale indoor scenes into intrinsic components. Previous approaches for the inverse rendering of NeRF transform the implicit volume to fit the rendering pipeline of explicit geometry, and approximate the views of segmented, isolated objects with environment lighting. In contrast, our inverse rendering extends the original NeRF formulation to capture the spatial variation of lighting within the scene volume, in addition to surface properties. Specifically, the scenes of diverse materials are decomposed into intrinsic components for image-based rendering, namely, albedo, roughness, surface normal, irradiance, and prefiltered radiance. All of the components are inferred as neural images from MLP, which can model large-scale general scenes. By adopting the image-based formulation of NeRF, our approach inherits superior visual quality and multi-view consistency for synthesized images. We demonstrate the performance on scenes with complex object layouts and light configurations, which could not be processed in any of the previous works.\n  - [Estimating Neural Reflectance Field from Radiance Field using Tree Structures](https://arxiv.org/abs/2210.04217) | [code]\n    > We present a new method for estimating the Neural Reflectance Field (NReF) of an object from a set of posed multi-view images under unknown lighting. NReF represents 3D geometry and appearance of objects in a disentangled manner, and are hard to be estimated from images only. Our method solves this problem by exploiting the Neural Radiance Field (NeRF) as a proxy representation, from which we perform further decomposition. A high-quality NeRF decomposition relies on good geometry information extraction as well as good prior terms to properly resolve ambiguities between different components. To extract high-quality geometry information from radiance fields, we re-design a new ray-casting based method for surface point extraction. To efficiently compute and apply prior terms, we convert different prior terms into different type of filter operations on the surface extracted from radiance field. We then employ two type of auxiliary data structures, namely Gaussian KD-tree and octree, to support fast querying of surface points and efficient computation of surface filters during training. Based on this, we design a multi-stage decomposition optimization pipeline for estimating neural reflectance field from neural radiance fields. Extensive experiments show our method outperforms other state-of-the-art methods on different data, and enable high-quality free-view relighting as well as material editing tasks.\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n  - [Neural Global Illumination: Interactive Indirect Illumination Prediction under Dynamic Area Lights, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9904431) | [code]\n    > We propose neural global illumination, a novel method for fast rendering full global illumination in static scenes with dynamic viewpoint and area lighting. The key idea of our method is to utilize a deep rendering network to model the complex mapping from each shading point to global illumination. To efficiently learn the mapping, we propose a neural-network-friendly input representation including attributes of each shading point, viewpoint information, and a combinational lighting representation that enables high-quality fitting with a compact neural network. To synthesize high-frequency global illumination effects, we transform the low-dimension input to higher-dimension space by positional encoding and model the rendering network as a deep fully-connected network. Besides, we feed a screen-space neural buffer to our rendering network to share global information between objects in the screen-space to each shading point. We have demonstrated our neural global illumination method in rendering a wide variety of scenes exhibiting complex and all-frequency global illumination effects such as multiple-bounce glossy interreflection, color bleeding, and caustics.\n## Sep18 - Sep24, 2022\n## Sep11 - Sep17, 2022\n  - [StructNeRF: Neural Radiance Fields for Indoor Scenes with Structural Hints](https://arxiv.org/abs/2209.05277) | [code]\n    > Neural Radiance Fields (NeRF) achieve photo-realistic view synthesis with densely captured input images. However, the geometry of NeRF is extremely under-constrained given sparse views, resulting in significant degradation of novel view synthesis quality. Inspired by self-supervised depth estimation methods, we propose StructNeRF, a solution to novel view synthesis for indoor scenes with sparse inputs. StructNeRF leverages the structural hints naturally embedded in multi-view inputs to handle the unconstrained geometry issue in NeRF. Specifically, it tackles the texture and non-texture regions respectively: a patch-based multi-view consistent photometric loss is proposed to constrain the geometry of textured regions; for non-textured ones, we explicitly restrict them to be 3D consistent planes. Through the dense self-supervised depth constraints, our method improves both the geometry and the view synthesis performance of NeRF without any additional training on external data. Extensive experiments on several real-world datasets demonstrate that StructNeRF surpasses state-of-the-art methods for indoor scenes with sparse inputs both quantitatively and qualitatively.\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n  - [Cross-Spectral Neural Radiance Fields, 3DV2022](https://arxiv.org/abs/2209.00648) | [code]\n    > We propose X-NeRF, a novel method to learn a Cross-Spectral scene representation given images captured from cameras with different light spectrum sensitivity, based on the Neural Radiance Fields formulation. X-NeRF optimizes camera poses across spectra during training and exploits Normalized Cross-Device Coordinates (NXDC) to render images of different modalities from arbitrary viewpoints, which are aligned and at the same resolution. Experiments on 16 forward-facing scenes, featuring color, multi-spectral and infrared images, confirm the effectiveness of X-NeRF at modeling Cross-Spectral scene representations.\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n  - [Casual Indoor HDR Radiance Capture from Omnidirectional Images](https://arxiv.org/abs/2208.07903) | [code]\n    > We present PanoHDR-NeRF, a novel pipeline to casually capture a plausible full HDR radiance field of a large indoor scene without elaborate setups or complex capture protocols. First, a user captures a low dynamic range (LDR) omnidirectional video of the scene by freely waving an off-the-shelf camera around the scene. Then, an LDR2HDR network uplifts the captured LDR frames to HDR, subsequently used to train a tailored NeRF++ model. The resulting PanoHDR-NeRF pipeline can estimate full HDR panoramas from any location of the scene. Through experiments on a novel test dataset of a variety of real scenes with the ground truth HDR radiance captured at locations not seen during training, we show that PanoHDR-NeRF predicts plausible radiance from any scene point. We also show that the HDR images produced by PanoHDR-NeRF can synthesize correct lighting effects, enabling the augmentation of indoor scenes with synthetic objects that are lit correctly.\n  - [HDR-Plenoxels: Self-Calibrating High Dynamic Range Radiance Fields, ECCV2022](https://arxiv.org/abs/2208.06787) | [code]\n    > We propose high dynamic range radiance (HDR) fields, HDR-Plenoxels, that learn a plenoptic function of 3D HDR radiance fields, geometry information, and varying camera settings inherent in 2D low dynamic range (LDR) images. Our voxel-based volume rendering pipeline reconstructs HDR radiance fields with only multi-view LDR images taken from varying camera settings in an end-to-end manner and has a fast convergence speed. To deal with various cameras in real-world scenarios, we introduce a tone mapping module that models the digital in-camera imaging pipeline (ISP) and disentangles radiometric settings. Our tone mapping module allows us to render by controlling the radiometric settings of each novel view. Finally, we build a multi-view dataset with varying camera conditions, which fits our problem setting. Our experiments show that HDR-Plenoxels can express detail and high-quality HDR novel views from only LDR images with various cameras.\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [Neural Radiance Transfer Fields for Relightable Novel-view Synthesis with Global Illumination](https://arxiv.org/abs/2207.13607) | [code]\n    > Given a set of images of a scene, the re-rendering of this scene from novel views and lighting conditions is an important and challenging problem in Computer Vision and Graphics. On the one hand, most existing works in Computer Vision usually impose many assumptions regarding the image formation process, e.g. direct illumination and predefined materials, to make scene parameter estimation tractable. On the other hand, mature Computer Graphics tools allow modeling of complex photo-realistic light transport given all the scene parameters. Combining these approaches, we propose a method for scene relighting under novel views by learning a neural precomputed radiance transfer function, which implicitly handles global illumination effects using novel environment maps. Our method can be solely supervised on a set of real images of the scene under a single unknown lighting condition. To disambiguate the task during training, we tightly integrate a differentiable path tracer in the training process and propose a combination of a synthesized OLAT and a real image loss. Results show that the recovered disentanglement of scene parameters improves significantly over the current state of the art and, thus, also our re-rendering results are more realistic and accurate.\n## Previous weeks\n  - [NeRF in the Wild: Neural Radiance Fields for Unconstrained Photo Collections, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > We present a learning-based method for synthesizing novel views of complex scenes using only unstructured collections of in-the-wild photographs. We build on Neural Radiance Fields (NeRF), which uses the weights of a multilayer perceptron to model the density and color of a scene as a function of 3D coordinates. While NeRF works well on images of static subjects captured under controlled settings, it is incapable of modeling many ubiquitous, real-world phenomena in uncontrolled images, such as variable illumination or transient occluders. We introduce a series of extensions to NeRF to address these issues, thereby enabling accurate reconstructions from unstructured image collections taken from the internet. We apply our system, dubbed NeRF-W, to internet photo collections of famous landmarks, and demonstrate temporally consistent novel view renderings that are significantly closer to photorealism than the prior state of the art.\n  - [Ha-NeRF: Hallucinated Neural Radiance Fields in the Wild, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > Neural Radiance Fields (NeRF) has recently gained popularity for its impressive novel view synthesis ability. This paper studies the problem of hallucinated NeRF: i.e., recovering a realistic NeRF at a different time of day from a group of tourism images. Existing solutions adopt NeRF with a controllable appearance embedding to render novel views under various conditions, but they cannot render view-consistent images with an unseen appearance. To solve this problem, we present an end-to-end framework for constructing a hallucinated NeRF, dubbed as Ha-NeRF. Specifically, we propose an appearance hallucination module to handle time-varying appearances and transfer them to novel views. Considering the complex occlusions of tourism images, we introduce an anti-occlusion module to decompose the static subjects for visibility accurately. Experimental results on synthetic data and real tourism photo collections demonstrate that our method can hallucinate the desired appearances and render occlusion-free images from different views.\n  - [NeRF in the Dark: High Dynamic Range View Synthesis from Noisy Raw Images, CVPR2022(oral)](https://bmild.github.io/rawnerf/) | [***``[code]``***](https://github.com/google-research/multinerf)\n    > Neural Radiance Fields (NeRF) is a technique for high quality novel view synthesis from a collection of posed input images. Like most view synthesis methods, NeRF uses tonemapped low dynamic range (LDR) as input; these images have been processed by a lossy camera pipeline that smooths detail, clips highlights, and distorts the simple noise distribution of raw sensor data. We modify NeRF to instead train directly on linear raw images, preserving the scene's full dynamic range. By rendering raw output images from the resulting NeRF, we can perform novel high dynamic range (HDR) view synthesis tasks. In addition to changing the camera viewpoint, we can manipulate focus, exposure, and tonemapping after the fact. Although a single raw image appears significantly more noisy than a postprocessed one, we show that NeRF is highly robust to the zero-mean distribution of raw noise. When optimized over many noisy raw inputs (25-200), NeRF produces a scene representation so accurate that its rendered novel views outperform dedicated single and multi-image deep raw denoisers run on the same wide baseline input images. As a result, our method, which we call RawNeRF, can reconstruct scenes from extremely noisy images captured in near-darkness.\n  - [NeRV: Neural Reflectance and Visibility Fields for Relighting and View Synthesis, CVPR2021](https://pratulsrinivasan.github.io/nerv/) | [code]\n    > We present a method that takes as input a set of images of a scene illuminated by unconstrained known lighting, and produces as output a 3D representation that can be rendered from novel viewpoints under arbitrary lighting conditions. Our method represents the scene as a continuous volumetric function parameterized as MLPs whose inputs are a 3D location and whose outputs are the following scene properties at that input location: volume density, surface normal, material parameters, distance to the first surface intersection in any direction, and visibility of the external environment in any direction. Together, these allow us to render novel views of the object under arbitrary lighting, including indirect illumination effects. The predicted visibility and surface intersection fields are critical to our model's ability to simulate direct and indirect illumination during training, because the brute-force techniques used by prior work are intractable for lighting conditions outside of controlled setups with a single light. Our method outperforms alternative approaches for recovering relightable 3D scene representations, and performs well in complex lighting settings that have posed a significant challenge to prior work.\n  - [NeX: Real-time View Synthesis with Neural Basis Expansion, CVPR2021(oral)](https://nex-mpi.github.io/) | [***``[code]``***](https://github.com/nex-mpi/nex-code/)\n    > We present NeX, a new approach to novel view synthesis based on enhancements of multiplane image (MPI) that can reproduce NeXt-level view-dependent effects---in real time. Unlike traditional MPI that uses a set of simple RGBα planes, our technique models view-dependent effects by instead parameterizing each pixel as a linear combination of basis functions learned from a neural network. Moreover, we propose a hybrid implicit-explicit modeling strategy that improves upon fine detail and produces state-of-the-art results. Our method is evaluated on benchmark forward-facing datasets as well as our newly-introduced dataset designed to test the limit of view-dependent modeling with significantly more challenging effects such as the rainbow reflections on a CD. Our method achieves the best overall scores across all major metrics on these datasets with more than 1000× faster rendering time than the state of the art.\n  - [NeRFactor: Neural Factorization of Shape and Reflectance Under an Unknown Illumination, TOG 2021 (Proc. SIGGRAPH Asia)](https://xiuming.info/projects/nerfactor/) | [code]\n    > We address the problem of recovering the shape and spatially-varying reflectance of an object from multi-view images (and their camera poses) of an object illuminated by one unknown lighting condition. This enables the rendering of novel views of the object under arbitrary environment lighting and editing of the object's material properties. The key to our approach, which we call Neural Radiance Factorization (NeRFactor), is to distill the volumetric geometry of a Neural Radiance Field (NeRF) [Mildenhall et al. 2020] representation of the object into a surface representation and then jointly refine the geometry while solving for the spatially-varying reflectance and environment lighting. Specifically, NeRFactor recovers 3D neural fields of surface normals, light visibility, albedo, and Bidirectional Reflectance Distribution Functions (BRDFs) without any supervision, using only a re-rendering loss, simple smoothness priors, and a data-driven BRDF prior learned from real-world BRDF measurements. By explicitly modeling light visibility, NeRFactor is able to separate shadows from albedo and synthesize realistic soft or hard shadows under arbitrary lighting conditions. NeRFactor is able to recover convincing 3D models for free-viewpoint relighting in this challenging and underconstrained capture setup for both synthetic and real scenes. Qualitative and quantitative experiments show that NeRFactor outperforms classic and deep learning-based state of the art across various tasks. Our videos, code, and data are available at people.csail.mit.edu/xiuming/projects/nerfactor/.\n  - [FiG-NeRF: Figure Ground Neural Radiance Fields for 3D Object Category Modelling, 3DV2021](https://fig-nerf.github.io/) | [code]\n    > We investigate the use of Neural Radiance Fields (NeRF) to learn high quality 3D object category models from collections of input images. In contrast to previous work, we are able to do this whilst simultaneously separating foreground objects from their varying backgrounds. We achieve this via a 2-component NeRF model, FiG-NeRF, that prefers explanation of the scene as a geometrically constant background and a deformable foreground that represents the object category. We show that this method can learn accurate 3D object category models using only photometric supervision and casually captured images of the objects. Additionally, our 2-part decomposition allows the model to perform accurate and crisp amodal segmentation. We quantitatively evaluate our method with view synthesis and image fidelity metrics, using synthetic, lab-captured, and in-the-wild data. Our results demonstrate convincing 3D object category modelling that exceed the performance of existing methods.\n  - [NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-view Stereo, ICCV2021(oral)](https://arxiv.org/abs/2109.01129) | [***``[code]``***](https://github.com/weiyithu/NerfingMVS)\n    > In this work, we present a new multi-view depth estimation method that utilizes both conventional SfM reconstruction and learning-based priors over the recently proposed neural radiance fields (NeRF). Unlike existing neural network based optimization method that relies on estimated correspondences, our method directly optimizes over implicit volumes, eliminating the challenging step of matching pixels in indoor scenes. The key to our approach is to utilize the learning-based priors to guide the optimization process of NeRF. Our system firstly adapts a monocular depth network over the target scene by finetuning on its sparse SfM reconstruction. Then, we show that the shape-radiance ambiguity of NeRF still exists in indoor environments and propose to address the issue by employing the adapted depth priors to monitor the sampling process of volume rendering. Finally, a per-pixel confidence map acquired by error computation on the rendered image can be used to further improve the depth quality. Experiments show that our proposed framework significantly outperforms state-of-the-art methods on indoor scenes, with surprising findings presented on the effectiveness of correspondence-based optimization and NeRF-based optimization over the adapted depth priors. In addition, we show that the guided optimization scheme does not sacrifice the original synthesis capability of neural radiance fields, improving the rendering quality on both seen and novel views.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/others.md",
    "content": "\nWeekly Classified Neural Radiance Fields - others ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n==================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n  - [Masked Wavelet Representation for Compact Neural Radiance Fields](https://arxiv.org/abs/2212.09069) | [***``[code]``***](https://github.com/daniel03c1/masked_wavelet_nerf)\n    > Neural radiance fields (NeRF) have demonstrated the potential of coordinate-based neural representation (neural fields or implicit neural representation) in neural rendering. However, using a multi-layer perceptron (MLP) to represent a 3D scene or object requires enormous computational resources and time. There have been recent studies on how to reduce these computational inefficiencies by using additional data structures, such as grids or trees. Despite the promising performance, the explicit data structure necessitates a substantial amount of memory. In this work, we present a method to reduce the size without compromising the advantages of having additional data structures. In detail, we propose using the wavelet transform on grid-based neural fields. Grid-based neural fields are for fast convergence, and the wavelet transform, whose efficiency has been demonstrated in high-performance standard codecs, is to improve the parameter efficiency of grids. Furthermore, in order to achieve a higher sparsity of grid coefficients while maintaining reconstruction quality, we present a novel trainable masking approach. Experimental results demonstrate that non-spatial grid coefficients, such as wavelet coefficients, are capable of attaining a higher level of sparsity than spatial grid coefficients, resulting in a more compact representation. With our proposed mask and compression pipeline, we achieved state-of-the-art performance within a memory budget of 2 MB. Our code is available at this https URL.\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions](https://arxiv.org/abs/2212.04701) | [***``[code]``***](https://github.com/frozoul/4K-NeRF)\n    > In this paper, we present a novel and effective framework, named 4K-NeRF, to pursue high fidelity view synthesis on the challenging scenarios of ultra high resolutions, building on the methodology of neural radiance fields (NeRF). The rendering procedure of NeRF-based methods typically relies on a pixel wise manner in which rays (or pixels) are treated independently on both training and inference phases, limiting its representational ability on describing subtle details especially when lifting to a extremely high resolution. We address the issue by better exploring ray correlation for enhancing high-frequency details benefiting from the use of geometry-aware local context. Particularly, we use the view-consistent encoder to model geometric information effectively in a lower resolution space and recover fine details through the view-consistent decoder, conditioned on ray features and depths estimated by the encoder. Joint training with patch-based sampling further facilitates our method incorporating the supervision from perception oriented regularization beyond pixel wise loss. Quantitative and qualitative comparisons with modern NeRF methods demonstrate that our method can significantly boost rendering quality for retaining high-frequency details, achieving the state-of-the-art visual quality on 4K ultra-high-resolution scenario. Code Available at \\url{this https URL}\n## Nov27 - Dec3, 2022\n  - [3D-TOGO: Towards Text-Guided Cross-Category 3D Object Generation, AAAI2023](https://arxiv.org/abs/2212.01103) | [code]\n    > Text-guided 3D object generation aims to generate 3D objects described by user-defined captions, which paves a flexible way to visualize what we imagined. Although some works have been devoted to solving this challenging task, these works either utilize some explicit 3D representations (e.g., mesh), which lack texture and require post-processing for rendering photo-realistic views; or require individual time-consuming optimization for every single case. Here, we make the first attempt to achieve generic text-guided cross-category 3D object generation via a new 3D-TOGO model, which integrates a text-to-views generation module and a views-to-3D generation module. The text-to-views generation module is designed to generate different views of the target 3D object given an input caption. prior-guidance, caption-guidance and view contrastive learning are proposed for achieving better view-consistency and caption similarity. Meanwhile, a pixelNeRF model is adopted for the views-to-3D generation module to obtain the implicit 3D neural representation from the previously-generated views. Our 3D-TOGO model generates 3D objects in the form of the neural radiance field with good texture and requires no time-cost optimization for every single caption. Besides, 3D-TOGO can control the category, color and shape of generated 3D objects with the input caption. Extensive experiments on the largest 3D object dataset (i.e., ABO) are conducted to verify that 3D-TOGO can better generate high-quality 3D objects according to the input captions across 98 different categories, in terms of PSNR, SSIM, LPIPS and CLIP-score, compared with text-NeRF and Dreamfields.\n  - [SinGRAF: Learning a 3D Generative Radiance Field for a Single Scene](https://arxiv.org/abs/2211.17260) | [code]\n    > Generative models have shown great promise in synthesizing photorealistic 3D objects, but they require large amounts of training data. We introduce SinGRAF, a 3D-aware generative model that is trained with a few input images of a single scene. Once trained, SinGRAF generates different realizations of this 3D scene that preserve the appearance of the input while varying scene layout. For this purpose, we build on recent progress in 3D GAN architectures and introduce a novel progressive-scale patch discrimination approach during training. With several experiments, we demonstrate that the results produced by SinGRAF outperform the closest related works in both quality and diversity by a large margin.\n  - [NeAF: Learning Neural Angle Fields for Point Normal Estimation, AAAI2023](https://arxiv.org/abs/2211.16869) | [***``[code]``***](https://github.com/lisj575/NeAF)\n    > Normal estimation for unstructured point clouds is an important task in 3D computer vision. Current methods achieve encouraging results by mapping local patches to normal vectors or learning local surface fitting using neural networks. However, these methods are not generalized well to unseen scenarios and are sensitive to parameter settings. To resolve these issues, we propose an implicit function to learn an angle field around the normal of each point in the spherical coordinate system, which is dubbed as Neural Angle Fields (NeAF). Instead of directly predicting the normal of an input point, we predict the angle offset between the ground truth normal and a randomly sampled query normal. This strategy pushes the network to observe more diverse samples, which leads to higher prediction accuracy in a more robust manner. To predict normals from the learned angle fields at inference time, we randomly sample query vectors in a unit spherical space and take the vectors with minimal angle values as the predicted normals. To further leverage the prior learned by NeAF, we propose to refine the predicted normal vectors by minimizing the angle offsets. The experimental results with synthetic data and real scans show significant improvements over the state-of-the-art under widely used benchmarks.\n  - [SNAF: Sparse-view CBCT Reconstruction with Neural Attenuation Fields](https://arxiv.org/abs/2211.17048) | [code]\n    > Cone beam computed tomography (CBCT) has been widely used in clinical practice, especially in dental clinics, while the radiation dose of X-rays when capturing has been a long concern in CBCT imaging. Several research works have been proposed to reconstruct high-quality CBCT images from sparse-view 2D projections, but the current state-of-the-arts suffer from artifacts and the lack of fine details. In this paper, we propose SNAF for sparse-view CBCT reconstruction by learning the neural attenuation fields, where we have invented a novel view augmentation strategy to overcome the challenges introduced by insufficient data from sparse input views. Our approach achieves superior performance in terms of high reconstruction quality (30+ PSNR) with only 20 input views (25 times fewer than clinical collections), which outperforms the state-of-the-arts. We have further conducted comprehensive experiments and ablation analysis to validate the effectiveness of our approach.\n  - [Reconstructing Hand-Held Objects from Monocular Video, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555401) | [code]\n    > This paper presents an approach that reconstructs a hand-held object from a monocular video. In contrast to many recent methods that directly predict object geometry by a trained network, the proposed approach does not require any learned prior about the object and is able to recover more accurate and detailed object geometry. The key idea is that the hand motion naturally provides multiple views of the object and the motion can be reliably estimated by a hand pose tracker. Then, the object geometry can be recovered by solving a multi-view reconstruction problem. We devise an implicit neural representation-based method to solve the reconstruction problem and address the issues of imprecise hand pose estimation, relative hand-object motion, and insufficient geometry optimization for small objects. We also provide a newly collected dataset with 3D ground truth to validate the proposed approach. The dataset and code will be released at https://dihuangdh.github.io/hhor.\n  - [A Light Touch Approach to Teaching Transformers Multi-view Geometry](https://arxiv.org/abs/2211.15107) | [code]\n    > Transformers are powerful visual learners, in large part due to their conspicuous lack of manually-specified priors. This flexibility can be problematic in tasks that involve multiple-view geometry, due to the near-infinite possible variations in 3D shapes and viewpoints (requiring flexibility), and the precise nature of projective geometry (obeying rigid laws). To resolve this conundrum, we propose a \"light touch\" approach, guiding visual Transformers to learn multiple-view geometry but allowing them to break free when needed. We achieve this by using epipolar lines to guide the Transformer's cross-attention maps, penalizing attention values outside the epipolar lines and encouraging higher attention along these lines since they contain geometrically plausible matches. Unlike previous methods, our proposal does not require any camera pose information at test-time. We focus on pose-invariant object instance retrieval, where standard Transformer networks struggle, due to the large differences in viewpoint between query and retrieved images. Experimentally, our method outperforms state-of-the-art approaches at object retrieval, without needing pose information at test-time.\n  - [Non-uniform Sampling Strategies for NeRF on 360° images , BMVC2022](https://arxiv.org/abs/2212.03635) | [code]\n    > In recent years, the performance of novel view synthesis using perspective images has dramatically improved with the advent of neural radiance fields (NeRF). This study proposes two novel techniques that effectively build NeRF for 360{\\textdegree} omnidirectional images. Due to the characteristics of a 360{\\textdegree} image of ERP format that has spatial distortion in their high latitude regions and a 360{\\textdegree} wide viewing angle, NeRF's general ray sampling strategy is ineffective. Hence, the view synthesis accuracy of NeRF is limited and learning is not efficient. We propose two non-uniform ray sampling schemes for NeRF to suit 360{\\textdegree} images - distortion-aware ray sampling and content-aware ray sampling. We created an evaluation dataset Synth360 using Replica and SceneCity models of indoor and outdoor scenes, respectively. In experiments, we show that our proposal successfully builds 360{\\textdegree} image NeRF in terms of both accuracy and efficiency. The proposal is widely applicable to advanced variants of NeRF. DietNeRF, AugNeRF, and NeRF++ combined with the proposed techniques further improve the performance. Moreover, we show that our proposed method enhances the quality of real-world scenes in 360{\\textdegree} images. Synth360: this https URL.\n## Nov20 - Nov26, 2022\n  - [Unsupervised Continual Semantic Adaptation through Neural Rendering](https://arxiv.org/abs/2211.13969) | [code]\n    > An increasing amount of applications rely on data-driven models that are deployed for perception tasks across a sequence of scenes. Due to the mismatch between training and deployment data, adapting the model on the new scenes is often crucial to obtain good performance. In this work, we study continual multi-scene adaptation for the task of semantic segmentation, assuming that no ground-truth labels are available during deployment and that performance on the previous scenes should be maintained. We propose training a Semantic-NeRF network for each scene by fusing the predictions of a segmentation model and then using the view-consistent rendered semantic labels as pseudo-labels to adapt the model. Through joint training with the segmentation model, the Semantic-NeRF model effectively enables 2D-3D knowledge transfer. Furthermore, due to its compact size, it can be stored in a long-term memory and subsequently used to render data from arbitrary viewpoints to reduce forgetting. We evaluate our approach on ScanNet, where we outperform both a voxel-based baseline and a state-of-the-art unsupervised domain adaptation method.\n  - [DiffusionSDF: Conditional Generative Modeling of Signed Distance Functions](https://arxiv.org/abs/2211.13757) | [code]\n    > Probabilistic diffusion models have achieved state-of-the-art results for image synthesis, inpainting, and text-to-image tasks. However, they are still in the early stages of generating complex 3D shapes. This work proposes DiffusionSDF, a generative model for shape completion, single-view reconstruction, and reconstruction of real-scanned point clouds. We use neural signed distance functions (SDFs) as our 3D representation to parameterize the geometry of various signals (e.g., point clouds, 2D images) through neural networks. Neural SDFs are implicit functions and diffusing them amounts to learning the reversal of their neural network weights, which we solve using a custom modulation module. Extensive experiments show that our method is capable of both realistic unconditional generation and conditional generation from partial inputs. This work expands the domain of diffusion models from learning 2D, explicit representations, to 3D, implicit representations.\n  - [BAD-NeRF: Bundle Adjusted Deblur Neural Radiance Fields](https://arxiv.org/abs/2211.12853) | [code]\n    > Neural Radiance Fields (NeRF) have received considerable attention recently, due to its impressive capability in photo-realistic 3D reconstruction and novel view synthesis, given a set of posed camera images. Earlier work usually assumes the input images are in good quality. However, image degradation (e.g. image motion blur in low-light conditions) can easily happen in real-world scenarios, which would further affect the rendering quality of NeRF. In this paper, we present a novel bundle adjusted deblur Neural Radiance Fields (BAD-NeRF), which can be robust to severe motion blurred images and inaccurate camera poses. Our approach models the physical image formation process of a motion blurred image, and jointly learns the parameters of NeRF and recovers the camera motion trajectories during exposure time. In experiments, we show that by directly modeling the real physical image formation process, BAD-NeRF achieves superior performance over prior works on both synthetic and real datasets.\n  - [OReX: Object Reconstruction from Planner Cross-sections Using Neural Fields](https://arxiv.org/abs/2211.12886) | [code]\n    > Reconstructing 3D shapes from planar cross-sections is a challenge inspired by downstream applications like medical imaging and geographic informatics. The input is an in/out indicator function fully defined on a sparse collection of planes in space, and the output is an interpolation of the indicator function to the entire volume. Previous works addressing this sparse and ill-posed problem either produce low quality results, or rely on additional priors such as target topology, appearance information, or input normal directions. In this paper, we present OReX, a method for 3D shape reconstruction from slices alone, featuring a Neural Field as the interpolation prior. A simple neural network is trained on the input planes to receive a 3D coordinate and return an inside/outside estimate for the query point. This prior is powerful in inducing smoothness and self-similarities. The main challenge for this approach is high-frequency details, as the neural prior is overly smoothing. To alleviate this, we offer an iterative estimation architecture and a hierarchical input sampling scheme that encourage coarse-to-fine training, allowing focusing on high frequencies at later stages. In addition, we identify and analyze a common ripple-like effect stemming from the mesh extraction step. We mitigate it by regularizing the spatial gradients of the indicator function around input in/out boundaries, cutting the problem at the root.\n## Nov13 - Nov19, 2022\n  - [Real-Time Omnidirectional Roaming in Large Scale Indoor Scenes, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550340.3564222) | [code]\n    > Neural radiance field (NeRF) has recently achieved impressive results in novel view synthesis. However, previous works on NeRF mainly focus on object-centric scenarios. They would suffer observable performance degradation in outward-facing and large-scale scenes due to limiting positional encoding capacity. To narrow the gap, we explore radiance fields in a geometry-aware fashion. We estimate explicit geometry from the omnidirectional neural radiance field that was learned from multiple 360° images. Relying on the recovered geometry, we use an adaptive divide-and-conquer strategy to slim and fine-tune the radiance fields and further improve render speed and quality. Quantitative and qualitative comparisons among baselines illustrated our predominant performance in large-scale indoor scenes and our system supports real-time VR roaming.\n  - [AligNeRF: High-Fidelity Neural Radiance Fields via Alignment-Aware Training](https://arxiv.org/abs/2211.09682) | [code]\n    > Neural Radiance Fields (NeRFs) are a powerful representation for modeling a 3D scene as a continuous function. Though NeRF is able to render complex 3D scenes with view-dependent effects, few efforts have been devoted to exploring its limits in a high-resolution setting. Specifically, existing NeRF-based methods face several limitations when reconstructing high-resolution real scenes, including a very large number of parameters, misaligned input data, and overly smooth details. In this work, we conduct the first pilot study on training NeRF with high-resolution data and propose the corresponding solutions: 1) marrying the multilayer perceptron (MLP) with convolutional layers which can encode more neighborhood information while reducing the total number of parameters; 2) a novel training strategy to address misalignment caused by moving objects or small camera calibration errors; and 3) a high-frequency aware loss. Our approach is nearly free without introducing obvious training/testing costs, while experiments on different datasets demonstrate that it can recover more high-frequency details compared with the current state-of-the-art NeRF models. Project page: \\url{this https URL.}\n  - [3DLatNav: Navigating Generative Latent Spaces for Semantic-Aware 3D Object Manipulation](https://arxiv.org/abs/2211.09770) | [code]\n    > 3D generative models have been recently successful in generating realistic 3D objects in the form of point clouds. However, most models do not offer controllability to manipulate the shape semantics of component object parts without extensive semantic attribute labels or other reference point clouds. Moreover, beyond the ability to perform simple latent vector arithmetic or interpolations, there is a lack of understanding of how part-level semantics of 3D shapes are encoded in their corresponding generative latent spaces. In this paper, we propose 3DLatNav; a novel approach to navigating pretrained generative latent spaces to enable controlled part-level semantic manipulation of 3D objects. First, we propose a part-level weakly-supervised shape semantics identification mechanism using latent representations of 3D shapes. Then, we transfer that knowledge to a pretrained 3D object generative latent space to unravel disentangled embeddings to represent different shape semantics of component parts of an object in the form of linear subspaces, despite the unavailability of part-level labels during the training. Finally, we utilize those identified subspaces to show that controllable 3D object part manipulation can be achieved by applying the proposed framework to any pretrained 3D generative model. With two novel quantitative metrics to evaluate the consistency and localization accuracy of part-level manipulations, we show that 3DLatNav outperforms existing unsupervised latent disentanglement methods in identifying latent directions that encode part-level shape semantics of 3D objects. With multiple ablation studies and testing on state-of-the-art generative models, we show that 3DLatNav can implement controlled part-level semantic manipulations on an input point cloud while preserving other features and the realistic nature of the object.\n  - [AsyncNeRF: Learning Large-scale Radiance Fields from Asynchronous RGB-D Sequences with Time-Pose Function](https://arxiv.org/abs/2211.07459) | [code]\n    > Large-scale radiance fields are promising mapping tools for smart transportation applications like autonomous driving or drone delivery. But for large-scale scenes, compact synchronized RGB-D cameras are not applicable due to limited sensing range, and using separate RGB and depth sensors inevitably leads to unsynchronized sequences. Inspired by the recent success of self-calibrating radiance field training methods that do not require known intrinsic or extrinsic parameters, we propose the first solution that self-calibrates the mismatch between RGB and depth frames. We leverage the important domain-specific fact that RGB and depth frames are actually sampled from the same trajectory and develop a novel implicit network called the time-pose function. Combining it with a large-scale radiance field leads to an architecture that cascades two implicit representation networks. To validate its effectiveness, we construct a diverse and photorealistic dataset that covers various RGB-D mismatch scenarios. Through a comprehensive benchmarking on this dataset, we demonstrate the flexibility of our method in different scenarios and superior performance over applicable prior counterparts. Codes, data, and models will be made publicly available.\n## Nov6 - Nov12, 2022\n  - [NeXT: Towards High Quality Neural Radiance Fields via Multi-skip Transformer, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19824-3_5) | [***``[code]``***](https://github.com/Crishawy/NeXT)\n    > Neural Radiance Fields (NeRF) methods show impressive performance for novel view synthesis by representing a scene via a neural network. However, most existing NeRF based methods, including its variants, treat each sample point individually as input, while ignoring the inherent relationships between adjacent sample points from the corresponding rays, thus hindering the reconstruction performance. To address this issue, we explore a brand new scheme, namely NeXT, introducing a multi-skip transformer to capture the rich relationships between various sample points in a ray-level query. Specifically, ray tokenization is proposed to represent each ray as a sequence of point embeddings which is taken as input of our proposed NeXT. In this way, relationships between sample points are captured via the built-in self-attention mechanism to promote the reconstruction. Besides, our proposed NeXT can be easily combined with other NeRF based methods to improve their rendering quality. Extensive experiments conducted on three datasets demonstrate that NeXT significantly outperforms all previous state-of-the-art work by a large margin. In particular, the proposed NeXT surpasses the strong NeRF baseline by 2.74 dB of PSNR on Blender dataset. The code is available at https://github.com/Crishawy/NeXT.\n  - [QRF: Implicit Neural Representations with Quantum Radiance Fields](https://arxiv.org/abs/2211.03418) | [code]\n    > Photorealistic rendering of real-world scenes is a tremendous challenge with a wide range of applications, including mixed reality (MR), and virtual reality (VR). Neural networks, which have long been investigated in the context of solving differential equations, have previously been introduced as implicit representations for photorealistic rendering. However, realistic rendering using classic computing is challenging because it requires time-consuming optical ray marching, and suffer computational bottlenecks due to the curse of dimensionality. In this paper, we propose Quantum Radiance Fields (QRF), which integrate the quantum circuit, quantum activation function, and quantum volume rendering for implicit scene representation. The results indicate that QRF not only exploits the advantage of quantum computing, such as high speed, fast convergence, and high parallelism, but also ensure high quality of volume rendering.\n## Oct30 - Nov5, 2022\n  - [HyperSound: Generating Implicit Neural Representations of Audio Signals with Hypernetworks](https://arxiv.org/abs/2211.01839) | [code]\n    > Implicit neural representations (INRs) are a rapidly growing research field, which provides alternative ways to represent multimedia signals. Recent applications of INRs include image super-resolution, compression of high-dimensional signals, or 3D rendering. However, these solutions usually focus on visual data, and adapting them to the audio domain is not trivial. Moreover, it requires a separately trained model for every data sample. To address this limitation, we propose HyperSound, a meta-learning method leveraging hypernetworks to produce INRs for audio signals unseen at training time. We show that our approach can reconstruct sound waves with quality comparable to other state-of-the-art models.\n  - [Attention-based Neural Cellular Automata, NeurIPS2022](https://arxiv.org/abs/2211.01233) | [code]\n    > Recent extensions of Cellular Automata (CA) have incorporated key ideas from modern deep learning, dramatically extending their capabilities and catalyzing a new family of Neural Cellular Automata (NCA) techniques. Inspired by Transformer-based architectures, our work presents a new class of attention-based NCAs formed using a spatially localized—yet globally organized—self-attention scheme. We introduce an instance of this class named Vision Transformer Cellular Automata (ViTCA). We present quantitative and qualitative results on denoising autoencoding across six benchmark datasets, comparing ViTCA to a U-Net, a U-Net-based CA baseline (UNetCA), and a Vision Transformer (ViT). When comparing across architectures configured to similar parameter complexity, ViTCA architectures yield superior performance across all benchmarks and for nearly every evaluation metric. We present an ablation study on various architectural configurations of ViTCA, an analysis of its effect on cell states, and an investigation on its inductive biases. Finally, we examine its learned representations via linear probes on its converged cell state hidden representations, yielding, on average, superior results when compared to our U-Net, ViT, and UNetCA baselines.\n## Oct23 - Oct29, 2022\n  - [NeX360: Real-time All-around View Synthesis with Neural Basis Expansion, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9931981) | [code]\n    > We present NeX, a new approach to novel view synthesis based on enhancements of multiplane images (MPI) that can reproduce view-dependent effects in real time. Unlike traditional MPI, our technique parameterizes each pixel as a linear combination of spherical basis functions learned from a neural network to model view-dependent effects and uses a hybrid implicit-explicit modeling strategy to improve fine detail. Moreover, we also present an extension to NeX, which leverages knowledge distillation to train multiple MPIs for unbounded 360 ∘ scenes. Our method is evaluated on several benchmark datasets: NeRF-Synthetic dataset, Light Field dataset, Real Forward-Facing dataset, Space dataset, as well as Shiny , our new dataset that contains significantly more challenging view-dependent effects, such as the rainbow reflections on the CD. Our method outperforms other real-time rendering approaches on PSNR, SSIM, and LPIPS and can render unbounded 360 ∘ scenes in real time.\n  - [NeRFPlayer: A Streamable Dynamic Scene Representation with Decomposed Neural Radiance Fields](https://arxiv.org/abs/2210.15947) | [code]\n    > Visually exploring in a real-world 4D spatiotemporal space freely in VR has been a long-term quest. The task is especially appealing when only a few or even single RGB cameras are used for capturing the dynamic scene. To this end, we present an efficient framework capable of fast reconstruction, compact modeling, and streamable rendering. First, we propose to decompose the 4D spatiotemporal space according to temporal characteristics. Points in the 4D space are associated with probabilities of belonging to three categories: static, deforming, and new areas. Each area is represented and regularized by a separate neural field. Second, we propose a hybrid representations based feature streaming scheme for efficiently modeling the neural fields. Our approach, coined NeRFPlayer, is evaluated on dynamic scenes captured by single hand-held cameras and multi-camera arrays, achieving comparable or superior rendering performance in terms of quality and speed comparable to recent state-of-the-art methods, achieving reconstruction in 10 seconds per frame and real-time rendering.\n  - [Vox-Fusion: Dense Tracking and Mapping with Voxel-based Neural Implicit Representation](https://arxiv.org/abs/2210.15858) | [***``[code]``***](https://github.com/zju3dv/Vox-Fusion)\n    > In this work, we present a dense tracking and mapping system named Vox-Fusion, which seamlessly fuses neural implicit representations with traditional volumetric fusion methods. Our approach is inspired by the recently developed implicit mapping and positioning system and further extends the idea so that it can be freely applied to practical scenarios. Specifically, we leverage a voxel-based neural implicit surface representation to encode and optimize the scene inside each voxel. Furthermore, we adopt an octree-based structure to divide the scene and support dynamic expansion, enabling our system to track and map arbitrary scenes without knowing the environment like in previous works. Moreover, we proposed a high-performance multi-process framework to speed up the method, thus supporting some applications that require real-time performance. The evaluation results show that our methods can achieve better accuracy and completeness than previous methods. We also show that our Vox-Fusion can be used in augmented reality and virtual reality applications. Our source code is publicly available at this https URL.\n## Oct16 - Oct22, 2022\n  - [Compressing multidimensional weather and climate data into neural networks](https://arxiv.org/abs/2210.12538) | [code]\n    > Weather and climate simulations produce petabytes of high-resolution data that are later analyzed by researchers in order to understand climate change or severe weather. We propose a new method of compressing this multidimensional weather and climate data: a coordinate-based neural network is trained to overfit the data, and the resulting parameters are taken as a compact representation of the original grid-based data. While compression ratios range from 300x to more than 3,000x, our method outperforms the state-of-the-art compressor SZ3 in terms of weighted RMSE, MAE. It can faithfully preserve important large scale atmosphere structures and does not introduce artifacts. When using the resulting neural network as a 790x compressed dataloader to train the WeatherBench forecasting model, its RMSE increases by less than 2%. The three orders of magnitude compression democratizes access to high-resolution climate data and enables numerous new research directions.\n  - [Neural Sound Field Decomposition with Super-resolution of Sound Direction](https://arxiv.org/abs/2210.12345) | [code]\n    > Sound field decomposition predicts waveforms in arbitrary directions using signals from a limited number of microphones as inputs. Sound field decomposition is fundamental to downstream tasks, including source localization, source separation, and spatial audio reproduction. Conventional sound field decomposition methods such as Ambisonics have limited spatial decomposition resolution. This paper proposes a learning-based Neural Sound field Decomposition (NeSD) framework to allow sound field decomposition with fine spatial direction resolution, using recordings from microphone capsules of a few microphones at arbitrary positions. The inputs of a NeSD system include microphone signals, microphone positions, and queried directions. The outputs of a NeSD include the waveform and the presence probability of a queried position. We model the NeSD systems respectively with different neural networks, including fully connected, time delay, and recurrent neural networks. We show that the NeSD systems outperform conventional Ambisonics and DOANet methods in sound field decomposition and source localization on speech, music, and sound events datasets. Demos are available at this https URL.\n## Oct9 - Oct15, 2022\n  - [Continuous conditional video synthesis by neural processes](https://arxiv.org/abs/2210.05810) | [***``[code]``***](https://github.com/NPVS/NPVS)\n    > We propose a unified model for multiple conditional video synthesis tasks, including video prediction and video frame interpolation. We show that conditional video synthesis can be formulated as a neural process, which maps input spatio-temporal coordinates to target pixel values given context spatio-temporal coordinates and pixels values. Specifically, we feed an implicit neural representations of coordinates into a Transformer-based non-autoregressive conditional video synthesis model. Our task-specific models outperform previous work for video interpolation on multiple datasets and reach a competitive performance with the state-of-the-art models for video prediction. Importantly, the model is able to interpolate or predict with an arbitrary high frame rate, i.e., continuous synthesis. Our source code is available at this https URL.\n  - [Geometric Warping Error Aware CNN for DIBR Oriented View Synthesis, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547946) | [code]\n    > Depth Image based Rendering (DIBR) oriented view synthesis is an important virtual view generation technique. It warps the reference view images to the target viewpoint based on their depth maps, without requiring many available viewpoints. However, in the 3D warping process, pixels are warped to fractional pixel locations and then rounded (or interpolated) to integer pixels, resulting in geometric warping error and reducing the image quality. This resembles, to some extent, the image super-resolution problem, but with unfixed fractional pixel locations. To address this problem, we propose a geometric warping error aware CNN (GWEA) framework to enhance the DIBR oriented view synthesis. First, a deformable convolution based geometric warping error aware alignment (GWEA-DCA) module is developed, by taking advantage of the geometric warping error preserved in the DIBR module. The offset learned in the deformable convolution can account for the geometric warping error to facilitate the mapping from the fractional pixels to integer pixels. Moreover, in view that the pixels in the warped images are of different qualities due to the different strengths of warping errors, an attention enhanced view blending (GWEA-AttVB) module is further developed to adaptively fuse the pixels from different warped images. Finally, a partial convolution based hole filling and refinement module fills the remaining holes and improves the quality of the overall image. Experiments show that our model can synthesize higher-quality images than the existing methods, and ablation study is also conducted, validating the effectiveness of each proposed module.\n## Oct2 - Oct8, 2022\n  - [ViewFool: Evaluating the Robustness of Visual Recognition to Adversarial Viewpoints, NeurIPS2022](https://arxiv.org/abs/2210.03895) | [code]\n    > Recent studies have demonstrated that visual recognition models lack robustness to distribution shift. However, current work mainly considers model robustness to 2D image transformations, leaving viewpoint changes in the 3D world less explored. In general, viewpoint changes are prevalent in various real-world applications (e.g., autonomous driving), making it imperative to evaluate viewpoint robustness. In this paper, we propose a novel method called ViewFool to find adversarial viewpoints that mislead visual recognition models. By encoding real-world objects as neural radiance fields (NeRF), ViewFool characterizes a distribution of diverse adversarial viewpoints under an entropic regularizer, which helps to handle the fluctuations of the real camera pose and mitigate the reality gap between the real objects and their neural representations. Experiments validate that the common image classifiers are extremely vulnerable to the generated adversarial viewpoints, which also exhibit high cross-model transferability. Based on ViewFool, we introduce ImageNet-V, a new out-of-distribution dataset for benchmarking viewpoint robustness of image classifiers. Evaluation results on 40 classifiers with diverse architectures, objective functions, and data augmentations reveal a significant drop in model performance when tested on ImageNet-V, which provides a possibility to leverage ViewFool as an effective data augmentation strategy to improve viewpoint robustness.\n  - [Novel View Synthesis for Surgical Recording](https://link.springer.com/chapter/10.1007/978-3-031-18576-2_7) | [code]\n    > Recording surgery in operating rooms is one of the essential tasks for education and evaluation of medical treatment. However, recording the fields which depict the surgery is difficult because the targets are heavily occluded during surgery by the heads or hands of doctors or nurses. We use a recording system which multiple cameras embedded in the surgical lamp, assuming that at least one camera is recording the target without occlusion. In this paper, we propose Conditional-BARF (C-BARF) to generate occlusion-free images by synthesizing novel view images from the camera, aiming to generate videos with smooth camera pose transitions. To the best of our knowledge, this is the first work to tackle the problem of synthesizing a novel view image from multiple images for the surgery scene. We conduct experiments using an original dataset of three different types of surgeries. Our experiments show that we can successfully synthesize novel views from the images recorded by the multiple cameras embedded in the surgical lamp.\n  - [Dfferentiable Raycasting for Self-supervised Occupancy Forecasting, ECCV2022](https://arxiv.org/abs/2210.01917) | [***``[code]``***](https://github.com/tarashakhurana/emergent-occ-forecasting)\n    > Motion planning for safe autonomous driving requires learning how the environment around an ego-vehicle evolves with time. Ego-centric perception of driveable regions in a scene not only changes with the motion of actors in the environment, but also with the movement of the ego-vehicle itself. Self-supervised representations proposed for large-scale planning, such as ego-centric freespace, confound these two motions, making the representation difficult to use for downstream motion planners. In this paper, we use geometric occupancy as a natural alternative to view-dependent representations such as freespace. Occupancy maps naturally disentangle the motion of the environment from the motion of the ego-vehicle. However, one cannot directly observe the full 3D occupancy of a scene (due to occlusion), making it difficult to use as a signal for learning. Our key insight is to use differentiable raycasting to \"render\" future occupancy predictions into future LiDAR sweep predictions, which can be compared with ground-truth sweeps for self-supervised learning. The use of differentiable raycasting allows occupancy to emerge as an internal representation within the forecasting network. In the absence of groundtruth occupancy, we quantitatively evaluate the forecasting of raycasted LiDAR sweeps and show improvements of upto 15 F1 points. For downstream motion planners, where emergent occupancy can be directly used to guide non-driveable regions, this representation relatively reduces the number of collisions with objects by up to 17% as compared to freespace-centric motion planners.\n  - [Self-improving Multiplane-to-layer Images for Novel View Synthesis, WACV2023](https://samsunglabs.github.io/MLI/) | [***``[code]``***](https://github.com/SamsungLabs/MLI)\n    > We present a new method for lightweight novel-view synthesis that generalizes to an arbitrary forward-facing scene. Recent approaches are computationally expensive, require per-scene optimization, or produce a memory-expensive representation. We start by representing the scene with a set of fronto-parallel semitransparent planes and afterward convert them to deformable layers in an end-to-end manner. Additionally, we employ a feed-forward refinement procedure that corrects the estimated representation by aggregating information from input views. Our method does not require fine-tuning when a new scene is processed and can handle an arbitrary number of views without restrictions. Experimental results show that our approach surpasses recent models in terms of common metrics and human evaluation, with the noticeable advantage in inference speed and compactness of the inferred layered geometry, see this https URL\n  - [NARF22: Neural Articulated Radiance Fields for Configuration-Aware Rendering, IROS2022](https://progress.eecs.umich.edu/projects/narf/) | [code]\n    > Articulated objects pose a unique challenge for robotic perception and manipulation. Their increased number of degrees-of-freedom makes tasks such as localization computationally difficult, while also making the process of real-world dataset collection unscalable. With the aim of addressing these scalability issues, we propose Neural Articulated Radiance Fields (NARF22), a pipeline which uses a fully-differentiable, configuration-parameterized Neural Radiance Field (NeRF) as a means of providing high quality renderings of articulated objects. NARF22 requires no explicit knowledge of the object structure at inference time. We propose a two-stage parts-based training mechanism which allows the object rendering models to generalize well across the configuration space even if the underlying training data has as few as one configuration represented. We demonstrate the efficacy of NARF22 by training configurable renderers on a real-world articulated tool dataset collected via a Fetch mobile manipulation robot. We show the applicability of the model to gradient-based inference methods through a configuration estimation and 6 degree-of-freedom pose refinement task. The project webpage is available at: this https URL.\n  - [SinGRAV: Learning a Generative Radiance Volume from a Single Natural Scene](https://arxiv.org/abs/2210.01202) | [code]\n    > We present a 3D generative model for general natural scenes. Lacking necessary volumes of 3D data characterizing the target scene, we propose to learn from a single scene. Our key insight is that a natural scene often contains multiple constituents whose geometry, texture, and spatial arrangements follow some clear patterns, but still exhibit rich variations over different regions within the same scene. This suggests localizing the learning of a generative model on substantial local regions. Hence, we exploit a multi-scale convolutional network, which possesses the spatial locality bias in nature, to learn from the statistics of local regions at multiple scales within a single scene. In contrast to existing methods, our learning setup bypasses the need to collect data from many homogeneous 3D scenes for learning common features. We coin our method SinGRAV, for learning a Generative RAdiance Volume from a Single natural scene. We demonstrate the ability of SinGRAV in generating plausible and diverse variations from a single scene, the merits of SinGRAV over state-of-the-art generative neural scene methods, as well as the versatility of SinGRAV by its use in a variety of applications, spanning 3D scene editing, composition, and animation. Code and data will be released to facilitate further research.\n  - [IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable Novel View Synthesis](https://arxiv.org/abs/2210.00647) | [***``[code]``***](https://github.com/zju3dv/IntrinsicNeRF)\n    > We present intrinsic neural radiance fields, dubbed IntrinsicNeRF, that introduce intrinsic decomposition into the NeRF-based~\\cite{mildenhall2020nerf} neural rendering method and can perform editable novel view synthesis in room-scale scenes while existing inverse rendering combined with neural rendering methods~\\cite{zhang2021physg, zhang2022modeling} can only work on object-specific scenes. Given that intrinsic decomposition is a fundamentally ambiguous and under-constrained inverse problem, we propose a novel distance-aware point sampling and adaptive reflectance iterative clustering optimization method that enables IntrinsicNeRF with traditional intrinsic decomposition constraints to be trained in an unsupervised manner, resulting in temporally consistent intrinsic decomposition results. To cope with the problem of different adjacent instances of similar reflectance in a scene being incorrectly clustered together, we further propose a hierarchical clustering method with coarse-to-fine optimization to obtain a fast hierarchical indexing representation. It enables compelling real-time augmented reality applications such as scene recoloring, material editing, and illumination variation. Extensive experiments on Blender Object and Replica Scene demonstrate that we can obtain high-quality, consistent intrinsic decomposition results and high-fidelity novel view synthesis even for challenging sequences. Code and data are available on the project webpage: this https URL.\n## Sep25 - Oct1, 2022\n  - [SCI: A spectrum concentrated implicit neural compression for biomedical data](https://arxiv.org/abs/2209.15180) | [code]\n    > Massive collection and explosive growth of the huge amount of medical data, demands effective compression for efficient storage, transmission and sharing. Readily available visual data compression techniques have been studied extensively but tailored for nature images/videos, and thus show limited performance on medical data which are of different characteristics. Emerging implicit neural representation (INR) is gaining momentum and demonstrates high promise for fitting diverse visual data in target-data-specific manner, but a general compression scheme covering diverse medical data is so far absent. To address this issue, we firstly derive a mathematical explanation for INR's spectrum concentration property and an analytical insight on the design of compression-oriented INR architecture. Further, we design a funnel shaped neural network capable of covering broad spectrum of complex medical data and achieving high compression ratio. Based on this design, we conduct compression via optimization under given budget and propose an adaptive compression approach SCI, which adaptively partitions the target data into blocks matching the concentrated spectrum envelop of the adopted INR, and allocates parameter with high representation accuracy under given compression ratio. The experiments show SCI's superior performance over conventional techniques and wide applicability across diverse medical data.\n  - [Distilling Style from Image Pairs for Global Forward and Inverse Tone Mapping, CVMP2022](https://arxiv.org/abs/2209.15165) | [code]\n    > Many image enhancement or editing operations, such as forward and inverse tone mapping or color grading, do not have a unique solution, but instead a range of solutions, each representing a different style. Despite this, existing learning-based methods attempt to learn a unique mapping, disregarding this style. In this work, we show that information about the style can be distilled from collections of image pairs and encoded into a 2- or 3-dimensional vector. This gives us not only an efficient representation but also an interpretable latent space for editing the image style. We represent the global color mapping between a pair of images as a custom normalizing flow, conditioned on a polynomial basis of the pixel color. We show that such a network is more effective than PCA or VAE at encoding image style in low-dimensional space and lets us obtain an accuracy close to 40 dB, which is about 7-10 dB improvement over the state-of-the-art methods.\n  - [Towards Multi-spatiotemporal-scale Generalized PDE Modeling](https://arxiv.org/abs/2209.15616) | [code]\n    > Partial differential equations (PDEs) are central to describing complex physical system simulations. Their expensive solution techniques have led to an increased interest in deep neural network based surrogates. However, the practical utility of training such surrogates is contingent on their ability to model complex multi-scale spatio-temporal phenomena. Various neural network architectures have been proposed to target such phenomena, most notably Fourier Neural Operators (FNOs) which give a natural handle over local \\& global spatial information via parameterization of different Fourier modes, and U-Nets which treat local and global information via downsampling and upsampling paths. However, generalizing across different equation parameters or different time-scales still remains a challenge. In this work, we make a comprehensive comparison between various FNO and U-Net like approaches on fluid mechanics problems in both vorticity-stream and velocity function form. For U-Nets, we transfer recent architectural improvements from computer vision, most notably from object segmentation and generative modeling. We further analyze the design considerations for using FNO layers to improve performance of U-Net architectures without major degradation of computational performance. Finally, we show promising results on generalization to different PDE parameters and time-scales with a single surrogate model.\n  - [Implicit Neural Spatial Representations for Time-dependent PDEs](https://arxiv.org/abs/2210.00124) | [code]\n    > Numerically solving partial differential equations (PDEs) often entails spatial and temporal discretizations. Traditional methods (e.g., finite difference, finite element, smoothed-particle hydrodynamics) frequently adopt explicit spatial discretizations, such as grids, meshes, and point clouds, where each degree-of-freedom corresponds to a location in space. While these explicit spatial correspondences are intuitive to model and understand, these representations are not necessarily optimal for accuracy, memory-usage, or adaptivity. In this work, we explore implicit neural representation as an alternative spatial discretization, where spatial information is implicitly stored in the neural network weights. With implicit neural spatial representation, PDE-constrained time-stepping translates into updating neural network weights, which naturally integrates with commonly adopted optimization time integrators. We validate our approach on a variety of classic PDEs with examples involving large elastic deformations, turbulent fluids, and multiscale phenomena. While slower to compute than traditional representations, our approach exhibits higher accuracy, lower memory consumption, and dynamically adaptive allocation of degrees of freedom without complex remeshing.\n  - [Continuous PDE Dynamics Forecasting with Implicit Neural Representations](https://arxiv.org/abs/2209.14855) | [code]\n    > Effective data-driven PDE forecasting methods often rely on fixed spatial and / or temporal discretizations. This raises limitations in real-world applications like weather prediction where flexible extrapolation at arbitrary spatiotemporal locations is required. We address this problem by introducing a new data-driven approach, DINo, that models a PDE's flow with continuous-time dynamics of spatially continuous functions. This is achieved by embedding spatial observations independently of their discretization via Implicit Neural Representations in a small latent space temporally driven by a learned ODE. This separate and flexible treatment of time and space makes DINo the first data-driven model to combine the following advantages. It extrapolates at arbitrary spatial and temporal locations; it can learn from sparse irregular grids or manifolds; at test time, it generalizes to new grids or resolutions. DINo outperforms alternative neural PDE forecasters in a variety of challenging generalization scenarios on representative PDE systems.\n  - [Towards General-Purpose Representation Learning of Polygonal Geometries, GeoInformatica](https://arxiv.org/abs/2209.15458) | [code]\n    > Neural network representation learning for spatial data is a common need for geographic artificial intelligence (GeoAI) problems. In recent years, many advancements have been made in representation learning for points, polylines, and networks, whereas little progress has been made for polygons, especially complex polygonal geometries. In this work, we focus on developing a general-purpose polygon encoding model, which can encode a polygonal geometry (with or without holes, single or multipolygons) into an embedding space. The result embeddings can be leveraged directly (or finetuned) for downstream tasks such as shape classification, spatial relation prediction, and so on. To achieve model generalizability guarantees, we identify a few desirable properties: loop origin invariance, trivial vertex invariance, part permutation invariance, and topology awareness. We explore two different designs for the encoder: one derives all representations in the spatial domain; the other leverages spectral domain representations. For the spatial domain approach, we propose ResNet1D, a 1D CNN-based polygon encoder, which uses circular padding to achieve loop origin invariance on simple polygons. For the spectral domain approach, we develop NUFTspec based on Non-Uniform Fourier Transformation (NUFT), which naturally satisfies all the desired properties. We conduct experiments on two tasks: 1) shape classification based on MNIST; 2) spatial relation prediction based on two new datasets - DBSR-46K and DBSR-cplx46K. Our results show that NUFTspec and ResNet1D outperform multiple existing baselines with significant margins. While ResNet1D suffers from model performance degradation after shape-invariance geometry modifications, NUFTspec is very robust to these modifications due to the nature of the NUFT.\n  - [Enforcing safety for vision-based controllers via Control Barrier Functions and Neural Radiance Fields](https://arxiv.org/abs/2209.12266) | [code]\n    > To navigate complex environments, robots must increasingly use high-dimensional visual feedback (e.g. images) for control. However, relying on high-dimensional image data to make control decisions raises important questions; particularly, how might we prove the safety of a visual-feedback controller? Control barrier functions (CBFs) are powerful tools for certifying the safety of feedback controllers in the state-feedback setting, but CBFs have traditionally been poorly-suited to visual feedback control due to the need to predict future observations in order to evaluate the barrier function. In this work, we solve this issue by leveraging recent advances in neural radiance fields (NeRFs), which learn implicit representations of 3D scenes and can render images from previously-unseen camera perspectives, to provide single-step visual foresight for a CBF-based controller. This novel combination is able to filter out unsafe actions and intervene to preserve safety. We demonstrate the effect of our controller in real-time simulation experiments where it successfully prevents the robot from taking dangerous actions.\n  - [WaterNeRF: Neural Radiance Fields for Underwater Scenes](https://arxiv.org/abs/2209.13091) | [code]\n    > Underwater imaging is a critical task performed by marine robots for a wide range of applications including aquaculture, marine infrastructure inspection, and environmental monitoring. However, water column effects, such as attenuation and backscattering, drastically change the color and quality of imagery captured underwater. Due to varying water conditions and range-dependency of these effects, restoring underwater imagery is a challenging problem. This impacts downstream perception tasks including depth estimation and 3D reconstruction. In this paper, we advance state-of-the-art in neural radiance fields (NeRFs) to enable physics-informed dense depth estimation and color correction. Our proposed method, WaterNeRF, estimates parameters of a physics-based model for underwater image formation, leading to a hybrid data-driven and model-based solution. After determining the scene structure and radiance field, we can produce novel views of degraded as well as corrected underwater images, along with dense depth of the scene. We evaluate the proposed method qualitatively and quantitatively on a real underwater dataset.\n## Sep18 - Sep24, 2022\n  - [How Does It Feel? Self-Supervised Costmap Learning for Off-Road Vehicle Traversability](https://arxiv.org/abs/2209.10788) | [code]\n    > Estimating terrain traversability in off-road environments requires reasoning about complex interaction dynamics between the robot and these terrains. However, it is challenging to build an accurate physics model, or create informative labels to learn a model in a supervised manner, for these interactions. We propose a method that learns to predict traversability costmaps by combining exteroceptive environmental information with proprioceptive terrain interaction feedback in a self-supervised manner. Additionally, we propose a novel way of incorporating robot velocity in the costmap prediction pipeline. We validate our method in multiple short and large-scale navigation tasks on a large, autonomous all-terrain vehicle (ATV) on challenging off-road terrains, and demonstrate ease of integration on a separate large ground robot. Our short-scale navigation results show that using our learned costmaps leads to overall smoother navigation, and provides the robot with a more fine-grained understanding of the interactions between the robot and different terrain types, such as grass and gravel. Our large-scale navigation trials show that we can reduce the number of interventions by up to 57% compared to an occupancy-based navigation baseline in challenging off-road courses ranging from 400 m to 3150 m.\n  - [wildNeRF: Complete view synthesis of in-the-wild dynamic scenes captured using sparse monocular data](https://arxiv.org/abs/2209.10399) | [code]\n    > We present a novel neural radiance model that is trainable in a self-supervised manner for novel-view synthesis of dynamic unstructured scenes. Our end-to-end trainable algorithm learns highly complex, real-world static scenes within seconds and dynamic scenes with both rigid and non-rigid motion within minutes. By differentiating between static and motion-centric pixels, we create high-quality representations from a sparse set of images. We perform extensive qualitative and quantitative evaluation on existing benchmarks and set the state-of-the-art on performance measures on the challenging NVIDIA Dynamic Scenes Dataset. Additionally, we evaluate our model performance on challenging real-world datasets such as Cholec80 and SurgicalActions160.\n  - [Density-aware NeRF Ensembles: Quantifying Predictive Uncertainty in Neural Radiance Fields](https://arxiv.org/abs/2209.08718) | [code]\n    > We show that ensembling effectively quantifies model uncertainty in Neural Radiance Fields (NeRFs) if a density-aware epistemic uncertainty term is considered. The naive ensembles investigated in prior work simply average rendered RGB images to quantify the model uncertainty caused by conflicting explanations of the observed scene. In contrast, we additionally consider the termination probabilities along individual rays to identify epistemic model uncertainty due to a lack of knowledge about the parts of a scene unobserved during training. We achieve new state-of-the-art performance across established uncertainty quantification benchmarks for NeRFs, outperforming methods that require complex changes to the NeRF architecture and training regime. We furthermore demonstrate that NeRF uncertainty can be utilised for next-best view selection and model refinement.\n  - [LATITUDE: Robotic Global Localization with Truncated Dynamic Low-pass Filter in City-scale NeRF, ICRA2023](https://arxiv.org/abs/2209.08498) | [***``[code]``***](https://github.com/jike5/LATITUDE)\n    > Neural Radiance Fields (NeRFs) have made great success in representing complex 3D scenes with high-resolution details and efficient memory. Nevertheless, current NeRF-based pose estimators have no initial pose prediction and are prone to local optima during optimization. In this paper, we present LATITUDE: Global Localization with Truncated Dynamic Low-pass Filter, which introduces a two-stage localization mechanism in city-scale NeRF. In place recognition stage, we train a regressor through images generated from trained NeRFs, which provides an initial value for global localization. In pose optimization stage, we minimize the residual between the observed image and rendered image by directly optimizing the pose on tangent plane. To avoid convergence to local optimum, we introduce a Truncated Dynamic Low-pass Filter (TDLF) for coarse-to-fine pose registration. We evaluate our method on both synthetic and real-world data and show its potential applications for high-precision navigation in large-scale city scenes. Codes and data will be publicly available at this https URL.\n  - [Implicit Neural Representations for Medical Imaging Segmentation, MICCAI2022](https://link.springer.com/chapter/10.1007/978-3-031-16443-9_42) | [code]\n    > 3D signals in medical imaging, such as CT scans, are usually parameterized as a discrete grid of voxels. For instance, existing state-of-the-art organ segmentation methods learn discrete segmentation maps. Unfortunately, the memory requirements of such methods grow cubically with increasing spatial resolution, which makes them unsuitable for processing high resolution scans. To overcome this, we design an Implicit Organ Segmentation Network (IOSNet) that utilizes continuous Implicit Neural Representations and has several useful properties. Firstly, the IOSNet decoder memory is roughly constant and independent of the spatial resolution since it parameterizes the segmentation map as a continuous function. Secondly, IOSNet converges much faster than discrete voxel based methods due to its ability to accurately segment organs irrespective of organ sizes, thereby alleviating size imbalance issues without requiring any auxiliary tricks. Thirdly, IOSNet naturally supports super-resolution (i.e. sampling at arbitrary resolutions during inference) due to its continuous learnt representations. Moreover, despite using a simple lightweight decoder, IOSNet consistently outperforms the discrete specialized segmentation architecture UNet. Hence, our approach demonstrates that Implicit Neural Representations are well-suited for medical imaging applications, especially for processing high-resolution 3D medical scans.\n## Sep11 - Sep17, 2022\n  - [DevNet: Self-supervised Monocular Depth Learning via Density Volume Construction, ECCV2022](https://arxiv.org/abs/2209.06351) | [code]\n    > Self-supervised depth learning from monocular images normally relies on the 2D pixel-wise photometric relation between temporally adjacent image frames. However, they neither fully exploit the 3D point-wise geometric correspondences, nor effectively tackle the ambiguities in the photometric warping caused by occlusions or illumination inconsistency. To address these problems, this work proposes Density Volume Construction Network (DevNet), a novel self-supervised monocular depth learning framework, that can consider 3D spatial information, and exploit stronger geometric constraints among adjacent camera frustums. Instead of directly regressing the pixel value from a single image, our DevNet divides the camera frustum into multiple parallel planes and predicts the pointwise occlusion probability density on each plane. The final depth map is generated by integrating the density along corresponding rays. During the training process, novel regularization strategies and loss functions are introduced to mitigate photometric ambiguities and overfitting. Without obviously enlarging model parameters size or running time, DevNet outperforms several representative baselines on both the KITTI-2015 outdoor dataset and NYU-V2 indoor dataset. In particular, the root-mean-square-deviation is reduced by around 4% with DevNet on both KITTI-2015 and NYU-V2 in the task of depth estimation. Code is available at this https URL.\n  - [Learning A Unified 3D Point Cloud for View Synthesis](https://arxiv.org/abs/2209.05013) | [code]\n    > 3D point cloud representation-based view synthesis methods have demonstrated effectiveness. However, existing methods usually synthesize novel views only from a single source view, and it is non-trivial to generalize them to handle multiple source views for pursuing higher reconstruction quality. In this paper, we propose a new deep learning-based view synthesis paradigm, which learns a unified 3D point cloud from different source views. Specifically, we first construct sub-point clouds by projecting source views to 3D space based on their depth maps. Then, we learn the unified 3D point cloud by adaptively fusing points at a local neighborhood defined on the union of the sub-point clouds. Besides, we also propose a 3D geometry-guided image restoration module to fill the holes and recover high-frequency details of the rendered novel views. Experimental results on three benchmark datasets demonstrate that our method outperforms state-of-the-art view synthesis methods to a large extent both quantitatively and visually.\n  - [Self-Supervised Coordinate Projection Network for Sparse-View Computed Tomography](https://arxiv.org/abs/2209.05483) | [code]\n    > In the present work, we propose a Self-supervised COordinate Projection nEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV sinogram by solving the inverse tomography imaging problem. Compared with recent related works that solve similar problems using implicit neural representation network (INR), our essential contribution is an effective and simple re-projection strategy that pushes the tomography image reconstruction quality over supervised deep learning CT reconstruction works. The proposed strategy is inspired by the simple relationship between linear algebra and inverse problems. To solve the under-determined linear equation system, we first introduce INR to constrain the solution space via image continuity prior and achieve a rough solution. And secondly, we propose to generate a dense view sinogram that improves the rank of the linear equation system and produces a more stable CT image solution space. Our experiment results demonstrate that the re-projection strategy significantly improves the image reconstruction quality (+3 dB for PSNR at least). Besides, we integrate the recent hash encoding into our SCOPE model, which greatly accelerates the model training. Finally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction tasks. Experimental results indicate that the proposed SCOPE model outperforms two latest INR-based methods and two well-popular supervised DL methods quantitatively and qualitatively.\n  - [CU-Net: Efficient Point Cloud Color Upsampling Network](https://arxiv.org/abs/2209.06112) | [code]\n    > Point cloud upsampling is necessary for Augmented Reality, Virtual Reality, and telepresence scenarios. Although the geometry upsampling is well studied to densify point cloud coordinates, the upsampling of colors has been largely overlooked. In this paper, we propose CU-Net, the first deep-learning point cloud color upsampling model. Leveraging a feature extractor based on sparse convolution and a color prediction module based on neural implicit function, CU-Net achieves linear time and space complexity. Therefore, CU-Net is theoretically guaranteed to be more efficient than most existing methods with quadratic complexity. Experimental results demonstrate that CU-Net can colorize a photo-realistic point cloud with nearly a million points in real time, while having better visual quality than baselines. Besides, CU-Net can adapt to an arbitrary upsampling ratio and unseen objects. Our source code will be released to the public soon.\n## Sep4 - Sep10, 2022\n  - [Implicit Full Waveform Inversion with Deep Neural Representation](https://arxiv.org/abs/2209.03525) | [code]\n    > Full waveform inversion (FWI) commonly stands for the state-of-the-art approach for imaging subsurface structures and physical parameters, however, its implementation usually faces great challenges, such as building a good initial model to escape from local minima, and evaluating the uncertainty of inversion results. In this paper, we propose the implicit full waveform inversion (IFWI) algorithm using continuously and implicitly defined deep neural representations. Compared to FWI, which is sensitive to the initial model, IFWI benefits from the increased degrees of freedom with deep learning optimization, thus allowing to start from a random initialization, which greatly reduces the risk of non-uniqueness and being trapped in local minima. Both theoretical and experimental analyses indicates that, given a random initial model, IFWI is able to converge to the global minimum and produce a high-resolution image of subsurface with fine structures. In addition, uncertainty analysis of IFWI can be easily performed by approximating Bayesian inference with various deep learning approaches, which is analyzed in this paper by adding dropout neurons. Furthermore, IFWI has a certain degree of robustness and strong generalization ability that are exemplified in the experiments of various 2D geological models. With proper setup, IFWI can also be well suited for multi-scale joint geophysical inversion.\n## Aug28 - Sep3, 2022\n  - [FoV-NeRF: Foveated Neural Radiance Fields for Virtual Reality, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9872532) | [code]\n    > Virtual Reality (VR) is becoming ubiquitous with the rise of consumer displays and commercial VR platforms. Such displays require low latency and high quality rendering of synthetic imagery with reduced compute overheads. Recent advances in neural rendering showed promise of unlocking new possibilities in 3D computer graphics via image-based representations of virtual or physical environments. Specifically, the neural radiance fields (NeRF) demonstrated that photo-realistic quality and continuous view changes of 3D scenes can be achieved without loss of view-dependent effects. While NeRF can significantly benefit rendering for VR applications, it faces unique challenges posed by high field-of-view, high resolution, and stereoscopic/egocentric viewing, typically causing low quality and high latency of the rendered images. In VR, this not only harms the interaction experience but may also cause sickness. To tackle these problems toward six-degrees-of-freedom, egocentric, and stereo NeRF in VR, we present the first gaze-contingent 3D neural representation and view synthesis method . We incorporate the human psychophysics of visual- and stereo-acuity into an egocentric neural representation of 3D scenery. We then jointly optimize the latency/performance and visual quality while mutually bridging human perception and neural scene synthesis to achieve perceptually high-quality immersive interaction. We conducted both objective analysis and subjective studies to evaluate the effectiveness of our approach. We find that our method significantly reduces latency (up to 99% time reduction compared with NeRF) without loss of high-fidelity rendering (perceptually identical to full-resolution ground truth). The presented approach may serve as the first step toward future VR/AR systems that capture, teleport, and visualize remote environments in real-time.\n  - [CLONeR: Camera-Lidar Fusion for Occupancy Grid-aided Neural Representations](https://arxiv.org/abs/2209.01194) | [code]\n    > This paper proposes CLONeR, which significantly improves upon NeRF by allowing it to model large outdoor driving scenes that are observed from sparse input sensor views. This is achieved by decoupling occupancy and color learning within the NeRF framework into separate Multi-Layer Perceptrons (MLPs) trained using LiDAR and camera data, respectively. In addition, this paper proposes a novel method to build differentiable 3D Occupancy Grid Maps (OGM) alongside the NeRF model, and leverage this occupancy grid for improved sampling of points along a ray for volumetric rendering in metric space.\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n## Aug7 - Aug13, 2022\n  - [HyperTime: Implicit Neural Representation for Time Series](https://arxiv.org/abs/2208.05836) | [code]\n    > Implicit neural representations (INRs) have recently emerged as a powerful tool that provides an accurate and resolution-independent encoding of data. Their robustness as general approximators has been shown in a wide variety of data sources, with applications on image, sound, and 3D scene representation. However, little attention has been given to leveraging these architectures for the representation and analysis of time series data. In this paper, we analyze the representation of time series using INRs, comparing different activation functions in terms of reconstruction accuracy and training convergence speed. We show how these networks can be leveraged for the imputation of time series, with applications on both univariate and multivariate data. Finally, we propose a hypernetwork architecture that leverages INRs to learn a compressed latent representation of an entire time series dataset. We introduce an FFT-based loss to guide training so that all frequencies are preserved in the time series. We show that this network can be used to encode time series as INRs, and their embeddings can be interpolated to generate new time series from existing ones. We evaluate our generative method by using it for data augmentation, and show that it is competitive against current state-of-the-art approaches for augmentation of time series.\n  - [NIDN: Neural Inverse Design of Nanostructures](https://arxiv.org/abs/2208.05480) | [code]\n    > In the recent decade, computational tools have become central in material design, allowing rapid development cycles at reduced costs. Machine learning tools are especially on the rise in photonics. However, the inversion of the Maxwell equations needed for the design is particularly challenging from an optimization standpoint, requiring sophisticated software. We present an innovative, open-source software tool called Neural Inverse Design of Nanostructures (NIDN) that allows designing complex, stacked material nanostructures using a physics-based deep learning approach. Instead of a derivative-free or data-driven optimization or learning method, we perform a gradient-based neural network training where we directly optimize the material and its structure based on its spectral characteristics. NIDN supports two different solvers, rigorous coupled-wave analysis and a finite-difference time-domain method. The utility and validity of NIDN are demonstrated on several synthetic examples as well as the design of a 1550 nm filter and anti-reflection coating. Results match experimental baselines, other simulation tools, and the desired spectral characteristics. Given its full modularity in regard to network architectures and Maxwell solvers as well as open-source, permissive availability, NIDN will be able to support computational material design processes in a broad range of applications.\n  - [Monte Carlo Denoising Using Implicit Neural Representation](https://oaktrust.library.tamu.edu/handle/1969.1/196567) | [code]\n    > Monte Carlo path tracing is a popular 3D rendering technique in computer graphics, but it often requires a costly tradeoff between the amount of noise in the image and computation time. Therefore, it is useful to attempt to “smooth out” a noisy image, typically by constructing new data between the samples or applying filters to the image. In this work, we investigate the feasibility of training a neural network to implicitly represent the radiance of a fixed-viewpoint scene as a continuous function. We implement the neural network using a multilayer perceptron network and train it on a sparsely sampled image that is generated by an offline Monte Carlo renderer. This training data uses the (x, y) coordinate of each sample on the image plane as inputs and the RGB color of the sample as outputs. Additionally, we provide the network with the surface normal, depth, and albedo of the first ray intersection as extra inputs alongside the pixel coordinates. These extra input dimensions improve the quality of the implicit representation by helping the network account for changes in depth, normal, and diffuse color. Once the network is trained on the sparsely sampled scene, we can densely sample the network many times per pixel to create the final denoised image. We find that this network can quickly learn and denoise images in scenes with soft lighting and glossy reflections, and it can easily handle discontinuities in depth, normal, and diffuse color with just a small amount of training.\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [DoF-NeRF: Depth-of-Field Meets Neural Radiance Fields, ACMMM2022](https://arxiv.org/pdf/2208.00945) | [***``[code]``***](https://github.com/zijinwuzijin/DoF-NeRF)\n    > Neural Radiance Field (NeRF) and its variants have exhibited great success on representing 3D scenes and synthesizing photo-realistic novel views. However, they are generally based on the pinhole camera model and assume all-in-focus inputs. This limits their applicability as images captured from the real world often have finite depth-of-field (DoF). To mitigate this issue, we introduce DoF-NeRF, a novel neural rendering approach that can deal with shallow DoF inputs and can simulate DoF effect. In particular, it extends NeRF to simulate the aperture of lens following the principles of geometric optics. Such a physical guarantee allows DoF-NeRF to operate views with different focus configurations. Benefiting from explicit aperture modeling, DoF-NeRF also enables direct manipulation of DoF effect by adjusting virtual aperture and focus parameters. It is plug-and-play and can be inserted into NeRF-based frameworks. Experiments on synthetic and real-world datasets show that, DoF-NeRF not only performs comparably with NeRF in the all-in-focus setting, but also can synthesize all-in-focus novel views conditioned on shallow DoF inputs. An interesting application of DoF-NeRF to DoF rendering is also demonstrated.\n  - [Neural Density-Distance Fields, ECCV2022](https://arxiv.org/abs/2207.14455) | [***``[code]``***](https://ueda0319.github.io/neddf/)\n    > The success of neural fields for 3D vision tasks is now indisputable. Following this trend, several methods aiming for visual localization (e.g., SLAM) have been proposed to estimate distance or density fields using neural fields. However, it is difficult to achieve high localization performance by only density fields-based methods such as Neural Radiance Field (NeRF) since they do not provide density gradient in most empty regions. On the other hand, distance field-based methods such as Neural Implicit Surface (NeuS) have limitations in objects' surface shapes. This paper proposes Neural Density-Distance Field (NeDDF), a novel 3D representation that reciprocally constrains the distance and density fields. We extend distance field formulation to shapes with no explicit boundary surface, such as fur or smoke, which enable explicit conversion from distance field to density field. Consistent distance and density fields realized by explicit conversion enable both robustness to initial values and high-quality registration. Furthermore, the consistency between fields allows fast convergence from sparse point clouds. Experiments show that NeDDF can achieve high localization performance while providing comparable results to NeRF on novel view synthesis. The code is available at this https URL.\n  - [End-to-end View Synthesis via NeRF Attention](https://arxiv.org/abs/2207.14741) | [code]\n    > In this paper, we present a simple seq2seq formulation for view synthesis where we take a set of ray points as input and output colors corresponding to the rays. Directly applying a standard transformer on this seq2seq formulation has two limitations. First, the standard attention cannot successfully fit the volumetric rendering procedure, and therefore high-frequency components are missing in the synthesized views. Second, applying global attention to all rays and pixels is extremely inefficient. Inspired by the neural radiance field (NeRF), we propose the NeRF attention (NeRFA) to address the above problems. On the one hand, NeRFA considers the volumetric rendering equation as a soft feature modulation procedure. In this way, the feature modulation enhances the transformers with the NeRF-like inductive bias. On the other hand, NeRFA performs multi-stage attention to reduce the computational overhead. Furthermore, the NeRFA model adopts the ray and pixel transformers to learn the interactions between rays and pixels. NeRFA demonstrates superior performance over NeRF and NerFormer on four datasets: DeepVoxels, Blender, LLFF, and CO3D. Besides, NeRFA establishes a new state-of-the-art under two settings: the single-scene view synthesis and the category-centric novel view synthesis. The code will be made publicly available.\n  - [Neural Strands: Learning Hair Geometry and Appearance from Multi-View Images, ECCV2022](https://arxiv.org/pdf/2207.14067) | [***``[code]``***](https://radualexandru.github.io/neural_strands/)\n    > We present Neural Strands, a novel learning framework for modeling accurate hair geometry and appearance from multi-view image inputs. The learned hair model can be rendered in real-time from any viewpoint with high-fidelity view-dependent effects. Our model achieves intuitive shape and style control unlike volumetric counterparts. To enable these properties, we propose a novel hair representation based on a neural scalp texture that encodes the geometry and appearance of individual strands at each texel location. Furthermore, we introduce a novel neural rendering framework based on rasterization of the learned hair strands. Our neural rendering is strand-accurate and anti-aliased, making the rendering view-consistent and photorealistic. Combining appearance with a multi-view geometric prior, we enable, for the first time, the joint learning of appearance and explicit hair geometry from a multi-view setup. We demonstrate the efficacy of our approach in terms of fidelity and efficiency for various hairstyles.\n  - [Neural Green’s function for Laplacian systems, Computer & Graphics](https://www.sciencedirect.com/science/article/pii/S0097849322001406) | [code]\n    > Solving linear system of equations stemming from Laplacian operators is at the heart of a wide range of applications. Due to the sparsity of the linear systems, iterative solvers such as Conjugate Gradient and Multigrid are usually employed when the solution has a large number of degrees of freedom. These iterative solvers can be seen as sparse approximations of the Green’s function for the Laplacian operator. In this paper we propose a machine learning approach that regresses a Green’s function from boundary conditions. This is enabled by a Green’s function that can be effectively represented in a multi-scale fashion, drastically reducing the cost associated with a dense matrix representation. Additionally, since the Green’s function is solely dependent on boundary conditions, training the proposed neural network does not require sampling the right-hand side of the linear system. We show results that our method outperforms state of the art Conjugate Gradient and Multigrid methods.\n  - [On the Learnability of Physical Concepts: Can a Neural Network Understand What's Real?](https://arxiv.org/abs/2207.12186) | [code]\n    > We revisit the classic signal-to-symbol barrier in light of the remarkable ability of deep neural networks to generate realistic synthetic data. DeepFakes and spoofing highlight the feebleness of the link between physical reality and its abstract representation, whether learned by a digital computer or a biological agent. Starting from a widely applicable definition of abstract concept, we show that standard feed-forward architectures cannot capture but trivial concepts, regardless of the number of weights and the amount of training data, despite being extremely effective classifiers. On the other hand, architectures that incorporate recursion can represent a significantly larger class of concepts, but may still be unable to learn them from a finite dataset. We qualitatively describe the class of concepts that can be \"understood\" by modern architectures trained with variants of stochastic gradient descent, using a (free energy) Lagrangian to measure information complexity. Even if a concept has been understood, however, a network has no means of communicating its understanding to an external agent, except through continuous interaction and validation. We then characterize physical objects as abstract concepts and use the previous analysis to show that physical objects can be encoded by finite architectures. However, to understand physical concepts, sensors must provide persistently exciting observations, for which the ability to control the data acquisition process is essential (active perception). The importance of control depends on the modality, benefiting visual more than acoustic or chemical perception. Finally, we conclude that binding physical entities to digital identities is possible in finite time with finite resources, solving in principle the signal-to-symbol barrier problem, but we highlight the need for continuous validation.\n## Previous weeks\n  - [﻿Plenoxels: Radiance Fields without Neural Networks, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > We introduce Plenoxels (plenoptic voxels), a system for photorealistic view synthesis. Plenoxels represent a scene as a sparse 3D grid with spherical harmonics. This representation can be optimized from calibrated images via gradient methods and regularization without any neural components. On standard, benchmark tasks, Plenoxels are optimized two orders of magnitude faster than Neural Radiance Fields with no loss in visual quality.\n  - [Urban Radiance Fields, CVPR2022](https://urban-radiance-fields.github.io/) | [code]\n    > The goal of this work is to perform 3D reconstruction and novel view synthesis from data captured by scanning platforms commonly deployed for world mapping in urban outdoor environments (e.g., Street View). Given a sequence of posed RGB images and lidar sweeps acquired by cameras and scanners moving through an outdoor scene, we produce a model from which 3D surfaces can be extracted and novel RGB images can be synthesized. Our approach extends Neural Radiance Fields, which has been demonstrated to synthesize realistic novel images for small scenes in controlled settings, with new methods for leveraging asynchronously captured lidar data, for addressing exposure variation between captured images, and for leveraging predicted image segmentations to supervise densities on rays pointing at the sky. Each of these three extensions provides significant performance improvements in experiments on Street View data. Our system produces state-of-the-art 3D surface reconstructions and synthesizes higher quality novel views in comparison to both traditional methods (e.g.~COLMAP) and recent neural representations (e.g.~Mip-NeRF).\n  - [NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis, ECCV2020](https://arxiv.org/abs/2003.08934) | [***``[code]``***](http://tancik.com/nerf)\n    > We present a method that achieves state-of-the-art results for synthesizing novel views of complex scenes by optimizing an underlying continuous volumetric scene function using a sparse set of input views. Our algorithm represents a scene using a fully-connected (non-convolutional) deep network, whose input is a single continuous 5D coordinate (spatial location (x,y,z) and viewing direction (θ,ϕ)) and whose output is the volume density and view-dependent emitted radiance at that spatial location. We synthesize views by querying 5D coordinates along camera rays and use classic volume rendering techniques to project the output colors and densities into an image. Because volume rendering is naturally differentiable, the only input required to optimize our representation is a set of images with known camera poses. We describe how to effectively optimize neural radiance fields to render photorealistic novel views of scenes with complicated geometry and appearance, and demonstrate results that outperform prior work on neural rendering and view synthesis. View synthesis results are best viewed as videos, so we urge readers to view our supplementary video for convincing comparisons.\n  - [NeRF in the Wild: Neural Radiance Fields for Unconstrained Photo Collections, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > We present a learning-based method for synthesizing novel views of complex scenes using only unstructured collections of in-the-wild photographs. We build on Neural Radiance Fields (NeRF), which uses the weights of a multilayer perceptron to model the density and color of a scene as a function of 3D coordinates. While NeRF works well on images of static subjects captured under controlled settings, it is incapable of modeling many ubiquitous, real-world phenomena in uncontrolled images, such as variable illumination or transient occluders. We introduce a series of extensions to NeRF to address these issues, thereby enabling accurate reconstructions from unstructured image collections taken from the internet. We apply our system, dubbed NeRF-W, to internet photo collections of famous landmarks, and demonstrate temporally consistent novel view renderings that are significantly closer to photorealism than the prior state of the art.\n  - [Ha-NeRF: Hallucinated Neural Radiance Fields in the Wild, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > Neural Radiance Fields (NeRF) has recently gained popularity for its impressive novel view synthesis ability. This paper studies the problem of hallucinated NeRF: i.e., recovering a realistic NeRF at a different time of day from a group of tourism images. Existing solutions adopt NeRF with a controllable appearance embedding to render novel views under various conditions, but they cannot render view-consistent images with an unseen appearance. To solve this problem, we present an end-to-end framework for constructing a hallucinated NeRF, dubbed as Ha-NeRF. Specifically, we propose an appearance hallucination module to handle time-varying appearances and transfer them to novel views. Considering the complex occlusions of tourism images, we introduce an anti-occlusion module to decompose the static subjects for visibility accurately. Experimental results on synthetic data and real tourism photo collections demonstrate that our method can hallucinate the desired appearances and render occlusion-free images from different views.\n  - [Nerfies: Deformable Neural Radiance Fields, ICCV2021](https://arxiv.org/abs/2011.12948) | [code]\n    > We present the first method capable of photorealistically reconstructing deformable scenes using photos/videos captured casually from mobile phones. Our approach augments neural radiance fields (NeRF) by optimizing an additional continuous volumetric deformation field that warps each observed point into a canonical 5D NeRF. We observe that these NeRF-like deformation fields are prone to local minima, and propose a coarse-to-fine optimization method for coordinate-based models that allows for more robust optimization. By adapting principles from geometry processing and physical simulation to NeRF-like models, we propose an elastic regularization of the deformation field that further improves robustness. We show that our method can turn casually captured selfie photos/videos into deformable NeRF models that allow for photorealistic renderings of the subject from arbitrary viewpoints, which we dub \"nerfies.\" We evaluate our method by collecting time-synchronized data using a rig with two mobile phones, yielding train/validation images of the same pose at different viewpoints. We show that our method faithfully reconstructs non-rigidly deforming scenes and reproduces unseen views with high fidelity.\n  - [D-NeRF: Neural Radiance Fields for Dynamic Scenes, CVPR2021](https://arxiv.org/abs/2011.13961) | [***``[code]``***](https://github.com/albertpumarola/D-NeRF)\n    > Neural rendering techniques combining machine learning with geometric reasoning have arisen as one of the most promising approaches for synthesizing novel views of a scene from a sparse set of images. Among these, stands out the Neural radiance fields (NeRF), which trains a deep network to map 5D input coordinates (representing spatial location and viewing direction) into a volume density and view-dependent emitted radiance. However, despite achieving an unprecedented level of photorealism on the generated images, NeRF is only applicable to static scenes, where the same spatial location can be queried from different images. In this paper we introduce D-NeRF, a method that extends neural radiance fields to a dynamic domain, allowing to reconstruct and render novel images of objects under rigid and non-rigid motions from a \\emph{single} camera moving around the scene. For this purpose we consider time as an additional input to the system, and split the learning process in two main stages: one that encodes the scene into a canonical space and another that maps this canonical representation into the deformed scene at a particular time. Both mappings are simultaneously learned using fully-connected networks. Once the networks are trained, D-NeRF can render novel images, controlling both the camera view and the time variable, and thus, the object movement. We demonstrate the effectiveness of our approach on scenes with objects under rigid, articulated and non-rigid motions. Code, model weights and the dynamic scenes dataset will be released.\n  - [Dynamic Neural Radiance Fields for Monocular 4D Facial Avatar Reconstruction, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > We present dynamic neural radiance fields for modeling the appearance and dynamics of a human face. Digitally modeling and reconstructing a talking human is a key building-block for a variety of applications. Especially, for telepresence applications in AR or VR, a faithful reproduction of the appearance including novel viewpoint or head-poses is required. In contrast to state-of-the-art approaches that model the geometry and material properties explicitly, or are purely image-based, we introduce an implicit representation of the head based on scene representation networks. To handle the dynamics of the face, we combine our scene representation network with a low-dimensional morphable model which provides explicit control over pose and expressions. We use volumetric rendering to generate images from this hybrid representation and demonstrate that such a dynamic neural scene representation can be learned from monocular input data only, without the need of a specialized capture setup. In our experiments, we show that this learned volumetric representation allows for photo-realistic image generation that surpasses the quality of state-of-the-art video-based reenactment methods.\n  - [PVA: Pixel-aligned Volumetric Avatars, CVPR2021](https://volumetric-avatars.github.io/) | [code]\n    > Acquisition and rendering of photorealistic human heads is a highly challenging research problem of particular importance for virtual telepresence. Currently, the highest quality is achieved by volumetric approaches trained in a person-specific manner on multi-view data. These models better represent fine structure, such as hair, compared to simpler mesh-based models. Volumetric models typically employ a global code to represent facial expressions, such that they can be driven by a small set of animation parameters. While such architectures achieve impressive rendering quality, they can not easily be extended to the multi-identity setting. In this paper, we devise a novel approach for predicting volumetric avatars of the human head given just a small number of inputs. We enable generalization across identities by a novel parameterization that combines neural radiance fields with local, pixel-aligned features extracted directly from the inputs, thus side-stepping the need for very deep or complex networks. Our approach is trained in an end-to-end manner solely based on a photometric rerendering loss without requiring explicit 3D supervision.We demonstrate that our approach outperforms the existing state of the art in terms of quality and is able to generate faithful facial expressions in a multi-identity setting.\n  - [Animatable Neural Radiance Fields for Human Body Modeling, ICCV2021](https://zju3dv.github.io/animatable_nerf/) | [***``[code]``***](https://github.com/zju3dv/animatable_nerf)\n    > This paper addresses the challenge of reconstructing an animatable human model from a multi-view video. Some recent works have proposed to decompose a non-rigidly deforming scene into a canonical neural radiance field and a set of deformation fields that map observation-space points to the canonical space, thereby enabling them to learn the dynamic scene from images. However, they represent the deformation field as translational vector field or SE(3) field, which makes the optimization highly under-constrained. Moreover, these representations cannot be explicitly controlled by input motions. Instead, we introduce neural blend weight fields to produce the deformation fields. Based on the skeleton-driven deformation, blend weight fields are used with 3D human skeletons to generate observation-to-canonical and canonical-to-observation correspondences. Since 3D human skeletons are more observable, they can regularize the learning of deformation fields. Moreover, the learned blend weight fields can be combined with input skeletal motions to generate new deformation fields to animate the human model. Experiments show that our approach significantly outperforms recent human synthesis methods. The code will be available at https://zju3dv.github.io/animatable_nerf/.\n  - [NeRF++: Analyzing and Improving Neural Radiance Fields](https://arxiv.org/abs/2010.07492) | [***``[code]``***](https://github.com/Kai-46/nerfplusplus;)\n    > Neural Radiance Fields (NeRF) achieve impressive view synthesis results for a variety of capture settings, including 360 capture of bounded scenes and forward-facing capture of bounded and unbounded scenes. NeRF fits multi-layer perceptrons (MLPs) representing view-invariant opacity and view-dependent color volumes to a set of training images, and samples novel views based on volume rendering techniques. In this technical report, we first remark on radiance fields and their potential ambiguities, namely the shape-radiance ambiguity, and analyze NeRF's success in avoiding such ambiguities. Second, we address a parametrization issue involved in applying NeRF to 360 captures of objects within large-scale, unbounded 3D scenes. Our method improves view synthesis fidelity in this challenging scenario. Code is available at this https URL.\n  - [Neural Scene Graphs for Dynamic Scenes, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > Recent implicit neural rendering methods have demonstrated that it is possible to learn accurate view synthesis for complex scenes by predicting their volumetric density and color supervised solely by a set of RGB images. However, existing methods are restricted to learning efficient representations of static scenes that encode all scene objects into a single neural network, and lack the ability to represent dynamic scenes and decompositions into individual scene objects. In this work, we present the first neural rendering method that decomposes dynamic scenes into scene graphs. We propose a learned scene graph representation, which encodes object transformation and radiance, to efficiently render novel arrangements and views of the scene. To this end, we learn implicitly encoded scenes, combined with a jointly learned latent representation to describe objects with a single implicit function. We assess the proposed method on synthetic and real automotive data, validating that our approach learns dynamic scenes -- only by observing a video of this scene -- and allows for rendering novel photo-realistic views of novel scene compositions with unseen sets of objects at unseen poses.\n  - [In-Place Scene Labelling and Understanding with Implicit Scene Representation, ICCV2021(oral)](https://shuaifengzhi.com/Semantic-NeRF/) | [***``[code]``***](https://github.com/Harry-Zhi/semantic_nerf/)\n    > Semantic labelling is highly correlated with geometry and radiance reconstruction, as scene entities with similar shape and appearance are more likely to come from similar classes. Recent implicit neural reconstruction techniques are appealing as they do not require prior training data, but the same fully self-supervised approach is not possible for semantics because labels are human-defined properties.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/pose-slam.md",
    "content": "\nWeekly Classified Neural Radiance Fields - pose-slam ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=====================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [Fast and Lightweight Scene Regressor for Camera Relocalization](https://arxiv.org/abs/2212.01830) | [***``[code]``***](https://github.com/aislab/feat2map)\n    > Camera relocalization involving a prior 3D reconstruction plays a crucial role in many mixed reality and robotics applications. Estimating the camera pose directly with respect to pre-built 3D models can be prohibitively expensive for several applications with limited storage and/or communication bandwidth. Although recent scene and absolute pose regression methods have become popular for efficient camera localization, most of them are computation-resource intensive and difficult to obtain a real-time inference with high accuracy constraints. This study proposes a simple scene regression method that requires only a multi-layer perceptron network for mapping scene coordinates to achieve accurate camera pose estimations. The proposed approach uses sparse descriptors to regress the scene coordinates, instead of a dense RGB image. The use of sparse features provides several advantages. First, the proposed regressor network is substantially smaller than those reported in previous studies. This makes our system highly efficient and scalable. Second, the pre-built 3D models provide the most reliable and robust 2D-3D matches. Therefore, learning from them can lead to an awareness of equivalent features and substantially improve the generalization performance. A detailed analysis of our approach and extensive evaluations using existing datasets are provided to support the proposed method. The implementation detail is available at this https URL\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [RUST: Latent Neural Scene Representations from Unposed Imagery](https://arxiv.org/abs/2211.14306) | [code]\n    > Inferring the structure of 3D scenes from 2D observations is a fundamental challenge in computer vision. Recently popularized approaches based on neural scene representations have achieved tremendous impact and have been applied across a variety of applications. One of the major remaining challenges in this space is training a single model which can provide latent representations which effectively generalize beyond a single scene. Scene Representation Transformer (SRT) has shown promise in this direction, but scaling it to a larger set of diverse scenes is challenging and necessitates accurately posed ground truth data. To address this problem, we propose RUST (Really Unposed Scene representation Transformer), a pose-free approach to novel view synthesis trained on RGB images alone. Our main insight is that one can train a Pose Encoder that peeks at the target image and learns a latent pose embedding which is used by the decoder for view synthesis. We perform an empirical investigation into the learned latent pose structure and show that it allows meaningful test-time camera transformations and accurate explicit pose readouts. Perhaps surprisingly, RUST achieves similar quality as methods which have access to perfect camera pose, thereby unlocking the potential for large-scale training of amortized neural scene representations.\n  - [ActiveRMAP: Radiance Field for Active Mapping And Planning](https://arxiv.org/abs/2211.12656) | [code]\n    > A high-quality 3D reconstruction of a scene from a collection of 2D images can be achieved through offline/online mapping methods. In this paper, we explore active mapping from the perspective of implicit representations, which have recently produced compelling results in a variety of applications. One of the most popular implicit representations - Neural Radiance Field (NeRF), first demonstrated photorealistic rendering results using multi-layer perceptrons, with promising offline 3D reconstruction as a by-product of the radiance field. More recently, researchers also applied this implicit representation for online reconstruction and localization (i.e. implicit SLAM systems). However, the study on using implicit representation for active vision tasks is still very limited. In this paper, we are particularly interested in applying the neural radiance field for active mapping and planning problems, which are closely coupled tasks in an active system. We, for the first time, present an RGB-only active vision framework using radiance field representation for active 3D reconstruction and planning in an online manner. Specifically, we formulate this joint task as an iterative dual-stage optimization problem, where we alternatively optimize for the radiance field representation and path planning. Experimental results suggest that the proposed method achieves competitive results compared to other offline methods and outperforms active reconstruction methods using NeRFs.\n  - [Local-to-Global Registration for Bundle-Adjusting Neural Radiance Fields](https://arxiv.org/abs/2211.11505) | [***``[code]``***](https://github.com/rover-xingyu/L2G-NeRF)\n    > Neural Radiance Fields (NeRF) have achieved photorealistic novel views synthesis; however, the requirement of accurate camera poses limits its application. Despite analysis-by-synthesis extensions for jointly learning neural 3D representations and registering camera frames exist, they are susceptible to suboptimal solutions if poorly initialized. We propose L2G-NeRF, a Local-to-Global registration method for bundle-adjusting Neural Radiance Fields: first, a pixel-wise flexible alignment, followed by a frame-wise constrained parametric alignment. Pixel-wise local alignment is learned in an unsupervised way via a deep network which optimizes photometric reconstruction errors. Frame-wise global alignment is performed using differentiable parameter estimation solvers on the pixel-wise correspondences to find a global transformation. Experiments on synthetic and real-world data show that our method outperforms the current state-of-the-art in terms of high-fidelity reconstruction and resolving large camera pose misalignment. Our module is an easy-to-use plugin that can be applied to NeRF variants and other neural field applications. The Code and supplementary materials are available at this https URL.\n  - [Neural Puppeteer: Keypoint-Based Neural Rendering of Dynamic Shapes, ACCV2022](https://openaccess.thecvf.com/content/ACCV2022/html/Giebenhain_Neural_Puppeteer_Keypoint-Based_Neural_Rendering_of_Dynamic_Shapes_ACCV_2022_paper.html) | [***``[code]``***](https://github.com/urs-waldmann/NePu/)\n    > We introduce Neural Puppeteer, an efficient neural rendering pipeline for articulated shapes. By inverse rendering, we can predict 3D keypoints from multi-view 2D silhouettes alone, without requiring texture information. Furthermore, we can easily predict 3D keypoints of the same class of shapes with one and the same trained model and generalize more easily from training with synthetic data which we demonstrate by successfully applying zero-shot synthetic to real-world experiments. We demonstrate the flexibility of our method by fitting models to synthetic videos of different animals and a human, and achieve quantitative results which outperform our baselines. Our method uses 3D keypoints in conjunction with individual local feature vectors and a global latent code to allow for an efficient representation of time-varying and articulated shapes such as humans and animals. In contrast to previous work, we do not perform reconstruction in the 3D domain, but project the 3D features into 2D cameras and perform reconstruction of 2D RGB-D images from these projected features, which is significantly faster than volumetric rendering. Our synthetic dataset will be publicly available, to further develop the evolving field of animal pose and shape reconstruction.\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n  - [nerf2nerf: Pairwise Registration of Neural Radiance Fields](https://arxiv.org/abs/2211.01600) | [code]\n    > We introduce a technique for pairwise registration of neural fields that extends classical optimization-based local registration (i.e. ICP) to operate on Neural Radiance Fields (NeRF) -- neural 3D scene representations trained from collections of calibrated images. NeRF does not decompose illumination and color, so to make registration invariant to illumination, we introduce the concept of a ''surface field'' -- a field distilled from a pre-trained NeRF model that measures the likelihood of a point being on the surface of an object. We then cast nerf2nerf registration as a robust optimization that iteratively seeks a rigid transformation that aligns the surface fields of the two scenes. We evaluate the effectiveness of our technique by introducing a dataset of pre-trained NeRF scenes -- our synthetic scenes enable quantitative evaluations and comparisons to classical registration techniques, while our real scenes demonstrate the validity of our technique in real-world scenarios. Additional results available at: this https URL\n  - [GARF: Gaussian Activated Radiance Fields for High Fidelity Reconstruction and Pose Estimation, ECCV2022](https://arxiv.org/abs/2204.05735) | [code]\n    > Despite Neural Radiance Fields (NeRF) showing compelling results in photorealistic novel views synthesis of real-world scenes, most existing approaches require accurate prior camera poses. Although approaches for jointly recovering the radiance field and camera pose exist (BARF), they rely on a cumbersome coarse-to-fine auxiliary positional embedding to ensure good performance. We present Gaussian Activated neural Radiance Fields (GARF), a new positional embedding-free neural radiance field architecture - employing Gaussian activations - that outperforms the current state-of-the-art in terms of high fidelity reconstruction and pose estimation.\n  - [Digging into Radiance Grid for Real-Time View Synthesis with Detail Preservation, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19784-0_42) | [code]\n    > Neural Radiance Fields (NeRF) [31] series are impressive in representing scenes and synthesizing high-quality novel views. However, most previous works fail to preserve texture details and suffer from slow training speed. A recent method SNeRG [11] demonstrates that baking a trained NeRF as a Sparse Neural Radiance Grid enables real-time view synthesis with slight scarification of rendering quality. In this paper, we dig into the Radiance Grid representation and present a set of improvements, which together result in boosted performance in terms of both speed and quality. First, we propose an HieRarchical Sparse Radiance Grid (HrSRG) representation that has higher voxel resolution for informative spaces and fewer voxels for other spaces. HrSRG leverages a hierarchical voxel grid building process inspired by [30, 55], and can describe a scene at high resolution without excessive memory footprint. Furthermore, we show that directly optimizing the voxel grid leads to surprisingly good texture details in rendered images. This direct optimization is memory-friendly and requires multiple orders of magnitude less time than conventional NeRFs as it only involves a tiny MLP. Finally, we find that a critical factor that prevents fine details restoration is the misaligned 2D pixels among images caused by camera pose errors. We propose to use the perceptual loss to add tolerance to misalignments, leading to the improved visual quality of rendered images.\n## Oct23 - Oct29, 2022\n  - [EpipolarNVS: leveraging on Epipolar geometry for single-image Novel View Synthesis, BMVC2022](https://arxiv.org/abs/2210.13077) | [code]\n    > Novel-view synthesis (NVS) can be tackled through different approaches, depending on the general setting: a single source image to a short video sequence, exact or noisy camera pose information, 3D-based information such as point clouds etc. The most challenging scenario, the one where we stand in this work, only considers a unique source image to generate a novel one from another viewpoint. However, in such a tricky situation, the latest learning-based solutions often struggle to integrate the camera viewpoint transformation. Indeed, the extrinsic information is often passed as-is, through a low-dimensional vector. It might even occur that such a camera pose, when parametrized as Euler angles, is quantized through a one-hot representation. This vanilla encoding choice prevents the learnt architecture from inferring novel views on a continuous basis (from a camera pose perspective). We claim it exists an elegant way to better encode relative camera pose, by leveraging 3D-related concepts such as the epipolar constraint. We, therefore, introduce an innovative method that encodes the viewpoint transformation as a 2D feature image. Such a camera encoding strategy gives meaningful insights to the network regarding how the camera has moved in space between the two views. By encoding the camera pose information as a finite number of coloured epipolar lines, we demonstrate through our experiments that our strategy outperforms vanilla encoding.\n  - [NeRF-SLAM: Real-Time Dense Monocular SLAM with Neural Radiance Fields](https://arxiv.org/abs/2210.13641) | [code]\n    > We propose a novel geometric and photometric 3D mapping pipeline for accurate and real-time scene reconstruction from monocular images. To achieve this, we leverage recent advances in dense monocular SLAM and real-time hierarchical volumetric neural radiance fields. Our insight is that dense monocular SLAM provides the right information to fit a neural radiance field of the scene in real-time, by providing accurate pose estimates and depth-maps with associated uncertainty. With our proposed uncertainty-based depth loss, we achieve not only good photometric accuracy, but also great geometric accuracy. In fact, our proposed pipeline achieves better geometric and photometric accuracy than competing approaches (up to 179% better PSNR and 86% better L1 depth), while working in real-time and using only monocular images.\n## Oct16 - Oct22, 2022\n  - [Generative Range Imaging for Learning Scene Priors of 3D LiDAR Data, WACV2023](https://arxiv.org/abs/2210.11750) | [code]\n    > 3D LiDAR sensors are indispensable for the robust vision of autonomous mobile robots. However, deploying LiDAR-based perception algorithms often fails due to a domain gap from the training environment, such as inconsistent angular resolution and missing properties. Existing studies have tackled the issue by learning inter-domain mapping, while the transferability is constrained by the training configuration and the training is susceptible to peculiar lossy noises called ray-drop. To address the issue, this paper proposes a generative model of LiDAR range images applicable to the data-level domain transfer. Motivated by the fact that LiDAR measurement is based on point-by-point range imaging, we train an implicit image representation-based generative adversarial networks along with a differentiable ray-drop effect. We demonstrate the fidelity and diversity of our model in comparison with the point-based and image-based state-of-the-art generative models. We also showcase upsampling and restoration applications. Furthermore, we introduce a Sim2Real application for LiDAR semantic segmentation. We demonstrate that our method is effective as a realistic ray-drop simulator and outperforms state-of-the-art methods.\n  - [High-Quality RGB-D Reconstruction via Multi-View Uncalibrated Photometric Stereo and Gradient-SDF, WACV2023](https://arxiv.org/abs/2210.12202) | [code]\n    > Fine-detailed reconstructions are in high demand in many applications. However, most of the existing RGB-D reconstruction methods rely on pre-calculated accurate camera poses to recover the detailed surface geometry, where the representation of a surface needs to be adapted when optimizing different quantities. In this paper, we present a novel multi-view RGB-D based reconstruction method that tackles camera pose, lighting, albedo, and surface normal estimation via the utilization of a gradient signed distance field (gradient-SDF). The proposed method formulates the image rendering process using specific physically-based model(s) and optimizes the surface's quantities on the actual surface using its volumetric representation, as opposed to other works which estimate surface quantities only near the actual surface. To validate our method, we investigate two physically-based image formation models for natural light and point light source applications. The experimental results on synthetic and real-world datasets demonstrate that the proposed method can recover high-quality geometry of the surface more faithfully than the state-of-the-art and further improves the accuracy of estimated camera poses.\n  - [Neural Fields for Robotic Object Manipulation from a Single Image, ICRA2023](https://arxiv.org/abs/2210.12126) | [code]\n    > We present a unified and compact representation for object rendering, 3D reconstruction, and grasp pose prediction that can be inferred from a single image within a few seconds. We achieve this by leveraging recent advances in the Neural Radiance Field (NeRF) literature that learn category-level priors and fine-tune on novel objects with minimal data and time. Our insight is that we can learn a compact shape representation and extract meaningful additional information from it, such as grasping poses. We believe this to be the first work to retrieve grasping poses directly from a NeRF-based representation using a single viewpoint (RGB-only), rather than going through a secondary network and/or representation. When compared to prior art, our method is two to three orders of magnitude smaller while achieving comparable performance at view reconstruction and grasping. Accompanying our method, we also propose a new dataset of rendered shoes for training a sim-2-real NeRF method with grasping poses for different widths of grippers.\n  - [Parallel Inversion of Neural Radiance Fields for Robust Pose Estimation, ICRA2023](https://arxiv.org/abs/2210.10108) | [code]\n    > We present a parallelized optimization method based on fast Neural Radiance Fields (NeRF) for estimating 6-DoF target poses. Given a single observed RGB image of the target, we can predict the translation and rotation of the camera by minimizing the residual between pixels rendered from a fast NeRF model and pixels in the observed image. We integrate a momentum-based camera extrinsic optimization procedure into Instant Neural Graphics Primitives, a recent exceptionally fast NeRF implementation. By introducing parallel Monte Carlo sampling into the pose estimation task, our method overcomes local minima and improves efficiency in a more extensive search space. We also show the importance of adopting a more robust pixel-based loss function to reduce error. Experiments demonstrate that our method can achieve improved generalization and robustness on both synthetic and real-world benchmarks.\n  - [Neural Contact Fields: Tracking Extrinsic Contact with Tactile Sensing](https://arxiv.org/abs/2210.09297) | [code]\n    > We present Neural Contact Fields, a method that brings together neural fields and tactile sensing to address the problem of tracking extrinsic contact between object and environment. Knowing where the external contact occurs is a first step towards methods that can actively control it in facilitating downstream manipulation tasks. Prior work for localizing environmental contacts typically assume a contact type (e.g. point or line), does not capture contact/no-contact transitions, and only works with basic geometric-shaped objects. Neural Contact Fields are the first method that can track arbitrary multi-modal extrinsic contacts without making any assumptions about the contact type. Our key insight is to estimate the probability of contact for any 3D point in the latent space of object shapes, given vision-based tactile inputs that sense the local motion resulting from the external contact. In experiments, we find that Neural Contact Fields are able to localize multiple contact patches without making any assumptions about the geometry of the contact, and capture contact/no-contact transitions for known categories of objects with unseen shapes in unseen environment configurations. In addition to Neural Contact Fields, we also release our YCB-Extrinsic-Contact dataset of simulated extrinsic contact interactions to enable further research in this area. Project repository: this https URL\n  - [Differentiable Physics Simulation of Dynamics-Augmented Neural Objects](https://arxiv.org/abs/2210.09420) | [code]\n    > We present a differentiable pipeline for simulating the motion of objects that represent their geometry as a continuous density field parameterized as a deep network. This includes Neural Radiance Fields (NeRFs), and other related models. From the density field, we estimate the dynamical properties of the object, including its mass, center of mass, and inertia matrix. We then introduce a differentiable contact model based on the density field for computing normal and friction forces resulting from collisions. This allows a robot to autonomously build object models that are visually and dynamically accurate from still images and videos of objects in motion. The resulting Dynamics-Augmented Neural Objects (DANOs) are simulated with an existing differentiable simulation engine, Dojo, interacting with other standard simulation objects, such as spheres, planes, and robots specified as URDFs. A robot can use this simulation to optimize grasps and manipulation trajectories of neural objects, or to improve the neural object models through gradient-based real-to-simulation transfer. We demonstrate the pipeline to learn the coefficient of friction of a bar of soap from a real video of the soap sliding on a table. We also learn the coefficient of friction and mass of a Stanford bunny through interactions with a Panda robot arm from synthetic data, and we optimize trajectories in simulation for the Panda arm to push the bunny to a goal location.\n## Oct9 - Oct15, 2022\n  - [ExAug: Robot-Conditioned Navigation Policies via Geometric Experience Augmentation](https://arxiv.org/abs/2210.07450) | [code]\n    > Machine learning techniques rely on large and diverse datasets for generalization. Computer vision, natural language processing, and other applications can often reuse public datasets to train many different models. However, due to differences in physical configurations, it is challenging to leverage public datasets for training robotic control policies on new robot platforms or for new tasks. In this work, we propose a novel framework, ExAug to augment the experiences of different robot platforms from multiple datasets in diverse environments. ExAug leverages a simple principle: by extracting 3D information in the form of a point cloud, we can create much more complex and structured augmentations, utilizing both generating synthetic images and geometric-aware penalization that would have been suitable in the same situation for a different robot, with different size, turning radius, and camera placement. The trained policy is evaluated on two new robot platforms with three different cameras in indoor and outdoor environments with obstacles.\n  - [NOCaL: Calibration-Free Semi-Supervised Learning of Odometry and Camera Intrinsics](https://arxiv.org/abs/2210.07435) | [code]\n    > There are a multitude of emerging imaging technologies that could benefit robotics. However the need for bespoke models, calibration and low-level processing represents a key barrier to their adoption. In this work we present NOCaL, Neural odometry and Calibration using Light fields, a semi-supervised learning architecture capable of interpreting previously unseen cameras without calibration. NOCaL learns to estimate camera parameters, relative pose, and scene appearance. It employs a scene-rendering hypernetwork pretrained on a large number of existing cameras and scenes, and adapts to previously unseen cameras using a small supervised training set to enforce metric scale. We demonstrate NOCaL on rendered and captured imagery using conventional cameras, demonstrating calibration-free odometry and novel view synthesis. This work represents a key step toward automating the interpretation of general camera geometries and emerging imaging technologies.\n  - [GeoAug: Data Augmentation for Few-Shot NeRF with Geometry Constraints, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19790-1_20) | [code]\n    > Neural Radiance Fields (NeRF) show remarkable ability to render novel views of a certain scene by learning an implicit volumetric representation with only posed RGB images. Despite its impressiveness and simplicity, NeRF usually converges to sub-optimal solutions with incorrect geometries given few training images. We hereby present GeoAug: a data augmentation method for NeRF, which enriches training data based on multi-view geometric constraint. GeoAug provides random artificial (novel pose, RGB image) pairs for training, where the RGB image is from a nearby training view. The rendering of a novel pose is warped to the nearby training view with depth map and relative pose to match the RGB image supervision. Our method reduces the risk of over-fitting by introducing more data during training, while also provides additional implicit supervision for depth maps. In experiments, our method significantly boosts the performance of neural radiance fields conditioned on few training views.\n  - [Photo-realistic Neural Domain Randomization, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19806-9_18) | [code]\n    > Synthetic data is a scalable alternative to manual supervision, but it requires overcoming the sim-to-real domain gap. This discrepancy between virtual and real worlds is addressed by two seemingly opposed approaches: improving the realism of simulation or foregoing realism entirely via domain randomization. In this paper, we show that the recent progress in neural rendering enables a new unified approach we call Photo-realistic Neural Domain Randomization (PNDR). We propose to learn a composition of neural networks that acts as a physics-based ray tracer generating high-quality renderings from scene geometry alone. Our approach is modular, composed of different neural networks for materials, lighting, and rendering, thus enabling randomization of different key image generation components in a differentiable pipeline. Once trained, our method can be combined with other methods and used to generate photo-realistic image augmentations online and significantly more efficiently than via traditional ray-tracing. We demonstrate the usefulness of PNDR through two downstream tasks: 6D object detection and monocular depth estimation. Our experiments show that training with PNDR enables generalization to novel scenes and significantly outperforms the state of the art in terms of real-world transfer.\n  - [X-NeRF: Explicit Neural Radiance Field for Multi-Scene 360∘ Insufficient RGB-D Views, WACV2023](https://arxiv.org/abs/2210.05135) | [***``[code]``***](https://github.com/HaoyiZhu/XNeRF)\n    > Neural Radiance Fields (NeRFs), despite their outstanding performance on novel view synthesis, often need dense input views. Many papers train one model for each scene respectively and few of them explore incorporating multi-modal data into this problem. In this paper, we focus on a rarely discussed but important setting: can we train one model that can represent multiple scenes, with 360∘ insufficient views and RGB-D images? We refer insufficient views to few extremely sparse and almost non-overlapping views. To deal with it, X-NeRF, a fully explicit approach which learns a general scene completion process instead of a coordinate-based mapping, is proposed. Given a few insufficient RGB-D input views, X-NeRF first transforms them to a sparse point cloud tensor and then applies a 3D sparse generative Convolutional Neural Network (CNN) to complete it to an explicit radiance field whose volumetric rendering can be conducted fast without running networks during inference. To avoid overfitting, besides common rendering loss, we apply perceptual loss as well as view augmentation through random rotation on point clouds. The proposed methodology significantly out-performs previous implicit methods in our setting, indicating the great potential of proposed problem and approach. Codes and data are available at this https URL.\n  - [Multi-Object Navigation with dynamically learned neural implicit representations](https://arxiv.org/abs/2210.05129) | [code]\n    > Understanding and mapping a new environment are core abilities of any autonomously navigating agent. While classical robotics usually estimates maps in a stand-alone manner with SLAM variants, which maintain a topological or metric representation, end-to-end learning of navigation keeps some form of memory in a neural network. Networks are typically imbued with inductive biases, which can range from vectorial representations to birds-eye metric tensors or topological structures. In this work, we propose to structure neural networks with two neural implicit representations, which are learned dynamically during each episode and map the content of the scene: (i) the Semantic Finder predicts the position of a previously seen queried object; (ii) the Occupancy and Exploration Implicit Representation encapsulates information about explored area and obstacles, and is queried with a novel global read mechanism which directly maps from function space to a usable embedding space. Both representations are leveraged by an agent trained with Reinforcement Learning (RL) and learned online during each episode. We evaluate the agent on Multi-Object Navigation and show the high impact of using neural implicit representations as a memory source.\n  - [SiNeRF: Sinusoidal Neural Radiance Fields for Joint Pose Estimation and Scene Reconstruction, BMVC2022](https://arxiv.org/abs/2210.04553) | [***``[code]``***](https://github.com/yitongx/sinerf)\n    > NeRFmm is the Neural Radiance Fields (NeRF) that deal with Joint Optimization tasks, i.e., reconstructing real-world scenes and registering camera parameters simultaneously. Despite NeRFmm producing precise scene synthesis and pose estimations, it still struggles to outperform the full-annotated baseline on challenging scenes. In this work, we identify that there exists a systematic sub-optimality in joint optimization and further identify multiple potential sources for it. To diminish the impacts of potential sources, we propose Sinusoidal Neural Radiance Fields (SiNeRF) that leverage sinusoidal activations for radiance mapping and a novel Mixed Region Sampling (MRS) for selecting ray batch efficiently. Quantitative and qualitative results show that compared to NeRFmm, SiNeRF achieves comprehensive significant improvements in image synthesis quality and pose estimation accuracy. Codes are available at this https URL.\n  - [NeRF2Real: Sim2real Transfer of Vision-guided Bipedal Motion Skills using Neural Radiance Fields](https://arxiv.org/abs/2210.04932) | [code]\n    > We present a system for applying sim2real approaches to \"in the wild\" scenes with realistic visuals, and to policies which rely on active perception using RGB cameras. Given a short video of a static scene collected using a generic phone, we learn the scene's contact geometry and a function for novel view synthesis using a Neural Radiance Field (NeRF). We augment the NeRF rendering of the static scene by overlaying the rendering of other dynamic objects (e.g. the robot's own body, a ball). A simulation is then created using the rendering engine in a physics simulator which computes contact dynamics from the static scene geometry (estimated from the NeRF volume density) and the dynamic objects' geometry and physical properties (assumed known). We demonstrate that we can use this simulation to learn vision-based whole body navigation and ball pushing policies for a 20 degrees of freedom humanoid robot with an actuated head-mounted RGB camera, and we successfully transfer these policies to a real robot. Project video is available at this https URL\n## Oct2 - Oct8, 2022\n  - [A Real2Sim2Real Method for Robust Object Grasping with Neural Surface Reconstruction](https://arxiv.org/abs/2210.02685) | [code]\n    > Recent 3D-based manipulation methods either directly predict the grasp pose using 3D neural networks, or solve the grasp pose using similar objects retrieved from shape databases. However, the former faces generalizability challenges when testing with new robot arms or unseen objects; and the latter assumes that similar objects exist in the databases. We hypothesize that recent 3D modeling methods provides a path towards building digital replica of the evaluation scene that affords physical simulation and supports robust manipulation algorithm learning. We propose to reconstruct high-quality meshes from real-world point clouds using state-of-the-art neural surface reconstruction method (the Real2Sim step). Because most simulators take meshes for fast simulation, the reconstructed meshes enable grasp pose labels generation without human efforts. The generated labels can train grasp network that performs robustly in the real evaluation scene (the Sim2Real step). In synthetic and real experiments, we show that the Real2Sim2Real pipeline performs better than baseline grasp networks trained with a large dataset and a grasp sampling method with retrieval-based reconstruction. The benefit of the Real2Sim2Real pipeline comes from 1) decoupling scene modeling and grasp sampling into sub-problems, and 2) both sub-problems can be solved with sufficiently high quality using recent 3D learning algorithms and mesh-based physical simulation techniques.\n  - [Feature-Realistic Neural Fusion for Real-Time, Open Set Scene Understanding](https://arxiv.org/abs/2210.03043) | [code]\n    > General scene understanding for robotics requires flexible semantic representation, so that novel objects and structures which may not have been known at training time can be identified, segmented and grouped. We present an algorithm which fuses general learned features from a standard pre-trained network into a highly efficient 3D geometric neural field representation during real-time SLAM. The fused 3D feature maps inherit the coherence of the neural field's geometry representation. This means that tiny amounts of human labelling interacting at runtime enable objects or even parts of objects to be robustly and accurately segmented in an open set manner.\n  - [IR-MCL: Implicit Representation-Based Online Global Localization](https://arxiv.org/abs/2210.03113) | [***``[code]``***](https://github.com/PRBonn/ir-mcl)\n    > Determining the state of a mobile robot is an essential building block of robot navigation systems. In this paper, we address the problem of estimating the robots pose in an indoor environment using 2D LiDAR data and investigate how modern environment models can improve gold standard Monte-Carlo localization (MCL) systems. We propose a neural occupancy field (NOF) to implicitly represent the scene using a neural network. With the pretrained network, we can synthesize 2D LiDAR scans for an arbitrary robot pose through volume rendering. Based on the implicit representation, we can obtain the similarity between a synthesized and actual scan as an observation model and integrate it into an MCL system to perform accurate localization. We evaluate our approach on five sequences of a self-recorded dataset and three publicly available datasets. We show that we can accurately and efficiently localize a robot using our approach surpassing the localization performance of state-of-the-art methods. The experiments suggest that the presented implicit representation is able to predict more accurate 2D LiDAR scans leading to an improved observation model for our particle filter-based localization. The code of our approach is released at: this https URL.\n  - [NARF22: Neural Articulated Radiance Fields for Configuration-Aware Rendering, IROS2022](https://progress.eecs.umich.edu/projects/narf/) | [code]\n    > Articulated objects pose a unique challenge for robotic perception and manipulation. Their increased number of degrees-of-freedom makes tasks such as localization computationally difficult, while also making the process of real-world dataset collection unscalable. With the aim of addressing these scalability issues, we propose Neural Articulated Radiance Fields (NARF22), a pipeline which uses a fully-differentiable, configuration-parameterized Neural Radiance Field (NeRF) as a means of providing high quality renderings of articulated objects. NARF22 requires no explicit knowledge of the object structure at inference time. We propose a two-stage parts-based training mechanism which allows the object rendering models to generalize well across the configuration space even if the underlying training data has as few as one configuration represented. We demonstrate the efficacy of NARF22 by training configurable renderers on a real-world articulated tool dataset collected via a Fetch mobile manipulation robot. We show the applicability of the model to gradient-based inference methods through a configuration estimation and 6 degree-of-freedom pose refinement task. The project webpage is available at: this https URL.\n  - [Probabilistic Volumetric Fusion for Dense Monocular SLAM](https://arxiv.org/abs/2210.01276) | [code]\n    > We present a novel method to reconstruct 3D scenes from images by leveraging deep dense monocular SLAM and fast uncertainty propagation. The proposed approach is able to 3D reconstruct scenes densely, accurately, and in real-time while being robust to extremely noisy depth estimates coming from dense monocular SLAM. Differently from previous approaches, that either use ad-hoc depth filters, or that estimate the depth uncertainty from RGB-D cameras' sensor models, our probabilistic depth uncertainty derives directly from the information matrix of the underlying bundle adjustment problem in SLAM. We show that the resulting depth uncertainty provides an excellent signal to weight the depth-maps for volumetric fusion. Without our depth uncertainty, the resulting mesh is noisy and with artifacts, while our approach generates an accurate 3D mesh with significantly fewer artifacts. We provide results on the challenging Euroc dataset, and show that our approach achieves 92% better accuracy than directly fusing depths from monocular SLAM, and up to 90% improvements compared to the best competing approach.\n  - [NeRF: Neural Radiance Field in 3D Vision, A Comprehensive Review](https://arxiv.org/abs/2210.00379) | [code]\n    > Neural Radiance Field (NeRF), a new novel view synthesis with implicit scene representation has taken the field of Computer Vision by storm. As a novel view synthesis and 3D reconstruction method, NeRF models find applications in robotics, urban mapping, autonomous navigation, virtual reality/augmented reality, and more. Since the original paper by Mildenhall et al., more than 250 preprints were published, with more than 100 eventually being accepted in tier one Computer Vision Conferences. Given NeRF popularity and the current interest in this research area, we believe it necessary to compile a comprehensive survey of NeRF papers from the past two years, which we organized into both architecture, and application based taxonomies. We also provide an introduction to the theory of NeRF based novel view synthesis, and a benchmark comparison of the performance and speed of key NeRF models. By creating this survey, we hope to introduce new researchers to NeRF, provide a helpful reference for influential works in this field, as well as motivate future research directions with our discussion section.\n## Sep25 - Oct1, 2022\n  - [City-scale Incremental Neural Mapping with Three-layer Sampling and Panoptic Representation](https://arxiv.org/abs/2209.14072) | [code]\n    > Neural implicit representations are drawing a lot of attention from the robotics community recently, as they are expressive, continuous and compact. However, city-scale incremental implicit dense mapping based on sparse LiDAR input is still an under-explored challenge. To this end,we successfully build the first city-scale incremental neural mapping system with a panoptic representation that consists of both environment-level and instance-level modelling. Given a stream of sparse LiDAR point cloud, it maintains a dynamic generative model that maps 3D coordinates to signed distance field (SDF) values. To address the difficulty of representing geometric information at different levels in city-scale space, we propose a tailored three-layer sampling strategy to dynamically sample the global, local and near-surface domains. Meanwhile, to realize high fidelity mapping, category-specific prior is introduced to better model the geometric details, leading to a panoptic representation. We evaluate on the public SemanticKITTI dataset and demonstrate the significance of the newly proposed three-layer sampling strategy and panoptic representation, using both quantitative and qualitative results. Codes and data will be publicly available.\n  - [Orbeez-SLAM: A Real-time Monocular Visual SLAM with ORB Features and NeRF-realized Mapping](https://arxiv.org/abs/2209.13274) | [code]\n    > A spatial AI that can perform complex tasks through visual signals and cooperate with humans is highly anticipated. To achieve this, we need a visual SLAM that easily adapts to new scenes without pre-training and generates dense maps for downstream tasks in real-time. None of the previous learning-based and non-learning-based visual SLAMs satisfy all needs due to the intrinsic limitations of their components. In this work, we develop a visual SLAM named Orbeez-SLAM, which successfully collaborates with implicit neural representation (NeRF) and visual odometry to achieve our goals. Moreover, Orbeez-SLAM can work with the monocular camera since it only needs RGB inputs, making it widely applicable to the real world. We validate its effectiveness on various challenging benchmarks. Results show that our SLAM is up to 800x faster than the strong baseline with superior rendering outcomes.\n  - [Enforcing safety for vision-based controllers via Control Barrier Functions and Neural Radiance Fields](https://arxiv.org/abs/2209.12266) | [code]\n    > To navigate complex environments, robots must increasingly use high-dimensional visual feedback (e.g. images) for control. However, relying on high-dimensional image data to make control decisions raises important questions; particularly, how might we prove the safety of a visual-feedback controller? Control barrier functions (CBFs) are powerful tools for certifying the safety of feedback controllers in the state-feedback setting, but CBFs have traditionally been poorly-suited to visual feedback control due to the need to predict future observations in order to evaluate the barrier function. In this work, we solve this issue by leveraging recent advances in neural radiance fields (NeRFs), which learn implicit representations of 3D scenes and can render images from previously-unseen camera perspectives, to provide single-step visual foresight for a CBF-based controller. This novel combination is able to filter out unsafe actions and intervene to preserve safety. We demonstrate the effect of our controller in real-time simulation experiments where it successfully prevents the robot from taking dangerous actions.\n## Sep18 - Sep24, 2022\n  - [Local_INN: Implicit Map Representation and Localization with Invertible Neural Networks](https://arxiv.org/abs/2209.11925) | [code]\n    > Robot localization is an inverse problem of finding a robot's pose using a map and sensor measurements. In recent years, Invertible Neural Networks (INNs) have successfully solved ambiguous inverse problems in various fields. This paper proposes a framework that solves the localization problem with INN. We design an INN that provides implicit map representation in the forward path and localization in the inverse path. By sampling the latent space in evaluation, Local\\_INN outputs robot poses with covariance, which can be used to estimate the uncertainty. We show that the localization performance of Local\\_INN is on par with current methods with much lower latency. We show detailed 2D and 3D map reconstruction from Local\\_INN using poses exterior to the training set. We also provide a global localization algorithm using Local\\_INN to tackle the kidnapping problem.\n  - [NeRF-Loc: Transformer-Based Object Localization Within Neural Radiance Fields](https://arxiv.org/abs/2209.12068) | [code]\n    > Neural Radiance Fields (NeRFs) have been successfully used for scene representation. Recent works have also developed robotic navigation and manipulation systems using NeRF-based environment representations. As object localization is the foundation for many robotic applications, to further unleash the potential of NeRFs in robotic systems, we study object localization within a NeRF scene. We propose a transformer-based framework NeRF-Loc to extract 3D bounding boxes of objects in NeRF scenes. NeRF-Loc takes a pre-trained NeRF model and camera view as input, and produces labeled 3D bounding boxes of objects as output. Concretely, we design a pair of paralleled transformer encoder branches, namely the coarse stream and the fine stream, to encode both the context and details of target objects. The encoded features are then fused together with attention layers to alleviate ambiguities for accurate object localization. We have compared our method with the conventional transformer-based method and our method achieves better performance. In addition, we also present the first NeRF samples-based object localization benchmark NeRFLocBench.\n  - [How Does It Feel? Self-Supervised Costmap Learning for Off-Road Vehicle Traversability](https://arxiv.org/abs/2209.10788) | [code]\n    > Estimating terrain traversability in off-road environments requires reasoning about complex interaction dynamics between the robot and these terrains. However, it is challenging to build an accurate physics model, or create informative labels to learn a model in a supervised manner, for these interactions. We propose a method that learns to predict traversability costmaps by combining exteroceptive environmental information with proprioceptive terrain interaction feedback in a self-supervised manner. Additionally, we propose a novel way of incorporating robot velocity in the costmap prediction pipeline. We validate our method in multiple short and large-scale navigation tasks on a large, autonomous all-terrain vehicle (ATV) on challenging off-road terrains, and demonstrate ease of integration on a separate large ground robot. Our short-scale navigation results show that using our learned costmaps leads to overall smoother navigation, and provides the robot with a more fine-grained understanding of the interactions between the robot and different terrain types, such as grass and gravel. Our large-scale navigation trials show that we can reduce the number of interventions by up to 57% compared to an occupancy-based navigation baseline in challenging off-road courses ranging from 400 m to 3150 m.\n  - [Loc-NeRF: Monte Carlo Localization using Neural Radiance Fields](https://arxiv.org/abs/2209.09050) | [***``[code]``***](https://github.com/MIT-SPARK/Loc-NeRF)\n    > We present Loc-NeRF, a real-time vision-based robot localization approach that combines Monte Carlo localization and Neural Radiance Fields (NeRF). Our system uses a pre-trained NeRF model as the map of an environment and can localize itself in real-time using an RGB camera as the only exteroceptive sensor onboard the robot. While neural radiance fields have seen significant applications for visual rendering in computer vision and graphics, they have found limited use in robotics. Existing approaches for NeRF-based localization require both a good initial pose guess and significant computation, making them impractical for real-time robotics applications. By using Monte Carlo localization as a workhorse to estimate poses using a NeRF map model, Loc-NeRF is able to perform localization faster than the state of the art and without relying on an initial pose estimate. In addition to testing on synthetic data, we also run our system using real data collected by a Clearpath Jackal UGV and demonstrate for the first time the ability to perform real-time global localization with neural radiance fields. We make our code publicly available at this https URL.\n  - [MeSLAM: Memory Efficient SLAM based on Neural Fields, SMC2022](https://arxiv.org/abs/2209.09357) | [code]\n    > Existing Simultaneous Localization and Mapping (SLAM) approaches are limited in their scalability due to growing map size in long-term robot operation. Moreover, processing such maps for localization and planning tasks leads to the increased computational resources required onboard. To address the problem of memory consumption in long-term operation, we develop a novel real-time SLAM algorithm, MeSLAM, that is based on neural field implicit map representation. It combines the proposed global mapping strategy, including neural networks distribution and region tracking, with an external odometry system. As a result, the algorithm is able to efficiently train multiple networks representing different map regions and track poses accurately in large-scale environments. Experimental results show that the accuracy of the proposed approach is comparable to the state-of-the-art methods (on average, 6.6 cm on TUM RGB-D sequences) and outperforms the baseline, iMAP∗. Moreover, the proposed SLAM approach provides the most compact-sized maps without details distortion (1.9 MB to store 57 m3) among the state-of-the-art SLAM approaches.\n  - [LATITUDE: Robotic Global Localization with Truncated Dynamic Low-pass Filter in City-scale NeRF, ICRA2023](https://arxiv.org/abs/2209.08498) | [***``[code]``***](https://github.com/jike5/LATITUDE)\n    > Neural Radiance Fields (NeRFs) have made great success in representing complex 3D scenes with high-resolution details and efficient memory. Nevertheless, current NeRF-based pose estimators have no initial pose prediction and are prone to local optima during optimization. In this paper, we present LATITUDE: Global Localization with Truncated Dynamic Low-pass Filter, which introduces a two-stage localization mechanism in city-scale NeRF. In place recognition stage, we train a regressor through images generated from trained NeRFs, which provides an initial value for global localization. In pose optimization stage, we minimize the residual between the observed image and rendered image by directly optimizing the pose on tangent plane. To avoid convergence to local optimum, we introduce a Truncated Dynamic Low-pass Filter (TDLF) for coarse-to-fine pose registration. We evaluate our method on both synthetic and real-world data and show its potential applications for high-precision navigation in large-scale city scenes. Codes and data will be publicly available at this https URL.\n  - [Uncertainty Guided Policy for Active Robotic 3D Reconstruction using Neural Radiance Fields, RAL2022](https://arxiv.org/abs/2209.08409) | [code]\n    > In this paper, we tackle the problem of active robotic 3D reconstruction of an object. In particular, we study how a mobile robot with an arm-held camera can select a favorable number of views to recover an object's 3D shape efficiently. Contrary to the existing solution to this problem, we leverage the popular neural radiance fields-based object representation, which has recently shown impressive results for various computer vision tasks. However, it is not straightforward to directly reason about an object's explicit 3D geometric details using such a representation, making the next-best-view selection problem for dense 3D reconstruction challenging. This paper introduces a ray-based volumetric uncertainty estimator, which computes the entropy of the weight distribution of the color samples along each ray of the object's implicit neural representation. We show that it is possible to infer the uncertainty of the underlying 3D geometry given a novel view with the proposed estimator. We then present a next-best-view selection policy guided by the ray-based volumetric uncertainty in neural radiance fields-based representations. Encouraging experimental results on synthetic and real-world data suggest that the approach presented in this paper can enable a new research direction of using an implicit 3D object representation for the next-best-view problem in robot vision applications, distinguishing our approach from the existing approaches that rely on explicit 3D geometric modeling.\n## Sep11 - Sep17, 2022\n  - [iDF-SLAM: End-to-End RGB-D SLAM with Neural Implicit Mapping and Deep Feature Tracking](https://arxiv.org/abs/2209.07919) | [code]\n    > We propose a novel end-to-end RGB-D SLAM, iDF-SLAM, which adopts a feature-based deep neural tracker as the front-end and a NeRF-style neural implicit mapper as the back-end. The neural implicit mapper is trained on-the-fly, while though the neural tracker is pretrained on the ScanNet dataset, it is also finetuned along with the training of the neural implicit mapper. Under such a design, our iDF-SLAM is capable of learning to use scene-specific features for camera tracking, thus enabling lifelong learning of the SLAM system. Both the training for the tracker and the mapper are self-supervised without introducing ground truth poses. We test the performance of our iDF-SLAM on the Replica and ScanNet datasets and compare the results to the two recent NeRF-based neural SLAM systems. The proposed iDF-SLAM demonstrates state-of-the-art results in terms of scene reconstruction and competitive performance in camera tracking.\n## Sep4 - Sep10, 2022\n  - [PixTrack: Precise 6DoF Object Pose Tracking using NeRF Templates and Feature-metric Alignment](https://arxiv.org/abs/2209.03910) | [code]\n    > We present PixTrack, a vision based object pose tracking framework using novel view synthesis and deep feature-metric alignment. Our evaluations demonstrate that our method produces highly accurate, robust, and jitter-free 6DoF pose estimates of objects in RGB images without the need of any data annotation or trajectory smoothing. Our method is also computationally efficient making it easy to have multi-object tracking with no alteration to our method and just using CPU multiprocessing.\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n  - [SCONE: Surface Coverage Optimization in Unknown Environments by Volumetric Integration](https://arxiv.org/abs/2208.10449) | [code]\n    > Next Best View computation (NBV) is a long-standing problem in robotics, and consists in identifying the next most informative sensor position(s) for reconstructing a 3D object or scene efficiently and accurately. Like most current methods, we consider NBV prediction from a depth sensor. Learning-based methods relying on a volumetric representation of the scene are suitable for path planning, but do not scale well with the size of the scene and have lower accuracy than methods using a surface-based representation. However, the latter constrain the camera to a small number of poses. To obtain the advantages of both representations, we show that we can maximize surface metrics by Monte Carlo integration over a volumetric representation. Our method scales to large scenes and handles free camera motion: It takes as input an arbitrarily large point cloud gathered by a depth sensor like Lidar systems as well as camera poses to predict NBV. We demonstrate our approach on a novel dataset made of large and complex 3D scenes.\n## Aug14 - Aug20, 2022\n  - [The 8-Point Algorithm as an Inductive Bias for Relative Pose Prediction by ViTs, 3DV2022](https://arxiv.org/abs/2208.08988) | [***``[code]``***](https://github.com/crockwell/rel_pose)\n    > We present a simple baseline for directly estimating the relative pose (rotation and translation, including scale) between two images. Deep methods have recently shown strong progress but often require complex or multi-stage architectures. We show that a handful of modifications can be applied to a Vision Transformer (ViT) to bring its computations close to the Eight-Point Algorithm. This inductive bias enables a simple method to be competitive in multiple settings, often substantially improving over the state of the art with strong performance gains in limited data regimes.\n## Aug7 - Aug13, 2022\n  - [RelPose: Predicting Probabilistic Relative Rotation for Single Objects in the Wild, ECCV2022](https://jasonyzhang.com/relpose/) | [***``[code]``***](https://github.com/jasonyzhang/relpose)\n    > We describe a data-driven method for inferring the camera viewpoints given multiple images of an arbitrary object. This task is a core component of classic geometric pipelines such as SfM and SLAM, and also serves as a vital pre-processing requirement for contemporary neural approaches (e.g. NeRF) to object reconstruction and view synthesis. In contrast to existing correspondence-driven methods that do not perform well given sparse views, we propose a top-down prediction based approach for estimating camera viewpoints. Our key technical insight is the use of an energy-based formulation for representing distributions over relative camera rotations, thus allowing us to explicitly represent multiple camera modes arising from object symmetries or views. Leveraging these relative predictions, we jointly estimate a consistent set of camera rotations from multiple images. We show that our approach outperforms state-of-the-art SfM and SLAM methods given sparse images on both seen and unseen categories. Further, our probabilistic approach significantly outperforms directly regressing relative poses, suggesting that modeling multimodality is important for coherent joint reconstruction. We demonstrate that our system can be a stepping stone toward in-the-wild reconstruction from multi-view datasets. The project page with code and videos can be found at this https URL.\n## Jul31 - Aug6, 2022\n  - [PRIF: Primary Ray-based Implicit Function](https://research.google/pubs/pub51556/) | [code]\n    > We introduce a new implicit shape representation called Primary Ray-based Implicit Function (PRIF). In contrast to most existing approaches based on the signed distance function (SDF) which handles spatial locations, our representation operates on oriented rays. Specifically, PRIF is formulated to directly produce the surface hit point of a given input ray, without the expensive sphere-tracing operations, hence enabling efficient shape extraction and differentiable rendering. We demonstrate that neural networks trained to encode PRIF achieve successes in various tasks including single shape representation, category-wise shape generation, shape completion from sparse or noisy observations, inverse rendering for camera pose estimation, and neural rendering with color.\n## Jul24 - Jul30, 2022\n  - [ObjectFusion: Accurate object-level SLAM with neural object priors, Graphical Models, Volume 123, September 2022](https://www.sciencedirect.com/science/article/pii/S1524070322000418) | [code]\n    > Previous object-level Simultaneous Localization and Mapping (SLAM) approaches still fail to create high quality object-oriented 3D map in an efficient way. The main challenges come from how to represent the object shape effectively and how to apply such object representation to accurate online camera tracking efficiently. In this paper, we provide ObjectFusion as a novel object-level SLAM in static scenes which efficiently creates object-oriented 3D map with high-quality object reconstruction, by leveraging neural object priors. We propose a neural object representation with only a single encoder–decoder network to effectively express the object shape across various categories, which benefits high quality reconstruction of object instance. More importantly, we propose to convert such neural object representation as precise measurements to jointly optimize the object shape, object pose and camera pose for the final accurate 3D object reconstruction. With extensive evaluations on synthetic and real-world RGB-D datasets, we show that our ObjectFusion outperforms previous approaches, with better object reconstruction quality, using much less memory footprint, and in a more efficient way, especially at the object level.\n  - [Neural Density-Distance Fields, ECCV2022](https://arxiv.org/abs/2207.14455) | [***``[code]``***](https://ueda0319.github.io/neddf/)\n    > The success of neural fields for 3D vision tasks is now indisputable. Following this trend, several methods aiming for visual localization (e.g., SLAM) have been proposed to estimate distance or density fields using neural fields. However, it is difficult to achieve high localization performance by only density fields-based methods such as Neural Radiance Field (NeRF) since they do not provide density gradient in most empty regions. On the other hand, distance field-based methods such as Neural Implicit Surface (NeuS) have limitations in objects' surface shapes. This paper proposes Neural Density-Distance Field (NeDDF), a novel 3D representation that reciprocally constrains the distance and density fields. We extend distance field formulation to shapes with no explicit boundary surface, such as fur or smoke, which enable explicit conversion from distance field to density field. Consistent distance and density fields realized by explicit conversion enable both robustness to initial values and high-quality registration. Furthermore, the consistency between fields allows fast convergence from sparse point clouds. Experiments show that NeDDF can achieve high localization performance while providing comparable results to NeRF on novel view synthesis. The code is available at this https URL.\n  - [ShAPO: Implicit Representations for Multi-Object Shape, Appearance, and Pose Optimization, ECCV2022](https://arxiv.org/abs/2207.13691) | [***``[code]``***](https://zubair-irshad.github.io/projects/ShAPO.html)\n    > Our method studies the complex task of object-centric 3D understanding from a single RGB-D observation. As it is an ill-posed problem, existing methods suffer from low performance for both 3D shape and 6D pose and size estimation in complex multi-object scenarios with occlusions. We present ShAPO, a method for joint multi-object detection, 3D textured reconstruction, 6D object pose and size estimation. Key to ShAPO is a single-shot pipeline to regress shape, appearance and pose latent codes along with the masks of each object instance, which is then further refined in a sparse-to-dense fashion. A novel disentangled shape and appearance database of priors is first learned to embed objects in their respective shape and appearance space. We also propose a novel, octree-based differentiable optimization step, allowing us to further improve object shape, pose and appearance simultaneously under the learned latent space, in an analysis-by-synthesis fashion. Our novel joint implicit textured object representation allows us to accurately identify and reconstruct novel unseen objects without having access to their 3D meshes. Through extensive experiments, we show that our method, trained on simulated indoor scenes, accurately regresses the shape, appearance and pose of novel objects in the real-world with minimal fine-tuning. Our method significantly out-performs all baselines on the NOCS dataset with an 8% absolute improvement in mAP for 6D pose estimation.\n  - [GAUDI: A Neural Architect for Immersive 3D Scene Generation](https://arxiv.org/abs/2207.13751) | [***``[code]``***](https://github.com/apple/ml-gaudi)\n    > We introduce GAUDI, a generative model capable of capturing the distribution of complex and realistic 3D scenes that can be rendered immersively from a moving camera. We tackle this challenging problem with a scalable yet powerful approach, where we first optimize a latent representation that disentangles radiance fields and camera poses. This latent representation is then used to learn a generative model that enables both unconditional and conditional generation of 3D scenes. Our model generalizes previous works that focus on single objects by removing the assumption that the camera pose distribution can be shared across samples. We show that GAUDI obtains state-of-the-art performance in the unconditional generative setting across multiple datasets and allows for conditional generation of 3D scenes given conditioning variables like sparse image observations or text that describes the scene.\n  - [AlignSDF: Pose-Aligned Signed Distance Fields for Hand-Object Reconstruction, ECCV2022](https://arxiv.org/abs/2207.12909) | [***``[code]``***](https://zerchen.github.io/projects/alignsdf.html)\n    > Recent work achieved impressive progress towards joint reconstruction of hands and manipulated objects from monocular color images. Existing methods focus on two alternative representations in terms of either parametric meshes or signed distance fields (SDFs). On one side, parametric models can benefit from prior knowledge at the cost of limited shape deformations and mesh resolutions. Mesh models, hence, may fail to precisely reconstruct details such as contact surfaces of hands and objects. SDF-based methods, on the other side, can represent arbitrary details but are lacking explicit priors. In this work we aim to improve SDF models using priors provided by parametric representations. In particular, we propose a joint learning framework that disentangles the pose and the shape. We obtain hand and object poses from parametric models and use them to align SDFs in 3D space. We show that such aligned SDFs better focus on reconstructing shape details and improve reconstruction accuracy both for hands and objects. We evaluate our method and demonstrate significant improvements over the state of the art on the challenging ObMan and DexYCB benchmarks.\n## Previous weeks\n  - [NeRF in the Wild: Neural Radiance Fields for Unconstrained Photo Collections, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > We present a learning-based method for synthesizing novel views of complex scenes using only unstructured collections of in-the-wild photographs. We build on Neural Radiance Fields (NeRF), which uses the weights of a multilayer perceptron to model the density and color of a scene as a function of 3D coordinates. While NeRF works well on images of static subjects captured under controlled settings, it is incapable of modeling many ubiquitous, real-world phenomena in uncontrolled images, such as variable illumination or transient occluders. We introduce a series of extensions to NeRF to address these issues, thereby enabling accurate reconstructions from unstructured image collections taken from the internet. We apply our system, dubbed NeRF-W, to internet photo collections of famous landmarks, and demonstrate temporally consistent novel view renderings that are significantly closer to photorealism than the prior state of the art.\n  - [Ha-NeRF: Hallucinated Neural Radiance Fields in the Wild, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > Neural Radiance Fields (NeRF) has recently gained popularity for its impressive novel view synthesis ability. This paper studies the problem of hallucinated NeRF: i.e., recovering a realistic NeRF at a different time of day from a group of tourism images. Existing solutions adopt NeRF with a controllable appearance embedding to render novel views under various conditions, but they cannot render view-consistent images with an unseen appearance. To solve this problem, we present an end-to-end framework for constructing a hallucinated NeRF, dubbed as Ha-NeRF. Specifically, we propose an appearance hallucination module to handle time-varying appearances and transfer them to novel views. Considering the complex occlusions of tourism images, we introduce an anti-occlusion module to decompose the static subjects for visibility accurately. Experimental results on synthetic data and real tourism photo collections demonstrate that our method can hallucinate the desired appearances and render occlusion-free images from different views.\n  - [Nerfies: Deformable Neural Radiance Fields, ICCV2021](https://arxiv.org/abs/2011.12948) | [code]\n    > We present the first method capable of photorealistically reconstructing deformable scenes using photos/videos captured casually from mobile phones. Our approach augments neural radiance fields (NeRF) by optimizing an additional continuous volumetric deformation field that warps each observed point into a canonical 5D NeRF. We observe that these NeRF-like deformation fields are prone to local minima, and propose a coarse-to-fine optimization method for coordinate-based models that allows for more robust optimization. By adapting principles from geometry processing and physical simulation to NeRF-like models, we propose an elastic regularization of the deformation field that further improves robustness. We show that our method can turn casually captured selfie photos/videos into deformable NeRF models that allow for photorealistic renderings of the subject from arbitrary viewpoints, which we dub \"nerfies.\" We evaluate our method by collecting time-synchronized data using a rig with two mobile phones, yielding train/validation images of the same pose at different viewpoints. We show that our method faithfully reconstructs non-rigidly deforming scenes and reproduces unseen views with high fidelity.\n  - [Dynamic Neural Radiance Fields for Monocular 4D Facial Avatar Reconstruction, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > We present dynamic neural radiance fields for modeling the appearance and dynamics of a human face. Digitally modeling and reconstructing a talking human is a key building-block for a variety of applications. Especially, for telepresence applications in AR or VR, a faithful reproduction of the appearance including novel viewpoint or head-poses is required. In contrast to state-of-the-art approaches that model the geometry and material properties explicitly, or are purely image-based, we introduce an implicit representation of the head based on scene representation networks. To handle the dynamics of the face, we combine our scene representation network with a low-dimensional morphable model which provides explicit control over pose and expressions. We use volumetric rendering to generate images from this hybrid representation and demonstrate that such a dynamic neural scene representation can be learned from monocular input data only, without the need of a specialized capture setup. In our experiments, we show that this learned volumetric representation allows for photo-realistic image generation that surpasses the quality of state-of-the-art video-based reenactment methods.\n  - [Neural Articulated Radiance Field, ICCV2021](https://arxiv.org/abs/2104.03110) | [***``[code]``***](https://github.com/nogu-atsu/NARF#code)\n    > We present Neural Articulated Radiance Field (NARF), a novel deformable 3D representation for articulated objects learned from images. While recent advances in 3D implicit representation have made it possible to learn models of complex objects, learning pose-controllable representations of articulated objects remains a challenge, as current methods require 3D shape supervision and are unable to render appearance. In formulating an implicit representation of 3D articulated objects, our method considers only the rigid transformation of the most relevant object part in solving for the radiance field at each 3D location. In this way, the proposed method represents pose-dependent changes without significantly increasing the computational complexity. NARF is fully differentiable and can be trained from images with pose annotations. Moreover, through the use of an autoencoder, it can learn appearance variations over multiple instances of an object class. Experiments show that the proposed method is efficient and can generalize well to novel poses.\n  - [Neural Actor: Neural Free-view Synthesis of Human Actors with Pose Control, SIGSIGGRAPH Asia 2021](https://vcai.mpi-inf.mpg.de/projects/NeuralActor/) | [***``[code]``***](https://people.mpi-inf.mpg.de/~lliu/projects/NeuralActor/)\n    > We propose Neural Actor (NA), a new method for high-quality synthesis of humans from arbitrary viewpoints and under arbitrary controllable poses. Our method is built upon recent neural scene representation and rendering works which learn representations of geometry and appearance from only 2D images. While existing works demonstrated compelling rendering of static scenes and playback of dynamic scenes, photo-realistic reconstruction and rendering of humans with neural implicit methods, in particular under user-controlled novel poses, is still difficult. To address this problem, we utilize a coarse body model as the proxy to unwarp the surrounding 3D space into a canonical pose. A neural radiance field learns pose-dependent geometric deformations and pose- and view-dependent appearance effects in the canonical space from multi-view video input. To synthesize novel views of high fidelity dynamic geometry and appearance, we leverage 2D texture maps defined on the body model as latent variables for predicting residual deformations and the dynamic appearance. Experiments demonstrate that our method achieves better quality than the state-of-the-arts on playback as well as novel pose synthesis, and can even generalize well to new poses that starkly differ from the training poses. Furthermore, our method also supports body shape control of the synthesized results.\n  - [iNeRF: Inverting Neural Radiance Fields for Pose Estimation, IROS2021](http://yenchenlin.me/inerf/) | [***``[code]``***](https://github.com/yenchenlin/iNeRF-public)\n    > We present iNeRF, a framework that performs pose estimation by “inverting” a trained Neural Radiance Field(NeRF). NeRFs have been shown to be remarkably effective for the task of view synthesis — synthesizing photorealisticnovel views of real-world scenes or objects. In this work, we investigate whether we can apply analysis-by-synthesis with NeRF for 6DoF pose estimation – given an image, find the translation and rotation of a camera relative to a 3Dmodel. Starting from an initial pose estimate, we use gradient descent to minimize the residual between pixels rendered from an already-trained NeRF and pixels in an observed image. In our experiments, we first study 1) how to sample rays during pose refinement for iNeRF to collect informative gradients and 2) how different batch sizes ofrays affect iNeRF on a synthetic dataset. We then show that for complex real-world scenes from the LLFF dataset, iNeRF can improve NeRF by estimating the camera poses of novel images and using these images as additional trainingdata for NeRF. Finally, we show iNeRF can be combinedwith feature-based pose initialization. The approach outperforms all other RGB-based methods relying on syntheticdata on LineMOD.\n  - [A-NeRF: Surface-free Human 3D Pose Refinement via Neural Rendering, NeurIPS2021](https://arxiv.org/abs/2102.06199) | [***``[code]``***](https://github.com/LemonATsu/A-NeRF)\n    > While deep learning reshaped the classical motion capture pipeline with feed-forward networks, generative models are required to recover fine alignment via iterative refinement. Unfortunately, the existing models are usually hand-crafted or learned in controlled conditions, only applicable to limited domains. We propose a method to learn a generative neural body model from unlabelled monocular videos by extending Neural Radiance Fields (NeRFs). We equip them with a skeleton to apply to time-varying and articulated motion. A key insight is that implicit models require the inverse of the forward kinematics used in explicit surface models. Our reparameterization defines spatial latent variables relative to the pose of body parts and thereby overcomes ill-posed inverse operations with an overparameterization. This enables learning volumetric body shape and appearance from scratch while jointly refining the articulated pose; all without ground truth labels for appearance, pose, or 3D shape on the input videos. When used for novel-view-synthesis and motion capture, our neural model improves accuracy on diverse datasets. Project website: this https URL .\n  - [NeRF--: Neural Radiance Fields Without Known Camera Parameters](https://nerfmm.active.vision/) | [***``[code]``***](https://github.com/ActiveVisionLab/nerfmm)\n    > Considering the problem of novel view synthesis (NVS) from only a set of 2D images, we simplify the training process of Neural Radiance Field (NeRF) on forward-facing scenes by removing the requirement of known or pre-computed camera parameters, including both intrinsics and 6DoF poses. To this end, we propose NeRF−−, with three contributions: First, we show that the camera parameters can be jointly optimised as learnable parameters with NeRF training, through a photometric reconstruction; Second, to benchmark the camera parameter estimation and the quality of novel view renderings, we introduce a new dataset of path-traced synthetic scenes, termed as Blender Forward-Facing Dataset (BLEFF); Third, we conduct extensive analyses to understand the training behaviours under various camera motions, and show that in most scenarios, the joint optimisation pipeline can recover accurate camera parameters and achieve comparable novel view synthesis quality as those trained with COLMAP pre-computed camera parameters.\n  - [Implicit Mapping and Positioning in Real-Time, ICCV2021](https://arxiv.org/abs/2103.12352) | [code]\n    > We show for the first time that a multilayer perceptron (MLP) can serve as the only scene representation in a real-time SLAM system for a handheld RGB-D camera. Our network is trained in live operation without prior data, building a dense, scene-specific implicit 3D model of occupancy and colour which is also immediately used for tracking.\n  - [NICE-SLAM  Neural Implicit Scalable Encoding for SLAM, CVPR2022](https://arxiv.org/abs/2112.12130) | [***``[code]``***](https://github.com/cvg/nice-slam)\n    > Neural implicit representations have recently shown encouraging results in various domains, including promising progress in simultaneous localization and mapping (SLAM). Nevertheless, existing methods produce over-smoothed scene reconstructions and have difficulty scaling up to large scenes. These limitations are mainly due to their simple fully-connected network architecture that does not incorporate local information in the observations. In this paper, we present NICE-SLAM, a dense SLAM system that incorporates multi-level local information by introducing a hierarchical scene representation. Optimizing this representation with pre-trained geometric priors enables detailed reconstruction on large indoor scenes. Compared to recent neural implicit SLAM systems, our approach is more scalable, efficient, and robust. Experiments on five challenging datasets demonstrate competitive results of NICE-SLAM in both mapping and tracking quality.\n  - [GNeRF: GAN-based Neural Radiance Field without Posed Camera, ICCV2021(oral)](https://arxiv.org/abs/2103.15606) | [code]\n    > We introduce GNeRF, a framework to marry Generative Adversarial Networks (GAN) with Neural Radiance Field (NeRF) reconstruction for the complex scenarios with unknown and even randomly initialized camera poses. Recent NeRF-based advances have gained popularity for remarkable realistic novel view synthesis. However, most of them heavily rely on accurate camera poses estimation, while few recent methods can only optimize the unknown camera poses in roughly forward-facing scenes with relatively short camera trajectories and require rough camera poses initialization. Differently, our GNeRF only utilizes randomly initialized poses for complex outside-in scenarios. We propose a novel two-phases end-to-end framework. The first phase takes the use of GANs into the new realm for optimizing coarse camera poses and radiance fields jointly, while the second phase refines them with additional photometric loss. We overcome local minima using a hybrid and iterative optimization scheme. Extensive experiments on a variety of synthetic and natural scenes demonstrate the effectiveness of GNeRF. More impressively, our approach outperforms the baselines favorably in those scenes with repeated patterns or even low textures that are regarded as extremely challenging before.\n  - [BARF: Bundle-Adjusting Neural Radiance Fields, ICCV2021(oral)](https://chenhsuanlin.bitbucket.io/bundle-adjusting-NeRF/) | [***``[code]``***](https://github.com/chenhsuanlin/bundle-adjusting-NeRF)\n    > Neural Radiance Fields (NeRF) have recently gained a surge of interest within the computer vision community for its power to synthesize photorealistic novel views of real-world scenes. One limitation of NeRF, however, is its requirement of accurate camera poses to learn the scene representations. In this paper, we propose Bundle-Adjusting Neural Radiance Fields (BARF) for training NeRF from imperfect (or even unknown) camera poses — the joint problem of learning neural 3D representations and registering camera frames. We establish a theoretical connection to classical image alignment and show that coarse-to-fine registration is also applicable to NeRF. Furthermore, we show that naively applying positional encoding in NeRF has a negative impact on registration with a synthesis-based objective. Experiments on synthetic and real-world data show that BARF can effectively optimize the neural scene representations and resolve large camera pose misalignment at the same time. This enables view synthesis and localization of video sequences from unknown camera poses, opening up new avenues for visual localization systems (e.g. SLAM) and potential applications for dense 3D mapping and reconstruction.\n  - [Self-Calibrating Neural Radiance Fields, ICCV2021](https://postech-cvlab.github.io/SCNeRF/) | [***``[code]``***](https://github.com/POSTECH-CVLab/SCNeRF)\n    > In this work, we propose a camera self-calibration algorithm for generic cameras with arbitrary non-linear distortions. We jointly learn the geometry of the scene and the accurate camera parameters without any calibration objects. Our camera model consists a pinhole model, radial distortion, and a generic noise model that can learn arbitrary non-linear camera distortions. While traditional self-calibration algorithms mostly rely on geometric constraints, we additionally incorporate photometric consistency. This requires learning the geometry of the scene and we use Neural Radiance Fields (NeRF). We also propose a new geometric loss function, viz., projected ray distance loss, to incorporate geometric consistency for complex non-linear camera models. We validate our approach on standard real image datasets and demonstrate our model can learn the camera intrinsics and extrinsics (pose) from scratch without COLMAP initialization. Also, we show that learning accurate camera models in differentiable manner allows us to improves PSNR over NeRF. We experimentally demonstrate that our proposed method is applicable to variants of NeRF. In addition, we use a set of images captured with a fish-eye lens to demonstrate that learning camera model jointly improves the performance significantly over the COLMAP initialization.\n  - [Neural Scene Graphs for Dynamic Scenes, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > Recent implicit neural rendering methods have demonstrated that it is possible to learn accurate view synthesis for complex scenes by predicting their volumetric density and color supervised solely by a set of RGB images. However, existing methods are restricted to learning efficient representations of static scenes that encode all scene objects into a single neural network, and lack the ability to represent dynamic scenes and decompositions into individual scene objects. In this work, we present the first neural rendering method that decomposes dynamic scenes into scene graphs. We propose a learned scene graph representation, which encodes object transformation and radiance, to efficiently render novel arrangements and views of the scene. To this end, we learn implicitly encoded scenes, combined with a jointly learned latent representation to describe objects with a single implicit function. We assess the proposed method on synthetic and real automotive data, validating that our approach learns dynamic scenes -- only by observing a video of this scene -- and allows for rendering novel photo-realistic views of novel scene compositions with unseen sets of objects at unseen poses.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/reconstruction.md",
    "content": "\nWeekly Classified Neural Radiance Fields - reconstruction ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n==========================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [ResNeRF: Geometry-Guided Residual Neural Radiance Field for Indoor Scene Novel View Synthesis](https://arxiv.org/abs/2211.16211) | [code]\n    > We represent the ResNeRF, a novel geometry-guided two-stage framework for indoor scene novel view synthesis. Be aware of that a good geometry would greatly boost the performance of novel view synthesis, and to avoid the geometry ambiguity issue, we propose to characterize the density distribution of the scene based on a base density estimated from scene geometry and a residual density parameterized by the geometry. In the first stage, we focus on geometry reconstruction based on SDF representation, which would lead to a good geometry surface of the scene and also a sharp density. In the second stage, the residual density is learned based on the SDF learned in the first stage for encoding more details about the appearance. In this way, our method can better learn the density distribution with the geometry prior for high-fidelity novel view synthesis while preserving the 3D structures. Experiments on large-scale indoor scenes with many less-observed and textureless areas show that with the good 3D surface, our method achieves state-of-the-art performance for novel view synthesis.\n  - [Recovering Fine Details for Neural Implicit Surface Reconstruction](https://arxiv.org/abs/2211.11320) | [code]\n    > Recent works on implicit neural representations have made significant strides. Learning implicit neural surfaces using volume rendering has gained popularity in multi-view reconstruction without 3D supervision. However, accurately recovering fine details is still challenging, due to the underlying ambiguity of geometry and appearance representation. In this paper, we present D-NeuS, a volume rendering-base neural implicit surface reconstruction method capable to recover fine geometry details, which extends NeuS by two additional loss functions targeting enhanced reconstruction quality. First, we encourage the rendered surface points from alpha compositing to have zero signed distance values, alleviating the geometry bias arising from transforming SDF to density for volume rendering. Second, we impose multi-view feature consistency on the surface points, derived by interpolating SDF zero-crossings from sampled points along rays. Extensive quantitative and qualitative results demonstrate that our method reconstructs high-accuracy surfaces with details, and outperforms the state of the art.\n## Nov13 - Nov19, 2022\n  - [Magic3D: High-Resolution Text-to-3D Content Creation](https://arxiv.org/abs/2211.10440) | [code]\n    > DreamFusion has recently demonstrated the utility of a pre-trained text-to-image diffusion model to optimize Neural Radiance Fields (NeRF), achieving remarkable text-to-3D synthesis results. However, the method has two inherent limitations: (a) extremely slow optimization of NeRF and (b) low-resolution image space supervision on NeRF, leading to low-quality 3D models with a long processing time. In this paper, we address these limitations by utilizing a two-stage optimization framework. First, we obtain a coarse model using a low-resolution diffusion prior and accelerate with a sparse 3D hash grid structure. Using the coarse representation as the initialization, we further optimize a textured 3D mesh model with an efficient differentiable renderer interacting with a high-resolution latent diffusion model. Our method, dubbed Magic3D, can create high quality 3D mesh models in 40 minutes, which is 2x faster than DreamFusion (reportedly taking 1.5 hours on average), while also achieving higher resolution. User studies show 61.7% raters to prefer our approach over DreamFusion. Together with the image-conditioned generation capabilities, we provide users with new ways to control 3D synthesis, opening up new avenues to various creative applications.\n  - [Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures](https://arxiv.org/abs/2211.07600) | [code]\n    > Text-guided image generation has progressed rapidly in recent years, inspiring major breakthroughs in text-guided shape generation. Recently, it has been shown that using score distillation, one can successfully text-guide a NeRF model to generate a 3D object. We adapt the score distillation to the publicly available, and computationally efficient, Latent Diffusion Models, which apply the entire diffusion process in a compact latent space of a pretrained autoencoder. As NeRFs operate in image space, a naive solution for guiding them with latent score distillation would require encoding to the latent space at each guidance step. Instead, we propose to bring the NeRF to the latent space, resulting in a Latent-NeRF. Analyzing our Latent-NeRF, we show that while Text-to-3D models can generate impressive results, they are inherently unconstrained and may lack the ability to guide or enforce a specific 3D structure. To assist and direct the 3D generation, we propose to guide our Latent-NeRF using a Sketch-Shape: an abstract geometry that defines the coarse structure of the desired object. Then, we present means to integrate such a constraint directly into a Latent-NeRF. This unique combination of text and shape guidance allows for increased control over the generation process. We also show that latent score distillation can be successfully applied directly on 3D meshes. This allows for generating high-quality textures on a given geometry. Our experiments validate the power of our different forms of guidance and the efficiency of using latent rendering. Implementation is available at this https URL\n## Nov6 - Nov12, 2022\n  - [Directed Ray Distance Functions for 3D Scene Reconstruction, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-20086-1_12) | [code]\n    > We present an approach for full 3D scene reconstruction from a single unseen image. We trained on dataset of realistic non-watertight scans of scenes. Our approach uses a predicted distance function, since these have shown promise in handling complex topologies and large spaces. We identify and analyze two key challenges for predicting such image conditioned distance functions that have prevented their success on real 3D scene data. First, we show that predicting a conventional scene distance from an image requires reasoning over a large receptive field. Second, we analytically show that the optimal output of the network trained to predict these distance functions does not obey all the distance function properties. We propose an alternate distance function, the Directed Ray Distance Function (DRDF), that tackles both challenges. We show that a deep network trained to predict DRDFs outperforms all other methods quantitatively and qualitatively on 3D reconstruction from single image on Matterport3D, 3DFront, and ScanNet. (Project Page: https://nileshkulkarni.github.io/scene_drdf)\n  - [Common Pets in 3D: Dynamic New-View Synthesis of Real-Life Deformable Categories](https://arxiv.org/abs/2211.03889) | [code]\n    > Obtaining photorealistic reconstructions of objects from sparse views is inherently ambiguous and can only be achieved by learning suitable reconstruction priors. Earlier works on sparse rigid object reconstruction successfully learned such priors from large datasets such as CO3D. In this paper, we extend this approach to dynamic objects. We use cats and dogs as a representative example and introduce Common Pets in 3D (CoP3D), a collection of crowd-sourced videos showing around 4,200 distinct pets. CoP3D is one of the first large-scale datasets for benchmarking non-rigid 3D reconstruction \"in the wild\". We also propose Tracker-NeRF, a method for learning 4D reconstruction from our dataset. At test time, given a small number of video frames of an unseen object, Tracker-NeRF predicts the trajectories of its 3D points and generates new views, interpolating viewpoint and time. Results on CoP3D reveal significantly better non-rigid new-view synthesis performance than existing baselines.\n## Oct30 - Nov5, 2022\n  - [HyperSound: Generating Implicit Neural Representations of Audio Signals with Hypernetworks](https://arxiv.org/abs/2211.01839) | [code]\n    > Implicit neural representations (INRs) are a rapidly growing research field, which provides alternative ways to represent multimedia signals. Recent applications of INRs include image super-resolution, compression of high-dimensional signals, or 3D rendering. However, these solutions usually focus on visual data, and adapting them to the audio domain is not trivial. Moreover, it requires a separately trained model for every data sample. To address this limitation, we propose HyperSound, a meta-learning method leveraging hypernetworks to produce INRs for audio signals unseen at training time. We show that our approach can reconstruct sound waves with quality comparable to other state-of-the-art models.\n  - [Learning Neural Implicit Representations with Surface Signal Parameterizations](https://arxiv.org/abs/2211.00519) | [code]\n    > Neural implicit surface representations have recently emerged as popular alternative to explicit 3D object encodings, such as polygonal meshes, tabulated points, or voxels. While significant work has improved the geometric fidelity of these representations, much less attention is given to their final appearance. Traditional explicit object representations commonly couple the 3D shape data with auxiliary surface-mapped image data, such as diffuse color textures and fine-scale geometric details in normal maps that typically require a mapping of the 3D surface onto a plane, i.e., a surface parameterization; implicit representations, on the other hand, cannot be easily textured due to lack of configurable surface parameterization. Inspired by this digital content authoring methodology, we design a neural network architecture that implicitly encodes the underlying surface parameterization suitable for appearance data. As such, our model remains compatible with existing mesh-based digital content with appearance data. Motivated by recent work that overfits compact networks to individual 3D objects, we present a new weight-encoded neural implicit representation that extends the capability of neural implicit surfaces to enable various common and important applications of texture mapping. Our method outperforms reasonable baselines and state-of-the-art alternatives.\n  - [gCoRF: Generative Compositional Radiance Fields, 3DV2022](https://vcai.mpi-inf.mpg.de/projects/gCoRF/) | [code]\n    > 3D generative models of objects enable photorealistic image synthesis with 3D control. Existing methods model the scene as a global scene representation, ignoring the compositional aspect of the scene. Compositional reasoning can enable a wide variety of editing applications, in addition to enabling generalizable 3D reasoning. In this paper, we present a compositional generative model, where each semantic part of the object is represented as an independent 3D representation learnt from only in-the-wild 2D data. We start with a global generative model (GAN) and learn to decompose it into different semantic parts using supervision from 2D segmentation masks. We then learn to composite independently sampled parts in order to create coherent global scenes. Different parts can be independently sampled, while keeping rest of the object fixed. We evaluate our method on a wide variety of objects and parts, and demonstrate editing applications.\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n  - [Neural Contact Fields: Tracking Extrinsic Contact with Tactile Sensing](https://arxiv.org/abs/2210.09297) | [code]\n    > We present Neural Contact Fields, a method that brings together neural fields and tactile sensing to address the problem of tracking extrinsic contact between object and environment. Knowing where the external contact occurs is a first step towards methods that can actively control it in facilitating downstream manipulation tasks. Prior work for localizing environmental contacts typically assume a contact type (e.g. point or line), does not capture contact/no-contact transitions, and only works with basic geometric-shaped objects. Neural Contact Fields are the first method that can track arbitrary multi-modal extrinsic contacts without making any assumptions about the contact type. Our key insight is to estimate the probability of contact for any 3D point in the latent space of object shapes, given vision-based tactile inputs that sense the local motion resulting from the external contact. In experiments, we find that Neural Contact Fields are able to localize multiple contact patches without making any assumptions about the geometry of the contact, and capture contact/no-contact transitions for known categories of objects with unseen shapes in unseen environment configurations. In addition to Neural Contact Fields, we also release our YCB-Extrinsic-Contact dataset of simulated extrinsic contact interactions to enable further research in this area. Project repository: this https URL\n  - [S3-NeRF: Neural Reflectance Field from Shading and Shadow under a Single Viewpoint, NeurIPS2022](https://arxiv.org/abs/2210.08936) | [***``[code]``***](https://github.com/ywq/s3nerf)\n    > In this paper, we address the \"dual problem\" of multi-view scene reconstruction in which we utilize single-view images captured under different point lights to learn a neural scene representation. Different from existing single-view methods which can only recover a 2.5D scene representation (i.e., a normal / depth map for the visible surface), our method learns a neural reflectance field to represent the 3D geometry and BRDFs of a scene. Instead of relying on multi-view photo-consistency, our method exploits two information-rich monocular cues, namely shading and shadow, to infer scene geometry. Experiments on multiple challenging datasets show that our method is capable of recovering 3D geometry, including both visible and invisible parts, of a scene from single-view images. Thanks to the neural reflectance field representation, our method is robust to depth discontinuities. It supports applications like novel-view synthesis and relighting. Our code and model can be found at this https URL.\n## Oct9 - Oct15, 2022\n  - [Multi-View Photometric Stereo Revisited, WACV2023](https://arxiv.org/abs/2210.07670) | [code]\n    > Multi-view photometric stereo (MVPS) is a preferred method for detailed and precise 3D acquisition of an object from images. Although popular methods for MVPS can provide outstanding results, they are often complex to execute and limited to isotropic material objects. To address such limitations, we present a simple, practical approach to MVPS, which works well for isotropic as well as other object material types such as anisotropic and glossy. The proposed approach in this paper exploits the benefit of uncertainty modeling in a deep neural network for a reliable fusion of photometric stereo (PS) and multi-view stereo (MVS) network predictions. Yet, contrary to the recently proposed state-of-the-art, we introduce neural volume rendering methodology for a trustworthy fusion of MVS and PS measurements. The advantage of introducing neural volume rendering is that it helps in the reliable modeling of objects with diverse material types, where existing MVS methods, PS methods, or both may fail. Furthermore, it allows us to work on neural 3D shape representation, which has recently shown outstanding results for many geometric processing tasks. Our suggested new loss function aims to fits the zero level set of the implicit neural function using the most certain MVS and PS network predictions coupled with weighted neural volume rendering cost. The proposed approach shows state-of-the-art results when tested extensively on several benchmark datasets.\n  - [MVSPlenOctree: Fast and Generic Reconstruction of Radiance Fields in PlenOctree from Multi-view Stereo, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547795) | [code]\n    > We present MVSPlenOctree, a novel approach that can efficiently reconstruct radiance fields for view synthesis. Unlike previous scene-specific radiance fields reconstruction methods, we present a generic pipeline that can efficiently reconstruct 360-degree-renderable radiance fields via multi-view stereo (MVS) inference from tens of sparse-spread out images. Our approach leverages variance-based statistic features for MVS inference, and combines this with image based rendering and volume rendering for radiance field reconstruction. We first train a MVS Machine for reasoning scene's density and appearance. Then, based on the spatial hierarchy of the PlenOctree and coarse-to-fine dense sampling mechanism, we design a robust and efficient sampling strategy for PlenOctree reconstruction, which handles occlusion robustly. A 360-degree-renderable radiance fields can be reconstructed in PlenOctree from MVS Machine in an efficient single forward pass. We trained our method on real-world DTU, LLFF datasets, and synthetic datasets. We validate its generalizability by evaluating on the test set of DTU dataset which are unseen in training. In summary, our radiance field reconstruction method is both efficient and generic, a coarse 360-degree-renderable radiance field can be reconstructed in seconds and a dense one within minutes. Please visit the project page for more details: https://derry-xing.github.io/projects/MVSPlenOctree.\n  - [ParseMVS: Learning Primitive-aware Surface Representations for Sparse Multi-view Stereopsis, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547920) | [code]\n    > Multi-view stereopsis (MVS) recovers 3D surfaces by finding dense photo-consistent correspondences from densely sampled images. In this paper, we tackle the challenging MVS task from sparsely sampled views (up to an order of magnitude fewer images), which is more practical and cost-efficient in applications. The major challenge comes from the significant correspondence ambiguity introduced by the severe occlusions and the highly skewed patches. On the other hand, such ambiguity can be resolved by incorporating geometric cues from the global structure. In light of this, we propose ParseMVS, boosting sparse MVS by learning the P rimitive-A waR e S urface rE presentation. In particular, on top of being aware of global structure, our novel representation further allows for the preservation of fine details including geometry, texture, and visibility. More specifically, the whole scene is parsed into multiple geometric primitives. On each of them, the geometry is defined as the displacement along the primitives' normal directions, together with the texture and visibility along each view direction. An unsupervised neural network is trained to learn these factors by progressively increasing the photo-consistency and render-consistency among all input images. Since the surface properties are changed locally in the 2D space of each primitive, ParseMVS can preserve global primitive structures while optimizing local details, handling the 'incompleteness' and the 'inaccuracy' problems. We experimentally demonstrate that ParseMVS constantly outperforms the state-of-the-art surface reconstruction method in both completeness and the overall score under varying sampling sparsity, especially under the extreme sparse-MVS settings. Beyond that, ParseMVS also shows great potential in compression, robustness, and efficiency.\n  - [Self-Supervised Multi-view Stereo via Adjacent Geometry Guided Volume Completion, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547926) | [code]\n    > Existing self-supervised multi-view stereo (MVS) approaches largely rely on photometric consistency for geometry inference, and hence suffer from low-texture or non-Lambertian appearances. In this paper, we observe that adjacent geometry shares certain commonality that can help to infer the correct geometry of the challenging or low-confident regions. Yet exploiting such property in a non-supervised MVS approach remains challenging for the lacking of training data and necessity of ensuring consistency between views. To address the issues, we propose a novel geometry inference training scheme by selectively masking regions with rich textures, where geometry can be well recovered and used for supervisory signal, and then lead a deliberately designed cost volume completion network to learn how to recover geometry of the masked regions. During inference, we then mask the low-confident regions instead and use the cost volume completion network for geometry correction. To deal with the different depth hypotheses of the cost volume pyramid, we design a three-branch volume inference structure for the completion network. Further, by considering plane as a special geometry, we first identify planar regions from pseudo labels and then correct the low-confident pixels by high-confident labels through plane normal consistency. Extensive experiments on DTU and Tanks & Temples demonstrate the effectiveness of the proposed framework and the state-of-the-art performance.\n  - [Uncertainty-Aware Semi-Supervised Learning of 3D Face Rigging from Single Image, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3548285) | [code]\n    > We present a method to rig 3D faces via Action Units (AUs), viewpoint and light direction, from single input image. Existing 3D methods for face synthesis and animation rely heavily on 3D morphable model (3DMM), which was built on 3D data and cannot provide intuitive expression parameters, while AU-driven 2D methods cannot handle head pose and lighting effect. We bridge the gap by integrating a recent 3D reconstruction method with 2D AU-driven method in a semi-supervised fashion. Built upon the auto-encoding 3D face reconstruction model that decouples depth, albedo, viewpoint and light without any supervision, we further decouple expression from identity for depth and albedo with a novel conditional feature translation module and pretrained critics for AU intensity estimation and image classification. Novel objective functions are designed using unlabeled in-the-wild images and in-door images with AU labels. We also leverage uncertainty losses to model the probably changing AU region of images as input noise for synthesis, and model the noisy AU intensity labels for intensity estimation of the AU critic. Experiments with face editing and animation on four datasets show that, compared with six state-of-the-art methods, our proposed method is superior and effective on expression consistency, identity similarity and pose similarity.\n  - [Robustifying the Multi-Scale Representation of Neural Radiance Fields, BMVC2022](https://arxiv.org/abs/2210.04233) | [code]\n    > Neural Radiance Fields (NeRF) recently emerged as a new paradigm for object representation from multi-view (MV) images. Yet, it cannot handle multi-scale (MS) images and camera pose estimation errors, which generally is the case with multi-view images captured from a day-to-day commodity camera. Although recently proposed Mip-NeRF could handle multi-scale imaging problems with NeRF, it cannot handle camera pose estimation error. On the other hand, the newly proposed BARF can solve the camera pose problem with NeRF but fails if the images are multi-scale in nature. This paper presents a robust multi-scale neural radiance fields representation approach to simultaneously overcome both real-world imaging issues. Our method handles multi-scale imaging effects and camera-pose estimation problems with NeRF-inspired approaches by leveraging the fundamentals of scene rigidity. To reduce unpleasant aliasing artifacts due to multi-scale images in the ray space, we leverage Mip-NeRF multi-scale representation. For joint estimation of robust camera pose, we propose graph-neural network-based multiple motion averaging in the neural volume rendering framework. We demonstrate, with examples, that for an accurate neural representation of an object from day-to-day acquired multi-view images, it is crucial to have precise camera-pose estimates. Without considering robustness measures in the camera pose estimation, modeling for multi-scale aliasing artifacts via conical frustum can be counterproductive. We present extensive experiments on the benchmark datasets to demonstrate that our approach provides better results than the recent NeRF-inspired approaches for such realistic settings.\n## Oct2 - Oct8, 2022\n  - [XDGAN: Multi-Modal 3D Shape Generation in 2D Space](https://arxiv.org/abs/2210.03007) | [code]\n    > Generative models for 2D images has recently seen tremendous progress in quality, resolution and speed as a result of the efficiency of 2D convolutional architectures. However it is difficult to extend this progress into the 3D domain since most current 3D representations rely on custom network components. This paper addresses a central question: Is it possible to directly leverage 2D image generative models to generate 3D shapes instead? To answer this, we propose XDGAN, an effective and fast method for applying 2D image GAN architectures to the generation of 3D object geometry combined with additional surface attributes, like color textures and normals. Specifically, we propose a novel method to convert 3D shapes into compact 1-channel geometry images and leverage StyleGAN3 and image-to-image translation networks to generate 3D objects in 2D space. The generated geometry images are quick to convert to 3D meshes, enabling real-time 3D object synthesis, visualization and interactive editing. Moreover, the use of standard 2D architectures can help bring more 2D advances into the 3D realm. We show both quantitatively and qualitatively that our method is highly effective at various tasks such as 3D shape generation, single view reconstruction and shape manipulation, while being significantly faster and more flexible compared to recent 3D generative models.\n  - [Unsupervised Multi-View Object Segmentation Using Radiance Field Propagation, NeurIPS2022](https://arxiv.org/abs/2210.00489) | [code]\n    > We present radiance field propagation (RFP), a novel approach to segmenting objects in 3D during reconstruction given only unlabeled multi-view images of a scene. RFP is derived from emerging neural radiance field-based techniques, which jointly encodes semantics with appearance and geometry. The core of our method is a novel propagation strategy for individual objects' radiance fields with a bidirectional photometric loss, enabling an unsupervised partitioning of a scene into salient or meaningful regions corresponding to different object instances. To better handle complex scenes with multiple objects and occlusions, we further propose an iterative expectation-maximization algorithm to refine object masks. To the best of our knowledge, RFP is the first unsupervised approach for tackling 3D scene object segmentation for neural radiance field (NeRF) without any supervision, annotations, or other cues such as 3D bounding boxes and prior knowledge of object class. Experiments demonstrate that RFP achieves feasible segmentation results that are more accurate than previous unsupervised image/scene segmentation approaches, and are comparable to existing supervised NeRF-based methods. The segmented object representations enable individual 3D object editing operations.\n## Sep25 - Oct1, 2022\n  - [Sphere-Guided Training of Neural Implicit Surfaces](https://arxiv.org/abs/2209.15511) | [code]\n    > In recent years, surface modeling via neural implicit functions has become one of the main techniques for multi-view 3D reconstruction. However, the state-of-the-art methods rely on the implicit functions to model an entire volume of the scene, leading to reduced reconstruction fidelity in the areas with thin objects or high-frequency details. To address that, we present a method for jointly training neural implicit surfaces alongside an auxiliary explicit shape representation, which acts as surface guide. In our approach, this representation encapsulates the surface region of the scene and enables us to boost the efficiency of the implicit function training by only modeling the volume in that region. We propose using a set of learnable spherical primitives as a learnable surface guidance since they can be efficiently trained alongside the neural surface function using its gradients. Our training pipeline consists of iterative updates of the spheres' centers using the gradients of the implicit function and then fine-tuning the latter to the updated surface region of the scene. We show that such modification to the training procedure can be plugged into several popular implicit reconstruction methods, improving the quality of the results over multiple 3D reconstruction benchmarks.\n  - [360FusionNeRF: Panoramic Neural Radiance Fields with Joint Guidance](https://arxiv.org/abs/2209.14265) | [code]\n    > We present a method to synthesize novel views from a single 360∘ panorama image based on the neural radiance field (NeRF). Prior studies in a similar setting rely on the neighborhood interpolation capability of multi-layer perceptions to complete missing regions caused by occlusion, which leads to artifacts in their predictions. We propose 360FusionNeRF, a semi-supervised learning framework where we introduce geometric supervision and semantic consistency to guide the progressive training process. Firstly, the input image is re-projected to 360∘ images, and auxiliary depth maps are extracted at other camera positions. The depth supervision, in addition to the NeRF color guidance, improves the geometry of the synthesized views. Additionally, we introduce a semantic consistency loss that encourages realistic renderings of novel views. We extract these semantic features using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse 2D photographs mined from the web with natural language supervision. Experiments indicate that our proposed method can produce plausible completions of unobserved regions while preserving the features of the scene. When trained across various scenes, 360FusionNeRF consistently achieves the state-of-the-art performance when transferring to synthetic Structured3D dataset (PSNR~5%, SSIM~3% LPIPS~13%), real-world Matterport3D dataset (PSNR~3%, SSIM~3% LPIPS~9%) and Replica360 dataset (PSNR~8%, SSIM~2% LPIPS~18%).\n  - [Efficient View Path Planning for Autonomous Implicit Reconstruction](https://arxiv.org/abs/2209.13159) | [code]\n    > Implicit neural representations have shown promising potential for the 3D scene reconstruction. Recent work applies it to autonomous 3D reconstruction by learning information gain for view path planning. Effective as it is, the computation of the information gain is expensive, and compared with that using volumetric representations, collision checking using the implicit representation for a 3D point is much slower. In the paper, we propose to 1) leverage a neural network as an implicit function approximator for the information gain field and 2) combine the implicit fine-grained representation with coarse volumetric representations to improve efficiency. Further with the improved efficiency, we propose a novel informative path planning based on a graph-based planner. Our method demonstrates significant improvements in the reconstruction quality and planning efficiency compared with autonomous reconstructions with implicit and explicit representations. We deploy the method on a real UAV and the results show that our method can plan informative views and reconstruct a scene with high quality.\n## Sep18 - Sep24, 2022\n  - [SG-SRNs: Superpixel-Guided Scene Representation Networks, SignalProcessingLetters](https://ieeexplore.ieee.org/abstract/document/9900405) | [code]\n    > Recently, Scene Representation Networks (SRNs) have attracted increasing attention in computer vision, due to their continuous and light-weight scene representation ability. However, SRNs generally perform poorly on low-texture image regions. Addressing this problem, we propose superpixel-guided scene representation networks in this paper, called SG-SRNs, consisting of a backbone module (SRNs), a superpixel segmentation module, and a superpixel regularization module. In the proposed method, except for the novel view synthesis task, the task of representation-aware superpixel segmentation mask generation is realized by the proposed superpixel segmentation module. Then, the superpixel regularization module utilizes the superpixel segmentation mask to guide the backbone to be learned in a locally smooth way, and optimizes the scene representations of the local regions to indirectly alleviate the structure distortion of low-texture regions in a self-supervised manner. Extensive experimental results on both our constructed datasets and the public Synthetic-NeRF dataset demonstrated that the proposed SG-SRNs achieved a significantly better 3D structure representing performance.\n  - [Edge-oriented Implicit Neural Representation with Channel Tuning](https://arxiv.org/abs/2209.11697) | [code]\n    > Implicit neural representation, which expresses an image as a continuous function rather than a discrete grid form, is widely used for image processing. Despite its outperforming results, there are still remaining limitations on restoring clear shapes of a given signal such as the edges of an image. In this paper, we propose Gradient Magnitude Adjustment algorithm which calculates the gradient of an image for training the implicit representation. In addition, we propose Edge-oriented Representation Network (EoREN) that can reconstruct the image with clear edges by fitting gradient information (Edge-oriented module). Furthermore, we add Channel-tuning module to adjust the distribution of given signals so that it solves a chronic problem of fitting gradients. By separating backpropagation paths of the two modules, EoREN can learn true color of the image without hindering the role for gradients. We qualitatively show that our model can reconstruct complex signals and demonstrate general reconstruction ability of our model with quantitative results.\n  - [Neural Implicit Surface Reconstruction using Imaging Sonar](https://arxiv.org/abs/2209.08221) | [code]\n    > We present a technique for dense 3D reconstruction of objects using an imaging sonar, also known as forward-looking sonar (FLS). Compared to previous methods that model the scene geometry as point clouds or volumetric grids, we represent the geometry as a neural implicit function. Additionally, given such a representation, we use a differentiable volumetric renderer that models the propagation of acoustic waves to synthesize imaging sonar measurements. We perform experiments on real and synthetic datasets and show that our algorithm reconstructs high-fidelity surface geometry from multi-view FLS images at much higher quality than was possible with previous techniques and without suffering from their associated memory overhead.\n  - [Uncertainty Guided Policy for Active Robotic 3D Reconstruction using Neural Radiance Fields, RAL2022](https://arxiv.org/abs/2209.08409) | [code]\n    > In this paper, we tackle the problem of active robotic 3D reconstruction of an object. In particular, we study how a mobile robot with an arm-held camera can select a favorable number of views to recover an object's 3D shape efficiently. Contrary to the existing solution to this problem, we leverage the popular neural radiance fields-based object representation, which has recently shown impressive results for various computer vision tasks. However, it is not straightforward to directly reason about an object's explicit 3D geometric details using such a representation, making the next-best-view selection problem for dense 3D reconstruction challenging. This paper introduces a ray-based volumetric uncertainty estimator, which computes the entropy of the weight distribution of the color samples along each ray of the object's implicit neural representation. We show that it is possible to infer the uncertainty of the underlying 3D geometry given a novel view with the proposed estimator. We then present a next-best-view selection policy guided by the ray-based volumetric uncertainty in neural radiance fields-based representations. Encouraging experimental results on synthetic and real-world data suggest that the approach presented in this paper can enable a new research direction of using an implicit 3D object representation for the next-best-view problem in robot vision applications, distinguishing our approach from the existing approaches that rely on explicit 3D geometric modeling.\n## Sep11 - Sep17, 2022\n  - [DevNet: Self-supervised Monocular Depth Learning via Density Volume Construction, ECCV2022](https://arxiv.org/abs/2209.06351) | [code]\n    > Self-supervised depth learning from monocular images normally relies on the 2D pixel-wise photometric relation between temporally adjacent image frames. However, they neither fully exploit the 3D point-wise geometric correspondences, nor effectively tackle the ambiguities in the photometric warping caused by occlusions or illumination inconsistency. To address these problems, this work proposes Density Volume Construction Network (DevNet), a novel self-supervised monocular depth learning framework, that can consider 3D spatial information, and exploit stronger geometric constraints among adjacent camera frustums. Instead of directly regressing the pixel value from a single image, our DevNet divides the camera frustum into multiple parallel planes and predicts the pointwise occlusion probability density on each plane. The final depth map is generated by integrating the density along corresponding rays. During the training process, novel regularization strategies and loss functions are introduced to mitigate photometric ambiguities and overfitting. Without obviously enlarging model parameters size or running time, DevNet outperforms several representative baselines on both the KITTI-2015 outdoor dataset and NYU-V2 indoor dataset. In particular, the root-mean-square-deviation is reduced by around 4% with DevNet on both KITTI-2015 and NYU-V2 in the task of depth estimation. Code is available at this https URL.\n## Sep4 - Sep10, 2022\n  - [3D Textured Shape Recovery with Learned Geometric Priors](https://arxiv.org/abs/2209.03254) | [code]\n    > 3D textured shape recovery from partial scans is crucial for many real-world applications. Existing approaches have demonstrated the efficacy of implicit function representation, but they suffer from partial inputs with severe occlusions and varying object types, which greatly hinders their application value in the real world. This technical report presents our approach to address these limitations by incorporating learned geometric priors. To this end, we generate a SMPL model from learned pose prediction and fuse it into the partial input to add prior knowledge of human bodies. We also propose a novel completeness-aware bounding box adaptation for handling different levels of scales and partialness of partial scans.\n  - [SIRA: Relightable Avatars from a Single Image](https://arxiv.org/abs/2209.03027) | [code]\n    > Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.\n## Aug28 - Sep3, 2022\n  - [Multi-View Reconstruction using Signed Ray Distance Functions (SRDF)](https://arxiv.org/abs/2209.00082) | [code]\n    > In this paper, we address the problem of multi-view 3D shape reconstruction. While recent differentiable rendering approaches associated to implicit shape representations have provided breakthrough performance, they are still computationally heavy and often lack precision on the estimated geometries. To overcome these limitations we investigate a new computational approach that builds on a novel shape representation that is volumetric, as in recent differentiable rendering approaches, but parameterized with depth maps to better materialize the shape surface. The shape energy associated to this representation evaluates 3D geometry given color images and does not need appearance prediction but still benefits from volumetric integration when optimized. In practice we propose an implicit shape representation, the SRDF, based on signed distances which we parameterize by depths along camera rays. The associated shape energy considers the agreement between depth prediction consistency and photometric consistency, this at 3D locations within the volumetric representation. Various photo-consistency priors can be accounted for such as a median based baseline, or a more elaborated criterion as with a learned function. The approach retains pixel-accuracy with depth maps and is parallelizable. Our experiments over standard datasets shows that it provides state-of-the-art results with respect to recent approaches with implicit shape representations as well as with respect to traditional multi-view stereo methods.\n  - [Dual-Space NeRF: Learning Animatable Avatars and Scene Lighting in Separate Spaces, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > Modeling the human body in a canonical space is a common practice for capturing and animation. But when involving the neural radiance field (NeRF), learning a static NeRF in the canonical space is not enough because the lighting of the body changes when the person moves even though the scene lighting is constant. Previous methods alleviate the inconsistency of lighting by learning a per-frame embedding, but this operation does not generalize to unseen poses. Given that the lighting condition is static in the world space while the human body is consistent in the canonical space, we propose a dual-space NeRF that models the scene lighting and the human body with two MLPs in two separate spaces. To bridge these two spaces, previous methods mostly rely on the linear blend skinning (LBS) algorithm. However, the blending weights for LBS of a dynamic neural field are intractable and thus are usually memorized with another MLP, which does not generalize to novel poses. Although it is possible to borrow the blending weights of a parametric mesh such as SMPL, the interpolation operation introduces more artifacts. In this paper, we propose to use the barycentric mapping, which can directly generalize to unseen poses and surprisingly achieves superior results than LBS with neural blending weights. Quantitative and qualitative results on the Human3.6M and the ZJU-MoCap datasets show the effectiveness of our method.\n  - [NerfCap: Human Performance Capture With Dynamic Neural Radiance Fields, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > This paper addresses the challenge of human performance capture from sparse multi-view or monocular videos. Given a template mesh of the performer, previous methods capture the human motion by non-rigidly registering the template mesh to images with 2D silhouettes or dense photometric alignment. However, the detailed surface deformation cannot be recovered from the silhouettes, while the photometric alignment suffers from instability caused by appearance variation in the videos. To solve these problems, we propose NerfCap, a novel performance capture method based on the dynamic neural radiance field (NeRF) representation of the performer. Specifically, a canonical NeRF is initialized from the template geometry and registered to the video frames by optimizing the deformation field and the appearance model of the canonical NeRF. To capture both large body motion and detailed surface deformation, NerfCap combines linear blend skinning with embedded graph deformation. In contrast to the mesh-based methods that suffer from fixed topology and texture, NerfCap is able to flexibly capture complex geometry and appearance variation across the videos, and synthesize more photo-realistic images. In addition, NerfCap can be pre-trained end to end in a self-supervised manner by matching the synthesized videos with the input videos. Experimental results on various datasets show that NerfCap outperforms prior works in terms of both surface reconstruction accuracy and novel-view synthesis quality.\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n  - [Vox-Surf: Voxel-based Implicit Surface Representation](https://arxiv.org/abs/2208.10925) | [code]\n    > Virtual content creation and interaction play an important role in modern 3D applications such as AR and VR. Recovering detailed 3D models from real scenes can significantly expand the scope of its applications and has been studied for decades in the computer vision and computer graphics community. We propose Vox-Surf, a voxel-based implicit surface representation. Our Vox-Surf divides the space into finite bounded voxels. Each voxel stores geometry and appearance information in its corner vertices. Vox-Surf is suitable for almost any scenario thanks to sparsity inherited from voxel representation and can be easily trained from multiple view images. We leverage the progressive training procedure to extract important voxels gradually for further optimization so that only valid voxels are preserved, which greatly reduces the number of sampling points and increases rendering speed.The fine voxels can also be considered as the bounding volume for collision detection.The experiments show that Vox-Surf representation can learn delicate surface details and accurate color with less memory and faster rendering speed than other methods.We also show that Vox-Surf can be more practical in scene editing and AR applications.\n  - [Neural Capture of Animatable 3D Human from Monocular Video, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > We present a novel paradigm of building an animatable 3D human representation from a monocular video input, such that it can be rendered in any unseen poses and views. Our method is based on a dynamic Neural Radiance Field (NeRF) rigged by a mesh-based parametric 3D human model serving as a geometry proxy. Previous methods usually rely on multi-view videos or accurate 3D geometry information as additional inputs; besides, most methods suffer from degraded quality when generalized to unseen poses. We identify that the key to generalization is a good input embedding for querying dynamic NeRF: A good input embedding should define an injective mapping in the full volumetric space, guided by surface mesh deformation under pose variation. Based on this observation, we propose to embed the input query with its relationship to local surface regions spanned by a set of geodesic nearest neighbors on mesh vertices. By including both position and relative distance information, our embedding defines a distance-preserved deformation mapping and generalizes well to unseen poses. To reduce the dependency on additional inputs, we first initialize per-frame 3D meshes using off-the-shelf tools and then propose a pipeline to jointly optimize NeRF and refine the initial mesh. Extensive experiments show our method can synthesize plausible human rendering results under unseen poses and views.\n## Aug7 - Aug13, 2022\n  - [OmniVoxel: A Fast and Precise Reconstruction Method of Omnidirectional Neural Radiance Field, GCCE 2022](https://arxiv.org/abs/2208.06335) | [code]\n    > This paper proposes a method to reconstruct the neural radiance field with equirectangular omnidirectional images. Implicit neural scene representation with a radiance field can reconstruct the 3D shape of a scene continuously within a limited spatial area. However, training a fully implicit representation on commercial PC hardware requires a lot of time and computing resources (15 ∼ 20 hours per scene). Therefore, we propose a method to accelerate this process significantly (20 ∼ 40 minutes per scene). Instead of using a fully implicit representation of rays for radiance field reconstruction, we adopt feature voxels that contain density and color features in tensors. Considering omnidirectional equirectangular input and the camera layout, we use spherical voxelization for representation instead of cubic representation. Our voxelization method could balance the reconstruction quality of the inner scene and outer scene. In addition, we adopt the axis-aligned positional encoding method on the color features to increase the total image quality. Our method achieves satisfying empirical performance on synthetic datasets with random camera poses. Moreover, we test our method with real scenes which contain complex geometries and also achieve state-of-the-art performance. Our code and complete dataset will be released at the same time as the paper publication.\n  - [Fast Gradient Descent for Surface Capture Via Differentiable Rendering, 3DV2022](https://hal.inria.fr/hal-03748662/) | [code]\n    > Differential rendering has recently emerged as a powerful tool for image-based rendering or geometric reconstruction from multiple views, with very high quality. Up to now, such methods have been benchmarked on generic object databases and promisingly applied to some real data, but have yet to be applied to specific applications that may benefit. In this paper, we investigate how a differential rendering system can be crafted for raw multi-camera performance capture. We address several key issues in the way of practical usability and reproducibility, such as processing speed, explainability of the model, and general output model quality. This leads us to several contributions to the differential rendering framework. In particular we show that a unified view of differential rendering and classic optimization is possible, leading to a formulation and implementation where complete non-stochastic gradient steps can be analytically computed and the full perframe data stored in video memory, yielding a straightforward and efficient implementation. We also use a sparse storage and coarse-to-fine scheme to achieve extremely high resolution with contained memory and computation time. We show experimentally that results rivaling in quality with state of the art multi-view human surface capture methods are achievable in a fraction of the time, typically around a minute per frame.\n  - [PlaneFormers: From Sparse View Planes to 3D Reconstruction, ECCV2022](https://arxiv.org/abs/2208.04307) | [code]\n    > We present an approach for the planar surface reconstruction of a scene from images with limited overlap. This reconstruction task is challenging since it requires jointly reasoning about single image 3D reconstruction, correspondence between images, and the relative camera pose between images. Past work has proposed optimization-based approaches. We introduce a simpler approach, the PlaneFormer, that uses a transformer applied to 3D-aware plane tokens to perform 3D reasoning. Our experiments show that our approach is substantially more effective than prior work, and that several 3D-specific design decisions are crucial for its success.\n  - [PS-NeRV: Patch-wise Stylized Neural Representations for Videos](https://arxiv.org/abs/2208.03742) | [code]\n    > We study how to represent a video with implicit neural representations (INRs). Classical INRs methods generally utilize MLPs to map input coordinates to output pixels. While some recent works have tried to directly reconstruct the whole image with CNNs. However, we argue that both the above pixel-wise and image-wise strategies are not favorable to video data. Instead, we propose a patch-wise solution, PS-NeRV, which represents videos as a function of patches and the corresponding patch coordinate. It naturally inherits the advantages of image-wise methods, and achieves excellent reconstruction performance with fast decoding speed. The whole method includes conventional modules, like positional embedding, MLPs and CNNs, while also introduces AdaIN to enhance intermediate features. These simple yet essential changes could help the network easily fit high-frequency details. Extensive experiments have demonstrated its effectiveness in several video-related tasks, such as video compression and video inpainting.\n## Jul31 - Aug6, 2022\n  - [PRIF: Primary Ray-based Implicit Function](https://research.google/pubs/pub51556/) | [code]\n    > We introduce a new implicit shape representation called Primary Ray-based Implicit Function (PRIF). In contrast to most existing approaches based on the signed distance function (SDF) which handles spatial locations, our representation operates on oriented rays. Specifically, PRIF is formulated to directly produce the surface hit point of a given input ray, without the expensive sphere-tracing operations, hence enabling efficient shape extraction and differentiable rendering. We demonstrate that neural networks trained to encode PRIF achieve successes in various tasks including single shape representation, category-wise shape generation, shape completion from sparse or noisy observations, inverse rendering for camera pose estimation, and neural rendering with color.\n## Jul24 - Jul30, 2022\n  - [Going Off-Grid: Continuous Implicit Neural Representations for 3D Vascular Modeling, MICCAI STACOM 2022](https://arxiv.org/abs/2207.14663) | [code]\n    > Personalised 3D vascular models are valuable for diagnosis, prognosis and treatment planning in patients with cardiovascular disease. Traditionally, such models have been constructed with explicit representations such as meshes and voxel masks, or implicit representations such as radial basis functions or atomic (tubular) shapes. Here, we propose to represent surfaces by the zero level set of their signed distance function (SDF) in a differentiable implicit neural representation (INR). This allows us to model complex vascular structures with a representation that is implicit, continuous, light-weight, and easy to integrate with deep learning algorithms. We here demonstrate the potential of this approach with three practical examples. First, we obtain an accurate and watertight surface for an abdominal aortic aneurysm (AAA) from CT images and show robust fitting from as little as 200 points on the surface. Second, we simultaneously fit nested vessel walls in a single INR without intersections. Third, we show how 3D models of individual arteries can be smoothly blended into a single watertight surface. Our results show that INRs are a flexible representation with potential for minimally interactive annotation and manipulation of complex vascular structures.\n  - [GAUDI: A Neural Architect for Immersive 3D Scene Generation](https://arxiv.org/abs/2207.13751) | [***``[code]``***](https://github.com/apple/ml-gaudi)\n    > We introduce GAUDI, a generative model capable of capturing the distribution of complex and realistic 3D scenes that can be rendered immersively from a moving camera. We tackle this challenging problem with a scalable yet powerful approach, where we first optimize a latent representation that disentangles radiance fields and camera poses. This latent representation is then used to learn a generative model that enables both unconditional and conditional generation of 3D scenes. Our model generalizes previous works that focus on single objects by removing the assumption that the camera pose distribution can be shared across samples. We show that GAUDI obtains state-of-the-art performance in the unconditional generative setting across multiple datasets and allows for conditional generation of 3D scenes given conditioning variables like sparse image observations or text that describes the scene.\n  - [AlignSDF: Pose-Aligned Signed Distance Fields for Hand-Object Reconstruction, ECCV2022](https://arxiv.org/abs/2207.12909) | [***``[code]``***](https://zerchen.github.io/projects/alignsdf.html)\n    > Recent work achieved impressive progress towards joint reconstruction of hands and manipulated objects from monocular color images. Existing methods focus on two alternative representations in terms of either parametric meshes or signed distance fields (SDFs). On one side, parametric models can benefit from prior knowledge at the cost of limited shape deformations and mesh resolutions. Mesh models, hence, may fail to precisely reconstruct details such as contact surfaces of hands and objects. SDF-based methods, on the other side, can represent arbitrary details but are lacking explicit priors. In this work we aim to improve SDF models using priors provided by parametric representations. In particular, we propose a joint learning framework that disentangles the pose and the shape. We obtain hand and object poses from parametric models and use them to align SDFs in 3D space. We show that such aligned SDFs better focus on reconstructing shape details and improve reconstruction accuracy both for hands and objects. We evaluate our method and demonstrate significant improvements over the state of the art on the challenging ObMan and DexYCB benchmarks.\n  - [NeuMesh: Learning Disentangled Neural Mesh-based Implicit Field for Geometry and Texture Editing, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > Very recently neural implicit rendering techniques have been rapidly evolved and shown great advantages in novel view synthesis and 3D scene reconstruction. However, existing neural rendering methods for editing purposes offer limited functionality, e.g., rigid transformation, or not applicable for fine-grained editing for general objects from daily lives. In this paper, we present a novel mesh-based representation by encoding the neural implicit field with disentangled geometry and texture codes on mesh vertices, which facilitates a set of editing functionalities, including mesh-guided geometry editing, designated texture editing with texture swapping, filling and painting operations. To this end, we develop several techniques including learnable sign indicators to magnify spatial distinguishability of mesh-based representation, distillation and fine-tuning mechanism to make a steady convergence, and the spatial-aware optimization strategy to realize precise texture editing. Extensive experiments and editing examples on both real and synthetic data demonstrate the superiority of our method on representation quality and editing ability. Code is available on the project webpage: this https URL.\n## Previous weeks\n  - [Non-Rigid Neural Radiance Fields: Reconstruction and Novel View Synthesis of a Deforming Scene from Monocular Video,, ICCV2021](https://vcai.mpi-inf.mpg.de/projects/nonrigid_nerf/) | [***``[code]``***](https://github.com/facebookresearch/nonrigid_nerf)\n    > We present Non-Rigid Neural Radiance Fields (NR-NeRF), a reconstruction and novel view synthesis approach for general non-rigid dynamic scenes. Our approach takes RGB images of a dynamic scene as input (e.g., from a monocular video recording), and creates a high-quality space-time geometry and appearance representation. We show that a single handheld consumer-grade camera is sufficient to synthesize sophisticated renderings of a dynamic scene from novel virtual camera views, e.g. a `bullet-time' video effect. NR-NeRF disentangles the dynamic scene into a canonical volume and its deformation. Scene deformation is implemented as ray bending, where straight rays are deformed non-rigidly. We also propose a novel rigidity network to better constrain rigid regions of the scene, leading to more stable results. The ray bending and rigidity network are trained without explicit supervision. Our formulation enables dense correspondence estimation across views and time, and compelling video editing applications such as motion exaggeration. Our code will be open sourced.\n  - [Neural Articulated Radiance Field, ICCV2021](https://arxiv.org/abs/2104.03110) | [***``[code]``***](https://github.com/nogu-atsu/NARF#code)\n    > We present Neural Articulated Radiance Field (NARF), a novel deformable 3D representation for articulated objects learned from images. While recent advances in 3D implicit representation have made it possible to learn models of complex objects, learning pose-controllable representations of articulated objects remains a challenge, as current methods require 3D shape supervision and are unable to render appearance. In formulating an implicit representation of 3D articulated objects, our method considers only the rigid transformation of the most relevant object part in solving for the radiance field at each 3D location. In this way, the proposed method represents pose-dependent changes without significantly increasing the computational complexity. NARF is fully differentiable and can be trained from images with pose annotations. Moreover, through the use of an autoencoder, it can learn appearance variations over multiple instances of an object class. Experiments show that the proposed method is efficient and can generalize well to novel poses.\n  - [GRF: Learning a General Radiance Field for 3D Scene Representation and Rendering, ICCV2021(oral)](https://arxiv.org/abs/2010.04595) | [***``[code]``***](https://github.com/alextrevithick/GRF)\n    > We present a simple yet powerful neural network that implicitly represents and renders 3D objects and scenes only from 2D observations. The network models 3D geometries as a general radiance field, which takes a set of 2D images with camera poses and intrinsics as input, constructs an internal representation for each point of the 3D space, and then renders the corresponding appearance and geometry of that point viewed from an arbitrary position. The key to our approach is to learn local features for each pixel in 2D images and to then project these features to 3D points, thus yielding general and rich point representations. We additionally integrate an attention mechanism to aggregate pixel features from multiple 2D views, such that visual occlusions are implicitly taken into account. Extensive experiments demonstrate that our method can generate high-quality and realistic novel views for novel objects, unseen categories and challenging real-world scenes.\n  - [MVSNeRF: Fast Generalizable Radiance Field Reconstruction from Multi-View Stereo, ICCV2021](https://apchenstu.github.io/mvsnerf/) | [***``[code]``***](https://github.com/apchenstu/mvsnerf)\n    > We present MVSNeRF, a novel neural rendering approach that can efficiently reconstruct neural radiance fields for view synthesis. Unlike prior works on neural radiance fields that consider per-scene optimization on densely captured images, we propose a generic deep neural network that can reconstruct radiance fields from only three nearby input views via fast network inference. Our approach leverages plane-swept cost volumes (widely used in multi-view stereo) for geometry-aware scene reasoning, and combines this with physically based volume rendering for neural radiance field reconstruction. We train our network on real objects in the DTU dataset, and test it on three different datasets to evaluate its effectiveness and generalizability. Our approach can generalize across scenes (even indoor scenes, completely different from our training scenes of objects) and generate realistic view synthesis results using only three input images, significantly outperforming concurrent works on generalizable radiance field reconstruction. Moreover, if dense images are captured, our estimated radiance field representation can be easily fine-tuned; this leads to fast per-scene reconstruction with higher rendering quality and substantially less optimization time than NeRF.\n  - [Towards Continuous Depth MPI with NeRF for Novel View Synthesis, ICCV2021](https://arxiv.org/abs/2103.14910) | [***``[code]``***](https://github.com/vincentfung13/MINE)\n    > In this paper, we propose MINE to perform novel view synthesis and depth estimation via dense 3D reconstruction from a single image. Our approach is a continuous depth generalization of the Multiplane Images (MPI) by introducing the NEural radiance fields (NeRF). Given a single image as input, MINE predicts a 4-channel image (RGB and volume density) at arbitrary depth values to jointly reconstruct the camera frustum and fill in occluded contents. The reconstructed and inpainted frustum can then be easily rendered into novel RGB or depth views using differentiable rendering. Extensive experiments on RealEstate10K, KITTI and Flowers Light Fields show that our MINE outperforms state-of-the-art by a large margin in novel view synthesis. We also achieve competitive results in depth estimation on iBims-1 and NYU-v2 without annotated depth supervision. Our source code is available at this https URL\n  - [UNISURF: Unifying Neural Implicit Surfaces and Radiance Fields for Multi-View Reconstruction, ICCV2021(oral)](https://arxiv.org/abs/2104.10078) | [***``[code]``***](https://github.com/autonomousvision/unisurf)\n    > Neural implicit 3D representations have emerged as a powerful paradigm for reconstructing surfaces from multi-view images and synthesizing novel views. Unfortunately, existing methods such as DVR or IDR require accurate per-pixel object masks as supervision. At the same time, neural radiance fields have revolutionized novel view synthesis. However, NeRF's estimated volume density does not admit accurate surface reconstruction. Our key insight is that implicit surface models and radiance fields can be formulated in a unified way, enabling both surface and volume rendering using the same model. This unified perspective enables novel, more efficient sampling procedures and the ability to reconstruct accurate surfaces without input masks. We compare our method on the DTU, BlendedMVS, and a synthetic indoor dataset. Our experiments demonstrate that we outperform NeRF in terms of reconstruction quality while performing on par with IDR without requiring masks.\n  - [NeuS: Learning Neural Implicit Surfaces by Volume Rendering for Multi-view Reconstruction, NeurIPS2021](https://arxiv.org/abs/2106.10689) | [***``[code]``***](https://github.com/Totoro97/NeuS)\n    > We present a novel neural surface reconstruction method, called NeuS, for reconstructing objects and scenes with high fidelity from 2D image inputs. Existing neural surface reconstruction approaches, such as DVR and IDR, require foreground mask as supervision, easily get trapped in local minima, and therefore struggle with the reconstruction of objects with severe self-occlusion or thin structures. Meanwhile, recent neural methods for novel view synthesis, such as NeRF and its variants, use volume rendering to produce a neural scene representation with robustness of optimization, even for highly complex objects. However, extracting high-quality surfaces from this learned implicit representation is difficult because there are not sufficient surface constraints in the representation. In NeuS, we propose to represent a surface as the zero-level set of a signed distance function (SDF) and develop a new volume rendering method to train a neural SDF representation. We observe that the conventional volume rendering method causes inherent geometric errors (i.e. bias) for surface reconstruction, and therefore propose a new formulation that is free of bias in the first order of approximation, thus leading to more accurate surface reconstruction even without the mask supervision. Experiments on the DTU dataset and the BlendedMVS dataset show that NeuS outperforms the state-of-the-arts in high-quality surface reconstruction, especially for objects and scenes with complex structures and self-occlusion.\n  - [Volume Rendering of Neural Implicit Surfaces, NeurIPS2021](https://arxiv.org/abs/2106.12052) | [code]\n    > Neural volume rendering became increasingly popular recently due to its success in synthesizing novel views of a scene from a sparse set of input images. So far, the geometry learned by neural volume rendering techniques was modeled using a generic density function. Furthermore, the geometry itself was extracted using an arbitrary level set of the density function leading to a noisy, often low fidelity reconstruction. The goal of this paper is to improve geometry representation and reconstruction in neural volume rendering. We achieve that by modeling the volume density as a function of the geometry. This is in contrast to previous work modeling the geometry as a function of the volume density. In more detail, we define the volume density function as Laplace's cumulative distribution function (CDF) applied to a signed distance function (SDF) representation. This simple density representation has three benefits: (i) it provides a useful inductive bias to the geometry learned in the neural volume rendering process; (ii) it facilitates a bound on the opacity approximation error, leading to an accurate sampling of the viewing ray. Accurate sampling is important to provide a precise coupling of geometry and radiance; and (iii) it allows efficient unsupervised disentanglement of shape and appearance in volume rendering. Applying this new density representation to challenging scene multiview datasets produced high quality geometry reconstructions, outperforming relevant baselines. Furthermore, switching shape and appearance between scenes is possible due to the disentanglement of the two.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/semantic.md",
    "content": "\nWeekly Classified Neural Radiance Fields - semantic ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n====================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n  - [iLabel: Revealing Objects in Neural Fields, RAL2022](https://ieeexplore.ieee.org/abstract/document/9996585) | [code]\n    > A neural field trained with self-supervision to efficiently represent the geometry and colour of a 3D scene tends to automatically decompose it into coherent and accurate object-like regions, which can be revealed with sparse labelling interactions to produce a 3D semantic scene segmentation. Our real-time iLabel system takes input from a hand-held RGB-D camera, requires zero prior training data, and works in an ‘open set’ manner, with semantic classes defined on the fly by the user. iLabel's underlying model is a simple multilayer perceptron (MLP), trained from scratch to learn a neural representation of a single 3D scene. The model is updated continually and visualised in real-time, allowing the user to focus interactions to achieve extremely efficient semantic segmentation. A room-scale scene can be accurately labelled into 10+ semantic categories with around 100 clicks, taking less than 5 minutes. Quantitative labelling accuracy scales powerfully with the number of clicks, and rapidly surpasses standard pre-trained semantic segmentation methods. We also demonstrate a hierarchical labelling variant of iLabel and a ‘hands-free’ mode where the user only needs to supply label names for automatically-generated locations.\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [NeRF-RPN: A general framework for object detection in NeRFs](https://arxiv.org/abs/2211.11646) | [code]\n    > This paper presents the first significant object detection framework, NeRF-RPN, which directly operates on NeRF. Given a pre-trained NeRF model, NeRF-RPN aims to detect all bounding boxes of objects in a scene. By exploiting a novel voxel representation that incorporates multi-scale 3D neural volumetric features, we demonstrate it is possible to regress the 3D bounding boxes of objects in NeRF directly without rendering the NeRF at any viewpoint. NeRF-RPN is a general framework and can be applied to detect objects without class labels. We experimented the NeRF-RPN with various backbone architectures, RPN head designs and loss functions. All of them can be trained in an end-to-end manner to estimate high quality 3D bounding boxes. To facilitate future research in object detection for NeRF, we built a new benchmark dataset which consists of both synthetic and real-world data with careful labeling and clean up. Please click this https URL for visualizing the 3D region proposals by our NeRF-RPN. Code and dataset will be made available.\n  - [SegNeRF: 3D Part Segmentation with Neural Radiance Fields](https://arxiv.org/abs/2211.11215) | [code]\n    > Recent advances in Neural Radiance Fields (NeRF) boast impressive performances for generative tasks such as novel view synthesis and 3D reconstruction. Methods based on neural radiance fields are able to represent the 3D world implicitly by relying exclusively on posed images. Yet, they have seldom been explored in the realm of discriminative tasks such as 3D part segmentation. In this work, we attempt to bridge that gap by proposing SegNeRF: a neural field representation that integrates a semantic field along with the usual radiance field. SegNeRF inherits from previous works the ability to perform novel view synthesis and 3D reconstruction, and enables 3D part segmentation from a few images. Our extensive experiments on PartNet show that SegNeRF is capable of simultaneously predicting geometry, appearance, and semantic information from posed images, even for unseen objects. The predicted semantic fields allow SegNeRF to achieve an average mIoU of 30.30% for 2D novel view segmentation, and 37.46% for 3D part segmentation, boasting competitive performance against point-based methods by using only a few posed images. Additionally, SegNeRF is able to generate an explicit 3D model from a single image of an object taken in the wild, with its corresponding part segmentation.\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [CLIP-Fields: Weakly Supervised Semantic Fields for Robotic Memory](https://mahis.life/clip-fields/) | [code]\n    > We propose CLIP-Fields, an implicit scene model that can be trained with no direct human supervision. This model learns a mapping from spatial locations to semantic embedding vectors. The mapping can then be used for a variety of tasks, such as segmentation, instance identification, semantic search over space, and view localization. Most importantly, the mapping can be trained with supervision coming only from web-image and web-text trained models such as CLIP, Detic, and Sentence-BERT. When compared to baselines like Mask-RCNN, our method outperforms on few-shot instance identification or semantic segmentation on the HM3D dataset with only a fraction of the examples. Finally, we show that using CLIP-Fields as a scene memory, robots can perform semantic navigation in real-world environments. Our code and demonstrations are available here: https://mahis.life/clip-fields/\n## Oct2 - Oct8, 2022\n  - [ViewFool: Evaluating the Robustness of Visual Recognition to Adversarial Viewpoints, NeurIPS2022](https://arxiv.org/abs/2210.03895) | [code]\n    > Recent studies have demonstrated that visual recognition models lack robustness to distribution shift. However, current work mainly considers model robustness to 2D image transformations, leaving viewpoint changes in the 3D world less explored. In general, viewpoint changes are prevalent in various real-world applications (e.g., autonomous driving), making it imperative to evaluate viewpoint robustness. In this paper, we propose a novel method called ViewFool to find adversarial viewpoints that mislead visual recognition models. By encoding real-world objects as neural radiance fields (NeRF), ViewFool characterizes a distribution of diverse adversarial viewpoints under an entropic regularizer, which helps to handle the fluctuations of the real camera pose and mitigate the reality gap between the real objects and their neural representations. Experiments validate that the common image classifiers are extremely vulnerable to the generated adversarial viewpoints, which also exhibit high cross-model transferability. Based on ViewFool, we introduce ImageNet-V, a new out-of-distribution dataset for benchmarking viewpoint robustness of image classifiers. Evaluation results on 40 classifiers with diverse architectures, objective functions, and data augmentations reveal a significant drop in model performance when tested on ImageNet-V, which provides a possibility to leverage ViewFool as an effective data augmentation strategy to improve viewpoint robustness.\n  - [A Simple Plugin for Transforming Images to Arbitrary Scales](https://arxiv.org/abs/2210.03417) | [code]\n    > Existing models on super-resolution often specialized for one scale, fundamentally limiting their use in practical scenarios. In this paper, we aim to develop a general plugin that can be inserted into existing super-resolution models, conveniently augmenting their ability towards Arbitrary Resolution Image Scaling, thus termed ARIS. We make the following contributions: (i) we propose a transformer-based plugin module, which uses spatial coordinates as query, iteratively attend the low-resolution image feature through cross-attention, and output visual feature for the queried spatial location, resembling an implicit representation for images; (ii) we introduce a novel self-supervised training scheme, that exploits consistency constraints to effectively augment the model's ability for upsampling images towards unseen scales, i.e. ground-truth high-resolution images are not available; (iii) without loss of generality, we inject the proposed ARIS plugin module into several existing models, namely, IPT, SwinIR, and HAT, showing that the resulting models can not only maintain their original performance on fixed scale factor but also extrapolate to unseen scales, substantially outperforming existing any-scale super-resolution models on standard benchmarks, e.g. Urban100, DIV2K, etc.\n  - [Feature-Realistic Neural Fusion for Real-Time, Open Set Scene Understanding](https://arxiv.org/abs/2210.03043) | [code]\n    > General scene understanding for robotics requires flexible semantic representation, so that novel objects and structures which may not have been known at training time can be identified, segmented and grouped. We present an algorithm which fuses general learned features from a standard pre-trained network into a highly efficient 3D geometric neural field representation during real-time SLAM. The fused 3D feature maps inherit the coherence of the neural field's geometry representation. This means that tiny amounts of human labelling interacting at runtime enable objects or even parts of objects to be robustly and accurately segmented in an open set manner.\n  - [Neural Matching Fields: Implicit Representation of Matching Fields for Visual Correspondence, NeurIPS2022](https://arxiv.org/abs/2210.02689) | [***``[code]``***](https://ku-cvlab.github.io/NeMF/)\n    > Existing pipelines of semantic correspondence commonly include extracting high-level semantic features for the invariance against intra-class variations and background clutters. This architecture, however, inevitably results in a low-resolution matching field that additionally requires an ad-hoc interpolation process as a post-processing for converting it into a high-resolution one, certainly limiting the overall performance of matching results. To overcome this, inspired by recent success of implicit neural representation, we present a novel method for semantic correspondence, called Neural Matching Field (NeMF). However, complicacy and high-dimensionality of a 4D matching field are the major hindrances, which we propose a cost embedding network to process a coarse cost volume to use as a guidance for establishing high-precision matching field through the following fully-connected network. Nevertheless, learning a high-dimensional matching field remains challenging mainly due to computational complexity, since a naive exhaustive inference would require querying from all pixels in the 4D space to infer pixel-wise correspondences. To overcome this, we propose adequate training and inference procedures, which in the training phase, we randomly sample matching candidates and in the inference phase, we iteratively performs PatchMatch-based inference and coordinate optimization at test time. With these combined, competitive results are attained on several standard benchmarks for semantic correspondence. Code and pre-trained weights are available at this https URL.\n## Sep25 - Oct1, 2022\n  - [Understanding Pure CLIP Guidance for Voxel Grid NeRF Models](https://arxiv.org/abs/2209.15172) | [code]\n    > We explore the task of text to 3D object generation using CLIP. Specifically, we use CLIP for guidance without access to any datasets, a setting we refer to as pure CLIP guidance. While prior work has adopted this setting, there is no systematic study of mechanics for preventing adversarial generations within CLIP. We illustrate how different image-based augmentations prevent the adversarial generation problem, and how the generated results are impacted. We test different CLIP model architectures and show that ensembling different models for guidance can prevent adversarial generations within bigger models and generate sharper results. Furthermore, we implement an implicit voxel grid model to show how neural networks provide an additional layer of regularization, resulting in better geometrical structure and coherency of generated objects. Compared to prior work, we achieve more coherent results with higher memory efficiency and faster training speeds.\n  - [City-scale Incremental Neural Mapping with Three-layer Sampling and Panoptic Representation](https://arxiv.org/abs/2209.14072) | [code]\n    > Neural implicit representations are drawing a lot of attention from the robotics community recently, as they are expressive, continuous and compact. However, city-scale incremental implicit dense mapping based on sparse LiDAR input is still an under-explored challenge. To this end,we successfully build the first city-scale incremental neural mapping system with a panoptic representation that consists of both environment-level and instance-level modelling. Given a stream of sparse LiDAR point cloud, it maintains a dynamic generative model that maps 3D coordinates to signed distance field (SDF) values. To address the difficulty of representing geometric information at different levels in city-scale space, we propose a tailored three-layer sampling strategy to dynamically sample the global, local and near-surface domains. Meanwhile, to realize high fidelity mapping, category-specific prior is introduced to better model the geometric details, leading to a panoptic representation. We evaluate on the public SemanticKITTI dataset and demonstrate the significance of the newly proposed three-layer sampling strategy and panoptic representation, using both quantitative and qualitative results. Codes and data will be publicly available.\n  - [360FusionNeRF: Panoramic Neural Radiance Fields with Joint Guidance](https://arxiv.org/abs/2209.14265) | [code]\n    > We present a method to synthesize novel views from a single 360∘ panorama image based on the neural radiance field (NeRF). Prior studies in a similar setting rely on the neighborhood interpolation capability of multi-layer perceptions to complete missing regions caused by occlusion, which leads to artifacts in their predictions. We propose 360FusionNeRF, a semi-supervised learning framework where we introduce geometric supervision and semantic consistency to guide the progressive training process. Firstly, the input image is re-projected to 360∘ images, and auxiliary depth maps are extracted at other camera positions. The depth supervision, in addition to the NeRF color guidance, improves the geometry of the synthesized views. Additionally, we introduce a semantic consistency loss that encourages realistic renderings of novel views. We extract these semantic features using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse 2D photographs mined from the web with natural language supervision. Experiments indicate that our proposed method can produce plausible completions of unobserved regions while preserving the features of the scene. When trained across various scenes, 360FusionNeRF consistently achieves the state-of-the-art performance when transferring to synthetic Structured3D dataset (PSNR~5%, SSIM~3% LPIPS~13%), real-world Matterport3D dataset (PSNR~3%, SSIM~3% LPIPS~9%) and Replica360 dataset (PSNR~8%, SSIM~2% LPIPS~18%).\n  - [Baking in the Feature: Accelerating Volumetric Segmentation by Rendering Feature Maps](https://arxiv.org/abs/2209.12744) | [code]\n    > Methods have recently been proposed that densely segment 3D volumes into classes using only color images and expert supervision in the form of sparse semantically annotated pixels. While impressive, these methods still require a relatively large amount of supervision and segmenting an object can take several minutes in practice. Such systems typically only optimize their representation on the particular scene they are fitting, without leveraging any prior information from previously seen images. In this paper, we propose to use features extracted with models trained on large existing datasets to improve segmentation performance. We bake this feature representation into a Neural Radiance Field (NeRF) by volumetrically rendering feature maps and supervising on features extracted from each input image. We show that by baking this representation into the NeRF, we make the subsequent classification task much easier. Our experiments show that our method achieves higher segmentation accuracy with fewer semantic annotations than existing methods over a wide range of scenes.\n## Sep18 - Sep24, 2022\n  - [NeRF-SOS: Any-View Self-supervised Object Segmentation on Complex Scenes](https://zhiwenfan.github.io/NeRF-SOS/) | [***``[code]``***](https://github.com/VITA-Group/NeRF-SOS)\n    > Neural volumetric representations have shown the potential that Multi-layer Perceptrons (MLPs) can be optimized with multi-view calibrated images to represent scene geometry and appearance, without explicit 3D supervision. Object segmentation can enrich many downstream applications based on the learned radiance field. However, introducing hand-crafted segmentation to define regions of interest in a complex real-world scene is non-trivial and expensive as it acquires per view annotation. This paper carries out the exploration of self-supervised learning for object segmentation using NeRF for complex real-world scenes. Our framework, called NeRF with Self-supervised Object Segmentation NeRF-SOS, couples object segmentation and neural radiance field to segment objects in any view within a scene. By proposing a novel collaborative contrastive loss in both appearance and geometry levels, NeRF-SOS encourages NeRF models to distill compact geometry-aware segmentation clusters from their density fields and the self-supervised pre-trained 2D visual features. The self-supervised object segmentation framework can be applied to various NeRF models that both lead to photo-realistic rendering results and convincing segmentation maps for both indoor and outdoor scenarios. Extensive results on the LLFF, Tank & Temple, and BlendedMVS datasets validate the effectiveness of NeRF-SOS. It consistently surpasses other 2D-based self-supervised baselines and predicts finer semantics masks than existing supervised counterparts. Please refer to the video on our project page for more details:this https URL.\n  - [Implicit Neural Representations for Medical Imaging Segmentation, MICCAI2022](https://link.springer.com/chapter/10.1007/978-3-031-16443-9_42) | [code]\n    > 3D signals in medical imaging, such as CT scans, are usually parameterized as a discrete grid of voxels. For instance, existing state-of-the-art organ segmentation methods learn discrete segmentation maps. Unfortunately, the memory requirements of such methods grow cubically with increasing spatial resolution, which makes them unsuitable for processing high resolution scans. To overcome this, we design an Implicit Organ Segmentation Network (IOSNet) that utilizes continuous Implicit Neural Representations and has several useful properties. Firstly, the IOSNet decoder memory is roughly constant and independent of the spatial resolution since it parameterizes the segmentation map as a continuous function. Secondly, IOSNet converges much faster than discrete voxel based methods due to its ability to accurately segment organs irrespective of organ sizes, thereby alleviating size imbalance issues without requiring any auxiliary tricks. Thirdly, IOSNet naturally supports super-resolution (i.e. sampling at arbitrary resolutions during inference) due to its continuous learnt representations. Moreover, despite using a simple lightweight decoder, IOSNet consistently outperforms the discrete specialized segmentation architecture UNet. Hence, our approach demonstrates that Implicit Neural Representations are well-suited for medical imaging applications, especially for processing high-resolution 3D medical scans.\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n  - [Neural Feature Fusion Fields: 3D Distillation of Self-Supervised 2D Image Representations, 3DV2022(oral)](https://arxiv.org/abs/2209.03494) | [***``[code]``***](https://github.com/dichotomies/N3F)\n    > We present Neural Feature Fusion Fields (N3F), a method that improves dense 2D image feature extractors when the latter are applied to the analysis of multiple images reconstructible as a 3D scene. Given an image feature extractor, for example pre-trained using self-supervision, N3F uses it as a teacher to learn a student network defined in 3D space. The 3D student network is similar to a neural radiance field that distills said features and can be trained with the usual differentiable rendering machinery. As a consequence, N3F is readily applicable to most neural rendering formulations, including vanilla NeRF and its extensions to complex dynamic scenes. We show that our method not only enables semantic understanding in the context of scene-specific neural fields without the use of manual labels, but also consistently improves over the self-supervised 2D baselines. This is demonstrated by considering various tasks, such as 2D object retrieval, 3D segmentation, and scene editing, in diverse sequences, including long egocentric videos in the EPIC-KITCHENS benchmark.\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n  - [DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://dreambooth.github.io/) | [code]\n    > Large text-to-image models achieved a remarkable leap in the evolution of AI, enabling high-quality and diverse synthesis of images from a given text prompt. However, these models lack the ability to mimic the appearance of subjects in a given reference set and synthesize novel renditions of them in different contexts. In this work, we present a new approach for \"personalization\" of text-to-image diffusion models (specializing them to users' needs). Given as input just a few images of a subject, we fine-tune a pretrained text-to-image model (Imagen, although our method is not limited to a specific model) such that it learns to bind a unique identifier with that specific subject. Once the subject is embedded in the output domain of the model, the unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in different scenes. By leveraging the semantic prior embedded in the model with a new autogenous class-specific prior preservation loss, our technique enables synthesizing the subject in diverse scenes, poses, views, and lighting conditions that do not appear in the reference images. We apply our technique to several previously-unassailable tasks, including subject recontextualization, text-guided view synthesis, appearance modification, and artistic rendering (all while preserving the subject's key features). Project page: this https URL\n## Aug14 - Aug20, 2022\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n  - [NeSF: Neural Semantic Fields for Generalizable Semantic Segmentation of 3D Scenes](https://research.google/pubs/pub51563/) | [code]\n    > We present NeSF, a method for producing 3D semantic fields from pre-trained density fields and sparse 2D semantic supervision. Our method side-steps traditional scene representations by leveraging neural representations where 3D information is stored within neural fields. In spite of being supervised by 2D signals alone, our method is able to generate 3D-consistent semantic maps from novel camera poses and can be queried at arbitrary 3D points. Notably, NeSF is compatible with any method producing a density field, and its accuracy improves as the quality of the pre-trained density fields improve. Our empirical analysis demonstrates comparable quality to competitive 2D and 3D semantic segmentation baselines on convincing synthetic scenes while also offering features unavailable to existing methods.\n## Jul24 - Jul30, 2022\n## Previous weeks\n  - [Putting NeRF on a Diet: Semantically Consistent Few-Shot View Synthesis, ICCV2021](https://www.ajayj.com/dietnerf) | [***``[code]``***](https://github.com/ajayjain/DietNeRF)\n    > We present DietNeRF, a 3D neural scene representation estimated from a few images. Neural Radiance Fields (NeRF) learn a continuous volumetric representation of a scene through multi-view consistency, and can be rendered from novel viewpoints by ray casting. While NeRF has an impressive ability to reconstruct geometry and fine details given many images, up to 100 for challenging 360° scenes, it often finds a degenerate solution to its image reconstruction objective when only a few input views are available. To improve few-shot quality, we propose DietNeRF. We introduce an auxiliary semantic consistency loss that encourages realistic renderings at novel poses. DietNeRF is trained on individual scenes to (1) correctly render given input views from the same pose, and (2) match high-level semantic attributes across different, random poses. Our semantic loss allows us to supervise DietNeRF from arbitrary poses. We extract these semantics using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse single-view, 2D photographs mined from the web with natural language supervision. In experiments, DietNeRF improves the perceptual quality of few-shot view synthesis when learned from scratch, can render novel views with as few as one observed image when pre-trained on a multi-view dataset, and produces plausible completions of completely unobserved regions.\n  - [Unsupervised Discovery of Object Radiance Fields, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > We study the problem of inferring an object-centric scene representation from a single image, aiming to derive a representation that explains the image formation process, captures the scene's 3D nature, and is learned without supervision. Most existing methods on scene decomposition lack one or more of these characteristics, due to the fundamental challenge in integrating the complex 3D-to-2D image formation process into powerful inference schemes like deep networks. In this paper, we propose unsupervised discovery of Object Radiance Fields (uORF), integrating recent progresses in neural 3D scene representations and rendering with deep inference networks for unsupervised 3D scene decomposition. Trained on multi-view RGB images without annotations, uORF learns to decompose complex scenes with diverse, textured background from a single image. We show that uORF performs well on unsupervised 3D scene segmentation, novel view synthesis, and scene editing on three datasets.\n  - [In-Place Scene Labelling and Understanding with Implicit Scene Representation, ICCV2021(oral)](https://shuaifengzhi.com/Semantic-NeRF/) | [***``[code]``***](https://github.com/Harry-Zhi/semantic_nerf/)\n    > Semantic labelling is highly correlated with geometry and radiance reconstruction, as scene entities with similar shape and appearance are more likely to come from similar classes. Recent implicit neural reconstruction techniques are appealing as they do not require prior training data, but the same fully self-supervised approach is not possible for semantics because labels are human-defined properties.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/texture.md",
    "content": "\nWeekly Classified Neural Radiance Fields - texture ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n===================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n  - [Deep Appearance Prefiltering, ToG2022](https://dl.acm.org/doi/abs/10.1145/3570327) | [code]\n    > Physically based rendering of complex scenes can be prohibitively costly with a potentially unbounded and uneven distribution of complexity across the rendered image. The goal of an ideal level of detail (LoD) method is to make rendering costs independent of the 3D scene complexity, while preserving the appearance of the scene. However, current prefiltering LoD methods are limited in the appearances they can support due to their reliance of approximate models and other heuristics. We propose the first comprehensive multi-scale LoD framework for prefiltering 3D environments with complex geometry and materials (e.g., the Disney BRDF), while maintaining the appearance with respect to the ray-traced reference. Using a multi-scale hierarchy of the scene, we perform a data-driven prefiltering step to obtain an appearance phase function and directional coverage mask at each scale. At the heart of our approach is a novel neural representation that encodes this information into a compact latent form that is easy to decode inside a physically based renderer. Once a scene is baked out, our method requires no original geometry, materials, or textures at render time. We demonstrate that our approach compares favorably to state-of-the-art prefiltering methods and achieves considerable savings in memory for complex scenes.\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [IBL-NeRF: Image-Based Lighting Formulation of Neural Radiance Fields](https://arxiv.org/abs/2210.08202) | [code]\n    > We propose IBL-NeRF, which decomposes the neural radiance fields (NeRF) of large-scale indoor scenes into intrinsic components. Previous approaches for the inverse rendering of NeRF transform the implicit volume to fit the rendering pipeline of explicit geometry, and approximate the views of segmented, isolated objects with environment lighting. In contrast, our inverse rendering extends the original NeRF formulation to capture the spatial variation of lighting within the scene volume, in addition to surface properties. Specifically, the scenes of diverse materials are decomposed into intrinsic components for image-based rendering, namely, albedo, roughness, surface normal, irradiance, and prefiltered radiance. All of the components are inferred as neural images from MLP, which can model large-scale general scenes. By adopting the image-based formulation of NeRF, our approach inherits superior visual quality and multi-view consistency for synthesized images. We demonstrate the performance on scenes with complex object layouts and light configurations, which could not be processed in any of the previous works.\n  - [NeuralRoom: Geometry-Constrained Neural Implicit Surfaces for Indoor Scene Reconstruction](https://arxiv.org/abs/2210.06853) | [code]\n    > We present a novel neural surface reconstruction method called NeuralRoom for reconstructing room-sized indoor scenes directly from a set of 2D images. Recently, implicit neural representations have become a promising way to reconstruct surfaces from multiview images due to their high-quality results and simplicity. However, implicit neural representations usually cannot reconstruct indoor scenes well because they suffer severe shape-radiance ambiguity. We assume that the indoor scene consists of texture-rich and flat texture-less regions. In texture-rich regions, the multiview stereo can obtain accurate results. In the flat area, normal estimation networks usually obtain a good normal estimation. Based on the above observations, we reduce the possible spatial variation range of implicit neural surfaces by reliable geometric priors to alleviate shape-radiance ambiguity. Specifically, we use multiview stereo results to limit the NeuralRoom optimization space and then use reliable geometric priors to guide NeuralRoom training. Then the NeuralRoom would produce a neural scene representation that can render an image consistent with the input training images. In addition, we propose a smoothing method called perturbation-residual restrictions to improve the accuracy and completeness of the flat region, which assumes that the sampling points in a local surface should have the same normal and similar distance to the observation center. Experiments on the ScanNet dataset show that our method can reconstruct the texture-less area of indoor scenes while maintaining the accuracy of detail. We also apply NeuralRoom to more advanced multiview reconstruction algorithms and significantly improve their reconstruction quality.\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n  - [SG-SRNs: Superpixel-Guided Scene Representation Networks, SignalProcessingLetters](https://ieeexplore.ieee.org/abstract/document/9900405) | [code]\n    > Recently, Scene Representation Networks (SRNs) have attracted increasing attention in computer vision, due to their continuous and light-weight scene representation ability. However, SRNs generally perform poorly on low-texture image regions. Addressing this problem, we propose superpixel-guided scene representation networks in this paper, called SG-SRNs, consisting of a backbone module (SRNs), a superpixel segmentation module, and a superpixel regularization module. In the proposed method, except for the novel view synthesis task, the task of representation-aware superpixel segmentation mask generation is realized by the proposed superpixel segmentation module. Then, the superpixel regularization module utilizes the superpixel segmentation mask to guide the backbone to be learned in a locally smooth way, and optimizes the scene representations of the local regions to indirectly alleviate the structure distortion of low-texture regions in a self-supervised manner. Extensive experimental results on both our constructed datasets and the public Synthetic-NeRF dataset demonstrated that the proposed SG-SRNs achieved a significantly better 3D structure representing performance.\n  - [Human Performance Modeling and Rendering via Neural Animated Mesh](https://arxiv.org/abs/2209.08468) | [code]\n    > We have recently seen tremendous progress in the neural advances for photo-real human modeling and rendering. However, it's still challenging to integrate them into an existing mesh-based pipeline for downstream applications. In this paper, we present a comprehensive neural approach for high-quality reconstruction, compression, and rendering of human performances from dense multi-view videos. Our core intuition is to bridge the traditional animated mesh workflow with a new class of highly efficient neural techniques. We first introduce a neural surface reconstructor for high-quality surface generation in minutes. It marries the implicit volumetric rendering of the truncated signed distance field (TSDF) with multi-resolution hash encoding. We further propose a hybrid neural tracker to generate animated meshes, which combines explicit non-rigid tracking with implicit dynamic deformation in a self-supervised framework. The former provides the coarse warping back into the canonical space, while the latter implicit one further predicts the displacements using the 4D hash encoding as in our reconstructor. Then, we discuss the rendering schemes using the obtained animated meshes, ranging from dynamic texturing to lumigraph rendering under various bandwidth settings. To strike an intricate balance between quality and bandwidth, we propose a hierarchical solution by first rendering 6 virtual views covering the performer and then conducting occlusion-aware neural texture blending. We demonstrate the efficacy of our approach in a variety of mesh-based applications and photo-realistic free-view experiences on various platforms, i.e., inserting virtual human performances into real environments through mobile AR or immersively watching talent shows with VR headsets.\n## Sep11 - Sep17, 2022\n  - [StructNeRF: Neural Radiance Fields for Indoor Scenes with Structural Hints](https://arxiv.org/abs/2209.05277) | [code]\n    > Neural Radiance Fields (NeRF) achieve photo-realistic view synthesis with densely captured input images. However, the geometry of NeRF is extremely under-constrained given sparse views, resulting in significant degradation of novel view synthesis quality. Inspired by self-supervised depth estimation methods, we propose StructNeRF, a solution to novel view synthesis for indoor scenes with sparse inputs. StructNeRF leverages the structural hints naturally embedded in multi-view inputs to handle the unconstrained geometry issue in NeRF. Specifically, it tackles the texture and non-texture regions respectively: a patch-based multi-view consistent photometric loss is proposed to constrain the geometry of textured regions; for non-textured ones, we explicitly restrict them to be 3D consistent planes. Through the dense self-supervised depth constraints, our method improves both the geometry and the view synthesis performance of NeRF without any additional training on external data. Extensive experiments on several real-world datasets demonstrate that StructNeRF surpasses state-of-the-art methods for indoor scenes with sparse inputs both quantitatively and qualitatively.\n## Sep4 - Sep10, 2022\n  - [3D Textured Shape Recovery with Learned Geometric Priors](https://arxiv.org/abs/2209.03254) | [code]\n    > 3D textured shape recovery from partial scans is crucial for many real-world applications. Existing approaches have demonstrated the efficacy of implicit function representation, but they suffer from partial inputs with severe occlusions and varying object types, which greatly hinders their application value in the real world. This technical report presents our approach to address these limitations by incorporating learned geometric priors. To this end, we generate a SMPL model from learned pose prediction and fuse it into the partial input to add prior knowledge of human bodies. We also propose a novel completeness-aware bounding box adaptation for handling different levels of scales and partialness of partial scans.\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [ShAPO: Implicit Representations for Multi-Object Shape, Appearance, and Pose Optimization, ECCV2022](https://arxiv.org/abs/2207.13691) | [***``[code]``***](https://zubair-irshad.github.io/projects/ShAPO.html)\n    > Our method studies the complex task of object-centric 3D understanding from a single RGB-D observation. As it is an ill-posed problem, existing methods suffer from low performance for both 3D shape and 6D pose and size estimation in complex multi-object scenarios with occlusions. We present ShAPO, a method for joint multi-object detection, 3D textured reconstruction, 6D object pose and size estimation. Key to ShAPO is a single-shot pipeline to regress shape, appearance and pose latent codes along with the masks of each object instance, which is then further refined in a sparse-to-dense fashion. A novel disentangled shape and appearance database of priors is first learned to embed objects in their respective shape and appearance space. We also propose a novel, octree-based differentiable optimization step, allowing us to further improve object shape, pose and appearance simultaneously under the learned latent space, in an analysis-by-synthesis fashion. Our novel joint implicit textured object representation allows us to accurately identify and reconstruct novel unseen objects without having access to their 3D meshes. Through extensive experiments, we show that our method, trained on simulated indoor scenes, accurately regresses the shape, appearance and pose of novel objects in the real-world with minimal fine-tuning. Our method significantly out-performs all baselines on the NOCS dataset with an 8% absolute improvement in mAP for 6D pose estimation.\n  - [NeuMesh: Learning Disentangled Neural Mesh-based Implicit Field for Geometry and Texture Editing, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > Very recently neural implicit rendering techniques have been rapidly evolved and shown great advantages in novel view synthesis and 3D scene reconstruction. However, existing neural rendering methods for editing purposes offer limited functionality, e.g., rigid transformation, or not applicable for fine-grained editing for general objects from daily lives. In this paper, we present a novel mesh-based representation by encoding the neural implicit field with disentangled geometry and texture codes on mesh vertices, which facilitates a set of editing functionalities, including mesh-guided geometry editing, designated texture editing with texture swapping, filling and painting operations. To this end, we develop several techniques including learnable sign indicators to magnify spatial distinguishability of mesh-based representation, distillation and fine-tuning mechanism to make a steady convergence, and the spatial-aware optimization strategy to realize precise texture editing. Extensive experiments and editing examples on both real and synthetic data demonstrate the superiority of our method on representation quality and editing ability. Code is available on the project webpage: this https URL.\n## Previous weeks\n  - [CodeNeRF: Disentangled Neural Radiance Fields for Object Categories, ICCV2021(oral)](https://www.google.com/url?q=https%3A%2F%2Farxiv.org%2Fpdf%2F2109.01750.pdf&sa=D&sntz=1&usg=AOvVaw1Fnir0e4aRa22Nt0HoXDWh) | [***``[code]``***](https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fwbjang%2Fcode-nerf&sa=D&sntz=1&usg=AOvVaw2eD5ZoRbk2aWFuwUSHlh5_)\n    > CodeNeRF is an implicit 3D neural representation that learns the variation of object shapes and textures across a category and can be trained, from a set of posed images, to synthesize novel views of unseen objects. Unlike the original NeRF, which is scene specific, CodeNeRF learns to disentangle shape and texture by learning separate embeddings. At test time, given a single unposed image of an unseen object, CodeNeRF jointly estimates camera viewpoint, and shape and appearance codes via optimization. Unseen objects can be reconstructed from a single image, and then rendered from new viewpoints or their shape and texture edited by varying the latent codes. We conduct experiments on the SRN benchmark, which show that CodeNeRF generalises well to unseen objects and achieves on-par performance with methods that require known camera pose at test time. Our results on real-world images demonstrate that CodeNeRF can bridge the sim-to-real gap. \n  - [Unsupervised Discovery of Object Radiance Fields, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > We study the problem of inferring an object-centric scene representation from a single image, aiming to derive a representation that explains the image formation process, captures the scene's 3D nature, and is learned without supervision. Most existing methods on scene decomposition lack one or more of these characteristics, due to the fundamental challenge in integrating the complex 3D-to-2D image formation process into powerful inference schemes like deep networks. In this paper, we propose unsupervised discovery of Object Radiance Fields (uORF), integrating recent progresses in neural 3D scene representations and rendering with deep inference networks for unsupervised 3D scene decomposition. Trained on multi-view RGB images without annotations, uORF learns to decompose complex scenes with diverse, textured background from a single image. We show that uORF performs well on unsupervised 3D scene segmentation, novel view synthesis, and scene editing on three datasets.\n  - [NeRF-Tex: Neural Reflectance Field Textures, EGSR2021](https://developer.nvidia.com/blog/nvidia-research-nerf-tex-neural-reflectance-field-textures/) | [***``[code]``***](https://github.com/hbaatz/nerf-tex)\n    > We investigate the use of neural fields for modeling diverse mesoscale structures, such as fur, fabric, and grass. Instead of using classical graphics primitives to model the structure, we propose to employ a versatile volumetric primitive represented by a neural reflectance field (NeRF-Tex), which jointly models the geometry of the material and its response to lighting. The NeRF-Tex primitive can be instantiated over a base mesh to “texture” it with the desired meso and microscale appearance. We condition the reflectance field on user-defined parameters that control the appearance. A single NeRF texture thus captures an entire space of reflectance fields rather than one specific structure. This increases the gamut of appearances that can be modeled and provides a solution for combating repetitive texturing artifacts. We also demonstrate that NeRF textures naturally facilitate continuous level-of-detail rendering. Our approach unites the versatility and modeling power of neural networks with the artistic control needed for precise modeling of virtual scenes. While all our training data is currently synthetic, our work provides a recipe that can be further extended to extract complex, hard-to-model appearances from real images.\n"
  },
  {
    "path": "docs/classified_weekly_nerf/video.md",
    "content": "\nWeekly Classified Neural Radiance Fields - video ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=================================================================================================================================================================\n## Filter by classes: \n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | [lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \n## Dec27 - Jan3, 2023\n  - [Boosting UAV Tracking with Voxel-Based Trajectory-Aware Pre-Training, RAL2022](https://ieeexplore.ieee.org/abstract/document/10015867) | [code]\n    > Siamese network-based object tracking has remarkably promoted the automatic capability for highly-maneuvered unmanned aerial vehicles (UAVs). However, the leading-edge tracking framework often depends on template matching, making it trapped when facing multiple views of object in consecutive frames. Moreover, the general image-level pretrained backbone can overfit to holistic representations, causing the misalignment to learn object-level properties in UAV tracking. To tackle these issues, this work presents TRTrack , a comprehensive framework to fully exploit the stereoscopic representation for UAV tracking. Specifically, a novel pre-training paradigm method is proposed. Through trajectory-aware reconstruction training (TRT), the capability of the backbone to extract stereoscopic structure feature is strengthened without any parameter increment. Accordingly, an innovative hierarchical self-attention Transformer is proposed to capture the local detail information and global structure knowledge. For optimizing the correlation map, we proposed a novel spatial correlation refinement (SCR) module, which promotes the capability of modeling the long-range spatial dependencies. Comprehensive experiments on three UAV challenging benchmarks demonstrate that the proposed TRTrack achieves superior UAV tracking performance in both precision and efficiency. Quantitative tests in real-world settings fully prove the effectiveness of our work.\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n  - [QuadStream: A Quad-Based Scene Streaming Architecture for Novel Viewpoint Reconstruction, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555524) | [code]\n    > Streaming rendered 3D content over a network to a thin client device, such as a phone or a VR/AR headset, brings high-fidelity graphics to platforms where it would not normally possible due to thermal, power, or cost constraints. Streamed 3D content must be transmitted with a representation that is both robust to latency and potential network dropouts. Transmitting a video stream and reprojecting to correct for changing viewpoints fails in the presence of disocclusion events; streaming scene geometry and performing high-quality rendering on the client is not possible on limited-power mobile GPUs. To balance the competing goals of disocclusion robustness and minimal client workload, we introduce QuadStream, a new streaming content representation that reduces motion-to-photon latency by allowing clients to efficiently render novel views without artifacts caused by disocclusion events. Motivated by traditional macroblock approaches to video codec design, we decompose the scene seen from positions in a view cell into a series of quad proxies, or view-aligned quads from multiple views. By operating on a rasterized G-Buffer, our approach is independent of the representation used for the scene itself; the resulting QuadStream is an approximate geometric representation of the scene that can be reconstructed by a thin client to render both the current view and nearby adjacent views. Our technical contributions are an efficient parallel quad generation, merging, and packing strategy for proxy views covering potential client movement in a scene; a packing and encoding strategy that allows masked quads with depth information to be transmitted as a frame-coherent stream; and an efficient rendering approach for rendering our QuadStream representation into entirely novel views on thin clients. We show that our approach achieves superior quality compared both to video data streaming methods, and to geometry-based streaming.\n## Nov20 - Nov26, 2022\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n  - [MonoNeuralFusion: Online Monocular Neural 3D Reconstruction with Geometric Priors](https://arxiv.org/abs/2209.15153) | [code]\n    > High-fidelity 3D scene reconstruction from monocular videos continues to be challenging, especially for complete and fine-grained geometry reconstruction. The previous 3D reconstruction approaches with neural implicit representations have shown a promising ability for complete scene reconstruction, while their results are often over-smooth and lack enough geometric details. This paper introduces a novel neural implicit scene representation with volume rendering for high-fidelity online 3D scene reconstruction from monocular videos. For fine-grained reconstruction, our key insight is to incorporate geometric priors into both the neural implicit scene representation and neural volume rendering, thus leading to an effective geometry learning mechanism based on volume rendering optimization. Benefiting from this, we present MonoNeuralFusion to perform the online neural 3D reconstruction from monocular videos, by which the 3D scene geometry is efficiently generated and optimized during the on-the-fly 3D monocular scanning. The extensive comparisons with state-of-the-art approaches show that our MonoNeuralFusion consistently generates much better complete and fine-grained reconstruction results, both quantitatively and qualitatively.\n## Sep18 - Sep24, 2022\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n  - [Temporal View Synthesis of Dynamic Scenes through 3D Object Motion Estimation with Multi-Plane Images, ISMAR2022](https://arxiv.org/abs/2208.09463) | [***``[code]``***](https://github.com/NagabhushanSN95/DeCOMPnet)\n    > The challenge of graphically rendering high frame-rate videos on low compute devices can be addressed through periodic prediction of future frames to enhance the user experience in virtual reality applications. This is studied through the problem of temporal view synthesis (TVS), where the goal is to predict the next frames of a video given the previous frames and the head poses of the previous and the next frames. In this work, we consider the TVS of dynamic scenes in which both the user and objects are moving. We design a framework that decouples the motion into user and object motion to effectively use the available user motion while predicting the next frames. We predict the motion of objects by isolating and estimating the 3D object motion in the past frames and then extrapolating it. We employ multi-plane images (MPI) as a 3D representation of the scenes and model the object motion as the 3D displacement between the corresponding points in the MPI representation. In order to handle the sparsity in MPIs while estimating the motion, we incorporate partial convolutions and masked correlation layers to estimate corresponding points. The predicted object motion is then integrated with the given user or camera motion to generate the next frame. Using a disocclusion infilling module, we synthesize the regions uncovered due to the camera and object motion. We develop a new synthetic dataset for TVS of dynamic scenes consisting of 800 videos at full HD resolution. We show through experiments on our dataset and the MPI Sintel dataset that our model outperforms all the competing methods in the literature.\n  - [Neural Capture of Animatable 3D Human from Monocular Video, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > We present a novel paradigm of building an animatable 3D human representation from a monocular video input, such that it can be rendered in any unseen poses and views. Our method is based on a dynamic Neural Radiance Field (NeRF) rigged by a mesh-based parametric 3D human model serving as a geometry proxy. Previous methods usually rely on multi-view videos or accurate 3D geometry information as additional inputs; besides, most methods suffer from degraded quality when generalized to unseen poses. We identify that the key to generalization is a good input embedding for querying dynamic NeRF: A good input embedding should define an injective mapping in the full volumetric space, guided by surface mesh deformation under pose variation. Based on this observation, we propose to embed the input query with its relationship to local surface regions spanned by a set of geodesic nearest neighbors on mesh vertices. By including both position and relative distance information, our embedding defines a distance-preserved deformation mapping and generalizes well to unseen poses. To reduce the dependency on additional inputs, we first initialize per-frame 3D meshes using off-the-shelf tools and then propose a pipeline to jointly optimize NeRF and refine the initial mesh. Extensive experiments show our method can synthesize plausible human rendering results under unseen poses and views.\n  - [Casual Indoor HDR Radiance Capture from Omnidirectional Images](https://arxiv.org/abs/2208.07903) | [code]\n    > We present PanoHDR-NeRF, a novel pipeline to casually capture a plausible full HDR radiance field of a large indoor scene without elaborate setups or complex capture protocols. First, a user captures a low dynamic range (LDR) omnidirectional video of the scene by freely waving an off-the-shelf camera around the scene. Then, an LDR2HDR network uplifts the captured LDR frames to HDR, subsequently used to train a tailored NeRF++ model. The resulting PanoHDR-NeRF pipeline can estimate full HDR panoramas from any location of the scene. Through experiments on a novel test dataset of a variety of real scenes with the ground truth HDR radiance captured at locations not seen during training, we show that PanoHDR-NeRF predicts plausible radiance from any scene point. We also show that the HDR images produced by PanoHDR-NeRF can synthesize correct lighting effects, enabling the augmentation of indoor scenes with synthetic objects that are lit correctly.\n## Aug7 - Aug13, 2022\n  - [PS-NeRV: Patch-wise Stylized Neural Representations for Videos](https://arxiv.org/abs/2208.03742) | [code]\n    > We study how to represent a video with implicit neural representations (INRs). Classical INRs methods generally utilize MLPs to map input coordinates to output pixels. While some recent works have tried to directly reconstruct the whole image with CNNs. However, we argue that both the above pixel-wise and image-wise strategies are not favorable to video data. Instead, we propose a patch-wise solution, PS-NeRV, which represents videos as a function of patches and the corresponding patch coordinate. It naturally inherits the advantages of image-wise methods, and achieves excellent reconstruction performance with fast decoding speed. The whole method includes conventional modules, like positional embedding, MLPs and CNNs, while also introduces AdaIN to enhance intermediate features. These simple yet essential changes could help the network easily fit high-frequency details. Extensive experiments have demonstrated its effectiveness in several video-related tasks, such as video compression and video inpainting.\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n## Previous weeks\n  - [﻿Plenoxels: Radiance Fields without Neural Networks, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > We introduce Plenoxels (plenoptic voxels), a system for photorealistic view synthesis. Plenoxels represent a scene as a sparse 3D grid with spherical harmonics. This representation can be optimized from calibrated images via gradient methods and regularization without any neural components. On standard, benchmark tasks, Plenoxels are optimized two orders of magnitude faster than Neural Radiance Fields with no loss in visual quality.\n  - [Neural Scene Flow Fields for Space-Time View Synthesis of Dynamic Scenes, CVPR2021](http://www.cs.cornell.edu/~zl548/NSFF/) | [***``[code]``***](https://github.com/zhengqili/Neural-Scene-Flow-Fields)\n    > We present a method to perform novel view and time synthesis of dynamic scenes, requiring only a monocular video with known camera poses as input. To do this, we introduce Neural Scene Flow Fields, a new representation that models the dynamic scene as a time-variant continuous function of appearance, geometry, and 3D scene motion. Our representation is optimized through a neural network to fit the observed input views. We show that our representation can be used for complex dynamic scenes, including thin structures, view-dependent effects, and natural degrees of motion. We conduct a number of experiments that demonstrate our approach significantly outperforms recent monocular view synthesis methods, and show qualitative results of space-time view synthesis on a variety of real-world videos.\n  - [Neural 3D Video Synthesis from Multi-view Video, CVPR2022(oral)](https://neural-3d-video.github.io/) | [code]\n    > We propose a novel approach for 3D video synthesis that is able to represent multi-view video recordings of a dynamic real-world scene in a compact, yet expressive representation that enables high-quality view synthesis and motion interpolation. Our approach takes the high quality and compactness of static neural radiance fields in a new direction: to a model-free, dynamic setting. At the core of our approach is a novel time-conditioned neural radiance fields that represents scene dynamics using a set of compact latent codes. To exploit the fact that changes between adjacent frames of a video are typically small and locally consistent, we propose two novel strategies for efficient training of our neural network: 1) An efficient hierarchical training scheme, and 2) an importance sampling strategy that selects the next rays for training based on the temporal variation of the input videos. In combination, these two strategies significantly boost the training speed, lead to fast convergence of the training process, and enable high quality results. Our learned representation is highly compact and able to represent a 10 second 30 FPS multi-view video recording by 18 cameras with a model size of just 28MB. We demonstrate that our method can render high-fidelity wide-angle novel views at over 1K resolution, even for highly complex and dynamic scenes. We perform an extensive qualitative and quantitative evaluation that shows that our approach outperforms the current state of the art. Project website: https://neural-3d-video.github.io.\n  - [Dynamic View Synthesis from Dynamic Monocular Video, ICCV2021](https://free-view-video.github.io/) | [***``[code]``***](https://github.com/gaochen315/DynamicNeRF)\n    > We present an algorithm for generating novel views at arbitrary viewpoints and any input time step given a monocular video of a dynamic scene. Our work builds upon recent advances in neural implicit representation and uses continuous and differentiable functions for modeling the time-varying structure and the appearance of the scene. We jointly train a time-invariant static NeRF and a time-varying dynamic NeRF, and learn how to blend the results in an unsupervised manner. However, learning this implicit function from a single video is highly ill-posed (with infinitely many solutions that match the input video). To resolve the ambiguity, we introduce regularization losses to encourage a more physically plausible solution. We show extensive quantitative and qualitative results of dynamic view synthesis from casually captured videos.\n  - [Editable Free-Viewpoint Video using a Layered Neural Representation, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > Generating free-viewpoint videos is critical for immersive VR/AR experience but recent neural advances still lack the editing ability to manipulate the visual perception for large dynamic scenes. To fill this gap, in this paper we propose the first approach for editable photo-realistic free-viewpoint video generation for large-scale dynamic scenes using only sparse 16 cameras. The core of our approach is a new layered neural representation, where each dynamic entity including the environment itself is formulated into a space-time coherent neural layered radiance representation called ST-NeRF. Such layered representation supports fully perception and realistic manipulation of the dynamic scene whilst still supporting a free viewing experience in a wide range. In our ST-NeRF, the dynamic entity/layer is represented as continuous functions, which achieves the disentanglement of location, deformation as well as the appearance of the dynamic entity in a continuous and self-supervised manner. We propose a scene parsing 4D label map tracking to disentangle the spatial information explicitly, and a continuous deform module to disentangle the temporal motion implicitly. An object-aware volume rendering scheme is further introduced for the re-assembling of all the neural layers. We adopt a novel layered loss and motion-aware ray sampling strategy to enable efficient training for a large dynamic scene with multiple performers, Our framework further enables a variety of editing functions, i.e., manipulating the scale and location, duplicating or retiming individual neural layers to create numerous visual effects while preserving high realism. Extensive experiments demonstrate the effectiveness of our approach to achieve high-quality, photo-realistic, and editable free-viewpoint video generation for dynamic scenes.\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/dynamic.md",
    "content": "\n每周分类神经辐射场 - dynamic ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n====================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [Tensor4D：用于高保真动态重建和渲染的高效神经 4D 分解](https://arxiv.org/abs/2211.11610) | [code]\n    > 我们介绍了 Tensor4D，这是一种高效而有效的动态场景建模方法。我们解决方案的关键是一种高效的 4D 张量分解方法，使动态场景可以直接表示为 4D 时空张量。为了解决伴随的内存问题，我们首先将 4D 张量投影到三个时间感知体积，然后是九个紧凑的特征平面，从而分层分解 4D 张量。通过这种方式，可以以紧凑且高效的方式同时捕获随时间变化的空间信息。当应用 Tensor4D 进行动态场景重建和渲染时，我们进一步将 4D 场分解为不同的尺度，以便从粗到细学习结构运动和动态细节变化。我们的方法的有效性在合成场景和真实场景中都得到了验证。大量实验表明，我们的方法能够从稀疏视图摄像机装置甚至单目摄像机实现高质量的动态重建和渲染。代码和数据集将在此 https URL 上发布。\n  - [DynIBaR：基于神经动态图像的渲染, -](https://arxiv.org/abs/2211.11082) | [code]\n    > 我们解决了从描述复杂动态场景的单目视频中合成新视图的问题。基于随时间变化的神经辐射场（又名动态 NeRF）的最先进方法已在该任务上显示出令人印象深刻的结果。然而，对于具有复杂物体运动和不受控制的摄像机轨迹的长视频，这些方法可能会产生模糊或不准确的渲染，从而阻碍它们在现实世界中的应用。我们提出了一种新方法来解决这些限制，而不是在 MLP 的权重内对整个动态场景进行编码，方法是采用基于体积图像的渲染框架，该框架通过以场景运动感知方式聚合附近视图的特征来合成新视点.我们的系统保留了先前方法在建模复杂场景和视图相关效果方面的优势，而且还能够从具有复杂场景动态和不受约束的相机轨迹的长视频中合成照片般逼真的新颖视图。我们展示了对动态场景数据集的最先进方法的显着改进，并将我们的方法应用于具有挑战性相机和物体运动的野外视频，在这些视频中，先前的方法无法产生高质量的渲染。我们的项目网页位于此 http URL。\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n  - [ParticleNeRF：动态场景中在线神经辐射场的基于粒子的编码](https://arxiv.org/abs/2211.04041) | [code]\n    > 神经辐射场 (NeRFs) 从图像中学习隐式表示（通常是静态的）环境。我们的论文扩展了 NeRFs 以在线方式处理动态场景。我们建议 ParticleNeRF 适应环境几何形状的变化，每 350 毫秒学习一个新的最新表示。与其他 NeRF 框架相比，ParticleNeRF 可以以更高的保真度表示动态环境的当前状态。为实现这一目标，我们引入了一种新的基于粒子的参数编码，它允许中间 NeRF 特征——现在耦合到空间中的粒子——随动态几何移动。这可以通过将光度重建损失反向传播到粒子的位置来实现。位置梯度被解释为粒子速度，并使用基于位置的动力学 (PBS) 物理系统集成到位置中。将 PBS 引入 NeRF 公式使我们能够为粒子运动添加碰撞约束，并创造未来机会将其他运动先验添加到系统中，例如刚体和可变形体\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n  - [神经辐射场场景重建探索：合成、真实世界和动态场景](https://arxiv.org/abs/2210.12268) | [code]\n    > 该项目介绍了使用神经辐射场 (NeRF) 方法对合成和真实世界场景进行 3D 场景重建的探索。我们主要利用神经图形基元多分辨率哈希编码的训练和渲染时间的减少，来重建静态视频游戏场景和现实世界场景，比较和观察重建细节和局限性。此外，我们使用动态场景的神经辐射场 (D-NeRF) 探索动态场景重建。最后，我们扩展了 D-NeRF 的实现，最初仅限于处理合成场景，也可以处理真实世界的动态场景。\n## Oct9 - Oct15, 2022\n  - [通过学习一致性场实现高效的神经场景图, BMVC2022](https://arxiv.org/abs/2210.04127) | [***``[code]``***](https://github.com/ldynx/CF-NSG)\n    > 神经辐射场 (NeRF) 从新颖的视图实现照片般逼真的图像渲染，神经场景图 (NSG) \\cite{ost2021neural} 将其扩展到具有多个对象的动态场景（视频）。然而，为每个图像帧计算繁重的光线行进成为一个巨大的负担。在本文中，利用视频中相邻帧之间的显着冗余，我们提出了一个特征重用框架。然而，从天真地重用 NSG 特征的第一次尝试中，我们了解到，将跨帧一致的对象内在属性与瞬态属性分开是至关重要的。我们提出的方法，\\textit{基于一致性场的 NSG (CF-NSG)}，重新定义了神经辐射场以额外考虑 \\textit{一致性场}。通过解开表示，CF-NSG 充分利用了特征重用方案，并以更可控的方式执行扩展程度的场景操作。我们凭经验验证，CF-NSG 通过使用比 NSG 少 85% 的查询大大提高了推理效率，而渲染质量没有显着下降。代码将在以下位置提供：此 https 网址\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n  - [PREF：可预测性正则化神经运动场, ECCV2022(oral)](https://arxiv.org/abs/2209.10691) | [code]\n    > 了解动态场景中的 3D 运动对于许多视觉应用至关重要。最近的进展主要集中在估计一些特定元素的活动，如人类。在本文中，我们利用神经运动场来估计多视图设置中所有点的运动。由于颜色相似的点和颜色随时间变化的点的模糊性，使用多视图数据对动态场景的运动进行建模具有挑战性。我们建议将估计的运动规范化为可预测的。如果先前帧的运动是已知的，那么不久的将来的运动应该是可预测的。因此，我们通过首先调节潜在嵌入的估计运动，然后通过采用预测器网络来强制嵌入的可预测性来引入可预测性正则化。与最先进的基于神经运动场的动态场景表示方法相比，所提出的框架 PREF（Predictability REgularized Fields）实现了同等或更好的结果，同时不需要场景的先验知识。\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n  - [神经特征融合领域：自监督 2D 图像表示的 3D 蒸馏, 3DV2022(oral)](https://arxiv.org/abs/2209.03494) | [***``[code]``***](https://github.com/dichotomies/N3F)\n    > 我们提出了神经特征融合场 (N3F)，这是一种在将密集 2D 图像特征提取器应用于可重构为 3D 场景的多张图像分析时改进密集 2D 图像特征提取器的方法。给定一个图像特征提取器，例如使用自我监督进行预训练，N3F 使用它作为教师来学习在 3D 空间中定义的学生网络。 3D 学生网络类似于提取所述特征的神经辐射场，并且可以使用通常的可微渲染机器进行训练。因此，N3F 很容易适用于大多数神经渲染公式，包括 vanilla NeRF 及其对复杂动态场景的扩展。我们表明，我们的方法不仅能够在不使用手动标签的情况下在特定场景的神经领域的上下文中实现语义理解，而且在自我监督的 2D 基线上持续改进。这通过考虑不同序列中的各种任务（例如 2D 对象检索、3D 分割和场景编辑）来证明，包括 EPIC-KITCHENS 基准测试中的以自我为中心的长视频。\n  - [MotionDiffuse：使用扩散模型的文本驱动人体运动生成](https://arxiv.org/abs/2208.15001) | [***``[code]``***](https://github.com/mingyuan-zhang/MotionDiffuse)\n    > 人体运动建模对于许多现代图形应用程序很重要，这些应用程序通常需要专业技能。为了消除外行的技能障碍，最近的动作生成方法可以直接生成以自然语言为条件的人体动作。然而，通过各种文本输入实现多样化和细粒度的运动生成仍然具有挑战性。为了解决这个问题，我们提出了 MotionDiffuse，这是第一个基于扩散模型的文本驱动的运动生成框架，它展示了现有方法的几个所需属性。 1）概率映射。 MotionDiffuse 不是确定性的语言-运动映射，而是通过一系列注入变化的去噪步骤生成运动。 2）现实综合。 MotionDiffuse 擅长对复杂的数据分布进行建模并生成生动的运动序列。 3) 多级操作。 MotionDiffuse 响应身体部位的细粒度指令，以及带有时变文本提示的任意长度运动合成。我们的实验表明，MotionDiffuse 在文本驱动的运动生成和动作条件的运动生成方面具有令人信服的优势，从而优于现有的 SoTA 方法。定性分析进一步证明了 MotionDiffuse 对综合运动生成的可控性。主页：此 https 网址\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n  - [E-NeRF：来自移动事件相机的神经辐射场](https://arxiv.org/abs/2208.11300) | [code]\n    > 从理想图像估计神经辐射场 (NeRFs) 已在计算机视觉领域得到广泛研究。大多数方法假设最佳照明和缓慢的相机运动。这些假设在机器人应用中经常被违反，其中图像包含运动模糊并且场景可能没有合适的照明。这可能会导致下游任务（例如场景的导航、检查或可视化）出现重大问题。为了缓解这些问题，我们提出了 E-NeRF，这是第一种从快速移动的事件摄像机中以 NeRF 形式估计体积场景表示的方法。我们的方法可以在非常快速的运动和高动态范围条件下恢复 NeRF，在这种情况下，基于帧的方法会失败。我们展示了仅通过提供事件流作为输入来渲染高质量帧是可能的。此外，通过结合事件和帧，我们可以估计在严重运动模糊下比最先进的方法质量更高的 NeRF。我们还表明，在只有很少的输入视图可用的情况下，结合事件和帧可以克服 NeRF 估计的失败情况，而无需额外的正则化。\n## Aug14 - Aug20, 2022\n  - [从单目视频中对动画 3D 人体进行神经捕获, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > 我们提出了一种从单目视频输入构建可动画 3D 人体表示的新颖范例，这样它就可以以任何看不见的姿势和视图进行渲染。我们的方法基于动态神经辐射场 (NeRF)，该动态神经辐射场 (NeRF) 由作为几何代理的基于网格的参数化 3D 人体模型装配。以前的方法通常依赖多视图视频或准确的 3D 几何信息作为附加输入；此外，大多数方法在推广到看不见的姿势时质量会下降。我们认为，泛化的关键是用于查询动态 NeRF 的良好输入嵌入：良好的输入嵌入应该定义全体积空间中的单射映射，由姿态变化下的表面网格变形引导。基于这一观察，我们建议嵌入输入查询及其与网格顶点上一组测地最近邻所跨越的局部表面区域的关系。通过包含位置和相对距离信息，我们的嵌入定义了距离保留的变形映射，并很好地推广到看不见的姿势。为了减少对额外输入的依赖，我们首先使用现成的工具初始化每帧 3D 网格，然后提出一个管道来联合优化 NeRF 并细化初始网格。大量实验表明，我们的方法可以在看不见的姿势和视图下合成合理的人类渲染结果。\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n  - [NFOMP：具有非完整约束的差动驱动机器人最优运动规划器的神经场, IEEE Robotics and Automation Letters](https://ieeexplore.ieee.org/abstract/document/9851532/) | [code]\n    > 摘要：最优运动规划是移动机器人中最关键的问题之一。一方面，经典的基于采样的方法为这个问题提出了渐近最优的解决方案。然而，这些规划器无法在合理的计算时间内实现平滑和短的轨迹。另一方面，基于优化的方法能够在各种场景中生成平滑而平坦的轨迹，包括密集的人群。然而，现代基于优化的方法使用预先计算的有符号距离函数进行碰撞损失估计，它限制了这些方法在一般配置空间中的应用，包括具有非完整约束的差分驱动非圆形机器人。此外，基于优化的方法缺乏准确处理 U 形或薄障碍物的能力。我们建议从两个方面改进优化方法。首先，我们开发了一个障碍物神经场模型来估计碰撞损失；将此模型与轨迹优化一起训练可以持续改善碰撞损失，同时实现更可行和更平滑的轨迹。其次，我们通过将拉格朗日乘数添加到轨迹损失函数中来强制轨迹考虑非完整约束。我们应用我们的方法解决了具有非完整约束的差动驱动机器人的最优运动规划问题，对我们的解决方案进行了基准测试，并证明了新的规划器生成了非常适合机器人跟随的平滑、短而平坦的轨迹，并且优于最先进的方法在归一化曲率上提高了 25%，在 MovingAI 环境中的尖点数量上提高了 75%。\n  - [基于神经辐射场和运动图的可控自由视点视频重建, IEEE Transactions on Visualization and Computer Graphics](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > 在本文中，我们提出了一种基于运动图和神经辐射场（NeRF）的可控高质量自由视点视频生成方法。与现有的姿势驱动 NeRF 或时间/结构条件的 NeRF 工作不同，我们建议首先构建捕获序列的有向运动图。这种序列-运动-参数化策略不仅能够灵活地控制自由视点视频渲染的姿态，而且避免了相似姿态的冗余计算，从而提高了整体重建效率。此外，为了支持身体形状控制而不损失逼真的自由视点渲染性能，我们通过结合显式表面变形和隐式神经场景表示来改进 vanilla NeRF。具体来说，我们为运动图上的每个有效帧训练一个局部表面引导的 NeRF，并且体积渲染仅在真实表面周围的局部空间中执行，从而实现了合理的形状控制能力。据我们所知，我们的方法是第一个同时支持逼真的自由视点视频重建和基于运动图的用户引导运动遍历的方法。结果和比较进一步证明了所提出方法的有效性。\n  - [基于神经描述符字段的鲁棒变化检测, IROS2022](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > 推理环境变化的能力对于长时间运行的机器人至关重要。代理应在操作期间捕获更改，以便可以遵循操作以确保工作会话的顺利进行。然而，不同的视角和累积的定位误差使得机器人很容易由于低观察重叠和漂移的对象关联而错误地检测到周围世界的变化。在本文中，基于最近提出的类别级神经描述符字段 (NDF)，我们开发了一种对象级在线变化检测方法，该方法对部分重叠的观察和嘈杂的定位结果具有鲁棒性。利用 NDF 的形状补全能力和 SE(3) 等效性，我们表示具有紧凑形状代码的对象，该代码编码来自部分观察的完整对象形状。然后基于从 NDF 恢复的对象中心将对象组织在空间树结构中，以便快速查询对象邻域。通过形状代码相似性关联对象并比较局部对象-邻居空间布局，我们提出的方法证明了对低观测重叠和定位噪声的鲁棒性。我们对合成序列和真实世界序列进行了实验，与多种基线方法相比，实现了改进的变化检测结果。\n## Jul24 - Jul30, 2022\n  - [用笼子变形辐射场, ECCV2022](https://arxiv.org/abs/2207.12298) | [code]\n    > 辐射场的最新进展可以实现静态或动态 3D 场景的逼真渲染，但仍不支持用于场景操作或动画的显式变形。在本文中，我们提出了一种新的辐射场变形方法：自由形式的辐射场变形。我们使用一个三角形网格来包围称为笼子的前景对象作为界面，通过操纵笼子顶点，我们的方法可以实现辐射场的自由变形。我们方法的核心是网格变形中常用的基于笼的变形。我们提出了一种将其扩展到辐射场的新公式，该公式将采样点的位置和视图方向从变形空间映射到规范空间，从而实现变形场景的渲染。合成数据集和真实世界数据集的变形结果证明了我们方法的有效性。\n## Previous weeks\n  - [D-NeRF：动态场景的神经辐射场, CVPR2021](https://arxiv.org/abs/2011.13961) | [***``[code]``***](https://github.com/albertpumarola/D-NeRF)\n    > 将机器学习与几何推理相结合的神经渲染技术已成为从一组稀疏图像中合成场景新视图的最有前途的方法之一。其中，神经辐射场 (NeRF) 尤为突出，它训练深度网络将 5D 输入坐标（表示空间位置和观察方向）映射为体积密度和与视图相关的发射辐射。然而，尽管在生成的图像上实现了前所未有的真实感水平，但 NeRF 仅适用于静态场景，其中可以从不同的图像中查询相同的空间位置。在本文中，我们介绍了 D-NeRF，这是一种将神经辐射场扩展到动态域的方法，允许在场景中移动的 \\emph{single} 相机的刚性和非刚性运动下重建和渲染物体的新图像。为此，我们将时间视为系统的附加输入，并将学习过程分为两个主要阶段：一个将场景编码为规范空间，另一个将这个规范表示映射到特定时间的变形场景。两种映射都是使用全连接网络同时学习的。一旦网络经过训练，D-NeRF 就可以渲染新颖的图像，同时控制相机视图和时间变量，从而控制对象的移动。我们展示了我们的方法在物体处​​于刚性、关节和非刚性运动的场景中的有效性。代码、模型权重和动态场景数据集将发布。\n  - [用于单目 4D 面部头像重建的动态神经辐射场, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > 我们提出了用于模拟人脸外观和动态的动态神经辐射场。对说话的人进行数字建模和重建是各种应用程序的关键组成部分。特别是对于 AR 或 VR 中的远程呈现应用，需要忠实再现外观，包括新颖的视点或头部姿势。与显式建模几何和材料属性或纯粹基于图像的最先进方法相比，我们引入了基于场景表示网络的头部隐式表示。为了处理面部的动态，我们将场景表示网络与低维可变形模型相结合，该模型提供对姿势和表情的显式控制。我们使用体积渲染从这种混合表示中生成图像，并证明这种动态神经场景表示只能从单目输入数据中学习，而不需要专门的捕获设置。在我们的实验中，我们表明这种学习的体积表示允许生成照片般逼真的图像，其质量超过了基于视频的最先进的重演方法的质量。\n  - [非刚性神经辐射场：单目视频变形场景的重建和新视图合成，, ICCV2021](https://vcai.mpi-inf.mpg.de/projects/nonrigid_nerf/) | [***``[code]``***](https://github.com/facebookresearch/nonrigid_nerf)\n    > 我们提出了非刚性神经辐射场 (NR-NeRF)，这是一种用于一般非刚性动态场景的重建和新颖的视图合成方法。我们的方法将动态场景的 RGB 图像作为输入（例如，来自单目视频记录），并创建高质量的时空几何和外观表示。我们表明，单个手持消费级相机足以从新颖的虚拟相机视图合成动态场景的复杂渲染，例如一个“子弹时间”的视频效果。 NR-NeRF 将动态场景分解为规范体积及其变形。场景变形被实现为光线弯曲，其中直线光线被非刚性变形。我们还提出了一种新的刚性网络来更好地约束场景的刚性区域，从而获得更稳定的结果。射线弯曲和刚性网络在没有明确监督的情况下进行训练。我们的公式可以实现跨视图和时间的密集对应估计，以及引人注目的视频编辑应用程序，例如运动夸张。我们的代码将是开源的。\n  - [神经体：具有结构化潜在代码的隐式神经表示，用于动态人类的新视图合成, CVPR2021](https://zju3dv.github.io/neuralbody/) | [***``[code]``***](https://github.com/zju3dv/neuralbody)\n    > 本文解决了人类表演者从一组非常稀疏的摄像机视图中合成新颖视图的挑战。最近的一些工作表明，在给定密集输入视图的情况下，学习 3D 场景的隐式神经表示可以实现显着的视图合成质量。但是，如果视图高度稀疏，则表示学习将是不适定的。为了解决这个不适定问题，我们的关键思想是整合对视频帧的观察。为此，我们提出了神经体，这是一种新的人体表示，它假设在不同帧上学习到的神经表示共享同一组锚定到可变形网格的潜在代码，以便可以自然地整合跨帧的观察结果。可变形网格还为网络提供几何指导，以更有效地学习 3D 表示。为了评估我们的方法，我们创建了一个名为 ZJU-MoCap 的多视图数据集，用于捕捉具有复杂动作的表演者。 ZJU-MoCap 的实验表明，我们的方法在新颖的视图合成质量方面大大优于先前的工作。我们还展示了我们的方法从 People-Snapshot 数据集上的单目视频重建移动人物的能力。\n  - [动态单目视频的动态视图合成, ICCV2021](https://free-view-video.github.io/) | [***``[code]``***](https://github.com/gaochen315/DynamicNeRF)\n    > 我们提出了一种算法，用于在给定动态场景的单目视频的任意视点和任何输入时间步长处生成新视图。我们的工作建立在神经隐式表示的最新进展的基础上，并使用连续和可微的函数来建模时变结构和场景的外观。我们联合训练一个时不变的静态 NeRF 和一个时变的动态 NeRF，并学习如何以无监督的方式混合结果。然而，从单个视频中学习这个隐式函数是非常不适定的（与输入视频匹配的解决方案有无限多）。为了解决歧义，我们引入了正则化损失以鼓励更合理的解决方案。我们展示了从随意捕获的视频中进行动态视图合成的广泛定量和定性结果。\n  - [TöRF：动态场景视图合成的飞行时间辐射场, NeurIPS2021](https://imaging.cs.cmu.edu/torf/) | [***``[code]``***](https://github.com/breuckelen/torf)\n    > 神经网络可以表示和准确重建静态 3D 场景（例如 NeRF）的辐射场。一些作品将这些扩展到用单目视频捕获的动态场景，并具有可观的性能。然而，众所周知，单眼设置是一个约束不足的问题，因此方法依赖于数据驱动的先验来重建动态内容。我们用飞行时间 (ToF) 相机的测量值替换这些先验，并引入基于连续波 ToF 相机图像形成模型的神经表示。我们不使用处理过的深度图，而是对原始 ToF 传感器测量进行建模，以提高重建质量并避免低反射率区域、多路径干扰和传感器有限的明确深度范围等问题。我们展示了这种方法提高了动态场景重建对错误校准和大运动的鲁棒性，并讨论了集成现代智能手机上现在可用的 RGB+ToF 传感器的好处和局限性。\n  - [以对象为中心的神经场景渲染](https://shellguo.com/osf/) | [***``[code]``***](https://shellguo.com/osf/)\n    > 我们提出了一种从捕获的对象图像中合成逼真场景的方法。我们的工作建立在神经辐射场 (NeRFs) 之上，它隐含地模拟了场景的体积密度和定向发射的辐射。虽然 NeRF 可以合成逼真的图片，但它们只对静态场景进行建模，并且与特定的成像条件密切相关。这个属性使得 NeRFs 难以泛化到新场景，包括新的光照或对象的新排列。我们建议学习以对象为中心的神经散射函数 (OSF)，而不是像 NeRF 那样学习场景辐射场，这是一种使用与光照和视图相关的神经网络隐式模拟每个对象的光传输的表示。即使物体或灯光移动，这也可以渲染场景，而无需重新训练。结合体积路径跟踪程序，我们的框架能够渲染对象内和对象间的光传输效果，包括遮挡、镜面反射、阴影和间接照明。我们评估了我们的场景合成方法，并表明它可以推广到新的照明条件，产生逼真的、物理上精确的多对象场景渲染。\n  - [学习动态人头的组成辐射场, CVPR2021(oral)](https://ziyanw1.github.io/hybrid_nerf/) | [code]\n    > 动态人体的逼真渲染是远程呈现系统、虚拟购物、合成数据生成等的重要能力。最近，结合计算机图形学和机器学习技术的神经渲染方法已经创建了人类和物体的高保真模型。其中一些方法不会为可驱动的人体模型（神经体积）产生足够高保真度的结果，而其他方法则具有极长的渲染时间（NeRF）。我们提出了一种新颖的组合 3D 表示，它结合了以前最好的方法来产生更高分辨率和更快的结果。我们的表示通过将粗略的 3D 结构感知动画代码网格与连续学习的场景函数相结合，弥合了离散和连续体积表示之间的差距，该函数将每个位置及其相应的局部动画代码映射到其与视图相关的发射辐射和局部体积密度。可微分体渲染用于计算人头和上身的照片般逼真的新颖视图，并仅使用 2D 监督来端到端训练我们的新颖表示。此外，我们表明，学习到的动态辐射场可用于基于全局动画代码合成新的看不见的表情。我们的方法在合成动态人头和上半身的新视图方面取得了最先进的结果。\n  - [动态场景的神经场景图, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > 最近的隐式神经渲染方法表明，可以通过仅由一组 RGB 图像监督的预测其体积密度和颜色来学习复杂场景的准确视图合成。然而，现有方法仅限于学习将所有场景对象编码为单个神经网络的静态场景的有效表示，并且缺乏将动态场景表示和分解为单个场景对象的能力。在这项工作中，我们提出了第一个将动态场景分解为场景图的神经渲染方法。我们提出了一种学习的场景图表示，它对对象变换和辐射进行编码，以有效地渲染场景的新颖排列和视图。为此，我们学习隐式编码的场景，并结合联合学习的潜在表示来描述具有单个隐式函数的对象。我们在合成和真实汽车数据上评估所提出的方法，验证我们的方法学习动态场景 - 仅通过观察该场景的视频 - 并允许渲染具有看不见的对象集的新颖场景组合的新颖照片般逼真的视图看不见的姿势。\n  - [用于视觉运动控制的 3D 神经场景表示, CoRL2021(oral)](https://3d-representation-learning.github.io/nerf-dy/) | [code]\n    > 人类对我们周围的 3D 环境有着强烈的直觉理解。我们大脑中的物理心智模型适用于不同材料的物体，使我们能够执行远远超出当前机器人范围的广泛操作任务。在这项工作中，我们希望纯粹从 2D 视觉观察中学习动态 3D 场景的模型。我们的模型结合了神经弧度\n  - [神经辐射世界中的仅视觉机器人导航](https://arxiv.org/abs/2110.00168) | [code]\n    > 神经辐射场 (NeRFs) 最近已成为表示自然、复杂 3D 场景的强大范例。 NeRF 表示神经网络中的连续体积密度和 RGB 值，并通过光线追踪从看不见的相机视点生成照片般逼真的图像。我们提出了一种算法，用于在表示为 NeRF 的 3D 环境中导航机器人，仅使用板载 RGB 相机进行定位。我们假设场景的 NeRF 已经离线预训练，机器人的目标是在 NeRF 中的未占用空间中导航以达到目标姿势。我们引入了一种轨迹优化算法，该算法基于离散时间版本的差分平坦度避免与 NeRF 中的高密度区域发生碰撞，该版本可以约束机器人的完整姿势和控制输入。我们还引入了一种基于优化的过滤方法来估计 NeRF 中机器人的 6DoF 姿势和速度，仅给定一个板载 RGB 相机。我们将轨迹规划器与位姿过滤器结合在一个在线重新规划循环中，以提供基于视觉的机器人导航管道。我们展示了一个四旋翼机器人仅使用 RGB 相机在丛林健身房环境、教堂内部和巨石阵中导航的模拟结果。我们还演示了一个在教堂中导航的全向地面机器人，要求它重新定向以适应狭窄的缝隙。可以在此 https 网址上找到这项工作的视频。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/editing.md",
    "content": "\n每周分类神经辐射场 - editing ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n====================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n  - [从神经辐射场中移除对象](https://arxiv.org/abs/2212.11966) | [code]\n    > 神经辐射场 (NeRFs) 正在成为一种无处不在的场景表示，可实现新颖的视图合成。 NeRF 将越来越多地与其他人共享。 不过，在共享 NeRF 之前，可能需要删除个人信息或难看的物体。 使用当前的 NeRF 编辑框架不容易实现这种删除。 我们提出了一个框架，用于从 RGB-D 序列创建的 NeRF 表示中删除对象。 我们的 NeRF 修复方法利用了最近在 2D 图像修复方面的工作，并以用户提供的掩码为指导。 我们的算法以基于置信度的视图选择程序为基础。 它选择在创建 NeRF 时使用哪些单独的 2D 修复图像，以便生成的修复 NeRF 是 3D 一致的。 我们表明我们的 NeRF 编辑方法对于以多视图连贯方式合成合理的修复是有效的。 我们使用一个新的且仍然具有挑战性的数据集来验证我们的方法来完成 NeRF 修复任务。\n## Dec11 - Dec17, 2022\n  - [NeRF-Art：文本驱动的神经辐射场程式化](https://arxiv.org/abs/2212.08070) | [***``[code]``***](https://cassiepython.github.io/nerfart/)\n    > 作为 3D 场景的强大表示，神经辐射场 (NeRF) 可以从多视图图像中合成高质量的新视图。 然而，对 NeRF 进行样式化仍然具有挑战性，尤其是在模拟外观和几何形状同时发生变化的文本引导样式时。 在本文中，我们介绍了 NeRF-Art，这是一种文本引导的 NeRF 风格化方法，它通过简单的文本提示来操纵预训练的 NeRF 模型的风格。 与以前缺乏足够的几何变形和纹理细节或需要网格来指导风格化的方法不同，我们的方法可以将 3D 场景转换为以所需几何形状和外观变化为特征的目标样式，而无需任何网格引导。 这是通过引入一种新颖的全局-局部对比学习策略，结合方向约束来同时控制目标风格的轨迹和强度来实现的。 此外，我们采用权重正则化方法来有效抑制在几何样式化过程中转换密度场时容易出现的混浊伪影和几何噪声。 通过对各种风格的广泛实验，我们证明了我们的方法在单视图风格化质量和跨视图一致性方面是有效且稳健的。 代码和更多结果可以在我们的项目页面中找到：这个 https URL。\n## Dec4 - Dec10, 2022\n  - [Ref-NPR：基于参考的非真实感辐射场](https://arxiv.org/abs/2212.02766) | [code]\n    > 现有的 3D 场景风格化方法采用任意风格参考来将纹理和颜色作为风格进行传输，而无需建立有意义的语义对应关系。 我们提出了基于参考的非真实感辐射场，即 Ref-NPR。 它是一种可控的场景风格化方法，利用辐射场对 3D 场景进行风格化，并以单个风格化的 2D 视图作为参考。 为了获得不错的结果，我们提出了一种基于程式化参考视图的光线配准过程，以在新颖的视图中获得伪光线监督，并利用内容图像中的语义对应来填充具有感知相似风格的遮挡区域。 结合这些操作，Ref-NPR 使用单个参考生成非真实感和连续的新颖视图序列，同时在遮挡区域获得合理的程式化。 实验表明，Ref-NPR 在视觉质量和语义对应方面明显优于其他场景和视频风格化方法。 代码和数据将公开。\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n  - [基于学习的复杂室内场景逆渲染与可微蒙特卡洛光线追踪, SIGGRAPH-Asia2022](https://jingsenzhu.github.io/invrend/) | [code]\n    > 我们提出了一种基于学习的方法，用于使用可区分的蒙特卡洛光线追踪对复杂的室内场景进行逆向渲染。我们的方法将单个室内场景 RGB 图像作为输入，并自动推断其底层表面反射率、几何形状和空间变化的照明。这使我们能够对场景进行逼真的编辑，例如插入多个复杂的虚拟对象并使用全局照明忠实地编辑表面材质。\n## Oct30 - Nov5, 2022\n  - [gCoRF：生成合成辐射场, 3DV2022](https://vcai.mpi-inf.mpg.de/projects/gCoRF/) | [code]\n    > 对象的 3D 生成模型可通过 3D 控制实现逼真的图像合成。现有方法将场景建模为全局场景表示，忽略了场景的组成方面。除了支持可概括的 3D 推理之外，组合推理还可以支持各种编辑应用程序。在本文中，我们提出了一个组合生成模型，其中对象的每个语义部分都表示为仅从野外 2D 数据中学习的独立 3D 表示。我们从全局生成模型 (GAN) 开始，学习使用 2D 分割掩码的监督将其分解为不同的语义部分。然后，我们学习合成独立采样的部分，以创建连贯的全局场景。不同的部分可以独立采样，同时保持物体的其余部分固定。我们在各种对象和部件上评估我们的方法，并演示编辑应用程序。\n## Oct23 - Oct29, 2022\n  - [通过辐射贴图提升点云渲染](https://arxiv.org/abs/2210.15107) | [code]\n    > 近年来，由于其高质量，我们见证了基于 NeRF 的图像渲染的快速发展。然而，点云渲染在某种程度上较少被探索。与遭受密集空间采样的基于 NeRF 的渲染相比，点云渲染自然计算密集度较低，这使其能够部署在移动计算设备中。在这项工作中，我们专注于通过紧凑的模型设计提高点云渲染的图像质量。我们首先分析体绘制公式在点云上的适应性。基于分析，我们将 NeRF 表示简化为空间映射函数，每个像素只需要一次评估。此外，受光线行进的启发，我们将嘈杂的原始点云校正为光线与表面之间的估计交点作为查询坐标，这可以避免空间频率崩溃和邻点干扰。由光栅化、空间映射和细化阶段组成，我们的方法在点云渲染上实现了最先进的性能，以显着的优势优于之前的工作，模型尺寸更小。我们在 NeRF-Synthetic 上获得了 31.74 的 PSNR，在 ScanNet 上获得了 25.88，在 DTU 上获得了 30.81。代码和数据将很快发布。\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [LB-NERF：用于透明介质的光弯曲神经辐射场, ICIP2022](https://ieeexplore.ieee.org/abstract/document/9897642) | [code]\n    > 神经辐射场 (NeRFs) 已被提出作为新颖的视图合成方法，并且由于其多功能性已被用于解决各种问题。 NeRF 可以使用假设直线光路的神经渲染来表示 3D 空间中的颜色和密度。但是，场景中具有不同折射率的介质，例如透明介质，会引起光的折射，打破了光路直线的假设。因此，不能在多视图图像中一致地学习 NeRF。为了解决这个问题，本研究提出了一种方法，通过引入光折射效应作为与源自相机中心的直线的偏移量来学习跨多个视点的一致辐射场。实验结果定量和定性地验证了在考虑透明物体的折射时，我们的方法可以比传统的 NeRF 方法更好地插入视点。\n  - [通过隐式神经表示的测试时间训练实现可控风格迁移](https://arxiv.org/abs/2210.07762) | [code]\n    > 我们提出了一个基于隐式神经表示的可控风格迁移框架，该框架通过测试时训练以像素方式控制风格化输出。与传统的图像优化方法经常遇到不稳定的收敛和需要密集训练且泛化能力有限的基于学习的方法不同，我们提出了一个模型优化框架，该框架在测试时通过显式损失函数来优化神经网络以进行风格迁移。在经过一次测试时间训练后，由于基于 INR 的模型的灵活性，我们的框架可以以像素方式精确控制风格化图像，并自由调整图像分辨率，无需进一步优化或训练。我们演示了几个应用程序。\n  - [神经形状变形先验, NeurIPS2022](https://arxiv.org/abs/2210.05616) | [code]\n    > 我们提出了神经形状变形先验，这是一种新的形状操作方法，可以根据用户提供的手柄运动来预测非刚性物体的网格变形。最先进的方法将此问题视为优化任务，其中输入源网格被迭代变形以根据手工制作的正则化器（如 ARAP）最小化目标函数。在这项工作中，我们基于形状的基本几何特性来学习变形行为，同时利用包含各种非刚性变形的大规模数据集。具体来说，给定源网格和描述部分表面变形的手柄的所需目标位置，我们预测在 3D 空间中定义的连续变形场以描述空间变形。为此，我们引入了基于变压器的变形网络，将形状变形表示为局部表面变形的组合。它学习一组锚定在 3D 空间中的局部潜在代码，从中我们可以学习一组局部表面的连续变形函数。我们的方法可以应用于具有挑战性的变形，并且可以很好地推广到看不见的变形。我们使用 DeformingThing4D 数据集在实验中验证了我们的方法，并与经典的基于优化的方法和最近的基于神经网络的方法进行了比较。\n## Oct2 - Oct8, 2022\n  - [使用辐射场传播的无监督多视图对象分割, NeurIPS2022](https://arxiv.org/abs/2210.00489) | [code]\n    > 我们提出了辐射场传播 (RFP)，这是一种在重建过程中分割 3D 对象的新方法，仅给出场景的未标记多视图图像。 RFP 源自新兴的基于神经辐射场的技术，该技术将语义与外观和几何形状联合编码。我们方法的核心是一种新颖的传播策略，用于具有双向光度损失的单个对象的辐射场，能够将场景无监督地划分为对应于不同对象实例的显着或有意义的区域。为了更好地处理具有多个对象和遮挡的复杂场景，我们进一步提出了一种迭代期望最大化算法来细化对象掩码。据我们所知，RFP 是第一个在没有任何监督、注释或其他线索（如 3D 边界框和对象类别的先验知识）的情况下处理神经辐射场 (NeRF) 的 3D 场景对象分割的无监督方法。实验表明，RFP 实现了可行的分割结果，比以前的无监督图像/场景分割方法更准确，并且可与现有的基于 NeRF 监督的方法相媲美。分段对象表示支持单独的 3D 对象编辑操作。\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n## Sep11 - Sep17, 2022\n  - [3DMM-RF：用于 3D 人脸建模的卷积辐射场](https://arxiv.org/abs/2209.07366) | [code]\n    > 面部 3D 可变形模型是具有无数应用的主要计算机视觉主题，并且在过去二十年中得到了高度优化。深度生成网络的巨大改进为改进此类模型创造了各种可能性，并引起了广泛的兴趣。此外，神经辐射领域的最新进展正在彻底改变已知场景的新视图合成。在这项工作中，我们提出了一个面部 3D 可变形模型，它利用了上述两者，并且可以准确地建模对象的身份、姿势和表情，并在任意光照下渲染它。这是通过利用强大的基于深度样式的生成器来克服神经辐射场的两个主要弱点，即它们的刚性和渲染速度来实现的。我们引入了一种基于样式的生成网络，它一次性合成所有且仅合成神经辐射场所需的渲染样本。我们创建了一个巨大的面部渲染标记合成数据集，并在这些数据上训练网络，以便它可以准确地建模和概括面部身份、姿势和外观。最后，我们证明该模型可以准确地拟合任意姿势和光照的“in-the-wild”人脸图像，提取人脸特征，并用于在可控条件下重新渲染人脸。\n## Sep4 - Sep10, 2022\n  - [SIRA：来自单个图像的可重新点亮的头像](https://arxiv.org/abs/2209.03027) | [code]\n    > 从单个图像中恢复人头的几何形状，同时分解材料和照明是一个严重不适定的问题，需要解决先验信息。基于 3D 可变形模型 (3DMM) 的方法，以及它们与可微渲染器的组合，已显示出可喜的结果。然而，3DMM 的表现力是有限的，它们通常会产生过度平滑且与身份无关的 3D 形状，仅限于面部区域。最近已经通过使用多层感知器参数化几何形状的神经场获得了高度准确的全头重建。这些表示的多功能性也被证明对于解开几何、材料和照明是有效的。然而，这些方法需要几十个输入图像。在本文中，我们介绍了 SIRA，这是一种从单个图像重建具有高保真几何形状和分解光和表面材料的人头头像的方法。我们的关键成分是两个基于神经场的数据驱动统计模型，可解决单视图 3D 表面重建和外观分解的模糊性。实验表明，SIRA 在 3D 头部重建中获得了最先进的结果，同时它成功地解开了全局照明、漫反射和镜面反射率。此外，我们的重建适用于基于物理的外观编辑和头部模型重新照明。\n## Aug28 - Sep3, 2022\n  - [NerfCap：使用动态神经辐射场捕获人类表现, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > 本文解决了从稀疏的多视图或单目视频中捕捉人类表演的挑战。给定表演者的模板网格，以前的方法通过将模板网格非刚性地注册到具有 2D 轮廓或密集光度对齐的图像来捕获人体运动。然而，详细的表面变形无法从轮廓中恢复，而光度对齐则受到视频外观变化引起的不稳定性的影响。为了解决这些问题，我们提出了 NerfCap，这是一种基于表演者动态神经辐射场 (NeRF) 表示的新型表演捕捉方法。具体来说，通过优化变形场和规范 NeRF 的外观模型，从模板几何初始化规范 NeRF 并注册到视频帧。为了捕捉大型身体运动和详细的表面变形，NerfCap 将线性混合蒙皮与嵌入式图形变形相结合。与受限于固定拓扑和纹理的基于网格的方法相比，NerfCap 能够灵活地捕捉视频中复杂的几何形状和外观变化，并合成更逼真的图像。此外，NerfCap 可以通过将合成视频与输入视频进行匹配，以自我监督的方式进行端到端的预训练。各种数据集的实验结果表明，NerfCap 在表面重建精度和新视图合成质量方面都优于先前的工作。\n## Aug21 - Aug27, 2022\n  - [训练和调整生成神经辐射场以进行属性条件 3D 感知人脸生成](https://arxiv.org/abs/2208.12550) | [***``[code]``***](https://github.com/zhangqianhui/TT-GNeRF)\n    > 基于生成神经辐射场 (GNeRF) 的 3D 感知 GAN 已经实现了令人印象深刻的高质量图像生成，同时保持了强大的 3D 一致性。最显着的成就是在人脸生成领域。然而，这些模型中的大多数都专注于提高视图一致性而忽略了解耦方面，因此这些模型无法提供对生成的高质量语义/属性控制。为此，我们引入了一个使用特定属性标签作为输入的条件 GNeRF 模型，以提高 3D 感知生成模型的可控性和解开能力。我们利用预训练的 3D 感知模型作为基础，并集成了一个双分支属性编辑模块 (DAEM)，该模块利用属性标签来提供对生成的控制。此外，我们提出了一种 TRIOT (TRAining as Init, and Optimizing for Tuning) 方法来优化潜在向量，以进一步提高属性编辑的精度。在广泛使用的 FFHQ 上进行的大量实验表明，我们的模型在保留非目标区域的同时，可以产生具有更好视图一致性的高质量编辑。该代码可在此 https 网址上找到。\n  - [DreamBooth：为主题驱动生成微调文本到图像的扩散模型](https://dreambooth.github.io/) | [code]\n    > 大型文本到图像模型在人工智能的演进中实现了显着的飞跃，能够从给定的文本提示中对图像进行高质量和多样化的合成。然而，这些模型缺乏模仿给定参考集中对象的外观并在不同上下文中合成它们的新颖再现的能力。在这项工作中，我们提出了一种“个性化”文本到图像扩散模型的新方法（专门针对用户的需求）。给定主题的几张图像作为输入，我们微调预训练的文本到图像模型（Imagen，尽管我们的方法不限于特定模型），以便它学会将唯一标识符与该特定主题绑定.一旦对象被嵌入模型的输出域中，唯一标识符就可以用于合成在不同场景中情境化的对象的完全新颖的真实感图像。通过利用嵌入在模型中的语义先验和新的自生类特定先验保存损失，我们的技术能够在参考图像中没有出现的不同场景、姿势、视图和照明条件下合成主体。我们将我们的技术应用于几个以前无懈可击的任务，包括主题重新上下文化、文本引导视图合成、外观修改和艺术渲染（同时保留主题的关键特征）。项目页面：此 https 网址\n  - [FurryGAN：高质量的前景感知图像合成, ECCV2022](https://jeongminb.github.io/FurryGAN/) | [***``[code]``***](https://jeongminb.github.io/FurryGAN/)\n    > 前景感知图像合成旨在生成图像及其前景蒙版。一种常见的方法是将图像公式化为前景图像和背景图像的蒙版混合。这是一个具有挑战性的问题，因为它很容易达到一个简单的解决方案，即任一图像压倒另一个图像，即蒙版完全满或空，前景和背景没有有意义地分离。我们展示了 FurryGAN 的三个关键组件：1）将前景图像和合成图像都强加为逼真，2）将掩码设计为粗略和精细掩码的组合，以及 3）通过辅助掩码预测器引导生成器鉴别器。我们的方法使用非常详细的 alpha 蒙版生成逼真的图像，这些蒙版以完全无人监督的方式覆盖头发、毛皮和胡须。\n## Aug14 - Aug20, 2022\n  - [Vox-Surf：基于体素的隐式表面表示](https://arxiv.org/abs/2208.10925) | [code]\n    > 虚拟内容创建和交互在 AR 和 VR 等现代 3D 应用中发挥着重要作用。从真实场景中恢复详细的 3D 模型可以显着扩展其应用范围，并且已经在计算机视觉和计算机图形学界进行了数十年的研究。我们提出了 Vox-Surf，一种基于体素的隐式表面表示。我们的 Vox-Surf 将空间划分为有限的有界体素。每个体素在其角顶点中存储几何和外观信息。由于从体素表示继承而来的稀疏性，Vox-Surf 几乎适用于任何场景，并且可以从多个视图图像中轻松训练。我们利用渐进式训练过程逐步提取重要体素进行进一步优化，从而只保留有效体素，这大大减少了采样点的数量并提高了渲染速度。精细体素也可以视为碰撞检测的边界体积。实验表明，与其他方法相比，Vox-Surf 表示可以以更少的内存和更快的渲染速度学习精细的表面细节和准确的颜色。我们还表明，Vox-Surf 在场景编辑和 AR 应用中可以更实用。\n  - [DM-NeRF：2D 图像的 3D 场景几何分解和操作](https://arxiv.org/abs/2208.07227) | [***``[code]``***](https://github.com/vLAR-group/DM-NeRF)\n    > 在本文中，我们从 2D 视图研究 3D 场景几何分解和操纵问题。通过利用最近的隐式神经表示技术，特别是吸引人的神经辐射场，我们引入了一个对象场组件，仅从 2D 监督中学习 3D 空间中所有单个对象的唯一代码。该组件的关键是一系列精心设计的损失函数，以使每个 3D 点，尤其是在非占用空间中，即使没有 3D 标签也能得到有效优化。此外，我们引入了一种逆查询算法，可以在学习的场景表示中自由操作任何指定的 3D 对象形状。值得注意的是，我们的操作算法可以明确地解决关键问题，例如对象碰撞和视觉遮挡。我们的方法称为 DM-NeRF，是最早在单个管道中同时重建、分解、操作和渲染复杂 3D 场景的方法之一。在三个数据集上的大量实验清楚地表明，我们的方法可以准确地从 2D 视图中分解所有 3D 对象，允许在 3D 空间中自由操作任何感兴趣的对象，例如平移、旋转、大小调整和变形。\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n  - [VolTeMorph：体积表示的实时、可控和可泛化动画](https://arxiv.org/pdf/2208.00949) | [code]\n    > 最近，用于场景重建和新颖视图合成的体积表示越来越受欢迎，这使人们重新关注在高可见度下对体积内容进行动画处理质量和实时性。虽然基于学习函数的隐式变形方法可以产生令人印象深刻的结果，但它们对于艺术家和内容创作者来说是“黑匣子”，它们需要大量的训练数据才能进行有意义的概括，而且它们不会在训练数据之外产生现实的外推。在这项工作中，我们通过引入一种实时、易于使用现成软件进行编辑并且可以令人信服地推断的体积变形方法来解决这些问题。为了展示我们方法的多功能性，我们将其应用于两个场景：基于物理的对象变形和远程呈现，其中化身使用混合形状进行控制。我们还进行了彻底的实验，表明我们的方法优于结合隐式变形的体积方法和基于网格变形的方法。\n  - [基于神经辐射场和运动图的可控自由视点视频重建, IEEE Transactions on Visualization and Computer Graphics](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > 在本文中，我们提出了一种基于运动图和神经辐射场（NeRF）的可控高质量自由视点视频生成方法。与现有的姿势驱动 NeRF 或时间/结构条件的 NeRF 工作不同，我们建议首先构建捕获序列的有向运动图。这种序列-运动-参数化策略不仅能够灵活地控制自由视点视频渲染的姿态，而且避免了相似姿态的冗余计算，从而提高了整体重建效率。此外，为了支持身体形状控制而不损失逼真的自由视点渲染性能，我们通过结合显式表面变形和隐式神经场景表示来改进 vanilla NeRF。具体来说，我们为运动图上的每个有效帧训练一个局部表面引导的 NeRF，并且体积渲染仅在真实表面周围的局部空间中执行，从而实现了合理的形状控制能力。据我们所知，我们的方法是第一个同时支持逼真的自由视点视频重建和基于运动图的用户引导运动遍历的方法。结果和比较进一步证明了所提出方法的有效性。\n  - [基于神经描述符字段的鲁棒变化检测, IROS2022](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > 推理环境变化的能力对于长时间运行的机器人至关重要。代理应在操作期间捕获更改，以便可以遵循操作以确保工作会话的顺利进行。然而，不同的视角和累积的定位误差使得机器人很容易由于低观察重叠和漂移的对象关联而错误地检测到周围世界的变化。在本文中，基于最近提出的类别级神经描述符字段 (NDF)，我们开发了一种对象级在线变化检测方法，该方法对部分重叠的观察和嘈杂的定位结果具有鲁棒性。利用 NDF 的形状补全能力和 SE(3) 等效性，我们表示具有紧凑形状代码的对象，该代码编码来自部分观察的完整对象形状。然后基于从 NDF 恢复的对象中心将对象组织在空间树结构中，以便快速查询对象邻域。通过形状代码相似性关联对象并比较局部对象-邻居空间布局，我们提出的方法证明了对低观测重叠和定位噪声的鲁棒性。我们对合成序列和真实世界序列进行了实验，与多种基线方法相比，实现了改进的变化检测结果。\n## Jul24 - Jul30, 2022\n  - [MobileNeRF：利用多边形光栅化管道在移动架构上进行高效的神经场渲染](https://arxiv.org/abs/2208.00277) | [***``[code]``***](https://github.com/google-research/jax3d/tree/main/jax3d/projects/mobilenerf)\n    > 神经辐射场 (NeRFs) 展示了从新颖视图合成 3D 场景图像的惊人能力。但是，它们依赖于基于光线行进的专用体积渲染算法，这些算法与广泛部署的 g 的功能不匹配图形硬件。本文介绍了一种基于纹理多边形的新 NeRF 表示，它可以使用标准渲染管道有效地合成新图像。 NeRF 表示为一组多边形，其纹理表示二进制不透明度和特征向量。使用 z 缓冲区对多边形进行传统渲染会生成每个像素都有特征的图像，这些图像由在片段着色器中运行的小型、依赖于视图的 MLP 进行解释，以产生最终的像素颜色。这种方法使 NeRF 能够使用传统的多边形光栅化管道进行渲染，该管道提供大规模的像素级并行性，在包括手机在内的各种计算平台上实现交互式帧速率。\n  - [用笼子变形辐射场, ECCV2022](https://arxiv.org/abs/2207.12298) | [code]\n    > 辐射场的最新进展可以实现静态或动态 3D 场景的逼真渲染，但仍不支持用于场景操作或动画的显式变形。在本文中，我们提出了一种新的辐射场变形方法：自由形式的辐射场变形。我们使用一个三角形网格来包围称为笼子的前景对象作为界面，通过操纵笼子顶点，我们的方法可以实现辐射场的自由变形。我们方法的核心是网格变形中常用的基于笼的变形。我们提出了一种将其扩展到辐射场的新公式，该公式将采样点的位置和视图方向从变形空间映射到规范空间，从而实现变形场景的渲染。合成数据集和真实世界数据集的变形结果证明了我们方法的有效性。\n  - [NeuMesh：学习基于解缠结神经网格的隐式场，用于几何和纹理编辑, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > 最近，神经隐式渲染技术得到了迅速发展，并在新颖的视图合成和 3D 场景重建中显示出巨大的优势。然而，现有的用于编辑目的的神经渲染方法提供的功能有限，例如，刚性变换，或者不适用于日常生活中一般对象的细粒度编辑。在本文中，我们提出了一种新颖的基于网格的表示，通过在网格顶点上使用解开几何和纹理代码对神经隐场进行编码，这促进了一组编辑功能，包括网格引导的几何编辑、带有纹理交换的指定纹理编辑、填充和绘画操作。为此，我们开发了几种技术包括可学习的符号指标以放大基于网格的表示的空间可区分性，蒸馏和微调机制以实现稳定收敛，以及空间感知优化策略以实现精确的纹理编辑。对真实数据和合成数据的大量实验和编辑示例证明了我们的方法在表示质量和编辑能力方面的优越性。代码可在项目网页上找到：此 https URL。\n## Previous weeks\n  - [神经稀疏体素场, NeurIPS2020](https://lingjie0206.github.io/papers/NSVF/) | [***``[code]``***](https://github.com/facebookresearch/NSVF)\n    > 我们介绍了神经稀疏体素场 (NSVF)，这是一种用于快速和高质量自由视点渲染的新神经场景表示。 NSVF 定义了一组以稀疏体素八叉树组织的体素有界隐式字段，以对每个单元中的局部属性进行建模。 我们仅从一组姿势的 RGB 图像中通过可区分的光线行进操作逐步学习底层体素结构。 使用稀疏体素八叉树结构，可以通过跳过不包含相关场景内容的体素来加速渲染新颖的视图。 我们的方法在推理时比最先进的方法（即 NeRF (Mildenhall et al., 2020)）快 10 倍以上，同时获得更高质量的结果。 此外，通过利用显式稀疏体素表示，我们的方法可以很容易地应用于场景编辑和场景合成。 我们还展示了几个具有挑战性的任务，包括多场景学习、移动人体的自由视点渲染和大规模场景渲染。\n  - [CAMPARI：相机感知分解生成神经辐射场](https://arxiv.org/pdf/2103.17269.pdf) | [code]\n    > 深度生成模型的巨大进步导致了逼真的图像合成。在取得令人信服的结果的同时，大多数方法都在二维图像域中运行，而忽略了我们世界的三维性质。因此，最近的几项工作提出了具有 3D 感知能力的生成模型，即场景以 3D 建模，然后可微分地渲染到图像平面。这导致了令人印象深刻的 3D 一致性，但纳入这种偏差是有代价的：相机也需要建模。当前的方法假定固定的内在函数和预先定义的相机姿势范围。因此，实际数据通常需要参数调整，如果数据分布不匹配，结果会下降。我们的关键假设是，与图像生成器一起学习相机生成器会导致更原则性的 3D 感知图像合成方法。此外，我们建议将场景分解为背景和前景模型，从而实现更有效和更清晰的场景表示。在从原始的、未定型的图像集合中进行训练时，我们学习了一个 3D 和相机感知的生成模型，它不仅忠实地恢复了图像，而且还忠实地恢复了相机数据分布。在测试时，我们的模型生成的图像可以显式控制相机以及场景的形状和外观。\n  - [NeRFactor：未知光照下形状和反射率的神经分解, TOG 2021 (Proc. SIGGRAPH Asia)](https://xiuming.info/projects/nerfactor/) | [code]\n    > 我们解决了从由一种未知光照条件照射的物体的多视图图像（及其相机姿势）中恢复物体的形状和空间变化反射率的问题。这使得能够在任意环境照明下渲染对象的新颖视图并编辑对象的材质属性。我们方法的关键，我们称之为神经辐射分解（NeRFactor），是提取神经辐射场（NeRF）的体积几何[Mildenhall et al。 2020] 将对象表示为表面表示，然后在解决空间变化的反射率和环境照明的同时联合细化几何。具体来说，NeRFactor 在没有任何监督的情况下恢复表面法线、光能见度、反照率和双向反射分布函数 (BRDF) 的 3D 神经场，仅使用重新渲染损失、简单的平滑先验和从真实数据中学习的数据驱动的 BRDF 先验-世界BRDF测量。通过显式建模光可见性，NeRFactor 能够从反照率中分离出阴影，并在任意光照条件下合成逼真的软阴影或硬阴影。 NeRFactor 能够恢复令人信服的 3D 模型，用于在合成场景和真实场景的这种具有挑战性且约束不足的捕获设置中进行自由视点重新照明。定性和定量实验表明，NeRFactor 在各种任务中都优于经典和基于深度学习的最新技术。我们的视频、代码和数据可在 people.csail.mit.edu/xiuming/projects/nerfactor/ 上找到。\n  - [以对象为中心的神经场景渲染](https://shellguo.com/osf/) | [***``[code]``***](https://shellguo.com/osf/)\n    > 我们提出了一种从捕获的对象图像中合成逼真场景的方法。我们的工作建立在神经辐射场 (NeRFs) 之上，它隐含地模拟了场景的体积密度和定向发射的辐射。虽然 NeRF 可以合成逼真的图片，但它们只对静态场景进行建模，并且与特定的成像条件密切相关。这个属性使得 NeRFs 难以泛化到新场景，包括新的光照或对象的新排列。我们建议学习以对象为中心的神经散射函数 (OSF)，而不是像 NeRF 那样学习场景辐射场，这是一种使用与光照和视图相关的神经网络隐式模拟每个对象的光传输的表示。即使物体或灯光移动，这也可以渲染场景，而无需重新训练。结合体积路径跟踪程序，我们的框架能够渲染对象内和对象间的光传输效果，包括遮挡、镜面反射、阴影和间接照明。我们评估了我们的场景合成方法，并表明它可以推广到新的照明条件，产生逼真的、物理上精确的多对象场景渲染。\n  - [物体辐射场的无监督发现, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > 我们研究从单个图像推断以对象为中心的场景表示的问题，旨在推导出解释图像形成过程的表示，捕捉场景的 3D 性质，并且在没有监督的情况下学习。由于将复杂的 3D 到 2D 图像形成过程集成到强大的推理方案（如深度网络）中存在根本性挑战，大多数现有的场景分解方法都缺乏这些特征中的一个或多个。在本文中，我们提出了对象辐射场 (uORF) 的无监督发现，将神经 3D 场景表示和渲染的最新进展与深度推理网络相结合，用于无监督 3D 场景分解。在没有注释的多视图 RGB 图像上进行训练，uORF 学习从单个图像分解具有不同纹理背景的复杂场景。我们展示了 uORF 在无监督 3D 场景分割、新视图合成和三个数据集上的场景编辑方面表现良好。\n  - [学习用于可编辑场景渲染的对象组合神经辐射场, ICCV2021](https://zju3dv.github.io/object_nerf/) | [***``[code]``***](https://github.com/zju3dv/object_nerf)\n    > 隐式神经渲染技术已经显示出用于新视图合成的有希望的结果。然而，现有方法通常将整个场景编码为一个整体，这通常不知道对象身份，并且限制了移动或添加家具等高级编辑任务的能力。在本文中，我们提出了一种新颖的神经场景渲染系统，该系统学习对象组成的神经辐射场，并为集群和真实世界场景生成具有编辑能力的逼真渲染。具体来说，我们设计了一种新颖的双路径架构，其中场景分支对场景几何和外观进行编码，对象分支根据可学习的对象激活码对每个独立对象进行编码。为了在严重混乱的场景中进行训练，我们提出了一种场景引导的训练策略来解决遮挡区域中的 3D 空间模糊性并学习每个对象的清晰边界。大量实验表明，我们的系统不仅在静态场景新视图合成方面取得了有竞争力的性能，而且为对象级编辑产生了逼真的渲染。\n  - [编辑条件辐射场, ICCV2021](http://editnerf.csail.mit.edu/) | [***``[code]``***](https://github.com/stevliu/editnerf)\n    > 神经辐射场 (NeRF) 是支持高质量视图合成的场景模型，针对每个场景进行了优化。在本文中，我们探索启用用户编辑类别级 NeRF - 也称为条件辐射场 - 在形状类别上训练。具体来说，我们介绍了一种将粗略的 2D 用户涂鸦传播到 3D 空间的方法，以修改局部区域的颜色或形状。首先，我们提出了一个条件辐射场，它结合了新的模块化网络组件，包括一个跨对象实例共享的形状分支。观察同一类别的多个实例，我们的模型在没有任何监督的情况下学习底层部分语义，从而允许将粗略的 2D 用户涂鸦传播到整个 3D 区域（例如，椅子座位）。接下来，我们提出了一种针对特定网络组件的混合网络更新策略，该策略平衡了效率和准确性。在用户交互过程中，我们制定了一个既满足用户约束又保留原始对象结构的优化问题。我们在三个形状数据集上展示了我们在各种编辑任务上的方法，并表明它优于以前的神经编辑方法。最后，我们编辑真实照片的外观和形状，并显示编辑传播到外推的新视图。\n  - [使用分层神经表示的可编辑自由视点视频, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > 生成自由视点视频对于沉浸式 VR/AR 体验至关重要，但最近的神经学进展仍然缺乏编辑能力来操纵大型动态场景的视觉感知。为了填补这一空白，在本文中，我们提出了第一种仅使用稀疏的 16 个摄像头为大规模动态场景生成可编辑照片般逼真的自由视点视频的方法。我们方法的核心是一种新的分层神经表示，其中包括环境本身的每个动态实体都被制定为称为 ST-NeRF 的时空相干神经分层辐射表示。这种分层表示支持对动态场景的完全感知和真实操作，同时仍支持大范围的自由观看体验。在我们的 ST-NeRF 中，动态实体/层被表示为连续函数，以连续和自监督的方式实现动态实体的位置、变形以及外观的解耦。我们提出了一个场景解析 4D 标签映射跟踪来显式地解开空间信息，以及一个连续变形模块来隐式地解开时间运动。进一步引入了一种对象感知体绘制方案，用于重新组装所有神经层。我们采用了一种新颖的分层损失和运动感知光线采样策略，以实现对具有多个表演者的大型动态场景的有效训练，我们的框架进一步实现了各种编辑功能，即操纵规模和位置，复制或重新定时单个神经层在保持高度真实感的同时创造众多视觉效果。大量实验证明了我们的方法在为动态场景生成高质量、照片般逼真和可编辑的自由视点视频方面的有效性。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/fast.md",
    "content": "\n每周分类神经辐射场 - fast ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [GARF：几何感知广义神经辐射场](https://arxiv.org/abs/2212.02280) | [code]\n    > 神经辐射场 (NeRF) 彻底改变了自由视点渲染任务，并取得了令人瞩目的成果。 然而，效率和准确性问题阻碍了其广泛应用。 为了解决这些问题，我们提出了几何感知广义神经辐射场 (GARF) 和几何感知动态采样 (GADS) 策略，以在不进行逐场景优化的情况下对未见场景执行实时新颖视图渲染和无监督深度估计。 与大多数现有的广义 NeRF 不同，我们的框架仅使用少量输入图像就可以在像素尺度和几何尺度上推断出看不见的场景。 更具体地说，我们的方法通过编码器-解码器结构和有助于避免遮挡的点级可学习多视图特征融合模块来学习新视图合成的共同属性。 为了在广义模型中保留场景特征，我们引入了一个无监督深度估计模块来推导粗几何，将光线采样间隔缩小到估计表面的邻近空间，并在期望最大位置采样，构成几何感知动态采样策略（ GADS）。 此外，我们引入了多级语义一致性损失 (MSC) 来帮助提供更多信息的表示学习。 对室内和室外数据集的大量实验表明，与最先进的广义 NeRF 方法相比，GARF 将样本减少了 25% 以上，同时提高了渲染质量和 3D 几何估计。\n## Nov27 - Dec3, 2022\n  - [QFF：神经场表示的量化傅立叶特征](https://arxiv.org/abs/2212.00914) | [code]\n    > 多层感知器 (MLP) 学习高频的速度很慢。 最近的方法对空间箱中的特征进行编码以提高学习细节的速度，但是以更大的模型尺寸和连续性损失为代价。 相反，我们建议在通常用于位置编码的傅里叶特征的容器中对特征进行编码。 我们称这些为量化傅立叶特征 (QFF)。 作为一种自然的多分辨率和周期性表示，我们的实验表明，使用 QFF 可以为多种应用带来更小的模型尺寸、更快的训练和更高质量的输出，包括神经图像表示 (NIR)、神经辐射场 (NeRF) 和符号距离函数 (SDF) 建模。 QFF 易于编码，计算速度快，并且可以作为许多神经场表示之外的简单补充。\n  - [用于快速多视图视频合成的混合神经体素](https://arxiv.org/abs/2212.00190) | [code]\n    > 由于现实世界环境的复杂性和高度动态的运动，从现实世界的多视图输入合成高保真视频具有挑战性。 以前基于神经辐射场的作品已经展示了动态场景的高质量重建。 但是，在真实场景中训练此类模型非常耗时，通常需要数天或数周。 在本文中，我们提出了一种名为 MixVoxels 的新方法，以更好地表示具有快速训练速度和有竞争力的渲染质量的动态场景。 拟议的 MixVoxels 将 4D 动态场景表示为静态和动态体素的混合，并使用不同的网络对其进行处理。 这样，静态体素所需模态的计算可以由轻量级模型处理，这从本质上减少了计算量，特别是对于许多以静态背景为主的日常动态场景。 为了分离这两种体素，我们提出了一个新的变化场来估计每个体素的时间方差。 对于动态体素，我们设计了一种内积时间查询方法来有效地查询多个时间步长，这对于恢复高动态运动至关重要。 因此，通过对输入 300 帧视频的动态场景进行 15 分钟的训练，MixVoxels 实现了比以前的方法更好的 PSNR。 此 https 网址提供代码和训练模型\n  - [使用 RGBXY 导数和最佳传输的可微分渲染, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555479) | [code]\n    > 传统的可微分渲染方法通常很难在逆渲染优化中收敛，尤其是当初始对象和目标对象位置不太接近时。 受拉格朗日流体模拟的启发，我们提出了一种新颖的可微分渲染方法来解决这个问题。 我们将每个屏幕空间像素与像素中心覆盖的可见 3D 几何点相关联，并计算几何点而不是像素的导数。 我们将关联的几何点称为像素的点代理。 对于每个点代理，我们计算其 5D RGBXY 导数，测量其 3D RGB 颜色和 2D 投影屏幕空间位置如何相对于场景参数发生变化。 此外，为了捕获全局和远程对象运动，我们利用基于最佳传输的像素匹配来设计更复杂的损失函数。 我们已经进行了实验来评估我们提出的方法在各种逆向渲染应用程序中的有效性，并证明了与最先进的基线相比更优越的收敛行为。\n  - [QuadStream：一种用于新视点重建的基于 Quad 的场景流架构, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555524) | [code]\n    > 通过网络将渲染的 3D 内容流式传输到手机或 VR/AR 耳机等瘦客户端设备，将高保真图形带到通常由于热量、功率或成本限制而无法实现的平台。 流式 3D 内容必须以对延迟和潜在网络丢失都具有鲁棒性的表示形式进行传输。 在存在遮挡事件的情况下，传输视频流并重新投影以纠正不断变化的视点失败； 在功率有限的移动 GPU 上无法在客户端流式传输场景几何体和执行高质量渲染。 为了平衡消除遮挡稳健性和最小客户端工作量这两个相互竞争的目标，我们引入了 QuadStream，这是一种新的流媒体内容表示，它通过允许客户端有效地渲染新颖的视图而没有由消除遮挡事件引起的伪影来减少运动到光子的延迟。 受视频编解码器设计的传统宏块方法的启发，我们将从视图单元中的位置看到的场景分解为一系列四边形代理，或来自多个视图的视图对齐四边形。 通过在光栅化 G-Buffer 上操作，我们的方法独立于场景本身的表示； 生成的 QuadStream 是场景的近似几何表示，可以由瘦客户端重建以呈现当前视图和附近的相邻视图。 我们的技术贡献是一种有效的并行四边形生成、合并和打包策略，用于覆盖场景中潜在客户移动的代理视图； 一种打包和编码策略，允许将具有深度信息的掩码四边形作为帧相干流传输； 以及一种高效的渲染方法，用于将我们的 QuadStream 表示渲染为瘦客户端上的全新视图。 我们表明，与视频数据流方法和基于几何的流媒体相比，我们的方法实现了卓越的质量。\n  - [用于全频着色的轻量级神经基函数, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555386) | [code]\n    > 基函数既提供了紧凑表示的能力，又提供了高效计算的特性。 因此，它们普遍用于渲染以执行全频着色。 然而，包括球谐函数 (SH)、小波和球面高斯函数 (SG) 在内的常用基函数都有其自身的局限性，例如 SH 的低频、小波的旋转不变性以及 SG 不支持多乘积等。 在本文中，我们提出了神经基函数，这是一组隐式和数据驱动的基函数，它规避了所有所需属性的限制。 我们首先引入了一个表示神经网络，它将任何一般的 2D 球面函数（例如环境光照、BRDF 和可见性）作为输入并将其投影到潜在空间上作为我们的神经基函数的系数。 然后，我们设计了几个执行不同类型计算的轻量级神经网络，为我们的基函数提供了不同的计算属性，例如双/三乘积积分和旋转。 我们通过将神经基函数集成到全频着色应用程序中来展示我们的神经基函数的实用性，表明我们的方法不仅在同等质量下实现了比小波高 10 × -40 × 的压缩率，而且还渲染了全频 实时照明效果，没有上述经典基础功能的限制。\n## Nov20 - Nov26, 2022\n  - [ScanNeRF：神经辐射场的可扩展基准, WACV2023](https://arxiv.org/abs/2211.13762) | [code]\n    > 在本文中，我们提出了第一个用于评估神经辐射场 (NeRF) 和一般情况下的神经渲染 (NR) 框架的真实基准思想。我们设计并实施了一个有效的管道，可以毫不费力地扫描大量真实物体。我们的扫描站的硬件预算不到 500 美元，仅需 5 分钟即可收集大约 4000 张扫描对象的图像。这样的平台用于构建 ScanNeRF，这是一个以多个训练/验证/测试拆分为特征的数据集，旨在对现代 NeRF 方法在不同条件下的性能进行基准测试。因此，我们评估了三个尖端的 NeRF 变体，以突出它们的优点和缺点。该数据集可在我们的项目页面上找到，还有一个在线基准，以促进开发越来越好的 NeRF。\n  - [沉浸式神经图形基元](https://arxiv.org/abs/2211.13494) | [code]\n    > 神经辐射场 (NeRF)，特别是它通过即时神经图形基元的扩展，是一种用于视图合成的新型渲染方法，它使用真实世界的图像来构建照片般逼真的沉浸式虚拟场景。尽管有潜力，但关于 NeRF 和虚拟现实 (VR) 结合的研究仍然很少。目前，还没有集成到可用的典型 VR 系统中，并且尚未评估 NeRF 实现的 VR 性能和适用性，例如，针对不同的场景复杂性或屏幕分辨率。在本文中，我们提出并评估了一个基于 NeRF 的框架，该框架能够在沉浸式 VR 中渲染场景，允许用户自由移动头部来探索复杂的现实世界场景。我们通过对三个不同的 NeRF 场景进行基准测试来评估我们的框架，这些场景涉及它们在不同场景复杂性和分辨率下的渲染性能。利用超分辨率，我们的方法可以产生每秒 30 帧的帧速率，每只眼睛的分辨率为 1280x720 像素。我们讨论了我们框架的潜在应用，并在线提供了一个开源实现。\n  - [通过 Bootstrapped Radiance Field Inversion 从单个图像中获取形状、姿势和外观](https://arxiv.org/abs/2211.11674) | [code]\n    > 神经辐射场 (NeRF) 与 GAN 相结合代表了从单一视图进行 3D 重建领域的一个有前途的方向，因为它们能够有效地对任意拓扑进行建模。然而，该领域最近的工作主要集中在已知确切地面真实姿势的合成数据集上，而忽略了姿势估计，这对于某些下游应用程序（例如增强现实 (AR) 和机器人技术）很重要。我们为自然图像引入了一个有原则的端到端重建框架，其中没有准确的地面真实姿势。我们的方法从对象的单个图像中恢复 SDF 参数化的 3D 形状、姿势和外观，而无需在训练期间利用多个视图。更具体地说，我们利用无条件 3D 感知生成器，我们对其应用混合反演方案，在该方案中，模型会产生对解决方案的初步猜测，然后通过优化对其进行细化。我们的框架可以在短短 10 步内对图像进行反渲染，使其能够在实际场景中使用。我们在各种真实和综合基准测试中展示了最先进的结果。\n## Nov13 - Nov19, 2022\n  - [DINER：无序不变的隐式神经表征](https://arxiv.org/abs/2211.07871) | [code]\n    > 隐式神经表示 (INR) 将信号的属性表征为相应坐标的函数，它成为解决逆问题的利器。然而，INR 的容量受到网络训练中频谱偏差的限制。在本文中，我们发现通过重新排列输入信号的坐标可以在很大程度上解决这种与频率相关的问题，为此我们提出了无序不变的隐式神经表示 (DINER)，方法是将哈希表扩充为传统的 INR 骨架。鉴于离散信号共享相同的属性直方图和不同的排列顺序，哈希表可以将坐标投影到相同的分布中，映射信号可以使用后续的 INR 网络更好地建模，从而显着减轻频谱偏差。实验不仅揭示了 DINER 对不同 INR 主干（MLP 与 SIREN）和各种任务（图像/视频表示、相位检索和折射率恢复）的泛化，而且还显示了优于最先进技术的优势算法的质量和速度。\n## Nov6 - Nov12, 2022\n  - [基于时间相干性的大规模场景分布式光线追踪, ToG2022](https://ieeexplore.ieee.org/abstract/document/9940545) | [code]\n    > 分布式光线追踪算法在渲染海量场景时被广泛使用，其中数据利用率和负载均衡是提高性能的关键。一项基本观察是射线在时间上是相干的，这表明时间信息可用于提高计算效率。在本文中，我们使用时间相干性来优化分布式光线追踪的性能。首先，我们提出了一种基于时间一致性的调度算法来指导任务/数据分配和调度。然后，我们提出了一个虚拟门户结构来预测基于前一帧的光线辐射率，并将辐射率低的光线发送到预先计算的简化模型进行进一步追踪，这可以大大降低遍历复杂度和网络数据传输的开销.该方法在大小高达 355 GB 的场景中得到验证。与以前的算法相比，我们的算法可以实现高达 81% 的加速，并且均方误差非常小。\n  - [QRF：具有量子辐射场的隐式神经表示](https://arxiv.org/abs/2211.03418) | [code]\n    > 现实世界场景的逼真渲染对于包括混合现实 (MR) 和虚拟现实 (VR) 在内的广泛应用来说是一项巨大的挑战。神经网络长期以来一直在求解微分方程的背景下进行研究，之前已被引入作为照片级渲染的隐式表示。然而，使用经典计算的逼真渲染具有挑战性，因为它需要耗时的光线行进，并且由于维数灾难而遭受计算瓶颈。在本文中，我们提出了量子辐射场 (QRF)，它集成了量子电路、量子激活函数和量子体积渲染，用于隐式场景表示。结果表明，QRF不仅发挥了量子计算速度快、收敛快、并行度高等优势，而且保证了体绘制的高质量。\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n  - [NeX360：基于神经基础扩展的实时全方位视图合成, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9931981) | [code]\n    > 我们介绍了 NeX，这是一种基于多平面图像 (MPI) 增强的新颖视图合成的新方法，可以实时再现视图相关的效果。与传统的 MPI 不同，我们的技术将每个像素参数化为从神经网络学习的球形基函数的线性组合，以对视图相关的效果进行建模，并使用混合隐式-显式建模策略来改进精细细节。此外，我们还展示了 NeX 的扩展，它利用知识蒸馏来为无限 360 ∘ 场景训练多个 MPI。我们的方法在几个基准数据集上进行了评估：NeRF-Synthetic 数据集、Light Field 数据集、Real Forward-Facing 数据集、Space 数据集以及 Shiny，我们的新数据集包含更具挑战性的视图相关效果，例如彩虹反射在 CD 上。我们的方法在 PSNR、SSIM 和 LPIPS 上优于其他实时渲染方法，可以实时渲染无界 360 ∘ 场景。\n  - [NeRFPlayer：具有分解神经辐射场的可流式动态场景表示](https://arxiv.org/abs/2210.15947) | [code]\n    > 在 VR 中自由地在真实世界的 4D 时空空间中进行视觉探索一直是一项长期的追求。当仅使用几个甚至单个 RGB 相机来捕捉动态场景时，这项任务特别有吸引力。为此，我们提出了一个能够快速重建、紧凑建模和流式渲染的高效框架。首先，我们建议根据时间特征分解 4D 时空空间。 4D 空间中的点与属于三个类别的概率相关联：静态区域、变形区域和新区域。每个区域都由一个单独的神经场表示和规范化。其次，我们提出了一种基于混合表示的特征流方案，用于有效地对神经场进行建模。我们的方法，创造了 NeRFPlayer，在单手持相机和多相机阵列捕获的动态场景上进行评估，在质量和速度方面实现与最近最先进的方法相当或更优的渲染性能，实现重建每帧 10 秒，实时渲染。\n  - [用于 3D 视频合成的流式辐射场, NeurIPS2022](https://arxiv.org/abs/2210.14831) | [code]\n    > 我们提出了一种基于显式网格的方法，用于有效地重建流辐射场，用于真实世界动态场景的新视图合成。我们不是训练一个结合所有帧的单一模型，而是用增量学习范式来制定动态建模问题，其中训练每帧模型差异以补充当前帧上基础模型的适应性。通过利用简单而有效的窄带调整策略，所提出的方法实现了一个可行的框架，用于处理高训练效率的动态视频序列。通过使用基于模型差异的压缩，可以显着减少使用显式网格表示引起的存储开销。我们还引入了一种有效的策略来进一步加速每一帧的模型优化。对具有挑战性的视频序列的实验表明，我们的方法能够以具有竞争力的渲染质量实现每帧 15 秒的训练速度，比最先进的隐式方法实现 1000 倍的加速。此 https 网址提供了代码。\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [基于显着性感知动态路由策略的遥感图像轻量级无级超分辨率](https://arxiv.org/abs/2210.07598) | [***``[code]``***](https://github.com/hanlinwu/SalDRN)\n    > 基于深度学习的算法极大地提高了遥感图像（RSI）超分辨率（SR）的性能。然而，增加网络深度和参数会导致计算和存储的巨大负担。直接减少现有模型的深度或宽度会导致性能大幅下降。我们观察到，一个 RSI 中不同区域的 SR 难度差异很大，现有方法使用相同的深度网络处理图像中的所有区域，造成计算资源的浪费。此外，现有的 SR 方法通常预先定义整数尺度因子，不能进行无级 SR，即单个模型可以处理任何潜在的尺度因子。在每个比例因子上重新训练模型会浪费大量的计算资源和模型存储空间。为了解决上述问题，我们提出了一种显着性感知动态路由网络（SalDRN），用于 RSI 的轻量级和无级 SR。首先，我们引入视觉显着性作为区域级 SR 难度的指标，并将轻量级显着性检测器集成到 SalDRN 中以捕获像素级视觉特征。然后，我们设计了一种显着性感知动态路由策略，该策略采用路径选择开关根据子图像块的 SR 难度自适应地选择适当深度的特征提取路径。最后，我们提出了一种新颖的轻量级无级上采样模块，其核心是隐式特征函数，用于实现从低分辨率特征空间到高分辨率特征空间的映射。综合实验验证，SalDRN 可以在性能和复杂性之间取得良好的折衷。代码位于 \\url{this https URL}。\n  - [具有可学习位置特征的可扩展神经视频表示, NeurIPS2022](https://arxiv.org/abs/2210.06823) | [***``[code]``***](https://github.com/subin-kim-cv/NVP)\n    > 使用基于坐标的神经表示 (CNR) 的复杂信号的简洁表示已经取得了很大进展，最近的几项工作集中在扩展它们以处理视频。在这里，主要挑战是如何（a）减轻训练 CNR 时的计算效率低下，以（b）实现高质量的视频编码，同时（c）保持参数效率。为了同时满足 (a)、(b) 和 (c) 的所有要求，我们提出了具有可学习位置特征 (NVP) 的神经视频表示，这是一种新颖的 CNR，通过引入“可学习位置特征”可以有效地将视频摊销为潜在代码。具体来说，我们首先提出了一种基于设计 2D 潜在关键帧的 CNR 架构，以学习每个时空轴上的常见视频内容，这极大地改善了所有这三个要求。然后，我们建议利用现有强大的图像和视频编解码器作为潜在代码的计算/内存高效压缩过程。我们展示了 NVP 在流行的 UVG 基准上的优越性；与现有技术相比，NVP 不仅训练速度快 2 倍（不到 5 分钟），而且编码质量也超过了 34.07→34.57（用 PSNR 指标衡量），即使使用的参数减少了 8 倍以上。我们还展示了 NVP 的有趣属性，例如视频修复、视频帧插值等。\n  - [CUF：连续上采样滤波器](https://arxiv.org/abs/2210.06965) | [code]\n    > 神经领域已迅速被用于表示 3D 信号，但它们在更经典的 2D 图像处理中的应用相对有限。在本文中，我们考虑了图像处理中最重要的操作之一：上采样。在深度学习中，可学习的上采样层已广泛用于单图像超分辨率。我们建议将上采样内核参数化为神经域。这种参数化导致了一个紧凑的架构，与竞争的任意尺度超分辨率架构相比，参数数量减少了 40 倍。当对大小为 256x256 的图像进行上采样时，我们表明我们的架构比竞争的任意尺度超分辨率架构效率高 2x-10 倍，并且在实例化为单尺度模型时比亚像素卷积更有效。在一般情况下，这些增益随目标规模的平方呈多项式增长。我们在标准基准上验证了我们的方法，表明可以在不牺牲超分辨率性能的情况下实现这种效率提升。\n  - [NerfAcc：一个通用的 NeRF 加速工具箱](https://arxiv.org/abs/2210.04847) | [***``[code]``***](https://github.com/KAIR-BAIR/nerfacc)\n    > 我们提出了 NerfAcc，一个用于高效体积渲染辐射场的工具箱。我们以 Instant-NGP 中提出的技术为基础，并将这些技术扩展为不仅支持有界静态场景，还支持动态场景和无界场景。 NerfAcc 带有一个用户友好的 Python API，并为大多数 NeRF 的即插即用加速做好了准备。提供了各种示例来展示如何使用此工具箱。可在此处找到代码：此 https 网址。\n## Oct2 - Oct8, 2022\n  - [在杂乱的环境中学习感知感知敏捷飞行](https://arxiv.org/abs/2210.01841) | [code]\n    > 最近，神经控制策略的性能优于现有的基于模型的规划和控制方法，可在最短的时间内通过杂乱的环境自主导航四旋翼飞行器。然而，它们没有感知意识，这是基于视觉的导航的关键要求，因为相机的视野有限和四旋翼的驱动不足。我们提出了一种学习神经网络策略的方法，该策略可在杂乱的环境中实现感知感知、最短时间飞行。我们的方法通过利用特权学习作弊框架结合了模仿学习和强化学习 (RL)。使用 RL，我们首先训练具有全状态信息的感知感知教师策略，以便在最短时间内通过杂乱的环境。然后，我们使用模仿学习将其知识提炼成基于视觉的学生策略，该策略仅通过相机感知环境。我们的方法将感知和控制紧密结合，在计算速度（快 10 倍）和成功率方面显示出显着优势。我们使用物理四旋翼和硬件在环仿真以高达 50 公里/小时的速度展示了闭环控制性能。\n## Sep25 - Oct1, 2022\n  - [了解体素网格 NeRF 模型的纯 CLIP 指导](https://arxiv.org/abs/2209.15172) | [code]\n    > 我们使用 CLIP 探索文本到 3D 对象生成的任务。具体来说，我们在不访问任何数据集的情况下使用 CLIP 进行指导，我们将这种设置称为纯 CLIP 指导。虽然之前的工作采用了这种设置，但没有系统研究防止 CLIP 中产生对抗性生成的机制。我们说明了不同的基于图像的增强如何防止对抗性生成问题，以及生成的结果如何受到影响。我们测试了不同的 CLIP 模型架构，并表明集成不同的模型进行指导可以防止更大模型中的对抗性生成并产生更清晰的结果。此外，我们实现了一个隐式体素网格模型，以展示神经网络如何提供额外的正则化层，从而产生更好的几何结构和生成对象的连贯性。与之前的工作相比，我们以更高的记忆效率和更快的训练速度获得了更连贯的结果。\n## Sep18 - Sep24, 2022\n  - [来自单个压缩光场测量的快速视差估计](https://arxiv.org/abs/2209.11342) | [code]\n    > 来自光场的丰富空间和角度信息允许开发多种视差估计方法。然而，光场的获取需要较高的存储和处理成本，限制了该技术在实际应用中的使用。为了克服这些缺点，压缩传感 (CS) 理论允许开发光学架构来获取单个编码光场测量。该测量使用需要高计算成本的优化算法或深度神经网络进行解码。从压缩光场进行视差估计的传统方法需要首先恢复整个光场，然后进行后处理步骤，因此需要很长时间。相比之下，这项工作通过省略传统方法中所需的恢复步骤，从单个压缩测量中提出了一种快速的视差估计。具体来说，我们建议联合优化用于获取单个编码光场快照的光学架构和用于估计视差图的卷积神经网络 (CNN)。在实验上，所提出的方法估计的视差图与使用深度学习方法重建的光场获得的视差图相当。此外，所提出的方法在训练和推理方面比从重建光场估计视差的最佳方法快 20 倍。\n  - [wildNeRF：使用稀疏单目数据捕获的野外动态场景的完整视图合成](https://arxiv.org/abs/2209.10399) | [code]\n    > 我们提出了一种新的神经辐射模型，该模型可以以自我监督的方式进行训练，用于动态非结构化场景的新视图合成。我们的端到端可训练算法可在几秒钟内学习高度复杂的真实静态场景，并在几分钟内学习具有刚性和非刚性运动的动态场景。通过区分静态像素和以运动为中心的像素，我们从一组稀疏的图像中创建高质量的表示。我们对现有基准进行了广泛的定性和定量评估，并在具有挑战性的 NVIDIA 动态场景数据集上设置了最先进的性能指标。此外，我们在具有挑战性的现实世界数据集（例如 Cholec80 和 SurgicalActions160）上评估我们的模型性能。\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n  - [FoV-NeRF：虚拟现实的中心凹神经辐射场, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9872532) | [code]\n    > 随着消费者显示器和商业 VR 平台的兴起，虚拟现实 (VR) 正变得无处不在。这种显示需要低延迟和高质量的合成图像渲染，同时减少计算开销。神经渲染的最新进展表明，有望通过基于图像的虚拟或物理环境表示来解锁 3D 计算机图形的新可能性。具体来说，神经辐射场 (NeRF) 表明，可以在不损失与视图相关的效果的情况下实现 3D 场景的照片般逼真的质量和连续视图变化。虽然 NeRF 可以显着受益于 VR 应用的渲染，但它面临着由高视场、高分辨率和立体/以自我为中心的观看带来的独特挑战，通常会导致渲染图像的低质量和高延迟。在 VR 中，这不仅会损害交互体验，还可能导致疾病。为了解决 VR 中的六自由度、以自我为中心和立体 NeRF 的这些问题，我们提出了第一个注视条件 3D 神经表示和视图合成方法。我们将视觉和立体敏锐度的人类心理物理学纳入 3D 风景的以自我为中心的神经表示中。然后，我们共同优化延迟/性能和视觉质量，同时相互桥接人类感知和神经场景合成，以实现感知上高质量的沉浸式交互。我们进行了客观分析和主观研究，以评估我们方法的有效性。我们发现我们的方法显着减少了延迟（与 NeRF 相比减少了高达 99% 的时间），而不会损失高保真渲染（在感知上与全分辨率地面实况相同）。所提出的方法可能是迈向未来实时捕捉、传送和可视化远程环境的 VR/AR 系统的第一步。\n  - [克隆：用于占用网格辅助神经表示的相机-激光雷达融合](https://arxiv.org/abs/2209.01194) | [code]\n    > 本文提出了 CLONeR，它通过允许对从稀疏输入传感器视图观察到的大型户外驾驶场景进行建模，显着改进了 NeRF。这是通过将 NeRF 框架内的占用和颜色学习解耦为分别使用 LiDAR 和相机数据训练的单独的多层感知器 (MLP) 来实现的。此外，本文提出了一种在 NeRF 模型旁边构建可微分 3D 占用网格图 (OGM) 的新方法，并利用此占用网格改进沿射线的点采样，以在度量空间中进行体积渲染。\n## Aug21 - Aug27, 2022\n  - [Voxurf：基于体素的高效准确的神经表面重建](https://arxiv.org/abs/2208.12697) | [code]\n    > 神经表面重建旨在基于多视图图像重建准确的 3D 表面。以前基于神经体绘制的方法大多训练完全隐式模型，并且它们需要对单个场景进行数小时的训练。最近的努力探索了显式体积表示，它通过在可学习的体素网格中记忆重要信息来大大加速优化过程。然而，这些基于体素的方法通常难以重建细粒度几何。通过实证研究，我们发现高质量的表面重建取决于两个关键因素：构建连贯形状的能力和颜色几何依赖性的精确建模。特别是后者是精细细节准确重建的关键。受这些发现的启发，我们开发了 Voxurf，这是一种基于体素的高效和准确的神经表面重建方法，它包括两个阶段：1）利用可学习的特征网格来构建色场并获得连贯的粗略形状，以及 2）使用捕获精确的颜色几何依赖性的双色网络优化详细的几何图形。我们进一步引入了分层几何特征，以实现跨体素的信息共享。我们的实验表明，Voxurf 同时实现了高效率和高质量。在 DTU 基准上，与最先进的方法相比，Voxurf 实现了更高的重建质量，训练速度提高了 20 倍。\n  - [E-NeRF：来自移动事件相机的神经辐射场](https://arxiv.org/abs/2208.11300) | [code]\n    > 从理想图像估计神经辐射场 (NeRFs) 已在计算机视觉领域得到广泛研究。大多数方法假设最佳照明和缓慢的相机运动。这些假设在机器人应用中经常被违反，其中图像包含运动模糊并且场景可能没有合适的照明。这可能会导致下游任务（例如场景的导航、检查或可视化）出现重大问题。为了缓解这些问题，我们提出了 E-NeRF，这是第一种从快速移动的事件摄像机中以 NeRF 形式估计体积场景表示的方法。我们的方法可以在非常快速的运动和高动态范围条件下恢复 NeRF，在这种情况下，基于帧的方法会失败。我们展示了仅通过提供事件流作为输入来渲染高质量帧是可能的。此外，通过结合事件和帧，我们可以估计在严重运动模糊下比最先进的方法质量更高的 NeRF。我们还表明，在只有很少的输入视图可用的情况下，结合事件和帧可以克服 NeRF 估计的失败情况，而无需额外的正则化。\n## Aug14 - Aug20, 2022\n  - [PDRF：渐进式去模糊辐射场，用于从模糊图像中快速、稳健地重建场景](https://arxiv.org/abs/2208.08049) | [code]\n    > 我们提出了渐进式去模糊辐射场 (PDRF)，这是一种从模糊图像中有效重建高质量辐射场的新方法。虽然当前最先进的 (SoTA) 场景重建方法从干净的源视图实现照片般逼真的渲染结果，但当源视图受到模糊影响时，它们的性能会受到影响，这在野外图像中很常见。以前的去模糊方法要么不考虑 3D 几何，要么计算量很大。为了解决这些问题，PDRF 是辐射场建模中的一种渐进式去模糊方案，它通过结合 3D 场景上下文准确地模拟模糊。 PDRF 进一步使用有效的重要性采样方案，从而实现快速的场景优化。具体来说，PDRF 提出了一种 Coarse Ray Renderer 来快速估计体素密度和特征；然后使用 Fine Voxel Renderer 来实现高质量的光线追踪。我们进行了广泛的实验，结果表明 PDRF 比以前的 SoTA 快 15 倍，同时在合成场景和真实场景上都取得了更好的性能。\n  - [HDR-Plenoxels：自校准高动态范围辐射场, ECCV2022](https://arxiv.org/abs/2208.06787) | [code]\n    > 我们提出了高动态范围辐射 (HDR) 场 HDR-Plenoxels，它学习 3D HDR 辐射场、几何信息和 2D 低动态范围 (LDR) 图像中固有的不同相机设置的全光函数。我们基于体素的体素渲染管道仅使用从不同相机设置中以端到端方式拍摄的多视图 LDR 图像来重建 HDR 辐射场，并且具有快速的收敛速度。为了处理现实世界场景中的各种相机，我们引入了一个色调映射模块，该模块对相机内的数字成像管道 (ISP) 进行建模并解开辐射设置。我们的色调映射模块允许我们通过控制每个新视图的辐射设置来进行渲染。最后，我们构建了一个具有不同相机条件的多视图数据集，这符合我们的问题设置。我们的实验表明，HDR-Plenoxels 可以仅从带有各种相机的 LDR 图像中表达细节和高质量的 HDR 新颖视图。\n## Aug7 - Aug13, 2022\n  - [OmniVoxel：一种快速精确的全向神经辐射场重建方法, GCCE 2022](https://arxiv.org/abs/2208.06335) | [code]\n    > 本文提出了一种利用等矩形全向图像重建神经辐射场的方法。具有辐射场的隐式神经场景表示可以在有限的空间区域内连续重建场景的 3D 形状。然而，在商用 PC 硬件上训练完全隐式表示需要大量时间和计算资源（每个场景 15 ~ 20 小时）。因此，我们提出了一种显着加速这一过程的方法（每个场景 20 ∼ 40 分钟）。我们没有使用完全隐式的光线表示来重建辐射场，而是采用包含张量中的密度和颜色特征的特征体素。考虑到全向 equirectangular 输入和相机布局，我们使用球面体素化来表示，而不是三次表示。我们的体素化方法可以平衡内景和外景的重建质量。此外，我们对颜色特征采用轴对齐位置编码方法来提高整体图像质量。我们的方法在具有随机相机姿势的合成数据集上实现了令人满意的经验性能。此外，我们在包含复杂几何形状的真实场景中测试了我们的方法，并实现了最先进的性能。我们的代码和完整的数据集将与论文发表的同时发布。\n  - [HRF-Net：来自稀疏输入的整体辐射场](https://arxiv.org/abs/2208.04717) | [code]\n    > 我们提出了 HRF-Net，这是一种基于整体辐射场的新型视图合成方法，它使用一组稀疏输入来渲染新颖的视图。最近的泛化视图合成方法也利用了辐射场，但渲染速度不是实时的。现有的方法可以有效地训练和渲染新颖的视图，但它们不能推广到看不见的场景。我们的方法解决了用于泛化视图合成的实时渲染问题，包括两个主要阶段：整体辐射场预测器和基于卷积的神经渲染器。这种架构不仅可以基于隐式神经场推断出一致的场景几何，还可以使用单个 GPU 有效地渲染新视图。我们首先在 DTU 数据集的多个 3D 场景上训练 HRF-Net，并且该网络可以仅使用光度损失对看不见的真实和合成数据产生似是而非的新颖视图。此外，我们的方法可以利用单个场景的一组更密集的参考图像来生成准确的新颖视图，而无需依赖额外的显式表示，并且仍然保持预训练模型的高速渲染。实验结果表明，HRF-Net 在各种合成和真实数据集上优于最先进的可泛化神经渲染方法。\n## Jul31 - Aug6, 2022\n  - [全息显示3D相位全息图的端到端学习](https://www.nature.com/articles/s41377-022-00894-6) | [code]\n    > 计算机生成的全息术 (CGH) 提供相干波前的体积控制，是体积 3D 显示器、光刻、神经光刺激和光/声捕获等应用的基础。最近，基于深度学习的方法作为 CGH 合成的有前途的计算范式出现，克服了传统基于模拟/优化的方法中的质量-运行时权衡。然而，预测全息图的质量本质上受数据集质量的限制。在这里，我们介绍了一个新的全息图数据集 MIT-CGH-4K-V2，它使用分层深度图像作为数据高效的体积 3D 输入和用于直接合成高质量 3D 相位的两阶段监督+无监督训练协议-只有全息图。所提出的系统还可以校正视觉像差，从而允许为最终用户定制。我们通过实验展示了逼真的 3D 全息投影并讨论了相关的空间光调制器校准程序。我们的方法在消费级 GPU 上实时运行，在 iPhone 13 Pro 上以 5 FPS 运行，有望显着提高上述应用程序的性能。\n## Jul24 - Jul30, 2022\n  - [MobileNeRF：利用多边形光栅化管道在移动架构上进行高效的神经场渲染](https://arxiv.org/abs/2208.00277) | [***``[code]``***](https://github.com/google-research/jax3d/tree/main/jax3d/projects/mobilenerf)\n    > 神经辐射场 (NeRFs) 展示了从新颖视图合成 3D 场景图像的惊人能力。但是，它们依赖于基于光线行进的专用体积渲染算法，这些算法与广泛部署的 g 的功能不匹配图形硬件。本文介绍了一种基于纹理多边形的新 NeRF 表示，它可以使用标准渲染管道有效地合成新图像。 NeRF 表示为一组多边形，其纹理表示二进制不透明度和特征向量。使用 z 缓冲区对多边形进行传统渲染会生成每个像素都有特征的图像，这些图像由在片段着色器中运行的小型、依赖于视图的 MLP 进行解释，以产生最终的像素颜色。这种方法使 NeRF 能够使用传统的多边形光栅化管道进行渲染，该管道提供大规模的像素级并行性，在包括手机在内的各种计算平台上实现交互式帧速率。\n  - [通过 NeRF Attention 进行端到端视图合成](https://arxiv.org/abs/2207.14741) | [code]\n    > 在本文中，我们提出了一个用于视图合成的简单 seq2seq 公式，其中我们将一组光线点作为输入和输出与光线相对应的颜色。在这个 seq2seq 公式上直接应用标准转换器有两个限制。首先，标准注意力不能成功地适应体积渲染过程，因此合成视图中缺少高频分量。其次，将全局注意力应用于所有光线和像素是非常低效的。受神经辐射场 (NeRF) 的启发，我们提出了 NeRF 注意力 (NeRFA) 来解决上述问题。一方面，NeRFA 将体积渲染方程视为软特征调制过程。通过这种方式，特征调制增强了具有类似 NeRF 电感偏置的变压器。另一方面，NeRFA 执行多阶段注意力以减少计算开销。此外，NeRFA 模型采用光线和像素转换器来学习光线和像素之间的相互作用。 NeRFA 在四个数据集上展示了优于 NeRF 和 NerFormer 的性能：DeepVoxels、Blender、LLFF 和 CO3D。此外，NeRFA 在两种设置下建立了新的 state-of-the-art：单场景视图合成和以类别为中心的新颖视图合成。该代码将公开发布。\n  - [脱离网格：用于 3D 血管建模的连续隐式神经表示, MICCAI STACOM 2022](https://arxiv.org/abs/2207.14663) | [code]\n    > 个性化 3D 血管模型对于心血管疾病患者的诊断、预后和治疗计划非常有价值。传统上，此类模型是用网格和体素掩码等显式表示或径向基函数或原子（管状）形状等隐式表示构建的。在这里，我们建议在可微的隐式神经表示 (INR) 中通过其有符号距离函数 (SDF) 的零水平集来表示表面。这使我们能够用隐式、连续、轻量级且易于与深度学习算法集成的表示来对复杂的血管结构进行建模。我们在这里通过三个实际示例展示了这种方法的潜力。首先，我们从 CT 图像中获得了腹主动脉瘤 (AAA) 的准确且防水的表面，并从表面上的 200 个点显示出稳健的拟合。其次，我们同时将嵌套的血管壁安装在单个 INR 中，没有交叉点。第三，我们展示了如何将单个动脉的 3D 模型平滑地融合到单个防水表面中。我们的结果表明，INR 是一种灵活的表示形式，具有最小交互注释的潜力复杂血管结构的研究和操作。\n## Previous weeks\n  - [﻿Plenoxels：没有神经网络的辐射场, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > 我们介绍了 Plenoxels（全光体素），一种用于照片级真实视图合成的系统。 Plenoxels 将场景表示为具有球谐函数的稀疏 3D 网格。这种表示可以通过梯度方法和正则化从校准图像中优化，而无需任何神经组件。在标准的基准任务中，Plenoxels 的优化速度比神经辐射场快两个数量级，而视觉质量没有损失。\n  - [神经稀疏体素场, NeurIPS2020](https://lingjie0206.github.io/papers/NSVF/) | [***``[code]``***](https://github.com/facebookresearch/NSVF)\n    > 我们介绍了神经稀疏体素场 (NSVF)，这是一种用于快速和高质量自由视点渲染的新神经场景表示。 NSVF 定义了一组以稀疏体素八叉树组织的体素有界隐式字段，以对每个单元中的局部属性进行建模。 我们仅从一组姿势的 RGB 图像中通过可区分的光线行进操作逐步学习底层体素结构。 使用稀疏体素八叉树结构，可以通过跳过不包含相关场景内容的体素来加速渲染新颖的视图。 我们的方法在推理时比最先进的方法（即 NeRF (Mildenhall et al., 2020)）快 10 倍以上，同时获得更高质量的结果。 此外，通过利用显式稀疏体素表示，我们的方法可以很容易地应用于场景编辑和场景合成。 我们还展示了几个具有挑战性的任务，包括多场景学习、移动人体的自由视点渲染和大规模场景渲染。\n  - [AutoInt：快速神经体积渲染的自动集成, CVPR2021](http://www.computationalimaging.org/publications/automatic-integration/) | [***``[code]``***](https://github.com/computational-imaging/automatic-integration)\n    > 数值积分是科学计算的基础技术，是许多计算机视觉应用的核心。在这些应用中，隐式神经体绘制最近被提出作为视图合成的新范式，实现逼真的图像质量。然而，使这些方法实用的一个基本障碍是在训练和推理期间沿渲染光线所需的体积积分导致的极端计算和内存要求。需要数百万条光线，每条光线都需要数百次通过神经网络的前向传播，才能通过蒙特卡罗采样来近似这些集成。在这里，我们提出了自动积分，这是一种使用隐式神经表示网络来学习有效的、封闭形式的积分解决方案的新框架。对于训练，我们实例化对应于隐式神经表示的导数的计算图。该图适合要积分的信号。优化后，我们重新组装图以获得代表反导数的网络。根据微积分的基本定理，这可以在网络的两次评估中计算任何定积分。使用这种方法，我们展示了超过 10 倍的计算要求改进，从而实现了快速的神经体绘制。\n  - [DeRF：分解的辐射场](https://arxiv.org/abs/2011.12490) | [code]\n    > 随着神经辐射场 (NeRF) 的出现，神经网络现在可以渲染 3D 场景的新颖视图，其质量足以愚弄人眼。然而，生成这些图像的计算量非常大，限制了它们在实际场景中的适用性。在本文中，我们提出了一种基于空间分解的技术，能够缓解这个问题。我们的主要观察结果是，使用更大（更深和/或更宽）的网络会带来收益递减。因此，我们建议对场景进行空间分解，并为每个分解部分分配更小的网络。当一起工作时，这些网络可以渲染整个场景。这使我们无论分解部分的数量如何，都能获得近乎恒定的推理时间。此外，我们表明，Voronoi 空间分解更适合此目的，因为它可证明与 Painter 算法兼容，可实现高效且 GPU 友好的渲染。我们的实验表明，对于现实世界的场景，我们的方法提供的推理效率比 NeRF 高出 3 倍（具有相同的渲染质量），或者 PSNR 提高了 1.0~dB（对于相同的推理成本）。\n  - [DONeRF：使用 Depth Oracle Networks 实现紧凑神经辐射场的实时渲染, CGF2021](https://depthoraclenerf.github.io/) | [***``[code]``***](https://github.com/facebookresearch/DONERF)\n    > 最近围绕神经辐射场 (NeRFs) 的研究爆炸表明，在神经网络中隐式存储场景和照明信息具有巨大的潜力，例如，用于生成新的视图。然而，阻止 NeRF 广泛使用的一个主要限制是沿每个视图射线进行过多网络评估的计算成本过高，当针对当前设备上的实时渲染时需要数十 petaFLOPS。我们表明，当将局部样本放置在场景中的表面周围时，可以显着减少每个视图光线所需的样本数量。为此，我们提出了一个深度预言网络，它通过单个网络评估来预测每个视图光线的光线样本位置。我们表明，使用围绕对数离散和球面扭曲深度值的分类网络对于编码表面位置而不是直接估计深度至关重要。这些技术的结合产生了 DONeRF，这是一种双网络设计，第一步是深度预言网络，以及用于光线累积的局部采样着色网络。通过我们的设计，与 NeRF 相比，我们将推理成本降低了 48 倍。使用现成的推理 API 与简单的计算内核相结合，我们率先在单个 GPU 上以交互式帧速率（每秒 15 帧，800x800）渲染基于光线追踪的神经表示。同时，由于我们专注于表面周围场景的重要部分，与 NeRF 相比，我们获得了相同或更好的质量。\n  - [FastNeRF：200FPS 的高保真神经渲染, ICCV2021](https://arxiv.org/abs/2103.10380) | [code]\n    > 最近关于神经辐射场 (NeRF) 的工作展示了如何使用神经网络对复杂的 3D 环境进行编码，这些环境可以从新颖的视角进行逼真的渲染。渲染这些图像对计算的要求非常高，最近的改进距离实现交互速率还有很长的路要走，即使在高端硬件上也是如此。受移动和混合现实设备场景的启发，我们提出了 FastNeRF，这是第一个基于 NeRF 的系统，能够在高端消费 GPU 上以 200Hz 渲染高保真逼真图像。我们方法的核心是受图形启发的分解，它允许 (i) 在空间中的每个位置紧凑地缓存深度辐射图，(ii) 使用光线方向有效地查询该图以估计渲染图像中的像素值。大量实验表明，所提出的方法比原始的 NeRF 算法快 3000 倍，并且比现有的加速 NeRF 的工作至少快一个数量级，同时保持视觉质量和可扩展性。\n  - [KiloNeRF：使用数千个微型 MLP 加速神经辐射场, ICCV2021](https://arxiv.org/abs/2103.13744) | [***``[code]``***](https://github.com/creiser/kilonerf/)\n    > NeRF 通过将神经辐射场拟合到 RGB 图像，以前所未有的质量合成场景的新视图。然而，NeRF 需要数百万次查询深度多层感知器 (MLP)，导致渲染时间变慢，即使在现代 GPU 上也是如此。在本文中，我们证明了通过使用数千个微型 MLP 而不是一个大型 MLP，实时渲染是可能的。在我们的设置中，每个单独的 MLP 只需要表示场景的一部分，因此可以使用更小、更快评估的 MLP。通过将这种分而治之的策略与进一步的优化相结合，与原始 NeRF 模型相比，渲染速度提高了三个数量级，而不会产生高昂的存储成本。此外，使用师生蒸馏进行培训，我们表明可以在不牺牲视觉质量的情况下实现这种加速。\n  - [用于实时渲染神经辐射场的 PlenOctrees, ICCV2021(oral)](https://alexyu.net/plenoctrees/) | [***``[code]``***](https://github.com/sxyu/volrend)\n    > 实时性能是通过将 NeRF 预先制成基于八叉树的辐射场（我们称为 PlenOctrees）来实现的。为了保留与视图相关的效果，例如镜面反射，我们建议通过封闭形式的球面基函数对外观进行编码。具体来说，我们表明可以训练 NeRFs 来预测辐射的球谐表示，将观察方向作为神经网络的输入。此外，我们表明我们的 PlenOctrees 可以直接优化以进一步最小化重建损失，这导致与竞争方法相同或更好的质量。我们进一步表明，这个八叉树优化步骤可用于加快训练时间，因为我们不再需要等待 NeRF 训练完全收敛。我们的实时神经渲染方法可能会支持新的应用，例如 6 自由度工业和产品可视化，以及下一代 AR/VR 系统。\n  - [用于高效神经渲染的体积基元混合, SIGGRAPH2021](https://arxiv.org/abs/2103.01954) | [code]\n    > 人类的实时渲染和动画是游戏、电影和远程呈现应用中的核心功能。现有方法有许多我们的工作旨在解决的缺点。三角形网格难以建模像头发这样的细结构，像神经体积这样的体积表示在合理的内存预算下分辨率太低，而像神经辐射场这样的高分辨率隐式表示在实时应用中使用太慢。我们提出了体积基元混合（MVP），一种用于渲染动态 3D 内容的表示，它结合了体积表示的完整性和基于基元的渲染的效率，例如，基于点或基于网格的方法。我们的方法通过利用具有反卷积架构的空间共享计算以及通过使用可以移动以仅覆盖被占用区域的体积基元来最小化空间空白区域中的计算来实现这一点。我们的参数化支持对应和跟踪约束的集成，同时对经典跟踪失败的区域具有鲁棒性，例如薄或半透明结构周围以及具有大拓扑可变性的区域。 MVP 是一种混合体，它概括了基于体积和基元的表示。通过一系列广泛的实验，我们证明它继承了每种方法的优点，同时避免了它们的许多局限性。我们还将我们的方法与几种最先进的方法进行比较，并证明 MVP 在质量和运行时性能方面产生了卓越的结果。\n  - [光场网络：具有单次评估渲染的神经场景表示, NeurIPS2021(spotlight)](https://www.vincentsitzmann.com/lfns/) | [***``[code]``***](https://github.com/vsitzmann/light-field-networks)\n    > 从 2D 观察推断 3D 场景的表示是计算机图形学、计算机视觉和人工智能的基本问题。新兴的 3D 结构神经场景表示是一种有前途的 3D 场景理解方法。在这项工作中，我们提出了一种新的神经场景表示，光场网络或 LFN，它通过神经隐式表示在 360 度、四维光场中表示底层 3D 场景的几何形状和外观。渲染来自 LFN 的光线只需要*单个*网络评估，而 3D 结构化神经场景表示中的光线行进或基于体积的渲染器每条光线需要数百次评估。在简单场景的设置中，我们利用元学习来学习 LFN 的先验，从而能够从单个图像观察中进行多视图一致的光场重建。这导致时间和内存复杂性的显着降低，并实现了实时渲染。通过 LFN 存储 360 度光场的成本比 Lumigraph 等传统方法低两个数量级。利用神经隐式表示的分析可微性和光空间的新参数化，我们进一步证明了从 LFN 中提取稀疏深度图。\n  - [深度监督的 NeRF：更少的视图和更快的免费训练, CVPR2022](https://arxiv.org/abs/2107.02791) | [***``[code]``***](https://github.com/dunbar12138/DSNeRF)\n    > 当输入视图数量不足时，通常观察到的神经辐射场 (NeRF) 故障模式会拟合不正确的几何形状。一个潜在的原因是标准体积渲染不会强制执行大多数场景几何体由空白空间和不透明表面组成的约束。我们通过 DS-NeRF（深度监督神经辐射场）将上述假设形式化，这是一种利用现成的深度监督学习辐射场的损失。我们利用当前的 NeRF 管道需要具有已知相机姿势的图像这一事实，这些图像通常通过运行从运动结构 (SFM) 来估计。至关重要的是，SFM 还产生稀疏 3D 点，可在训练期间用作“免费”深度监督：我们添加损失以鼓励光线的终止深度分布匹配给定的 3D 关键点，并结合深度不确定性。 DS-NeRF 可以在训练视图更少的情况下渲染更好的图像，同时训练速度提高 2-3 倍。此外，我们表明我们的损失与最近提出的其他 NeRF 方法兼容，证明深度是一种廉价且易于消化的监督信号。最后，我们发现 DS-NeRF 可以支持其他类型的深度监督，例如扫描深度传感器和 RGB-D 重建输出。\n  - [直接体素网格优化：辐射场重建的超快速收敛, CVPR2022(oral)](https://arxiv.org/abs/2111.11215) | [***``[code]``***](https://github.com/sunset1995/DirectVoxGO)\n    > 我们提出了一种超快速收敛方法，用于从一组捕获具有已知姿势的场景的图像中重建每个场景的辐射场。这项任务通常应用于新颖的视图合成，最近因其最先进的质量和灵活性而被神经辐射场 (NeRF) 彻底改变。然而，对于单个场景，NeRF 及其变体需要很长的训练时间，从数小时到数天不等。相比之下，我们的方法实现了与 NeRF 相当的质量，并在不到 15 分钟的时间内使用单个 GPU 从头开始​​快速收敛。我们采用由用于场景几何的密度体素网格和具有浅层网络的特征体素网格组成的表示，用于复杂的依赖于视图的外观。使用显式和离散化的体积表示进行建模并不新鲜，但我们提出了两种简单但非平凡的技术，有助于快速收敛和高质量输出。首先，我们介绍了体素密度的激活后插值，它能够以较低的网格分辨率产生锐利的表面。其次，直接体素密度优化容易出现次优几何解决方案，因此我们通过强加几个先验来加强优化过程。最后，对五个内向基准的评估表明，我们的方法与 NeRF 的质量相匹配，甚至超过，但从头开始训练新场景只需要大约 15 分钟。\n  - [实时隐式映射和定位, ICCV2021](https://arxiv.org/abs/2103.12352) | [code]\n    > 我们首次展示了多层感知器 (MLP) 可以作为手持 RGB-D 相机的实时 SLAM 系统中唯一的场景表示。我们的网络在没有先验数据的情况下进行实时操作训练，构建了一个密集的、特定于场景的隐式 3D 占用率和颜色模型，该模型也可立即用于跟踪。\n  - [Mip-NeRF：抗锯齿神经辐射场的多尺度表示, ICCV2021(oral)](https://jonbarron.info/mipnerf/) | [***``[code]``***](https://github.com/google/mipnerf)\n    > 神经辐射场 (NeRF) 使用的渲染过程对每个像素单条射线进行采样，因此在训练或测试图像以不同分辨率观察场景内容时，可能会产生过度模糊或混叠的渲染。对于 NeRF 来说，通过每个像素渲染多条光线来进行超级采样的直接解决方案是不切实际的，因为渲染每条光线需要查询多层感知器数百次。我们的解决方案，我们称之为“mip-NeRF”（à la“mipmap”），扩展了 NeRF 以在连续值的尺度上表示场景。通过有效地渲染抗锯齿圆锥截头体而不是射线，mip-NeRF 减少了令人反感的锯齿伪影并显着提高了 NeRF 表示精细细节的能力，同时也比 NeRF 快 7% 和一半的大小。与 NeRF 相比，mip-NeRF 在使用 NeRF 呈现的数据集上将平均错误率降低了 17%，在我们呈现的该数据集的具有挑战性的多尺度变体上降低了 60%。 mip-NeRF 还能够在我们的多尺度数据集上匹配蛮力超采样 NeRF 的准确性，同时速度提高 22 倍。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/generalization.md",
    "content": "\n每周分类神经辐射场 - generalization ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n===========================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n  - [用于高质量视图合成的稀疏 RGB-D 图像的神经辐射场, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9999509) | [code]\n    > 最近提出的神经辐射场 (NeRF) 使用作为多层感知器 (MLP) 制定的连续函数来模拟 3D 场景的外观和几何形状。 这使得新视图的逼真合成成为可能，即使对于具有视图依赖外观的场景也是如此。 此后，许多后续工作以不同方式扩展了 NeRF。 然而，该方法的一个基本限制仍然是它需要从密集放置的视点捕获大量图像以进行高质量合成，并且当捕获的视图数量不足时，结果的质量会迅速下降。 为了解决这个问题，我们提出了一种新的基于 NeRF 的框架，该框架能够仅使用一组稀疏的 RGB-D 图像进行高质量的视图合成，这些图像可以在当前的消费设备上使用相机和 LiDAR 传感器轻松捕获。 首先，从捕获的 RGB-D 图像重建场景的几何代理。 然后可以使用重建场景的渲染以及精确的相机参数来预训练网络。 最后，使用少量真实捕获的图像对网络进行微调。 我们进一步引入了一个补丁鉴别器，以在微调期间在新颖的视图下监督网络，并在提高合成质量之前引入 3D 颜色。 我们证明了我们的方法可以从少至 6 个 RGB-D 图像生成 3D 场景的任意新颖视图。 大量实验表明，与现有的基于 NeRF 的方法相比，我们的方法有所改进，包括旨在减少输入图像数量的方法。\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [图像生成器的扩散引导域自适应](https://arxiv.org/abs/2212.04473) | [code]\n    > 能否将文本到图像扩散模型用作训练目标，让 GAN 生成器适应另一个领域？ 在本文中，我们展示了无分类器指导可以用作评论家，并使生成器能够从大规模文本到图像扩散模型中提取知识。 生成器可以有效地转移到文本提示指示的新域中，而无需访问目标域中的真实样本。 我们通过大量实验证明了我们方法的有效性和可控性。 尽管没有经过训练来最小化 CLIP 损失，但我们的模型在短提示上获得了同样高的 CLIP 分数和显着降低的 FID，并且在长而复杂的提示上在定性和定量上都优于基线。 据我们所知，所提出的方法是首次尝试将大规模预训练扩散模型和蒸馏采样结合起来用于文本驱动的图像生成器域自适应，并提供了以前无法实现的质量。 此外，我们将我们的工作扩展到基于 3D 风格的生成器和 DreamBooth 指南。\n  - [NeRDi：以语言引导扩散作为一般图像先验的单视图 NeRF 合成](https://arxiv.org/abs/2212.03267) | [code]\n    > 2D 到 3D 重建是一个病态问题，但由于人类多年来积累的 3D 世界先验知识，因此擅长解决这个问题。 受此观察的驱动，我们提出了 NeRDi，这是一种单视图 NeRF 合成框架，具有来自 2D 扩散模型的一般图像先验。 将单视图重建制定为图像条件 3D 生成问题，我们通过在输入视图约束下使用预训练图像扩散模型最小化其任意视图渲染上的扩散损失来优化 NeRF 表示。 我们利用现成的视觉语言模型，并引入两部分语言指导作为扩散模型的条件输入。 这本质上有助于提高多视图内容的一致性，因为它缩小了以单视图输入图像的语义和视觉特征为条件的一般图像先验范围。 此外，我们引入了基于估计深度图的几何损失，以正则化 NeRF 的底层 3D 几何。 DTU MVS 数据集上的实验结果表明，与在此数据集上训练的现有方法相比，我们的方法可以合成更高质量的新视图。 我们还展示了我们在野外图像的零样本 NeRF 合成中的普遍性。\n## Nov27 - Dec3, 2022\n  - [StegaNeRF：在神经辐射场中嵌入不可见信息](https://arxiv.org/abs/2212.01602) | [***``[code]``***](https://github.com/XGGNet/StegaNeRF)\n    > 神经渲染的最新进展意味着通过共享 NeRF 模型权重广泛分布视觉数据的未来。 然而，虽然常见的视觉数据（图像和视频）具有明确或巧妙地嵌入所有权或版权信息的标准方法，但对于新兴的 NeRF 格式，该问题仍未得到探索。 我们介绍了 StegaNeRF，这是一种在 NeRF 渲染中嵌入隐写信息的方法。 我们设计了一个优化框架，允许从 NeRF 渲染的图像中准确提取隐藏信息，同时保留其原始视觉质量。 我们在几个潜在的部署场景下对我们的方法进行了实验评估，并进一步讨论了通过我们的分析发现的见解。 StegaNeRF 标志着对将可定制、不可察觉和可恢复的信息灌输到 NeRF 渲染的新问题的初步探索，同时对渲染图像的影响最小。 项目页面：此 https 网址。\n  - [LatentSwap3D：3D 图像 GAN 的语义编辑](https://arxiv.org/abs/2212.01381) | [***``[code]``***](https://github.com/enisimsar/latentswap3d)\n    > 最近的 3D 感知 GAN 依靠体积渲染技术来解开物体的姿势和外观，事实上生成整个 3D 体积而不是从潜在代码生成单视图 2D 图像。 复杂的图像编辑任务可以在基于标准 2D 的 GAN（例如，StyleGAN 模型）中作为对潜在维度的操作来执行。 然而，据我们所知，对于 3D 感知 GAN 模型，仅部分探索了类似的属性。 这项工作旨在通过展示现有方法的局限性并提出 LatentSwap3D 来填补这一空白，LatentSwap3D 是一种与模型无关的方法，旨在在预训练的 3D 感知 GAN 的潜在空间中启用属性编辑。 我们首先根据随机森林分类器的特征重要性排名，确定控制目标属性的模型的潜在空间中最相关的维度。 然后，为了应用转换，我们将正在编辑的图像的前 K 个最相关的潜在维度与显示所需属性的图像交换。 尽管它很简单，但 LatentSwap3D 以一种分离的方式提供了卓越的语义编辑，并且在质量和数量上都优于其他方法。 我们在各种 3D 感知生成模型（如 pi-GAN、GIRAFFE、StyleSDF、MVCGAN、EG3D 和 VolumeGAN）以及各种数据集（如 FFHQ、AFHQ、Cats、MetFaces 和 CompCars）上展示了我们的语义编辑方法。 可以找到项目页面：\\url{this https URL}。\n  - [DiffRF：渲染引导的 3D 辐射场扩散](https://arxiv.org/abs/2212.01206) | [code]\n    > 我们介绍了 DiffRF，这是一种基于去噪扩散概率模型的 3D 辐射场合成新方法。 虽然现有的基于扩散的方法对图像、潜在代码或点云数据进行操作，但我们是第一个直接生成体积辐射场的方法。 为此，我们提出了一种直接在显式体素网格表示上运行的 3D 去噪模型。 然而，由于从一组姿势图像生成的辐射场可能不明确且包含伪影，因此获取地面真实辐射场样本并非易事。 我们通过将去噪公式与渲染损失配对来解决这一挑战，使我们的模型能够学习有利于良好图像质量的偏差先验，而不是试图复制像浮动伪影这样的拟合错误。 与 2D 扩散模型相比，我们的模型学习多视图一致先验，支持自由视图合成和准确的形状生成。 与 3D GAN 相比，我们基于扩散的方法自然可以在推理时启用条件生成，例如掩蔽完成或单视图 3D 合成。\n  - [SparseFusion：蒸馏 View-conditioned Diffusion 用于 3D 重建](https://arxiv.org/abs/2212.00792) | [code]\n    > 我们提出了 SparseFusion，这是一种稀疏视图 3D 重建方法，它统一了神经渲染和概率图像生成方面的最新进展。 现有方法通常建立在具有重新投影特征的神经渲染上，但无法生成看不见的区域或处理大视点变化下的不确定性。 替代方法将其视为（概率）2D 合成任务，虽然它们可以生成似是而非的 2D 图像，但它们无法推断出一致的底层 3D。 然而，我们发现 3D 一致性和概率图像生成之间的这种权衡并不需要存在。 事实上，我们表明几何一致性和生成推理可以在模式搜索行为中互补。 通过从视图条件潜在扩散模型中提取 3D 一致场景表示，我们能够恢复一个合理的 3D 表示，其渲染既准确又逼真。 我们评估了 CO3D 数据集中 51 个类别的方法，并表明它在失真和感知指标方面优于现有方法，用于稀疏视图新视图合成。\n  - [Score Jacobian Chaining：为 3D 生成提升预训练的 2D 扩散模型](https://arxiv.org/abs/2212.00774) | [code]\n    > 扩散模型学习预测梯度矢量场。 我们建议对学习到的梯度应用链式法则，并通过可微分渲染器的雅可比矩阵反向传播扩散模型的分数，我们将其实例化为体素辐射场。 此设置将多个摄像机视点的 2D 分数聚合为 3D 分数，并将预训练的 2D 模型重新用于 3D 数据生成。 我们确定了此应用程序中出现的分布不匹配的技术挑战，并提出了一种新颖的估计机制来解决它。 我们在几个现成的扩散图像生成模型上运行我们的算法，包括最近发布的在大规模 LAION 数据集上训练的稳定扩散。\n  - [3D-LDM：使用潜在扩散模型生成神经隐式 3D 形状](https://arxiv.org/abs/2212.00842) | [code]\n    > 扩散模型在图像生成方面显示出巨大的潜力，在生成多样性方面击败了 GAN，具有可比的图像质量。 然而，它们在 3D 形状上的应用仅限于点或体素表示，这些表示在实践中不能准确地表示 3D 表面。 我们提出了一种用于在自动解码器的潜在空间中运行的 3D 形状的神经隐式表示的扩散模型。 这使我们能够生成多样化和高质量的 3D 表面。 我们还表明，我们可以根据图像或文本调节我们的模型，以使用 CLIP 嵌入实现图像到 3D 生成和文本到 3D 生成。 此外，将噪声添加到现有形状的潜在代码中可以让我们探索形状变化。\n  - [用于交互式自由视点视频的高效神经辐射场, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555376) | [code]\n    > 本文旨在解决高效制作交互式自由视点视频的挑战。 最近的一些工作为神经辐射场配备了图像编码器，使它们能够跨场景进行泛化。 在处理动态场景时，他们可以简单地将每个视频帧视为一个单独的场景，并进行新颖的视图合成以生成自由视点视频。 但是，它们的渲染过程很慢，不能支持交互式应用程序。 一个主要因素是他们在推断辐射场时在空白空间中采样大量点。 我们提出了一种称为 ENeRF 的新颖场景表示，用于快速创建交互式自由视点视频。 具体来说，给定一帧的多视图图像，我们首先构建级联成本量来预测场景的粗略几何形状。 粗糙的几何体允许我们在场景表面附近采样几个点，从而显着提高渲染速度。 这个过程是完全可微的，使我们能够从 RGB 图像中共同学习深度预测和辐射场网络。 对多个基准的实验表明，我们的方法表现出有竞争力的性能，同时比以前的可推广辐射场方法至少快 60 倍。\n  - [一种轻松教授变形金刚多视图几何的方法](https://arxiv.org/abs/2211.15107) | [code]\n    > 变形金刚是强大的视觉学习者，这在很大程度上是因为它们明显缺乏手动指定的先验。 由于 3D 形状和视点的近乎无限可能的变化（需要灵活性）以及射影几何的精确性质（遵守刚性法则），这种灵活性在涉及多视图几何的任务中可能会出现问题。 为了解决这个难题，我们提出了一种“轻触”方法，引导视觉变形金刚学习多视图几何，但允许它们在需要时摆脱束缚。 我们通过使用极线来引导 Transformer 的交叉注意力图来实现这一点，惩罚极线外的注意力值并鼓励沿着这些线的更高注意力，因为它们包含几何上合理的匹配。 与以前的方法不同，我们的建议在测试时不需要任何相机姿势信息。 我们专注于姿势不变的对象实例检索，由于查询和检索图像之间的视点存在巨大差异，因此标准 Transformer 网络在这方面存在困难。 在实验上，我们的方法在对象检索方面优于最先进的方法，而且在测试时不需要姿势信息。\n  - [通过伪多视图优化的高保真 3D GAN 反演](https://arxiv.org/abs/2211.15662) | [***``[code]``***](https://github.com/jiaxinxie97/HFGI3D)\n    > 我们提出了一个高保真 3D 生成对抗网络 (GAN) 反演框架，可以在保留输入图像的特定细节的同时合成逼真的新视图。 由于高保真 3D 反演中的几何纹理权衡，高保真 3D GAN 反演本质上具有挑战性，其中对单个视图输入图像的过度拟合通常会在潜在优化期间损坏估计的几何形状。 为了解决这一挑战，我们提出了一种新的管道，它建立在具有可见性分析的伪多视图估计之上。 我们保留可见部分的原始纹理，并对被遮挡的部分使用生成先验。 广泛的实验表明，我们的方法比最先进的方法实现了有利的重建和新颖的视图合成质量，即使对于具有分布外纹理的图像也是如此。 拟议的管道还支持使用反向潜代码和 3D 感知纹理修改进行图像属性编辑。 我们的方法可以从单个图像进行高保真 3D 渲染，这有望用于 AI 生成的 3D 内容的各种应用。\n## Nov20 - Nov26, 2022\n  - [通过神经渲染的无监督连续语义适应](https://arxiv.org/abs/2211.13969) | [code]\n    > 越来越多的应用程序依赖于数据驱动模型，这些模型被部署用于跨一系列场景的感知任务。由于训练和部署数据之间的不匹配，在新场景上调整模型对于获得良好性能通常至关重要。在这项工作中，我们研究了语义分割任务的持续多场景适应，假设在部署期间没有可用的地面实况标签，并且应该保持先前场景的性能。我们建议通过融合分割模型的预测，然后使用视图一致的渲染语义标签作为伪标签来调整模型，为每个场景训练一个语义 NeRF 网络。通过与分割模型的联合训练，Semantic-NeRF 模型有效地实现了 2D-3D 知识迁移。此外，由于其紧凑的尺寸，它可以存储在长期记忆中，随后用于从任意角度渲染数据以减少遗忘。我们在 ScanNet 上评估了我们的方法，我们的方法优于基于体素的基线和最先进的无监督域适应方法。\n  - [ShadowNeuS：Shadow Ray 监督的神经 SDF 重建](https://arxiv.org/abs/2211.14086) | [code]\n    > 通过监督场景和多视图图像平面之间的相机光线，NeRF 为新视图合成任务重建神经场景表示。另一方面，光源和场景之间的阴影光线还有待考虑。因此，我们提出了一种新颖的阴影射线监督方案，可以优化沿射线的样本和射线位置。通过监督阴影光线，我们在多种光照条件下成功地从单视图纯阴影或 RGB 图像重建场景的神经 SDF。给定单视图二进制阴影，我们训练神经网络重建不受相机视线限制的完整场景。通过进一步模拟图像颜色和阴影光线之间的相关性，我们的技术还可以有效地扩展到 RGB 输入。我们将我们的方法与之前关于从单视图二值阴影或 RGB 图像重建形状的挑战性任务的工作进行比较，并观察到显着的改进。代码和数据将被发布。\n  - [Peekaboo：文本到图像扩散模型是零样本分割器](https://arxiv.org/abs/2211.13224) | [code]\n    > 最近基于扩散的生成模型与视觉语言模型相结合，能够根据自然语言提示创建逼真的图像。虽然这些模型是在大型互联网规模的数据集上训练的，但这种预训练模型并没有直接引入任何语义定位或基础。大多数当前的定位或接地方法都依赖于边界框或分割掩码形式的人工注释定位信息。例外是一些无监督方法，它们利用面向本地化的体系结构或损失函数，但它们需要单独训练。在这项工作中，我们探索了现成的扩散模型，在没有接触此类定位信息的情况下进行训练，如何能够在没有特定于分段的重新训练的情况下建立各种语义短语。引入了推理时间优化过程，能够生成以自然语言为条件的分割掩码。我们评估了我们在 Pascal VOC 数据集上进行无监督语义分割的提案 Peekaboo。此外，我们评估了 RefCOCO 数据集上的引用分割。总之，我们提出了第一个零样本、开放词汇、无监督（无定位信息）、语义基础技术，利用基于扩散的生成模型，无需重新训练。我们的代码将公开发布。\n  - [PANeRF：基于少样本输入的改进神经辐射场的伪视图增强](https://arxiv.org/abs/2211.12758) | [code]\n    > 近年来开发了神经辐射场 (NeRF) 方法，该技术在合成复杂场景的新视图方面具有广阔的应用前景。然而，NeRF 需要密集的输入视图，通常有数百个，以生成高质量图像。随着输入视图数量的减少，NeRF 对未见视点的渲染质量趋于急剧下降。为了克服这一挑战，我们提出了 NeRF 的伪视图增强，该方案通过考虑少镜头输入的几何形状来扩展足够数量的数据。我们首先通过利用扩展的伪视图来初始化 NeRF 网络，这可以有效地减少渲染看不见的视图时的不确定性。随后，我们通过使用包含精确几何和颜色信息的稀疏视图输入来微调网络。通过各种设置下的实验，我们验证了我们的模型忠实地合成了高质量的新视图图像，并且优于现有的多视图数据集方法。\n  - [零 NeRF：零重叠注册](https://arxiv.org/abs/2211.12544) | [code]\n    > 我们提出了零 NeRF，这是一种投影表面配准方法，据我们所知，它提供了第一个能够在具有最小或零视觉对应的场景表示之间对齐的通用解决方案。为此，我们加强了部分和完整重建的可见表面之间的一致性，这使我们能够约束被遮挡的几何体。我们使用 NeRF 作为我们的表面表示和 NeRF 渲染管道来执行此对齐。为了证明我们方法的有效性，我们从对面的现实世界场景中注册了无法使用现有方法准确注册的无限小重叠，并将这些结果与广泛使用的注册方法进行了比较。\n  - [SPARF：来自稀疏和嘈杂姿势的神经辐射场](https://arxiv.org/abs/2211.11738) | [code]\n    > 神经辐射场 (NeRF) 最近已成为合成逼真新颖视图的有力代表。虽然表现出令人印象深刻的性能，但它依赖于具有高精度相机姿势的密集输入视图的可用性，从而限制了其在现实场景中的应用。在这项工作中，我们引入了稀疏姿态调整辐射场 (SPARF)，以应对仅在少量宽基线输入图像（低至 3 张）且相机姿态嘈杂的情况下进行新视图合成的挑战。我们的方法利用多视图几何约束来共同学习 NeRF 并改进相机姿势。通过依赖于输入视图之间提取的像素匹配，我们的多视图对应目标强制优化场景和相机姿势以收敛到全局和几何精确的解决方案。我们的深度一致性损失进一步鼓励重建的场景从任何角度来看都是一致的。我们的方法在多个具有挑战性的数据集的稀疏视图机制中设置了一个新的技术状态。\n## Nov13 - Nov19, 2022\n  - [Magic3D：高分辨率文本到 3D 内容创建](https://arxiv.org/abs/2211.10440) | [code]\n    > DreamFusion 最近展示了预训练的文本到图像扩散模型在优化神经辐射场 (NeRF) 方面的实用性，实现了卓越的文本到 3D 合成结果。然而，该方法有两个固有的局限性：(a) NeRF 的优化极其缓慢和 (b) NeRF 上的低分辨率图像空间监督，导致处理时间长的低质量 3D 模型。在本文中，我们通过使用两阶段优化框架来解决这些限制。首先，我们使用低分辨率扩散先验获得粗糙模型，并使用稀疏 3D 哈希网格结构进行加速。使用粗略表示作为初始化，我们进一步优化了带纹理的 3D 网格模型，该模型具有与高分辨率潜在扩散模型交互的高效可微分渲染器。我们的方法被称为 Magic3D，可以在 40 分钟内创建高质量的 3D 网格模型，比 DreamFusion 快 2 倍（据报道平均需要 1.5 小时），同时还实现了更高的分辨率。用户研究表明 61.7% 的评分者更喜欢我们的方法而不是 DreamFusion。连同图像调节生成功能，我们为用户提供了控制 3D 合成的新方法，为各种创意应用开辟了新途径。\n  - [RenderDiffusion：用于 3D 重建、修复和生成的图像扩散](https://arxiv.org/abs/2211.09869) | [code]\n    > 扩散模型目前在条件和无条件图像生成方面都达到了最先进的性能。然而，到目前为止，图像扩散模型不支持 3D 理解所需的任务，例如视图一致的 3D 生成或单视图对象重建。在本文中，我们将 RenderDiffusion 作为第一个用于 3D 生成和推理的扩散模型，可以仅使用单眼 2D 监督进行训练。我们方法的核心是一种新颖的图像去噪架构，它在每个去噪步骤中生成并渲染场景的中间三维表示。这在扩散过程中强制实施了一个强大的归纳结构，为我们提供了一个 3D 一致的表示，同时只需要 2D 监督。可以从任何视点渲染生成的 3D 表示。我们在 ShapeNet 和 Clevr 数据集上评估 RenderDiffusion，并展示了在生成 3D 场景和从 2D 图像推断 3D 场景方面的竞争性能。此外，我们基于扩散的方法允许我们使用 2D 修复来编辑 3D 场景。我们相信，我们的工作有望在对大量图像集进行训练时实现大规模的完整 3D 生成，从而避免对大型 3D 模型集进行监督的需要。\n## Nov6 - Nov12, 2022\n  - [3D常见宠物：现实生活中可变形类别的动态新视角合成](https://arxiv.org/abs/2211.03889) | [code]\n    > 从稀疏视图中获得对象的逼真重建本质上是模棱两可的，只能通过学习合适的重建先验来实现。早期关于稀疏刚性对象重建的工作成功地从大型数据集（如 CO3D）中学习了这样的先验。在本文中，我们将这种方法扩展到动态对象。我们以猫和狗作为代表性示例，并介绍 Common Pets in 3D (CoP3D)，这是一组众包视频，展示了大约 4,200 种不同的宠物。 CoP3D 是首批用于“野外”非刚性 3D 重建基准测试的大型数据集之一。我们还提出了 Tracker-NeRF，这是一种从我们的数据集中学习 4D 重建的方法。在测试时，给定一个看不见的物体的少量视频帧，Tracker-NeRF 预测其 3D 点的轨迹并生成新视图、插值视点和时间。 CoP3D 的结果揭示了比现有基线更好的非刚性新视图合成性能。\n## Oct30 - Nov5, 2022\n  - [用于机器人操纵的神经抓取距离场](https://arxiv.org/abs/2211.02647) | [code]\n    > 我们将抓取学习制定为一个神经场，并提出神经抓取距离场 (NGDF)。这里，输入是机器人末端执行器的 6D 姿态，输出是到物体有效抓握的连续流形的距离。与预测一组离散候选抓握的当前方法相比，基于距离的 NGDF 表示很容易被解释为成本，并且最小化该成本会产生成功的抓握姿势。这种抓取距离成本可以直接合并到轨迹优化器中，与其他成本（如轨迹平滑度和碰撞避免）进行联合优化。在优化过程中，随着各种成本的平衡和最小化，抓取目标可以平滑变化，因为学习到的抓取域是连续的。在使用 Franka 手臂的模拟基准测试中，我们发现使用 NGDF 的联合抓取和规划比基线执行成功率高出 63%，同时泛化到看不见的查询姿势和看不见的物体形状。项目页面：此 https 网址。\n## Oct23 - Oct29, 2022\n  - [Compressing Explicit Voxel Grid Representations：快速的 NeRFs 也变小了](https://arxiv.org/abs/2210.12782) | [code]\n    > 由于其固有的紧凑性，NeRF 彻底改变了逐场景辐射场重建的世界。 NeRF 的主要限制之一是它们在训练和推理时的渲染速度都很慢。最近的研究重点是优化表示场景的显式体素网格 (EVG)，它可以与神经网络配对以学习辐射场。这种方法显着提高了训练和推理时间的速度，但代价是占用大量内存。在这项工作中，我们提出了 Re:NeRF，这是一种专门针对 EVG-NeRF 可压缩性的方法，旨在减少 NeRF 模型的内存存储，同时保持相当的性能。我们在四种流行的基准测试中使用三种不同的 EVG-NeRF 架构对我们的方法进行了基准测试，展示了 Re:NeRF 广泛的可用性和有效性。\n## Oct16 - Oct22, 2022\n  - [TANGO：通过光照分解实现文本驱动的真实感和强大的 3D 风格化, NeurIPS2022](https://arxiv.org/abs/2210.11277) | [***``[code]``***](https://cyw-3d.github.io/tango/)\n    > 通过程式化创建 3D 内容是计算机视觉和图形研究中一个有前途但具有挑战性的问题。在这项工作中，我们专注于对任意拓扑的给定表面网格的逼真外观渲染进行风格化。受最近对比语言-图像预训练 (CLIP) 模型的跨模态监督激增的启发，我们提出了 TANGO，它根据文本提示以逼真的方式转移给定 3D 形状的外观风格。从技术上讲，我们建议将外观风格分解为空间变化的双向反射率分布函数、局部几何变化和照明条件，通过基于球形高斯的可微分渲染器通过监督 CLIP 损失来共同优化它们。因此，TANGO 通过自动预测​​反射效果来实现逼真的 3D 风格转换，即使是对于裸露的、低质量的网格，也无需对特定任务的数据集进行培训。大量实验表明，TANGO 在逼真的质量、3D 几何的一致性和对低质量网格进行样式化时的鲁棒性方面优于现有的文本驱动 3D 样式转换方法。我们的代码和结果可在我们的项目网页 https URL 上找到。\n  - [坐标并不孤单——码本先验有助于隐式神经 3D 表示, NeurIPS2022](https://arxiv.org/abs/2210.11170) | [code]\n    > 隐式神经 3D 表示在表面或场景重建和新颖的视图合成中取得了令人印象深刻的结果，这通常使用基于坐标的多层感知器 (MLP) 来学习连续的场景表示。然而，现有的方法，例如神经辐射场 (NeRF) 及其变体，通常需要密集的输入视图（即 50-150）才能获得不错的结果。为了重温对大量校准图像的过度依赖并丰富基于坐标的特征表示，我们探索将先验信息注入基于坐标的网络，并引入一种新颖的基于坐标的模型 CoCo-INR，用于隐式神经 3D 表示。我们方法的核心是两个注意力模块：码本注意力和坐标注意力。前者从先验码本中提取包含丰富几何和外观信息的有用原型，后者将这些先验信息传播到每个坐标中，并丰富其对场景或物体表面的特征表示。在先验信息的帮助下，与使用较少可用校准图像的当前方法相比，我们的方法可以渲染具有更逼真外观和几何形状的 3D 视图。在包括 DTU 和 BlendedMVS 在内的各种场景重建数据集以及完整的 3D 头部重建数据集 H3DS 上的实验证明了我们提出的方法在较少输入视图下的鲁棒性和精细的细节保留能力。\n## Oct9 - Oct15, 2022\n  - [AniFaceGAN：用于视频头像的动画 3D 感知人脸图像生成, NeurIPS2022](https://arxiv.org/abs/2210.06465) | [***``[code]``***](https://yuewuhkust.github.io/AniFaceGAN/files/github_icon.jpeg)\n    > 尽管 2D 生成模型在人脸图像生成和动画方面取得了长足进步，但它们在从不同相机视点渲染图像时经常会遇到不希望的伪影，例如 3D 不一致。这可以防止他们合成与真实动画无法区分的视频动画。最近，3D 感知 GAN 扩展了 2D GAN，通过利用 3D 场景表示来明确解开相机姿势。这些方法可以很好地保持生成图像在不同视图中的 3D 一致性，但它们无法实现对其他属性的细粒度控制，其中面部表情控制可以说是面部动画最有用和最理想的方法。在本文中，我们提出了一种可动画的 3D 感知 GAN，用于多视图一致的人脸动画生成。关键思想是将 3D-aware GAN 的 3D 表示分解为模板字段和变形字段，其中前者用规范表达式表示不同的身份，后者表征每个身份的表达变化。为了通过变形实现对面部表情的有意义的控制，我们在 3D 感知 GAN 的对抗训练期间提出了生成器和参数 3D 面部模型之间的 3D 级模仿学习方案。这有助于我们的方法实现具有强烈视觉 3D 一致性的高质量动画人脸图像生成，即使仅使用非结构化 2D 图像进行训练。广泛的实验证明了我们优于以前的工作的性能。项目页面：此 https 网址\n  - [LION：用于 3D 形状生成的潜在点扩散模型, NeurIPS2022](https://arxiv.org/abs/2210.06978) | [***``[code]``***](https://nv-tlabs.github.io/LION)\n    > 去噪扩散模型 (DDM) 在 3D 点云合成中显示出可喜的结果。为了推进 3D DDM 并使它们对数字艺术家有用，我们需要 (i) 高生成质量，(ii) 操作和应用的灵活性，例如条件合成和形状插值，以及 (iii) 输出平滑表面或网格的能力。为此，我们介绍了用于 3D 形状生成的分层潜在点扩散模型 (LION)。 LION 被设置为具有分层潜在空间的变分自动编码器 (VAE)，该分层潜在空间将全局形状潜在表示与点结构潜在空间相结合。对于生成，我们在这些潜在空间中训练两个分层 DDM。与直接在点云上运行的 DDM 相比，分层 VAE 方法提高了性能，而点结构的潜在模型仍然非常适合基于 DDM 的建模。在实验上，LION 在多个 ShapeNet 基准上实现了最先进的生成性能。此外，我们的 VAE 框架使我们能够轻松地将 LION 用于不同的相关任务：LION 在多模态形状去噪和体素条件合成方面表现出色，并且可以适应文本和图像驱动的 3D 生成。我们还演示了形状自动编码和潜在形状插值，并使用现代表面重建技术增强了 LION 以生成平滑的 3D 网格。我们希望 LION 凭借其高质量的生成、灵活性和表面重建功能，为处理 3D 形状的艺术家提供强大的工具。项目页面和代码：此 https 网址。\n  - [CLIP-Fields：机器人记忆的弱监督语义场](https://mahis.life/clip-fields/) | [code]\n    > 我们提出了 CLIP-Fields，这是一种隐式场景模型，可以在没有直接人工监督的情况下进行训练。该模型学习从空间位置到语义嵌入向量的映射。然后，该映射可用于各种任务，例如分割、实例识别、空间语义搜索和视图定位。最重要的是，映射可以通过仅来自网络图像和网络文本训练模型（如 CLIP、Detic 和 Sentence-BERT）的监督进行训练。与 Mask-RCNN 之类的基线相比，我们的方法在 HM3D 数据集上的少量实例识别或语义分割方面表现优于仅一小部分示例。最后，我们展示了使用 CLIP-Fields 作为场景记忆，机器人可以在现实环境中执行语义导航。我们的代码和演示可在此处获得：https://mahis.life/clip-fields/\n## Oct2 - Oct8, 2022\n  - [用于新视图合成的自我改进多平面到层图像, WACV2023](https://samsunglabs.github.io/MLI/) | [***``[code]``***](https://github.com/SamsungLabs/MLI)\n    > 我们提出了一种用于轻量级小说视图合成的新方法，该方法可以推广到任意前向场景。最近的方法在计算上很昂贵，需要逐场景优化，或者产生内存昂贵的表示。我们首先用一组正面平行的半透明平面来表示场景，然后以端到端的方式将它们转换为可变形层。此外，我们采用前馈细化程序，通过聚合来自输入视图的信息来纠正估计的表示。我们的方法在处理新场景时不需要微调，并且可以不受限制地处理任意数量的视图。实验结果表明，我们的方法在常用指标和人工评估方面超过了最近的模型，在推理速度和推断分层几何的紧凑性方面具有显着优势，请参阅此 https URL\n  - [用于隐式场景重建的不确定性驱动的主动视觉](https://arxiv.org/abs/2210.00978) | [code]\n    > 多视图隐式场景重建方法由于能够表示复杂的场景细节而变得越来越流行。最近的努力致力于改进输入信息的表示并减少获得高质量重建所需的视图数量。然而，也许令人惊讶的是，关于选择哪些视图以最大限度地提高场景理解的研究在很大程度上仍未得到探索。我们提出了一种用于隐式场景重建的不确定性驱动的主动视觉方法，该方法利用体积渲染在场景中累积的占用不确定性来选择下一个要获取的视图。为此，我们开发了一种基于占用的重建方法，该方法使用 2D 或 3D 监督准确地表示场景。我们在 ABC 数据集和野外 CO3D 数据集上评估了我们提出的方法，并表明：（1）我们能够获得高质量的最先进的占用重建； (2) 我们的视角条件不确定性定义有效地推动了下一个最佳视图选择的改进，并且优于强大的基线方法； (3) 我们可以通过对视图选择候选执行基于梯度的搜索来进一步提高形状理解。总体而言，我们的结果突出了视图选择对于隐式场景重建的重要性，使其成为进一步探索的有希望的途径。\n  - [SinGRAV：从单个自然场景中学习生成辐射量](https://arxiv.org/abs/2210.01202) | [code]\n    > 我们提出了一个用于一般自然场景的 3D 生成模型。由于缺乏表征目标场景的必要 3D 数据量，我们建议从单个场景中学习。我们的关键见解是，一个自然场景通常包含多个组成部分，其几何、纹理和空间排列遵循一些清晰的模式，但在同一场景中的不同区域仍然表现出丰富的变化。这表明将生成模型的学习本地化在大量局部区域上。因此，我们利用具有空间局部性偏差的多尺度卷积网络来学习单个场景中多个尺度的局部区域的统计信息。与现有方法相比，我们的学习设置绕过了从许多同质 3D 场景中收集数据以学习共同特征的需要。我们创造了我们的方法 SinGRAV，用于从单个自然场景中学习生成辐射体积。我们展示了 SinGRAV 从单个场景生成合理多样的变化的能力，SingGRAV 相对于最先进的生成神经场景方法的优点，以及 SinGRAV 在各种应用中的多功能性，涵盖 3D 场景编辑、合成和动画。代码和数据将被发布以促进进一步的研究。\n  - [IntrinsicNeRF：学习用于可编辑新视图合成的内在神经辐射场](https://arxiv.org/abs/2210.00647) | [***``[code]``***](https://github.com/zju3dv/IntrinsicNeRF)\n    > 我们提出了被称为 IntrinsicNeRF 的内在神经辐射场，它将内在分解引入到基于 NeRF 的~\\cite{mildenhall2020nerf} 神经渲染方法中，并且可以在现有的逆向渲染结合神经渲染方法的同时在房间规模的场景中执行可编辑的新视图合成~ \\cite{zhang2021physg, zhang2022modeling} 只能用于特定对象的场景。鉴于内在分解本质上是一个模棱两可且约束不足的逆问题，我们提出了一种新颖的距离感知点采样和自适应反射率迭代聚类优化方法，该方法使具有传统内在分解约束的 IntrinsicNeRF 能够以无监督的方式进行训练，从而在时间上一致的内在分解结果。为了解决场景中相似反射率的不同相邻实例被错误地聚集在一起的问题，我们进一步提出了一种从粗到细优化的层次聚类方法，以获得快速的层次索引表示。它支持引人注目的实时增强现实应用，例如场景重新着色、材质编辑和照明变化。 Blender 对象和副本场景的大量实验表明，即使对于具有挑战性的序列，我们也可以获得高质量、一致的内在分解结果和高保真新视图合成。项目网页上提供了代码和数据：此 https 网址。\n## Sep25 - Oct1, 2022\n  - [通过对极约束不带姿势相机的结构感知 NeRF](https://arxiv.org/abs/2210.00183) | [***``[code]``***](https://github.com/XTU-PR-LAB/SaNerf)\n    > 用于逼真的新视图合成的神经辐射场 (NeRF) 需要通过运动结构 (SfM) 方法预先获取相机姿势。这种两阶段策略使用不方便并且会降低性能，因为姿势提取中的错误会传播到视图合成。我们将姿势提取和视图合成集成到一个端到端的过程中，这样它们就可以相互受益。为了训练 NeRF 模型，只给出了 RGB 图像，没有预先知道的相机姿势。相机位姿是通过极线约束获得的，其中不同视图中的相同特征具有根据提取的位姿从本地相机坐标转换而来的相同世界坐标。对极约束与像素颜色约束联合优化。姿势由基于 CNN 的深度网络表示，其输入是相关帧。这种联合优化使 NeRF 能够感知场景的结构，从而提高泛化性能。在各种场景上进行的大量实验证明了所提出方法的有效性。此 https 网址提供了代码。\n  - [使用几何感知鉴别器改进 3D 感知图像合成, NeurIPS2022](https://arxiv.org/abs/2209.15637) | [***``[code]``***](https://github.com/vivianszf/geod)\n    > 3D 感知图像合成旨在学习一个生成模型，该模型可以渲染逼真的 2D 图像，同时捕捉体面的底层 3D 形状。一种流行的解决方案是采用生成对抗网络 (GAN)，并用 3D 渲染器替换生成器，其中通常使用带有神经辐射场 (NeRF) 的体积渲染。尽管合成质量有所提高，但现有方法无法获得适度的 3D 形状。我们认为，考虑到 GAN 公式中的两人游戏，仅使生成器具有 3D 感知能力是不够的。换句话说，取代生成机制只能提供生成 3D 感知图像的能力，但不能保证，因为生成器的监督主要来自鉴别器。为了解决这个问题，我们提出 GeoD 通过学习几何感知鉴别器来改进 3D 感知 GAN。具体来说，除了从 2D 图像空间中区分真假样本外，还要求鉴别器从输入中获取几何信息，然后将其用作生成器的指导。这种简单而有效的设计有助于学习更准确的 3D 形状。对各种生成器架构和训练数据集的广泛实验验证了 GeoD 优于最先进的替代方案。此外，我们的方法被注册为一个通用框架，这样一个更有能力的鉴别器（即，除了域分类和几何提取之外，还有第三个新的视图合成任务）可以进一步帮助生成器获得更好的多视图一致性。\n  - [MonoNeuralFusion：具有几何先验的在线单目神经 3D 重建](https://arxiv.org/abs/2209.15153) | [code]\n    > 从单目视频重建高保真 3D 场景仍然具有挑战性，特别是对于完整和细粒度的几何重建。先前具有神经隐式表示的 3D 重建方法已显示出完整场景重建的有希望的能力，但它们的结果通常过于平滑且缺乏足够的几何细节。本文介绍了一种新颖的神经隐式场景表示法，用于从单目视频中进行高保真在线 3D 场景重建的体积渲染。对于细粒度重建，我们的关键见解是将几何先验纳入神经隐式场景表示和神经体绘制，从而产生基于体绘制优化的有效几何学习机制。受益于此，我们提出了 MonoNeuralFusion 来从单目视频执行在线神经 3D 重建，从而在动态 3D 单目扫描期间有效地生成和优化 3D 场景几何图形。与最先进方法的广泛比较表明，我们的 MonoNeuralFusion 在数量和质量上始终生成更好的完整和细粒度的重建结果。\n  - [SymmNeRF：学习探索单视图视图合成的对称先验, ACCV2022](https://arxiv.org/abs/2209.14819) | [***``[code]``***](https://github.com/xingyi-li/SymmNeRF)\n    > 我们研究了从单个图像中对对象进行新视图合成的问题。现有方法已经证明了单视图视图合成的潜力。但是，它们仍然无法恢复精细的外观细节，尤其是在自闭区域。这是因为单个视图仅提供有限的信息。我们观察到人造物体通常表现出对称的外观，这会引入额外的先验知识。受此启发，我们研究了将对称性显式嵌入场景表示的潜在性能增益。在本文中，我们提出了 SymmNeRF，这是一种基于神经辐射场 (NeRF) 的框架，在引入对称先验的情况下结合了局部和全局条件。特别是，SymmNeRF 将像素对齐的图像特征和相应的对称特征作为 NeRF 的额外输入，其参数由超网络生成。由于参数以图像编码的潜在代码为条件，因此 SymmNeRF 与场景无关，可以推广到新场景。对合成数据集和真实世界数据集的实验表明，SymmNeRF 可以合成具有更多细节的新颖视图，而不管姿势变换如何，并且在应用于看不见的对象时表现出良好的泛化性。代码位于：此 https URL。\n  - [360FusionNeRF：具有联合引导的全景神经辐射场](https://arxiv.org/abs/2209.14265) | [code]\n    > 我们提出了一种基于神经辐射场 (NeRF) 从单个 360 度全景图像合成新视图的方法。类似设置中的先前研究依赖于多层感知的邻域插值能力来完成由遮挡引起的缺失区域，这导致其预测中的伪影。我们提出了 360FusionNeRF，这是一个半监督学习框架，我们在其中引入几何监督和语义一致性来指导渐进式训练过程。首先，将输入图像重新投影到 360 度图像，并在其他相机位置提取辅助深度图。除了 NeRF 颜色指导之外，深度监督还改进了合成视图的几何形状。此外，我们引入了语义一致性损失，鼓励对新视图进行逼真的渲染。我们使用预训练的视觉编码器（例如 CLIP）提取这些语义特征，CLIP 是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的 2D 照片进行训练。实验表明，我们提出的方法可以在保留场景特征的同时产生未观察到的区域的合理完成。在跨各种场景进行训练时，360FusionNeRF 在转移到合成 Structured3D 数据集（PSNR~5%，SSIM~3% LPIPS~13%）、真实世界的 Matterport3D 数据集（PSNR~3%）时始终保持最先进的性能, SSIM~3% LPIPS~9%) 和 Replica360 数据集 (PSNR~8%, SSIM~2% LPIPS~18%)。\n## Sep18 - Sep24, 2022\n  - [PNeRF：用于不确定 3D 视觉映射的概率神经场景表示, ICRA2023](https://arxiv.org/abs/2209.11677) | [code]\n    > 最近，神经场景表示在视觉上表示 3D 场景提供了非常令人印象深刻的结果，但是，它们的研究和进展主要局限于计算机图形中虚拟模型的可视化或计算机视觉中的场景重建，而没有明确考虑传感器和姿势的不确定性。然而，在机器人应用中使用这种新颖的场景表示需要考虑神经图中的这种不确定性。因此，本文的目的是提出一种用不确定的训练数据训练 {\\em 概率神经场景表示} 的新方法，该方法可以将这些表示包含在机器人应用程序中。使用相机或深度传感器获取图像包含固有的不确定性，此外，用于学习 3D 模型的相机姿势也不完善。如果将这些测量值用于训练而不考虑其不确定性，则生成的模型不是最优的，并且生成的场景表示可能包含诸如模糊和几何不均匀等伪影。在这项工作中，通过关注以概率方式使用不确定信息进行训练，研究了将不确定性整合到学习过程中的问题。所提出的方法涉及使用不确定性项显式增加训练似然性，使得网络的学习概率分布相对于训练不确定性最小化。将会显示，除了更精确和一致的几何形状之外，这会导致更准确的图像渲染质量。已经对合成数据集和真实数据集进行了验证，表明所提出的方法优于最先进的方法。结果表明，即使在训练数据有限的情况下，所提出的方法也能够呈现新颖的高质量视图。\n  - [ActiveNeRF：通过不确定性估计学习在哪里看](https://arxiv.org/abs/2209.08546) | [***``[code]``***](https://github.com/LeapLabTHU/ActiveNeRF)\n    > 最近，神经辐射场 (NeRF) 在重建 3D 场景和从一组稀疏的 2D 图像合成新视图方面显示出令人鼓舞的性能。尽管有效，但 NeRF 的性能很大程度上受训练样本质量的影响。由于场景中的姿势图像有限，NeRF 无法很好地泛化到新颖的视图，并且可能会在未观察到的区域中崩溃为琐碎的解决方案。这使得 NeRF 在资源受限的情况下变得不切实际。在本文中，我们提出了一种新颖的学习框架 ActiveNeRF，旨在对输入预算受限的 3D 场景进行建模。具体来说，我们首先将不确定性估计纳入 NeRF 模型，以确保在少量观察下的稳健性，并提供对 NeRF 如何理解场景的解释。在此基础上，我们建议使用基于主动学习方案的新捕获样本来补充现有的训练集。通过评估给定新输入的不确定性减少情况，我们选择带来最多信息增益的样本。通过这种方式，可以用最少的额外资源提高新视图合成的质量。大量实验验证了我们的模型在真实场景和合成场景上的性能，尤其是在训练数据较少的情况下。代码将在 \\url{this https URL} 发布。\n## Sep11 - Sep17, 2022\n  - [学习用于视图合成的统一 3D 点云](https://arxiv.org/abs/2209.05013) | [code]\n    > 基于 3D 点云表示的视图合成方法已证明是有效的。然而，现有方法通常仅从单个源视图合成新视图，并且将它们泛化以处理多个源视图以追求更高的重建质量并非易事。在本文中，我们提出了一种新的基于深度学习的视图合成范式，它从不同的源视图中学习统一的 3D 点云。具体来说，我们首先通过根据深度图将源视图投影到 3D 空间来构建子点云。然后，我们通过自适应融合子点云联合上定义的局部邻域中的点来学习统一的 3D 点云。此外，我们还提出了一个 3D 几何引导图像恢复模块来填充孔洞并恢复渲染新视图的高频细节。三个基准数据集的实验结果表明，我们的方法在数量上和视觉上都在很大程度上优于最先进的视图合成方法。\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n  - [Dual-Space NeRF：在不同空间中学习动画化身和场景照明, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > 在规范空间中对人体进行建模是捕捉和动画的常见做法。但是当涉及到神经辐射场 (NeRF) 时，仅仅在标准空间中学习一个静态的 NeRF 是不够的，因为即使场景照明是恒定的，当人移动时身体的照明也会发生变化。以前的方法通过学习每帧嵌入来缓解光照的不一致性，但这种操作并不能推广到看不见的姿势。鉴于光照条件在世界空间中是静态的，而人体在规范空间中是一致的，我们提出了一种双空间 NeRF，它在两个独立的空间中使用两个 MLP 对场景光照和人体进行建模。为了弥合这两个空间，以前的方法主要依赖于线性混合蒙皮 (LBS) 算法。然而，动态神经领域的 LBS 的混合权重是难以处理的，因此通常用另一个 MLP 来记忆，这不能推广到新的姿势。尽管可以借用 SMPL 等参数网格的混合权重，但插值操作会引入更多伪影。在本文中，我们建议使用重心映射，它可以直接泛化到看不见的姿势，并且出人意料地取得了比具有神经混合权重的 LBS 更好的结果。 Human3.6M 和 ZJU-MoCap 数据集的定量和定性结果显示了我们方法的有效性。\n## Aug21 - Aug27, 2022\n  - [DreamBooth：为主题驱动生成微调文本到图像的扩散模型](https://dreambooth.github.io/) | [code]\n    > 大型文本到图像模型在人工智能的演进中实现了显着的飞跃，能够从给定的文本提示中对图像进行高质量和多样化的合成。然而，这些模型缺乏模仿给定参考集中对象的外观并在不同上下文中合成它们的新颖再现的能力。在这项工作中，我们提出了一种“个性化”文本到图像扩散模型的新方法（专门针对用户的需求）。给定主题的几张图像作为输入，我们微调预训练的文本到图像模型（Imagen，尽管我们的方法不限于特定模型），以便它学会将唯一标识符与该特定主题绑定.一旦对象被嵌入模型的输出域中，唯一标识符就可以用于合成在不同场景中情境化的对象的完全新颖的真实感图像。通过利用嵌入在模型中的语义先验和新的自生类特定先验保存损失，我们的技术能够在参考图像中没有出现的不同场景、姿势、视图和照明条件下合成主体。我们将我们的技术应用于几个以前无懈可击的任务，包括主题重新上下文化、文本引导视图合成、外观修改和艺术渲染（同时保留主题的关键特征）。项目页面：此 https 网址\n  - [E-NeRF：来自移动事件相机的神经辐射场](https://arxiv.org/abs/2208.11300) | [code]\n    > 从理想图像估计神经辐射场 (NeRFs) 已在计算机视觉领域得到广泛研究。大多数方法假设最佳照明和缓慢的相机运动。这些假设在机器人应用中经常被违反，其中图像包含运动模糊并且场景可能没有合适的照明。这可能会导致下游任务（例如场景的导航、检查或可视化）出现重大问题。为了缓解这些问题，我们提出了 E-NeRF，这是第一种从快速移动的事件摄像机中以 NeRF 形式估计体积场景表示的方法。我们的方法可以在非常快速的运动和高动态范围条件下恢复 NeRF，在这种情况下，基于帧的方法会失败。我们展示了仅通过提供事件流作为输入来渲染高质量帧是可能的。此外，通过结合事件和帧，我们可以估计在严重运动模糊下比最先进的方法质量更高的 NeRF。我们还表明，在只有很少的输入视图可用的情况下，结合事件和帧可以克服 NeRF 估计的失败情况，而无需额外的正则化。\n  - [FurryGAN：高质量的前景感知图像合成, ECCV2022](https://jeongminb.github.io/FurryGAN/) | [***``[code]``***](https://jeongminb.github.io/FurryGAN/)\n    > 前景感知图像合成旨在生成图像及其前景蒙版。一种常见的方法是将图像公式化为前景图像和背景图像的蒙版混合。这是一个具有挑战性的问题，因为它很容易达到一个简单的解决方案，即任一图像压倒另一个图像，即蒙版完全满或空，前景和背景没有有意义地分离。我们展示了 FurryGAN 的三个关键组件：1）将前景图像和合成图像都强加为逼真，2）将掩码设计为粗略和精细掩码的组合，以及 3）通过辅助掩码预测器引导生成器鉴别器。我们的方法使用非常详细的 alpha 蒙版生成逼真的图像，这些蒙版以完全无人监督的方式覆盖头发、毛皮和胡须。\n## Aug14 - Aug20, 2022\n  - [通过隐式视觉引导和超网络生成文本到图像](https://arxiv.org/abs/2208.08493) | [code]\n    > 我们开发了一种文本到图像生成的方法，该方法包含额外的检索图像，由隐式视觉引导损失和生成目标的组合驱动。与大多数现有的仅以文本为输入的文本到图像生成方法不同，我们的方法将跨模态搜索结果动态地馈送到统一的训练阶段，从而提高了生成结果的质量、可控性和多样性。我们提出了一种新的超网络调制的视觉文本编码方案来预测编码层的权重更新，从而实现从视觉信息（例如布局、内容）到相应的潜在域的有效传输。实验结果表明，我们的模型以额外的检索视觉数据为指导，优于现有的基于 GAN 的模型。在 COCO 数据集上，与最先进的方法相比，我们实现了更好的 FID 为 9.13，生成器参数减少了 3.5 倍。\n  - [UPST-NeRF：用于 3D 场景的神经辐射场的通用逼真风格转移](https://arxiv.org/abs/2208.07059) | [***``[code]``***](https://github.com/semchan/UPST-NeRF)\n    > 3D 场景逼真风格化旨在根据给定的风格图像从任意新颖的视图生成逼真的图像，同时确保从不同视点渲染时的一致性。现有的一些具有神经辐射场的风格化方法可以通过将风格图像的特征与多视图图像相结合来训练3D场景，从而有效地预测风格化场景。然而，这些方法会生成包含令人反感的伪影的新颖视图图像。此外，它们无法为 3D 场景实现通用的逼真风格化。因此，造型图像必须重新训练基于神经辐射场的 3D 场景表示网络。我们提出了一种新颖的 3D 场景逼真风格迁移框架来解决这些问题。它可以用 2D 风格的图像实现逼真的 3D 场景风格转换。我们首先预训练了一个 2D 真实感风格迁移网络，可以满足任何给定内容图像和风格图像之间的真实感风格迁移。然后，我们使用体素特征来优化 3D 场景并获得场景的几何表示。最后，我们共同优化了一个超网络，以实现任意风格图像的场景逼真风格迁移。在迁移阶段，我们使用预训练的 2D 真实感网络来约束 3D 场景中不同视图和不同风格图像的真实感风格。实验结果表明，我们的方法不仅实现了任意风格图像的 3D 逼真风格转换，而且在视觉质量和一致性方面优于现有方法。项目页面：此 https URL。\n## Aug7 - Aug13, 2022\n  - [HRF-Net：来自稀疏输入的整体辐射场](https://arxiv.org/abs/2208.04717) | [code]\n    > 我们提出了 HRF-Net，这是一种基于整体辐射场的新型视图合成方法，它使用一组稀疏输入来渲染新颖的视图。最近的泛化视图合成方法也利用了辐射场，但渲染速度不是实时的。现有的方法可以有效地训练和渲染新颖的视图，但它们不能推广到看不见的场景。我们的方法解决了用于泛化视图合成的实时渲染问题，包括两个主要阶段：整体辐射场预测器和基于卷积的神经渲染器。这种架构不仅可以基于隐式神经场推断出一致的场景几何，还可以使用单个 GPU 有效地渲染新视图。我们首先在 DTU 数据集的多个 3D 场景上训练 HRF-Net，并且该网络可以仅使用光度损失对看不见的真实和合成数据产生似是而非的新颖视图。此外，我们的方法可以利用单个场景的一组更密集的参考图像来生成准确的新颖视图，而无需依赖额外的显式表示，并且仍然保持预训练模型的高速渲染。实验结果表明，HRF-Net 在各种合成和真实数据集上优于最先进的可泛化神经渲染方法。\n## Jul31 - Aug6, 2022\n  - [NeSF: 用于 3D 场景的可概括语义分割的神经语义场](https://research.google/pubs/pub51563/) | [code]\n    > 我们提出了 NeSF，一种从预训练的密度场和稀疏的 2D 语义监督产生 3D 语义场的方法。我们的方法通过利用将 3D 信息存储在神经域中的神经表示来避开传统的场景表示。尽管仅由 2D 信号监督，我们的方法能够从新颖的相机姿势生成 3D 一致的语义图，并且可以在任意 3D 点进行查询。值得注意的是，NeSF 与任何产生密度场的方法兼容，并且随着预训练密度场质量的提高，其准确性也会提高。我们的实证分析证明了在令人信服的合成场景上与竞争性 2D 和 3D 语义分割基线相当的质量，同时还提供了现有方法无法提供的功能。\n  - [Transformers as Meta-Learners for Implicit Neural Representations, ECCV2022](https://arxiv.org/abs/2208.02801) | [***``[code]``***](https://yinboc.github.io/trans-inr/)\n    > 近年来，隐式神经表示 (INR) 已经出现并显示出其优于离散表示的优势。然而，将 INR 拟合到给定的观测值通常需要从头开始使用梯度下降进行优化，这是低效的，并且不能很好地泛化稀疏的观测值。为了解决这个问题，大多数先前的工作都训练了一个超网络，该超网络生成单个向量来调制 INR 权重，其中单个向量成为限制输出 INR 重建精度的信息瓶颈。最近的工作表明，通过基于梯度的元学习，可以在没有单向量瓶颈的情况下精确推断 INR 中的整个权重集。受基于梯度的元学习的广义公式的启发，我们提出了一个公式，该公式使用 Transformer 作为 INR 的超网络，它可以使用专门作为集合到集合映射的 Transformer 直接构建整个 INR 权重集。我们展示了我们的方法在不同任务和领域中构建 INR 的有效性，包括 2D 图像回归和 3D 对象的视图合成。我们的工作在 Transformer 超网络和基于梯度的元学习算法之间建立了联系，我们为理解生成的 INR 提供了进一步的分析。\n  - [VolTeMorph：体积表示的实时、可控和可泛化动画](https://arxiv.org/pdf/2208.00949) | [code]\n    > 最近，用于场景重建和新颖视图合成的体积表示越来越受欢迎，这使人们重新关注在高可见度下对体积内容进行动画处理质量和实时性。虽然基于学习函数的隐式变形方法可以产生令人印象深刻的结果，但它们对于艺术家和内容创作者来说是“黑匣子”，它们需要大量的训练数据才能进行有意义的概括，而且它们不会在训练数据之外产生现实的外推。在这项工作中，我们通过引入一种实时、易于使用现成软件进行编辑并且可以令人信服地推断的体积变形方法来解决这些问题。为了展示我们方法的多功能性，我们将其应用于两个场景：基于物理的对象变形和远程呈现，其中化身使用混合形状进行控制。我们还进行了彻底的实验，表明我们的方法优于结合隐式变形的体积方法和基于网格变形的方法。\n## Jul24 - Jul30, 2022\n  - [ZEPI-Net：通过内部跨尺度对极平面图像零样本学习的光场超分辨率, Neural Processing Letters (2022)](https://link.springer.com/article/10.1007/s11063-022-10955-x) | [code]\n    > 光场 (LF) 成像的许多应用都受到空间角分辨率问题的限制，因此需要高效的超分辨率技术。最近，基于学习的解决方案比传统的超分辨率（SR）技术取得了显着更好的性能。不幸的是，学习或训练过程在很大程度上依赖于训练数据集，这对于大多数 LF 成像应用程序来说可能是有限的。在本文中，我们提出了一种基于零样本学习的新型 LF 空间角 SR 算法。我们建议在核平面图像 (EPI) 空间中学习跨尺度可重用特征，并避免显式建模场景先验或从大量 LF 中隐式学习。最重要的是，在不使用任何外部 LF 的情况下，所提出的算法可以同时在空间域和角域中超分辨 LF。此外，所提出的解决方案没有深度或视差估计，这通常由现有的 LF 空间和角度 SR 采用。通过使用一个简单的 8 层全卷积网络，我们表明所提出的算法可以产生与最先进的空间 SR 相当的结果。我们的算法在多组公共 LF 数据集上的角度 SR 方面优于现有方法。实验结果表明，跨尺度特征可以很好地学习并在 EPI 空间中用于 LF SR。\n  - [ObjectFusion：具有神经对象先验的准确对象级 SLAM, Graphical Models, Volume 123, September 2022](https://www.sciencedirect.com/science/article/pii/S1524070322000418) | [code]\n    > 以前的对象级同步定位和映射 (SLAM) 方法仍然无法以有效的方式创建高质量的面向对象的 3D 地图。主要挑战来自如何有效地表示对象形状以及如何将这种对象表示有效地应用于准确的在线相机跟踪。在本文中，我们提供 ObjectFusion 作为静态场景中的一种新颖的对象级 SLAM，它通过利用神经对象先验，有效地创建具有高质量对象重建的面向对象的 3D 地图。我们提出了一种仅具有单个编码器-解码器网络的神经对象表示，以有效地表达各种类别的对象形状，这有利于对象实例的高质量重建。更重要的是，我们建议将这种神经对象表示转换为精确测量，以共同优化对象形状、对象姿态和相机姿态，以实现最终准确的 3D 对象重建。通过对合成和真实世界 RGB-D 数据集的广泛评估，我们表明我们的 ObjectFusion 优于以前的方法，具有更好的对象重建质量，使用更少的内存占用，并且以更有效的方式，尤其是在对象级别。\n  - [通过 NeRF Attention 进行端到端视图合成](https://arxiv.org/abs/2207.14741) | [code]\n    > 在本文中，我们提出了一个用于视图合成的简单 seq2seq 公式，其中我们将一组光线点作为输入和输出与光线相对应的颜色。在这个 seq2seq 公式上直接应用标准转换器有两个限制。首先，标准注意力不能成功地适应体积渲染过程，因此合成视图中缺少高频分量。其次，将全局注意力应用于所有光线和像素是非常低效的。受神经辐射场 (NeRF) 的启发，我们提出了 NeRF 注意力 (NeRFA) 来解决上述问题。一方面，NeRFA 将体积渲染方程视为软特征调制过程。通过这种方式，特征调制增强了具有类似 NeRF 电感偏置的变压器。另一方面，NeRFA 执行多阶段注意力以减少计算开销。此外，NeRFA 模型采用光线和像素转换器来学习光线和像素之间的相互作用。 NeRFA 在四个数据集上展示了优于 NeRF 和 NerFormer 的性能：DeepVoxels、Blender、LLFF 和 CO3D。此外，NeRFA 在两种设置下建立了新的 state-of-the-art：单场景视图合成和以类别为中心的新颖视图合成。该代码将公开发布。\n## Previous weeks\n  - [CLA-NeRF：类别级关节神经辐射场, ICRA2022](https://arxiv.org/abs/2202.00181) | [code]\n    > 我们提出了 CLA-NeRF——一种类别级的关节神经辐射场，可以执行视图合成、部分分割和关节姿态估计。 CLA-NeRF 在对象类别级别进行训练，不使用 CAD 模型和深度，而是使用一组具有地面实况相机姿势和部分片段的 RGB 图像。在推理过程中，只需对已知类别中未见过的 3D 对象实例进行少量 RGB 视图（即少镜头）即可推断对象部分分割和神经辐射场。给定一个关节姿态作为输入，CLA-NeRF 可以执行关节感知体积渲染，以在任何相机姿态下生成相应的 RGB 图像。此外，可以通过逆向渲染来估计对象的关节姿势。在我们的实验中，我们对合成数据和真实数据的五个类别的框架进行了评估。在所有情况下，我们的方法都显示了真实的变形结果和准确的关节姿态估计。我们相信，少量的关节对象渲染和关节姿势估计都为机器人感知和与看不见的关节对象交互打开了大门。\n  - [GRAF：用于 3D 感知图像合成的生成辐射场, NeurIPS2020](https://avg.is.mpg.de/publications/schwarz2020NeurIPS) | [***``[code]``***](https://github.com/autonomousvision/graf)\n    > 虽然 2D 生成对抗网络已经实现了高分辨率图像合成，但它们在很大程度上缺乏对 3D 世界和图像形成过程的理解。因此，它们不提供对相机视点或物体姿势的精确控制。为了解决这个问题，最近的几种方法将基于中间体素的表示与可微渲染相结合。然而，现有方法要么产生低图像分辨率，要么在解开相机和场景属性方面存在不足，例如，对象身份可能随视点而变化。在本文中，我们提出了一种辐射场的生成模型，该模型最近被证明在单个场景的新颖视图合成方面是成功的。与基于体素的表示相比，辐射场并不局限于 3D 空间的粗略离散化，还允许解开相机和场景属性，同时在存在重建模糊性的情况下优雅地退化。通过引入基于多尺度补丁的鉴别器，我们展示了高分辨率图像的合成，同时仅从未定位的 2D 图像训练我们的模型。我们系统地分析了我们在几个具有挑战性的合成和现实世界数据集上的方法。我们的实验表明，辐射场是生成图像合成的强大表示，可生成以高保真度渲染的 3D 一致模型。\n  - [GRF：学习用于 3D 场景表示和渲染的一般辐射场, ICCV2021(oral)](https://arxiv.org/abs/2010.04595) | [***``[code]``***](https://github.com/alextrevithick/GRF)\n    > 我们提出了一个简单而强大的神经网络，它仅从 2D 观察中隐式表示和渲染 3D 对象和场景。该网络将 3D 几何建模为一般辐射场，它以一组具有相机位姿和内在函数的 2D 图像作为输入，为 3D 空间的每个点构建内部表示，然后渲染该点的相应外观和几何观察从任意位置。我们方法的关键是学习 2D 图像中每个像素的局部特征，然后将这些特征投影到 3D 点，从而产生一般和丰富的点表示。我们还集成了一种注意力机制来聚合来自多个 2D 视图的像素特征，从而隐式考虑视觉遮挡。大量实验表明，我们的方法可以为新物体、看不见的类别和具有挑战性的现实世界场景生成高质量和逼真的新视图。\n  - [pixelNeRF：来自一个或几个图像的神经辐射场, CVPR2021](https://arxiv.org/abs/2012.02190) | [***``[code]``***](https://github.com/sxyu/pixel-nerf)\n    > 我们提出了 pixelNeRF，这是一种学习框架，可以预测以一个或几个输入图像为条件的连续神经场景表示。构建神经辐射场的现有方法涉及独立优化每个场景的表示，需要许多校准视图和大量计算时间。我们通过引入一种以完全卷积方式在图像输入上调节 NeRF 的架构，朝着解决这些缺点迈出了一步。这允许网络在多个场景中进行训练，以先学习一个场景，使其能够从一组稀疏的视图（少至一个）以前馈方式执行新颖的视图合成。利用 NeRF 的体积渲染方法，我们的模型可以直接从图像中训练，无需明确的 3D 监督。我们在 ShapeNet 基准上进行了广泛的实验，用于具有保留对象以及整个未见类别的单图像新颖视图合成任务。我们通过在多对象 ShapeNet 场景和来自 DTU 数据集的真实场景上展示 pixelNeRF 的灵活性，进一步展示了它的灵活性。在所有情况下，对于新颖的视图合成和单图像 3D 重建，pixelNeRF 都优于当前最先进的基线。有关视频和代码，请访问项目网站：此 https 网址\n  - [用于优化基于坐标的神经表示的学习初始化, CVPR2021](https://www.matthewtancik.com/learnit) | [***``[code]``***](https://github.com/tancik/learnit)\n    > 基于坐标的神经表示已显示出作为复杂低维信号的离散、基于数组的表示的替代方案的重要前景。然而，从每个新信号的随机初始化权重优化基于坐标的网络是低效的。我们建议应用标准的元学习算法来学习这些全连接网络的初始权重参数，这些参数基于所表示的底层信号类别（例如，面部图像或椅子的 3D 模型）。尽管只需要在实现中进行微小的更改，但使用这些学习到的初始权重可以在优化过程中实现更快的收敛，并且可以作为所建模信号类的强先验，从而在只有给定信号的部分观察可用时产生更好的泛化。我们在各种任务中探索这些好处，包括表示 2D 图像、重建 CT 扫描以及从 2D 图像观察中恢复 3D 形状和场景。\n  - [pi-GAN：用于 3D 感知图像合成的周期性隐式生成对抗网络, CVPR2021(oral)](https://marcoamonteiro.github.io/pi-GAN-website/) | [***``[code]``***](https://github.com/marcoamonteiro/pi-GAN)\n    > 我们见证了 3D 感知图像合成的快速进展，利用了生成视觉模型和神经渲染的最新进展。然而，现有方法在两个方面存在不足：首先，它们可能缺乏底层 3D 表示或依赖于视图不一致的渲染，因此合成的图像不是多视图一致的；其次，它们通常依赖于表达能力不足的表示网络架构，因此它们的结果缺乏图像质量。我们提出了一种新颖的生成模型，称为周期性隐式生成对抗网络（π-GAN 或 pi-GAN），用于高质量的 3D 感知图像合成。 π-GAN 利用具有周期性激活函数和体积渲染的神经表示将场景表示为具有精细细节的视图一致的 3D 表示。所提出的方法获得了具有多个真实和合成数据集的 3D 感知图像合成的最新结果。\n  - [单张图像的人像神经辐射场](https://portrait-nerf.github.io/) | [code]\n    > 我们提出了一种从单个爆头肖像估计神经辐射场 (NeRF) 的方法。虽然 NeRF 已经展示了高质量的视图合成，但它需要静态场景的多个图像，因此对于随意捕捉和移动主体是不切实际的。在这项工作中，我们建议使用使用灯光舞台肖像数据集的元学习框架来预训练多层感知器 (MLP) 的权重，该多层感知器隐含地对体积密度和颜色进行建模。为了提高对看不见的人脸的泛化能力，我们在由 3D 人脸可变形模型近似的规范坐标空间中训练 MLP。我们使用受控捕获对方法进行定量评估，并展示了对真实肖像图像的泛化性，显示出对最先进技术的有利结果。\n  - [CAMPARI：相机感知分解生成神经辐射场](https://arxiv.org/pdf/2103.17269.pdf) | [code]\n    > 深度生成模型的巨大进步导致了逼真的图像合成。在取得令人信服的结果的同时，大多数方法都在二维图像域中运行，而忽略了我们世界的三维性质。因此，最近的几项工作提出了具有 3D 感知能力的生成模型，即场景以 3D 建模，然后可微分地渲染到图像平面。这导致了令人印象深刻的 3D 一致性，但纳入这种偏差是有代价的：相机也需要建模。当前的方法假定固定的内在函数和预先定义的相机姿势范围。因此，实际数据通常需要参数调整，如果数据分布不匹配，结果会下降。我们的关键假设是，与图像生成器一起学习相机生成器会导致更原则性的 3D 感知图像合成方法。此外，我们建议将场景分解为背景和前景模型，从而实现更有效和更清晰的场景表示。在从原始的、未定型的图像集合中进行训练时，我们学习了一个 3D 和相机感知的生成模型，它不仅忠实地恢复了图像，而且还忠实地恢复了相机数据分布。在测试时，我们的模型生成的图像可以显式控制相机以及场景的形状和外观。\n  - [NeRF-VAE：几何感知 3D 场景生成模型](https://arxiv.org/abs/2104.00587) | [code]\n    > 我们提出了 NeRF-VAE，这是一种 3D 场景生成模型，它通过 NeRF 和可微体渲染结合了几何结构。与 NeRF 相比，我们的模型考虑了跨场景的共享结构，并且能够使用摊销推理推断新场景的结构——无需重新训练。 NeRF-VAE 的显式 3D 渲染过程进一步将先前的生成模型与缺乏几何结构的基于卷积的渲染进行对比。我们的模型是一个 VAE，它通过在潜在场景表示上调节辐射场来学习辐射场的分布。我们表明，经过训练，NeRF-VAE 能够使用很少的输入图像从以前看不见的 3D 环境中推断和渲染几何一致的场景。我们进一步证明了 NeRF-VAE 可以很好地推广到分布式相机，而卷积模型则不能。最后，我们介绍并研究了 NeRF-VAE 解码器的一种基于注意力的调节机制，该机制提高了模型性能。\n  - [具有局部条件辐射场的无约束场景生成, ICCV2021](https://apple.github.io/ml-gsn/) | [***``[code]``***](https://github.com/apple/ml-gsn)\n    > 我们遵循对抗性学习框架，其中生成器通过其辐射场对场景进行建模，鉴别器尝试区分从这些辐射场渲染的图像和真实场景的图像。从概念上讲，我们的模型将场景的辐射场分解为许多小的局部辐射场，这些辐射场是由二维潜在代码 W 网格上的条件产生的。W 可以解释为表示场景的潜在平面图。\n  - [MVSNeRF：从多视图立体快速概括辐射场重建, ICCV2021](https://apchenstu.github.io/mvsnerf/) | [***``[code]``***](https://github.com/apchenstu/mvsnerf)\n    > 我们提出了 MVSNeRF，一种新颖的神经渲染方法，可以有效地重建神经辐射场以进行视图合成。与先前的神经辐射场工作考虑对密集捕获的图像进行逐场景优化不同，我们提出了一个通用的深度神经网络，它可以通过快速网络推理仅从三个附近的输入视图重建辐射场。我们的方法利用平面扫描成本体积（广泛用于多视图立体）进行几何感知场景推理，并将其与基于物理的体积渲染相结合用于神经辐射场重建。我们在 DTU 数据集中的真实对象上训练我们的网络，并在三个不同的数据集上对其进行测试，以评估其有效性和普遍性。我们的方法可以跨场景（甚至是室内场景，与我们的对象训练场景完全不同）进行泛化，并仅使用三个输入图像生成逼真的视图合成结果，显着优于可泛化辐射场重建的并行工作。此外，如果捕捉到密集的图像，我们估计的辐射场表示可以很容易地进行微调；与 NeRF 相比，这导致具有更高渲染质量和更短优化时间的快速每场景重建。\n  - [立体辐射场 (SRF)：从新场景的稀疏视图中学习视图合成, CVPR2021](https://arxiv.org/abs/2104.06935) | [***``[code]``***](https://virtualhumans.mpi-inf.mpg.de/srf/)\n    > 最近的神经视图合成方法取得了令人印象深刻的质量和真实性，超越了依赖多视图重建的经典管道。最先进的方法，例如 NeRF，旨在使用神经网络学习单个场景，并且需要密集的多视图输入。在新场景上进行测试需要从头开始重新训练，这需要 2-3 天。在这项工作中，我们介绍了立体辐射场 (SRF)，这是一种端到端训练的神经视图合成方法，可以推广到新场景，并且在测试时只需要稀疏视图。核心思想是一种受经典多视图立体方法启发的神经架构，它通过在立体图像中找到相似的图像区域来估计表面点。在 SRF 中，我们预测每个 3D 点的颜色和密度，给定输入图像中立体对应的编码。编码是通过成对相似性的集合隐式学习的——模拟经典立体声。实验表明，SRF 在场景上学习结构而不是过度拟合。我们在 DTU 数据集的多个场景上进行训练，并在不重新训练的情况下推广到新场景，只需要 10 个稀疏和展开的视图作为输入。我们展示了 10-15 分钟的微调进一步改善了结果，与特定场景的模型相比，获得了更清晰、更详细的结果。代码、模型和视频可在此 https 网址上找到。\n  - [用于遮挡感知的基于图像的渲染的神经射线, CVPR2022](https://liuyuan-pal.github.io/NeuRay/) | [***``[code]``***](https://github.com/liuyuan-pal/NeuRay)\n    > 我们提出了一种新的神经表示，称为神经射线 (NeuRay)，用于新的视图合成任务。最近的工作从输入视图的图像特征构建辐射场以渲染新颖的视图图像，从而能够泛化到新场景。但是，由于遮挡，3D 点可能对某些输入视图不可见。在这样的 3D 点上，这些泛化方法将包括来自不可见视图的不一致图像特征，这会干扰辐射场的构建。为了解决这个问题，我们在 NeuRay 表示中预测 3D 点对输入视图的可见性。这种可见性使辐射场构建能够专注于可见图像特征，从而显着提高其渲染质量。同时，提出了一种新颖的一致性损失，以在对特定场景进行微调时改进 NeuRay 中的可见性。实验表明，我们的方法在推广到看不见的场景时在新颖的视图合成任务上实现了最先进的性能，并且在微调后优于每个场景的优化方法。\n  - [节食 NeRF：语义一致的 Few-Shot 视图合成, ICCV2021](https://www.ajayj.com/dietnerf) | [***``[code]``***](https://github.com/ajayjain/DietNeRF)\n    > 我们提出了 DietNeRF，一种从几张图像估计的 3D 神经场景表示。神经辐射场 (NeRF) 通过多视图一致性学习场景的连续体积表示，并且可以通过光线投射从新颖的视点进行渲染。虽然 NeRF 在给定许多图像的情况下具有令人印象深刻的重建几何和精细细节的能力，对于具有挑战性的 360° 场景最多可重建 100 个，但当只有少数输入视图可用时，它通常会为其图像重建目标找到退化的解决方案。为了提高few-shot质量，我们提出了DietNeRF。我们引入了一种辅助语义一致性损失，它鼓励以新颖的姿势进行逼真的渲染。 DietNeRF 在单个场景上进行训练，以 (1) 从相同的姿势正确渲染给定的输入视图，以及 (2) 在不同的随机姿势中匹配高级语义属性。我们的语义损失使我们能够从任意姿势监督 DietNeRF。我们使用预训练的视觉编码器提取这些语义，例如 CLIP，这是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的单视图 2D 照片进行训练。在实验中，DietNeRF 在从头开始学习时提高了少镜头视图合成的感知质量，在多视图数据集上进行预训练时，可以用少至一张观察到的图像渲染新视图，并生成完全未观察到的区域的合理完成。\n  - [CodeNeRF：对象类别的解开神经辐射场, ICCV2021(oral)](https://www.google.com/url?q=https%3A%2F%2Farxiv.org%2Fpdf%2F2109.01750.pdf&sa=D&sntz=1&usg=AOvVaw1Fnir0e4aRa22Nt0HoXDWh) | [***``[code]``***](https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fwbjang%2Fcode-nerf&sa=D&sntz=1&usg=AOvVaw2eD5ZoRbk2aWFuwUSHlh5_)\n    > CodeNeRF 是一种隐式 3D 神经表示，它学习对象形状和纹理在一个类别中的变化，并且可以从一组姿势图像中进行训练，以合成看不见的对象的新视图。与特定场景的原始 NeRF 不同，CodeNeRF 通过学习单独的嵌入来学习解开形状和纹理。在测试时，给定一个看不见的物体的单个未定位图像，CodeNeRF 通过优化联合估计相机视点、形状和外观代码。看不见的物体可以从单个图像中重建，然后从新的视点渲染，或者通过改变潜在代码编辑它们的形状和纹理。我们在 SRN 基准上进行了实验，结果表明 CodeNeRF 可以很好地泛化到看不见的对象，并且在测试时需要已知相机姿态的方法达到同等性能。我们在真实世界图像上的结果表明，CodeNeRF 可以弥合模拟到真实的差距。\n  - [StyleNeRF：用于高分辨率图像合成的基于样式的 3D 感知生成器, ICLR2022](https://jiataogu.me/style_nerf/) | [***``[code]``***](https://github.com/facebookresearch/StyleNeRF)\n    > 我们提出了 StyleNeRF，这是一种 3D 感知生成模型，用于具有高多视图一致性的逼真的高分辨率图像合成，可以在非结构化 2D 图像上进行训练。现有方法要么无法合成具有精细细节的高分辨率图像，要么产生明显的 3D 不一致伪影。此外，他们中的许多人缺乏对风格属性和明确的 3D 相机姿势的控制。 StyleNeRF 将神经辐射场 (NeRF) 集成到基于样式的生成器中，以应对上述挑战，即提高渲染效率和 3D 一致性以生成高分辨率图像。我们执行体积渲染只是为了生成一个低分辨率的特征图，并在 2D 中逐步应用上采样来解决第一个问题。为了减轻 2D 上采样引起的不一致性，我们提出了多种设计，包括更好的上采样器和新的正则化损失。通过这些设计，StyleNeRF 可以以交互速率合成高分辨率图像，同时保持高质量的 3D 一致性。 StyleNeRF 还可以控制相机姿势和不同级别的样式，可以推广到看不见的视图。它还支持具有挑战性的任务，包括放大和缩小、样式混合、反转和语义编辑。\n  - [GNeRF：基于 GAN 的无姿势相机的神经辐射场, ICCV2021(oral)](https://arxiv.org/abs/2103.15606) | [code]\n    > 我们介绍了 GNeRF，这是一个将生成对抗网络 (GAN) 与神经辐射场 (NeRF) 重建相结合的框架，用于具有未知甚至随机初始化相机姿势的复杂场景。最近基于 NeRF 的进展因显着的逼真的新视图合成而受到欢迎。然而，它们中的大多数严重依赖于准确的相机位姿估计，而最近的一些方法只能在相机轨迹相对较短的大致前向场景中优化未知相机位姿，并且需要粗略的相机位姿初始化。不同的是，我们的 GNeRF 仅将随机初始化的姿势用于复杂的由外而内的场景。我们提出了一种新颖的两阶段端到端框架。第一阶段将 GAN 的使用带入新领域，以联合优化粗略的相机姿势和辐射场，而第二阶段通过额外的光度损失对它们进行细化。我们使用混合迭代优化方案克服了局部最小值。对各种合成和自然场景的广泛实验证明了 GNeRF 的有效性。更令人印象深刻的是，我们的方法在那些以前被认为极具挑战性的重复模式甚至低纹理的场景中优于基线。\n  - [NeRD：来自图像集合的神经反射分解, ICCV2021](https://markboss.me/publication/2021-nerd/#:~:text=NeRD%20is%20a%20novel%20method,can%20turn%20around%20the%20object.) | [***``[code]``***](https://github.com/cgtuebingen/NeRD-Neural-Reflectance-Decomposition)\n    > 将场景分解为其形状、反射率和照明度是计算机视觉和图形学中一个具有挑战性但重要的问题。当照明不是实验室条件下的单一光源而是不受约束的环境照明时，这个问题本质上更具挑战性。尽管最近的工作表明可以使用隐式表示来模拟物体的辐射场，但这些技术中的大多数只能实现视图合成而不是重新照明。此外，评估这些辐射场是资源和时间密集型的。我们提出了一种神经反射分解 (NeRD) 技术，该技术使用基于物理的渲染将场景分解为空间变化的 BRDF 材料属性。与现有技术相比，我们的输入图像可以在不同的照明条件下捕获。此外，我们还提出了将学习到的反射体积转换为可重新照明的纹理网格的技术，从而能够使用新颖的照明进行快速实时渲染。我们通过在合成数据集和真实数据集上的实验证明了所提出方法的潜力，我们能够从图像集合中获得高质量的可重新点亮的 3D 资产。\n  - [NeRF++：分析和改进神经辐射场](https://arxiv.org/abs/2010.07492) | [***``[code]``***](https://github.com/Kai-46/nerfplusplus;)\n    > 神经辐射场 (NeRF) 为各种捕捉设置实现了令人印象深刻的视图合成结果，包括有界场景的 360 度捕捉以及有界和无界场景的前向捕捉。 NeRF 将表示视图不变不透明度和视图相关颜色体积的多层感知器 (MLP) 拟合到一组训练图像，并基于体积渲染技术对新视图进行采样。在这份技术报告中，我们首先评论了辐射场及其潜在的模糊性，即形状-辐射模糊度，并分析了 NeRF 在避免这种模糊性方面的成功。其次，我们解决了将 NeRF 应用于大规模、无界 3D 场景中对象的 360 度捕获所涉及的参数化问题。我们的方法在这种具有挑战性的场景中提高了视图合成保真度。此 https 网址提供了代码。\n  - [GIRAFFE：将场景表示为合成生成神经特征场, CVPR2021(oral)](https://arxiv.org/abs/2011.12100) | [***``[code]``***](https://github.com/autonomousvision/giraffe)\n    > 深度生成模型允许以高分辨率进行逼真的图像合成。但对于许多应用程序来说，这还不够：内容创建还需要可控。虽然最近的几项工作研究了如何解开数据变化的潜在因素，但它们中的大多数都在 2D 中运行，因此忽略了我们的世界是 3D 的。此外，只有少数作品考虑场景的构图性质。我们的关键假设是，将合成 3D 场景表示合并到生成模型中会导致更可控的图像合成。将场景表示为合成生成神经特征场使我们能够从背景中解开一个或多个对象以及单个对象的形状和外观，同时从非结构化和未定型的图像集合中学习，而无需任何额外的监督。将这种场景表示与神经渲染管道相结合，可以生成快速且逼真的图像合成模型。正如我们的实验所证明的那样，我们的模型能够解开单个对象，并允许在场景中平移和旋转它们以及改变相机姿势。\n  - [Fig-NeRF：用于 3D 对象类别建模的图地面神经辐射场, 3DV2021](https://fig-nerf.github.io/) | [code]\n    > 我们研究使用神经辐射场 (NeRF) 从输入图像的集合中学习高质量的 3D 对象类别模型。与以前的工作相比，我们能够做到这一点，同时将前景对象与不同的背景分开。我们通过 2 分量 NeRF 模型 FiG-NeRF 实现了这一点，该模型更喜欢将场景解释为几何恒定的背景和代表对象类别的可变形前景。我们表明，这种方法可以仅使用光度监督和随意捕获的对象图像来学习准确的 3D 对象类别模型。此外，我们的两部分分解允许模型执行准确和清晰的模态分割。我们使用合成的、实验室捕获的和野外数据，通过视图合成和图像保真度指标对我们的方法进行定量评估。我们的结果证明了令人信服的 3D 对象类别建模，其性能超过了现有方法的性能。\n  - [NerfingMVS：室内多视角立体神经辐射场的引导优化, ICCV2021(oral)](https://arxiv.org/abs/2109.01129) | [***``[code]``***](https://github.com/weiyithu/NerfingMVS)\n    > 在这项工作中，我们提出了一种新的多视图深度估计方法，该方法在最近提出的神经辐射场 (NeRF) 上利用了传统的 SfM 重建和基于学习的先验。与现有的依赖于估计对应的基于神经网络的优化方法不同，我们的方法直接优化隐式体积，消除了在室内场景中匹配像素的挑战性步骤。我们方法的关键是利用基于学习的先验来指导 NeRF 的优化过程。我们的系统首先通过微调其稀疏 SfM 重建来适应目标场景上的单目深度网络。然后，我们证明了 NeRF 的形状-辐射模糊性仍然存在于室内环境中，并建议通过采用适应的深度先验来监控体绘制的采样过程来解决这个问题。最后，通过对渲染图像进行误差计算获得的每像素置信度图可用于进一步提高深度质量。实验表明，我们提出的框架在室内场景中显着优于最先进的方法，在基于对应的优化和基于 NeRF 的优化对适应深度先验的有效性方面提出了令人惊讶的发现。此外，我们表明引导优化方案不会牺牲神经辐射场的原始合成能力，提高了可见视图和新视图的渲染质量。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/human.md",
    "content": "\n每周分类神经辐射场 - human ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n==================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n  - [NeuWigs：用于体积头发捕捉和动画的神经动态模型](https://arxiv.org/abs/2212.00613) | [code]\n    > 人发的捕捉和动画是为虚拟现实创建逼真化身的两个主要挑战。 这两个问题都非常具有挑战性，因为头发具有复杂的几何形状和外观，并且表现出具有挑战性的运动。 在本文中，我们提出了一种两阶段方法，该方法独立于头部对头发进行建模，以数据驱动的方式应对这些挑战。 第一阶段，状态压缩，通过一种新颖的自动编码器作为跟踪器策略，学习包含运动和外观的 3D 头发状态的低维潜在空间。 为了在外观学习中更好地分离头发和头部，我们结合使用多视图头发分割蒙版和可区分的体积渲染器。 第二阶段学习一种新颖的毛发动力学模型，该模型根据发现的潜在代码执行时间毛发转移。 为了在驱动我们的动力学模型时加强稳定性，我们在压缩阶段使用 3D 点云自动编码器来对头发状态进行去噪。 我们的模型在新颖的视图合成方面优于现有技术，并且能够创建新颖的头发动画，而无需依赖头发观察作为驱动信号。 项目页面在此 https URL。\n  - [NeRFInvertor：用于单次真实图像动画的高保真 NeRF-GAN 反演](https://arxiv.org/abs/2211.17235) | [code]\n    > 基于 Nerf 的生成模型在生成具有一致 3D 几何形状的高质量图像方面表现出了令人印象深刻的能力。 尽管成功合成了从潜在空间随机采样的假身份图像，但由于所谓的反转问题，采用这些模型生成真实主体的面部图像仍然是一项具有挑战性的任务。 在本文中，我们提出了一种通用方法来对这些 NeRF-GAN 模型进行微调，以便仅通过单个图像实现真实对象的高保真动画。 给定域外真实图像的优化潜代码，我们在渲染图像上使用 2D 损失函数来减少身份差距。 此外，我们的方法利用显式和隐式 3D 正则化，使用优化潜在代码周围的域内邻域样本来消除几何和视觉伪影。 我们的实验证实了我们的方法在跨不同数据集的多个 NeRF-GAN 模型上真实、高保真和 3D 一致的真实面孔动画的有效性。\n  - [LaplacianFusion：详细的 3D 衣服人体重建, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555511) | [code]\n    > 我们提出了 LaplacianFusion，这是一种从输入深度或 3D 点云序列重建详细且可控的 3D 穿衣人体形状的新颖方法。 我们方法的关键思想是使用拉普拉斯坐标，即已用于网格编辑的众所周知的微分坐标，来表示输入扫描中包含的局部结构，而不是之前使用的隐式 3D 函数或顶点位移。 我们的方法使用 SMPL 重建一个可控的基础网格，并学习一个表面函数来预测表示基础网格表面细节的拉普拉斯坐标。 对于给定的姿势，我们首先构建并细分一个基础网格，这是一个变形的 SMPL 模板，然后使用表面函数估计网格顶点的拉普拉斯坐标。 姿势的最终重建是通过将估计的拉普拉斯坐标作为一个整体进行整合而获得的。 实验结果表明，我们基于拉普拉斯坐标的方法比以前的方法成功地重建了视觉上更令人愉悦的形状细节。 该方法还支持各种表面细节操作，例如细节传输和增强。\n  - [DINER：基于深度感知图像的神经辐射场](https://arxiv.org/abs/2211.16630) | [code]\n    > 我们提出了基于深度感知图像的神经辐射场 (DINER)。 给定一组稀疏的 RGB 输入视图，我们预测深度和特征图以指导重建体积场景表示，使我们能够在新视图下渲染 3D 对象。 具体来说，我们提出了将深度信息纳入特征融合和高效场景采样的新技术。 与之前的最先进技术相比，DINER 实现了更高的合成质量，并且可以处理具有更大视差的输入视图。 这使我们能够在不改变捕获硬件要求的情况下更完整地捕获场景，并最终在新视图合成过程中实现更大的视点变化。 我们通过合成人头和一般物体的新视图来评估我们的方法，并观察到与以前的现有技术相比，定性结果有了显着改善，感知指标也有所增加。 该代码将公开用于研究目的。\n  - [从单目视频重建手持物体, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555401) | [code]\n    > 本文提出了一种从单目视频中重建手持物体的方法。 与许多最近通过训练有素的网络直接预测对象几何形状的方法相比，所提出的方法不需要任何关于对象的先验知识，并且能够恢复更准确和详细的对象几何形状。 关键思想是手部运动自然地提供了对象的多个视图，并且可以通过手部姿势跟踪器可靠地估计该运动。 然后，可以通过解决多视图重建问题来恢复对象几何形状。 我们设计了一种基于隐式神经表示的方法来解决重建问题，并解决手部姿势估计不精确、手部相对运动和小物体的几何优化不足等问题。 我们还提供了一个新收集的具有 3D ground truth 的数据集来验证所提出的方法。 数据集和代码将发布在 https://dihuangdh.github.io/hhor。\n  - [Dr.3D：将 3D GAN 应用于艺术绘画, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555422) | [code]\n    > 虽然 3D GAN 最近展示了多视图一致图像和 3D 形状的高质量合成，但它们主要限于照片般逼真的人像。 本文旨在将 3D GAN 扩展到一种不同但有意义的视觉形式：艺术肖像画。 然而，由于绘图中存在不可避免的几何歧义，将现有的 3D GAN 扩展到绘图具有挑战性。 为了解决这个问题，我们提出了 Dr.3D，这是一种新颖的适应方法，可以将现有的 3D GAN 适应艺术绘画。 Dr.3D 配备了三个新组件来处理几何模糊：变形感知 3D 合成网络、姿势估计和图像合成的交替适应以及几何先验。 实验表明，我们的方法可以成功地将 3D GAN 应用于绘图，并实现多视图一致的绘图语义编辑。\n  - [Fast-SNARF：一种用于关节神经场的快速变形器](https://arxiv.org/abs/2211.15601) | [code]\n    > 神经场彻底改变了刚性场景的 3D 重建和新颖视图合成领域。 使这种方法适用于关节物体（例如人体）的一个关键挑战是对静止姿势（规范空间）和变形空间之间的 3D 位置的变形进行建模。 我们提出了一种新的神经场连接模块 Fast-SNARF，它通过迭代求根找到规范空间和姿势空间之间的准确对应关系。 Fast-SNARF 是我们之前工作 SNARF 功能的直接替代品，同时显着提高了其计算效率。 我们对 SNARF 进行了多项算法和实现改进，产生了 150 倍的加速。 这些改进包括基于体素的对应搜索、预计算线性混合蒙皮函数以及使用 CUDA 内核的高效软件实现。 Fast-SNARF 可以在没有对应的变形观察（例如 3D 网格）的情况下，高效地同时优化形状和蒙皮权重。 由于变形图的学习是许多 3D 人体化身方法中的重要组成部分，并且由于 Fast-SNARF 提供了一种计算高效的解决方案，我们相信这项工作代表了向实际创建 3D 虚拟人迈出的重要一步。\n## Nov20 - Nov26, 2022\n  - [动态神经肖像, WACV2023](https://arxiv.org/abs/2211.13994) | [code]\n    > 我们提出了动态神经肖像，这是一种解决全头重现问题的新方法。我们的方法通过明确控制头部姿势、面部表情和眼睛注视来生成逼真的视频肖像。我们提出的架构不同于现有方法，后者依赖基于 GAN 的图像到图像转换网络将 3D 人脸渲染转换为逼真的图像。相反，我们在具有可控动力学的基于 2D 坐标的 MLP 上构建我们的系统。我们采用基于 2D 的表示而不是最近的 3D 类 NeRF 系统的直觉源于这样一个事实，即视频肖像是由单目固定摄像机拍摄的，因此，只有一个场景的视点可用。首先，我们将我们的生成模型设置为表达式混合形状，尽管如此，我们表明我们的系统也可以成功地由音频功能驱动。我们的实验表明，所提出的方法比最近基于 NeRF 的重演方法快 270 倍，我们的网络在分辨率高达 1024 x 1024 时达到 24 fps 的速度，同时在视觉质量方面优于之前的工作。\n  - [FLNeRF：神经辐射场中的 3D 面部地标估计](https://arxiv.org/abs/2211.11202) | [code]\n    > 本文介绍了在不使用 2D 图像、深度图或点云等中间表示的情况下直接预测神经辐射场 (NeRF) 上的 3D 面部地标的第一项重要工作。我们的 3D 从粗到细的人脸地标 NeRF (FLNeRF) 模型有效地从整个面部的 NeRF 中采样，并具有个人面部特征以获得准确的地标。为了缓解可用数据中面部表情的有限数量，局部和非线性 NeRF 扭曲被应用于精细的面部特征以模拟大范围的情绪，包括夸张的面部表情（例如，吹脸颊、张大嘴巴、眨眼） ), 用于训练 FLNeRF。通过这种表达增强，我们的模型可以预测 3D 地标，而不仅限于数据中给出的 20 个离散表达。强大的 3D NeRF 面部标志有助于许多下游任务。例如，我们修改 MoFaNeRF 以在 NeRF 上使用面部特征点启用高质量的面部编辑和交换，从而允许更直接的控制和更广泛的复杂表情。实验表明，使用地标的改进模型取得了相当好的结果。\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n  - [NeARportation：远程实时神经渲染框架, VRST22](https://arxiv.org/abs/2210.12398) | [code]\n    > 虽然逼真外观的呈现在沉浸在增强虚拟环境中起着重要作用，但显示真实物体的逼真外观仍然是一个具有挑战性的问题。摄影测量学的最新发展促进了将真实物体纳入虚拟空间。然而，照片般逼真的摄影测量需要专用的测量环境，并且需要在测量成本和质量之间进行权衡。此外，即使使用逼真的外观测量，渲染质量和帧速率之间也存在权衡。没有任何框架可以解决这些权衡问题并轻松地实时提供照片般逼真的外观。我们的 NeARportation 框架结合了服务器-客户端双向通信和神经渲染来解决这些权衡问题。服务器上的神经渲染接收客户端的头部姿势并生成具有逼真外观再现的新视图图像，并将其流式传输到客户端的显示器上。通过将我们的框架应用于立体显示器，我们确认它可以根据用户的头部运动以每秒 35-40 帧 (fps) 的速度在全高清立体视频上显示高保真外观。\n  - [HDHumans：高保真数字人类的混合方法](https://arxiv.org/abs/2210.12003) | [code]\n    > 逼真的数字人类头像在图形中非常重要，因为它们可以在全球范围内实现沉浸式通信，改善游戏和娱乐体验，并且对 AR 和 VR 设置特别有益。然而，当前的头像生成方法要么在高保真新视图合成、对新动作的泛化、宽松衣服的再现方面存在不足，要么无法以现代显示器提供的高分辨率渲染角色。为此，我们提出了 HDHumans，这是第一个用于 HD 人物角色合成的方法，它共同产生准确且时间连贯的 3D 变形表面和任意新颖视图和训练时未看到的运动的高度逼真的图像。在技​​术核心，我们的方法将经典的变形字符模板与神经辐射场 (NeRF) 紧密集成。我们的方法经过精心设计，以实现经典表面变形和 NeRF 之间的协同作用。首先，模板引导 NeRF，它允许合成高度动态和清晰的角色的新视图，甚至可以合成新的动作。其次，我们还利用 NeRF 产生的密集点云通过 3D 到 3D 监督进一步改善变形表面。在合成质量和分辨率以及 3D 表面重建的质量方面，我们在数量和质量上都优于最先进的技术。\n## Oct9 - Oct15, 2022\n  - [AniFaceGAN：用于视频头像的动画 3D 感知人脸图像生成, NeurIPS2022](https://arxiv.org/abs/2210.06465) | [***``[code]``***](https://yuewuhkust.github.io/AniFaceGAN/files/github_icon.jpeg)\n    > 尽管 2D 生成模型在人脸图像生成和动画方面取得了长足进步，但它们在从不同相机视点渲染图像时经常会遇到不希望的伪影，例如 3D 不一致。这可以防止他们合成与真实动画无法区分的视频动画。最近，3D 感知 GAN 扩展了 2D GAN，通过利用 3D 场景表示来明确解开相机姿势。这些方法可以很好地保持生成图像在不同视图中的 3D 一致性，但它们无法实现对其他属性的细粒度控制，其中面部表情控制可以说是面部动画最有用和最理想的方法。在本文中，我们提出了一种可动画的 3D 感知 GAN，用于多视图一致的人脸动画生成。关键思想是将 3D-aware GAN 的 3D 表示分解为模板字段和变形字段，其中前者用规范表达式表示不同的身份，后者表征每个身份的表达变化。为了通过变形实现对面部表情的有意义的控制，我们在 3D 感知 GAN 的对抗训练期间提出了生成器和参数 3D 面部模型之间的 3D 级模仿学习方案。这有助于我们的方法实现具有强烈视觉 3D 一致性的高质量动画人脸图像生成，即使仅使用非结构化 2D 图像进行训练。广泛的实验证明了我们优于以前的工作的性能。项目页面：此 https 网址\n  - [从单目视频中重建个性化语义面部 NeRF 模型, SIGGRAPH-Asia2022](https://arxiv.org/abs/2210.06108) | [***``[code]``***](https://github.com/USTC3DV/NeRFBlendShape-code)\n    > 我们提出了一种用神经辐射场定义的人头语义模型。 3D 一致的头部模型由一组解耦和可解释的基础组成，并且可以由低维表达系数驱动。由于神经辐射场强大的表示能力，所构建的模型可以表示复杂的面部属性，包括头发、着装等，这些属性是传统网格混合形状无法表示的。为了构建个性化的语义面部模型，我们建议将基础定义为几个多级体素字段。以短的单目 RGB 视频作为输入，我们的方法可以在 10 到 20 分钟内构建主体的语义面部 NeRF 模型，并且可以在给定的表情系数和视图方向下在数十毫秒内渲染出照片般逼真的人头图像。通过这种新颖的表示，我们将其应用于面部重定向和表情编辑等许多任务。实验结果证明了其强大的表示能力和训练/推理速度。我们的项目页面中提供了演示视频和发布的代码：此 https 网址\n  - [动态人脸合成的可控辐射场, 3DV2022](https://arxiv.org/abs/2210.05825) | [code]\n    > 最近关于 3D 感知图像合成的工作利用神经渲染的进步取得了令人瞩目的成果。然而，面部动态的 3D 感知合成并没有受到太多关注。在这里，我们研究如何明确控制表现出非刚性运动（例如，面部表情变化）的面部动力学的生成模型合成，同时确保 3D 感知。为此，我们提出了一种可控辐射场（CoRF）：1）通过在基于样式的生成器的分层潜在运动空间中嵌入运动特征来实现运动控制； 2）为了确保背景、运动特征和特定主题属性（如光照、纹理、形状、反照率和身份）的一致性，结合了人脸解析网络、头部回归器和身份编码器。在头部图像/视频数据上，我们表明 CoRF 具有 3D 感知能力，同时能够编辑身份、查看方向和运动。\n  - [通过神经渲染在静态视频中进行自我监督的 3D 人体姿态估计](https://arxiv.org/abs/2210.04514) | [code]\n    > 从 2D 图像推断 3D 人体姿势是计算机视觉领域中一个具有挑战性且长期存在的问题，具有许多应用，包括运动和医学的运动捕捉、虚拟现实、监视或步态分析。我们提供了一种从包含单个人和静态背景的 2D 视频中估计 3D 姿势的方法的初步结果，而无需任何手动地标注释。我们通过制定一个简单而有效的自我监督任务来实现这一点：我们的模型需要重建视频的随机帧，给定来自另一个时间点的帧和变换后的人体形状模板的渲染图像。对于优化至关重要，我们基于光线投射的渲染管道是完全可区分的，能够仅基于重建任务进行端到端训练。\n  - [ReFu：细化和融合未观察到的视图以保留细节的单图像 3D 人体重建](https://dl.acm.org/doi/abs/10.1145/3503161.3547971) | [code]\n    > 单图像 3D 人体重建旨在在给定单个图像的情况下重建人体的 3D 纹理表面。虽然基于隐式函数的方法最近实现了合理的重建性能，但它们仍然存在局限性，从未观察的角度显示表面几何形状和纹理质量下降。作为回应，为了生成逼真的纹理表面，我们提出了 ReFu，这是一种从粗到细的方法，可以细化投影的背面视图图像并融合细化的图像以预测最终的人体。为了抑制在投影图像和重建网格中引起噪声的扩散占用，我们建议通过同时利用 2D 和 3D 监督和基于占用的体渲染来训练占用概率。我们还引入了一种细化架构，该架构可以生成具有前后扭曲的保留细节的背面视图图像。大量实验表明，我们的方法从单个图像中实现了 3D 人体重建的最先进性能，从未观察到的视图中显示出增强的几何和纹理质量。\n## Oct2 - Oct8, 2022\n  - [一种基于关键点的音频驱动自由视角说话头合成增强方法](https://arxiv.org/abs/2210.03335) | [code]\n    > 音频驱动的说话头合成是一项具有挑战性的任务，近年来越来越受到关注。虽然现有的基于 2D 标志或 3D 人脸模型的方法可以为任意身份合成准确的嘴唇同步和有节奏的头部姿势，但它们仍然存在局限性，例如嘴部映射中的切割感和缺乏皮肤高光。与周围的人脸相比，变形区域是模糊的。提出了一种基于关键点的增强（KPBE）方法用于音频驱动的自由视图说话头合成，以提高生成视频的自然度。首先，使用现有方法作为后端来合成中间结果。然后我们使用关键点分解从后端输出和源图像中提取视频合成控制参数。之后，将控制参数合成为源关键点和驱动关键点。使用基于运动场的方法从关键点表示生成最终图像。通过关键点表示，我们克服了嘴巴映射中的切割感和缺乏皮肤高光的问题。实验表明，我们提出的增强方法在平均意见得分方面提高了谈话头视频的质量。\n  - [SelfNeRF：来自单目自旋转视频的人类快速训练 NeRF](https://arxiv.org/abs/2210.01651) | [code]\n    > 在本文中，我们提出了 SelfNeRF，一种有效的基于神经辐射场的新型视图合成方法，用于人类表现。给定人类表演者的单目自旋转视频，SelfNeRF 可以从头开始训练并在大约 20 分钟内获得高保真结果。最近的一些工作利用神经辐射场进行动态人体重建。然而，这些方法中的大多数都需要多视图输入并且需要数小时的训练，因此仍然难以实际使用。为了解决这个具有挑战性的问题，我们引入了一种基于多分辨率哈希编码的表面相对表示，可以大大提高训练速度并聚合帧间信息。在几个不同数据集上的广泛实验结果证明了 SelfNeRF 对具有挑战性的单目视频的有效性和效率。\n  - [从单目视频中捕捉和动画身体和服装](https://arxiv.org/abs/2210.01868) | [code]\n    > 虽然最近的工作已经显示出从单个图像、视频或一组 3D 扫描中提取穿衣服的 3D 人体化身的进展，但仍然存在一些限制。大多数方法使用整体表示来对身体和服装进行联合建模，这意味着对于虚拟试穿等应用，服装和身体不能分开。其他方法分别对身体和衣服进行建模，但它们需要从从 3D/4D 扫描仪或物理模拟获得的大量 3D 衣服人体网格中进行训练。我们的洞察是身体和服装有不同的造型要求。虽然基于网格的参数 3D 模型可以很好地表示身体，但隐式表示和神经辐射场更适合捕捉服装中存在的各种形状和外观。基于这一见解，我们提出了 SCARF（分段穿衣化身辐射场），这是一种将基于网格的身体与神经辐射场相结合的混合模型。将网格与可微分光栅器相结合将网格集成到体积渲染中，使我们能够直接从单目视频优化 SCARF，而无需任何 3D 监督。混合建模使 SCARF 能够（i）通过改变身体姿势（包括手部关节和面部表情）为穿着衣服的身体化身制作动画，（ii）合成化身的新视图，以及（iii）在虚拟试穿中在化身之间转移衣服应用程序。我们证明了 SCARF 重建的服装比现有方法具有更高的视觉质量，服装随着身体姿势和体形的变化而变形，并且服装可以在不同主体的化身之间成功转移。代码和模型可在此 https 网址获得。\n  - [MonoNHR：单眼神经人类渲染器](https://arxiv.org/abs/2210.00627) | [code]\n    > 由于不可见区域中缺乏信息以及可见区域中像素的深度模糊性，现有的神经人类渲染方法难以处理单个图像输入。在这方面，我们提出了单目神经人类渲染器 (MonoNHR)，这是一种新颖的方法，可以仅在给定单个图像的情况下渲染任意人的鲁棒自由视点图像。 MonoNHR 是第一个（i）在单目设置中呈现在训练期间从未见过的人类受试者，以及（ii）在没有几何监督的情况下以弱监督方式训练的方法。首先，我们建议解开 3D 几何和纹理特征，并根据 3D 几何特征调整纹理推断。其次，我们引入了一个 Mesh Inpainter 模块，该模块利用人类结构先验（例如对称性）来修复被遮挡的部分。在 ZJU-MoCap、AIST 和 HUMBI 数据集上的实验表明，我们的方法明显优于最近适应单目情况的方法。\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n  - [FNeVR：面部动画的神经体积渲染](https://arxiv.org/abs/2209.10340) | [code]\n    > 人脸动画是计算机视觉中最热门的话题之一，在生成模型的帮助下取得了可喜的成绩。然而，由于复杂的运动变形和复杂的面部细节建模，生成身份保持和照片般逼真的图像仍然是一个关键挑战。为了解决这些问题，我们提出了一个人脸神经体绘制 (FNeVR) 网络，以在一个统一的框架中充分挖掘 2D 运动扭曲和 3D 体绘制的潜力。在 FNeVR 中，我们设计了一个 3D 面部体积渲染 (FVR) 模块来增强图像渲染的面部细节。具体来说，我们首先使用精心设计的架构提取 3D 信息，然后引入正交自适应光线采样模块以实现高效渲染。我们还设计了一个轻量级的姿势编辑器，使 FNeVR 能够以简单而有效的方式编辑面部姿势。大量实验表明，我们的 FNeVR 在广泛使用的 Talking Head 基准测试中获得了最佳的整体质量和性能。\n  - [通过神经动画网格进行人体性能建模和渲染](https://arxiv.org/abs/2209.08468) | [code]\n    > 我们最近看到了照片真实人体建模和渲染的神经进步的巨大进步。但是，将它们集成到现有的基于网格的管道中以用于下游应用程序仍然具有挑战性。在本文中，我们提出了一种综合神经方法，用于从密集的多视图视频中对人类表演进行高质量的重建、压缩和渲染。我们的核心直觉是将传统的动画网格工作流程与新型高效神经技术联系起来。我们首先介绍了一种用于在几分钟内生成高质量表面的神经表面重建器。它将截断有符号距离场 (TSDF) 的隐式体积渲染与多分辨率哈希编码结合在一起。我们进一步提出了一种混合神经跟踪器来生成动画网格，它将显式非刚性跟踪与自监督框架中的隐式动态变形相结合。前者将粗略的变形提供回规范空间，而后者隐含的进一步使用我们的重构器中的 4D 哈希编码来预测位移。然后，我们讨论使用获得的动画网格的渲染方案，范围从动态纹理到各种带宽设置下的流明图渲染。为了在质量和带宽之间取得复杂的平衡，我们提出了一种分层解决方案，首先渲染覆盖表演者的 6 个虚拟视图，然后进行遮挡感知神经纹理混合。我们展示了我们的方法在各种基于网格的应用程序和各种平台上逼真的自由视图体验中的有效性，即通过移动 AR 将虚拟人类表演插入真实环境或使用 VR 耳机沉浸式观看才艺表演。\n## Sep11 - Sep17, 2022\n  - [3DMM-RF：用于 3D 人脸建模的卷积辐射场](https://arxiv.org/abs/2209.07366) | [code]\n    > 面部 3D 可变形模型是具有无数应用的主要计算机视觉主题，并且在过去二十年中得到了高度优化。深度生成网络的巨大改进为改进此类模型创造了各种可能性，并引起了广泛的兴趣。此外，神经辐射领域的最新进展正在彻底改变已知场景的新视图合成。在这项工作中，我们提出了一个面部 3D 可变形模型，它利用了上述两者，并且可以准确地建模对象的身份、姿势和表情，并在任意光照下渲染它。这是通过利用强大的基于深度样式的生成器来克服神经辐射场的两个主要弱点，即它们的刚性和渲染速度来实现的。我们引入了一种基于样式的生成网络，它一次性合成所有且仅合成神经辐射场所需的渲染样本。我们创建了一个巨大的面部渲染标记合成数据集，并在这些数据上训练网络，以便它可以准确地建模和概括面部身份、姿势和外观。最后，我们证明该模型可以准确地拟合任意姿势和光照的“in-the-wild”人脸图像，提取人脸特征，并用于在可控条件下重新渲染人脸。\n  - [明确可控的 3D 感知肖像生成](https://arxiv.org/abs/2209.05434) | [code]\n    > 与成本高昂的传统头像创建流程相比，当代生成方法直接从照片中学习数据分布。虽然大量工作扩展了无条件生成模型并实现了一定程度的可控性，但确保多视图一致性仍然具有挑战性，尤其是在大姿势中。在这项工作中，我们提出了一个生成 3D 感知肖像的网络，同时可以根据有关姿势、身份、表情和照明的语义参数进行控制。我们的网络使用神经场景表示来建模 3D 感知肖像，其生成由支持显式控制的参数化面部模型引导。虽然通过对比具有部分不同属性的图像可以进一步增强潜在的解缠结，但在为表情制作动画时，非面部区域（例如头发和背景）仍然存在明显的不一致。我们通过提出一种体积混合策略来解决这个问题，在该策略中，我们通过混合动态和静态区域来形成复合输出，其中两部分从联合学习的语义场中分割出来。我们的方法在广泛的实验中优于现有技术，当从自由视角观看时，可以在自然光下生成逼真的肖像，并具有生动的表达。它还展示了对真实图像和域外数据的泛化能力，在实际应用中显示出巨大的前景。\n## Sep4 - Sep10, 2022\n  - [SIRA：来自单个图像的可重新点亮的头像](https://arxiv.org/abs/2209.03027) | [code]\n    > 从单个图像中恢复人头的几何形状，同时分解材料和照明是一个严重不适定的问题，需要解决先验信息。基于 3D 可变形模型 (3DMM) 的方法，以及它们与可微渲染器的组合，已显示出可喜的结果。然而，3DMM 的表现力是有限的，它们通常会产生过度平滑且与身份无关的 3D 形状，仅限于面部区域。最近已经通过使用多层感知器参数化几何形状的神经场获得了高度准确的全头重建。这些表示的多功能性也被证明对于解开几何、材料和照明是有效的。然而，这些方法需要几十个输入图像。在本文中，我们介绍了 SIRA，这是一种从单个图像重建具有高保真几何形状和分解光和表面材料的人头头像的方法。我们的关键成分是两个基于神经场的数据驱动统计模型，可解决单视图 3D 表面重建和外观分解的模糊性。实验表明，SIRA 在 3D 头部重建中获得了最先进的结果，同时它成功地解开了全局照明、漫反射和镜面反射率。此外，我们的重建适用于基于物理的外观编辑和头部模型重新照明。\n  - [MotionDiffuse：使用扩散模型的文本驱动人体运动生成](https://arxiv.org/abs/2208.15001) | [***``[code]``***](https://github.com/mingyuan-zhang/MotionDiffuse)\n    > 人体运动建模对于许多现代图形应用程序很重要，这些应用程序通常需要专业技能。为了消除外行的技能障碍，最近的动作生成方法可以直接生成以自然语言为条件的人体动作。然而，通过各种文本输入实现多样化和细粒度的运动生成仍然具有挑战性。为了解决这个问题，我们提出了 MotionDiffuse，这是第一个基于扩散模型的文本驱动的运动生成框架，它展示了现有方法的几个所需属性。 1）概率映射。 MotionDiffuse 不是确定性的语言-运动映射，而是通过一系列注入变化的去噪步骤生成运动。 2）现实综合。 MotionDiffuse 擅长对复杂的数据分布进行建模并生成生动的运动序列。 3) 多级操作。 MotionDiffuse 响应身体部位的细粒度指令，以及带有时变文本提示的任意长度运动合成。我们的实验表明，MotionDiffuse 在文本驱动的运动生成和动作条件的运动生成方面具有令人信服的优势，从而优于现有的 SoTA 方法。定性分析进一步证明了 MotionDiffuse 对综合运动生成的可控性。主页：此 https 网址\n## Aug28 - Sep3, 2022\n  - [Dual-Space NeRF：在不同空间中学习动画化身和场景照明, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > 在规范空间中对人体进行建模是捕捉和动画的常见做法。但是当涉及到神经辐射场 (NeRF) 时，仅仅在标准空间中学习一个静态的 NeRF 是不够的，因为即使场景照明是恒定的，当人移动时身体的照明也会发生变化。以前的方法通过学习每帧嵌入来缓解光照的不一致性，但这种操作并不能推广到看不见的姿势。鉴于光照条件在世界空间中是静态的，而人体在规范空间中是一致的，我们提出了一种双空间 NeRF，它在两个独立的空间中使用两个 MLP 对场景光照和人体进行建模。为了弥合这两个空间，以前的方法主要依赖于线性混合蒙皮 (LBS) 算法。然而，动态神经领域的 LBS 的混合权重是难以处理的，因此通常用另一个 MLP 来记忆，这不能推广到新的姿势。尽管可以借用 SMPL 等参数网格的混合权重，但插值操作会引入更多伪影。在本文中，我们建议使用重心映射，它可以直接泛化到看不见的姿势，并且出人意料地取得了比具有神经混合权重的 LBS 更好的结果。 Human3.6M 和 ZJU-MoCap 数据集的定量和定性结果显示了我们方法的有效性。\n  - [NerfCap：使用动态神经辐射场捕获人类表现, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > 本文解决了从稀疏的多视图或单目视频中捕捉人类表演的挑战。给定表演者的模板网格，以前的方法通过将模板网格非刚性地注册到具有 2D 轮廓或密集光度对齐的图像来捕获人体运动。然而，详细的表面变形无法从轮廓中恢复，而光度对齐则受到视频外观变化引起的不稳定性的影响。为了解决这些问题，我们提出了 NerfCap，这是一种基于表演者动态神经辐射场 (NeRF) 表示的新型表演捕捉方法。具体来说，通过优化变形场和规范 NeRF 的外观模型，从模板几何初始化规范 NeRF 并注册到视频帧。为了捕捉大型身体运动和详细的表面变形，NerfCap 将线性混合蒙皮与嵌入式图形变形相结合。与受限于固定拓扑和纹理的基于网格的方法相比，NerfCap 能够灵活地捕捉视频中复杂的几何形状和外观变化，并合成更逼真的图像。此外，NerfCap 可以通过将合成视频与输入视频进行匹配，以自我监督的方式进行端到端的预训练。各种数据集的实验结果表明，NerfCap 在表面重建精度和新视图合成质量方面都优于先前的工作。\n## Aug21 - Aug27, 2022\n  - [神经小说演员：学习人类演员的广义动画神经表示](https://arxiv.org/abs/2208.11905) | [code]\n    > 我们提出了一种新方法，用于从一组稀疏的多人多视图图像中学习广义的可动画神经人类表示。学习到的表示可用于从一组稀疏的相机中合成任意人的新颖视图图像，并使用用户的姿势控制进一步对它们进行动画处理。虽然现有方法可以推广到新人或使用用户控制合成动画，但它们都不能同时实现这两者。我们将这一成就归功于为共享的多人人体模型使用 3D 代理，并进一步将不同姿势的空间扭曲到共享的规范姿势空间，在该空间中，我们学习了一个神经领域并预测了人和与姿势相关的变形，以及从输入图像中提取的特征的外观。为了应对身体形状、姿势和服装变形的巨大变化的复杂性，我们设计了具有解开几何和外观的神经人体模型。此外，我们利用 3D 代理的空间点和表面点的图像特征来预测与人和姿势相关的属性。实验表明，我们的方法在这两项任务上都显着优于现有技术。视频和代码可在此 https 网址上找到。\n## Aug14 - Aug20, 2022\n  - [通过多平面图像的 3D 对象运动估计动态场景的时间视图合成, ISMAR2022](https://arxiv.org/abs/2208.09463) | [***``[code]``***](https://github.com/NagabhushanSN95/DeCOMPnet)\n    > 在低计算设备上以图形方式渲染高帧率视频的挑战可以通过对未来帧的定期预测来解决，以增强虚拟现实应用程序中的用户体验。这是通过时间视图合成 (TVS) 的问题来研究的，其目标是在给定前一帧以及前一帧和下一帧的头部姿势的情况下预测视频的下一帧。在这项工作中，我们考虑了用户和对象都在移动的动态场景的 TVS。我们设计了一个框架，将运动解耦为用户和对象运动，以在预测下一帧的同时有效地使用可用的用户运动。我们通过隔离和估计过去帧中的 3D 对象运动然后外推来预测对象的运动。我们使用多平面图像 (MPI) 作为场景的 3D 表示，并将对象运动建模为 MPI 表示中对应点之间的 3D 位移。为了在估计运动时处理 MPI 中的稀疏性，我们结合了部分卷积和掩蔽相关层来估计对应点。然后将预测的对象运动与给定的用户或相机运动集成以生成下一帧。使用遮蔽填充模块，我们合成由于相机和物体运动而未覆盖的区域。我们为包含 800 个全高清分辨率视频的动态场景 TVS 开发了一个新的合成数据集。我们通过对我们的数据集和 MPI Sintel 数据集的实验表明，我们的模型优于文献中的所有竞争方法。\n  - [LoRD：用于高保真动态人体建模的局部 4D 隐式表示, ECCV2022](https://arxiv.org/abs/2208.08622) | [code]\n    > 4D 隐式表示的最新进展集中在使用低维潜在向量全局控制形状和运动，这容易丢失表面细节和累积跟踪误差。尽管许多深度局部表示已显示出可用于 3D 形状建模的有希望的结果，但它们的 4D 对应物尚不存在。在本文中，我们提出了一种新颖的用于动态服装人体的局部 4D 隐式表示，名为 LoRD，以填补这一空白，它兼具 4D 人体建模和局部表示的优点，并能够通过详细的表面变形进行高保真重建，例如衣服褶皱。特别是，我们的关键见解是鼓励网络学习局部部分级表示的潜在代码，能够解释局部几何和时间变形。为了在测试时进行推断，我们首先在每个时间步估计体内骨骼运动以跟踪局部部位，然后根据不同类型的观察数据通过自动解码优化每个部位的潜在代码。大量实验表明，该方法具有很强的表示 4D 人体的能力，并且在实际应用中优于最先进的方法，包括从稀疏点进行 4D 重建、非刚性深度融合，无论是定性还是定量。\n  - [从单目视频中对动画 3D 人体进行神经捕获, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > 我们提出了一种从单目视频输入构建可动画 3D 人体表示的新颖范例，这样它就可以以任何看不见的姿势和视图进行渲染。我们的方法基于动态神经辐射场 (NeRF)，该动态神经辐射场 (NeRF) 由作为几何代理的基于网格的参数化 3D 人体模型装配。以前的方法通常依赖多视图视频或准确的 3D 几何信息作为附加输入；此外，大多数方法在推广到看不见的姿势时质量会下降。我们认为，泛化的关键是用于查询动态 NeRF 的良好输入嵌入：良好的输入嵌入应该定义全体积空间中的单射映射，由姿态变化下的表面网格变形引导。基于这一观察，我们建议嵌入输入查询及其与网格顶点上一组测地最近邻所跨越的局部表面区域的关系。通过包含位置和相对距离信息，我们的嵌入定义了距离保留的变形映射，并很好地推广到看不见的姿势。为了减少对额外输入的依赖，我们首先使用现成的工具初始化每帧 3D 网格，然后提出一个管道来联合优化 NeRF 并细化初始网格。大量实验表明，我们的方法可以在看不见的姿势和视图下合成合理的人类渲染结果。\n## Aug7 - Aug13, 2022\n  - [渐进式多尺度光场网络, 3DV2022](https://arxiv.org/abs/2208.06710) | [code]\n    > 与图像集表示相比，神经表示在表示辐射和光场的能力方面显示出了巨大的希望，同时非常紧凑。然而，当前的表示不太适合流式传输，因为解码只能在单个细节级别上完成，并且需要下载整个神经网络模型。此外，高分辨率光场网络可能会出现闪烁和混叠，因为在没有适当过滤的情况下对神经网络进行采样。为了解决这些问题，我们提出了一个渐进式多尺度光场网络，它对具有多层次细节的光场进行编码。使用较少的神经网络权重对较低级别的细节进行编码，从而实现渐进式流传输并减少渲染时间。我们的渐进式多尺度光场网络通过在较低细节级别编码较小的抗锯齿表示来解决锯齿问题。此外，每个像素级别的细节使我们的表示能够支持抖动过渡和中心点渲染。\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [神经链：从多视图图像中学习头发的几何形状和外观, ECCV2022](https://arxiv.org/pdf/2207.14067) | [***``[code]``***](https://radualexandru.github.io/neural_strands/)\n    > 我们提出了 Neural Strands，这是一种新颖的学习框架，用于从多视图图像输入中对精确的头发几何形状和外观进行建模。学习的头发模型可以从具有高保真视图相关效果的任何视点实时渲染。与体积模型不同，我们的模型实现了直观的形状和样式控制。为了实现这些特性，我们提出了一种基于神经头皮纹理的新型头发表示，该神经头皮纹理对每个纹素位置的单个股线的几何形状和外观进行编码。此外，我们引入了一种基于学习发束光栅化的新型神经渲染框架。我们的神经渲染是精确的和抗锯齿的，使渲染视图一致且逼真。将外观与多视图几何先验相结合，我们首次实现了从多视图设置中联合学习外观和显式头发几何形状。我们展示了我们的方法在各种发型的保真度和效率方面的有效性。\n## Previous weeks\n  - [用于单目 4D 面部头像重建的动态神经辐射场, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > 我们提出了用于模拟人脸外观和动态的动态神经辐射场。对说话的人进行数字建模和重建是各种应用程序的关键组成部分。特别是对于 AR 或 VR 中的远程呈现应用，需要忠实再现外观，包括新颖的视点或头部姿势。与显式建模几何和材料属性或纯粹基于图像的最先进方法相比，我们引入了基于场景表示网络的头部隐式表示。为了处理面部的动态，我们将场景表示网络与低维可变形模型相结合，该模型提供对姿势和表情的显式控制。我们使用体积渲染从这种混合表示中生成图像，并证明这种动态神经场景表示只能从单目输入数据中学习，而不需要专门的捕获设置。在我们的实验中，我们表明这种学习的体积表示允许生成照片般逼真的图像，其质量超过了基于视频的最先进的重演方法的质量。\n  - [PVA：像素对齐的体积化身, CVPR2021](https://volumetric-avatars.github.io/) | [code]\n    > 逼真的人头的采集和渲染是一个极具挑战性的研究问题，对于虚拟远程呈现特别重要。目前，最高质量是通过在多视图数据上以个人特定方式训练的体积方法实现的。与更简单的基于网格的模型相比，这些模型更好地表示精细结构，例如头发。体积模型通常使用全局代码来表示面部表情，以便它们可以由一小组动画参数驱动。虽然这样的架构实现了令人印象深刻的渲染质量，但它们不能轻易地扩展到多身份设置。在本文中，我们设计了一种新颖的方法，用于在仅给定少量输入的情况下预测人头的体积化身。我们通过一种新颖的参数化实现跨身份的泛化，该参数化将神经辐射场与直接从输入中提取的局部像素对齐特征相结合，从而避免了对非常深或复杂网络的需求。我们的方法仅基于光度重新渲染损失以端到端的方式进行训练，无需明确的 3D 监督。我们证明我们的方法在质量方面优于现有的现有技术，并且能够生成忠实的面部表情多身份设置。\n  - [用于人体建模的动画神经辐射场, ICCV2021](https://zju3dv.github.io/animatable_nerf/) | [***``[code]``***](https://github.com/zju3dv/animatable_nerf)\n    > 本文解决了从多视图视频中重建可动画人体模型的挑战。最近的一些工作提出将非刚性变形场景分解为规范神经辐射场和一组将观察空间点映射到规范空间的变形场，从而使他们能够从图像中学习动态场景。然而，它们将变形场表示为平移矢量场或 SE(3) 场，这使得优化受到高度约束。此外，这些表示不能由输入运动明确控制。相反，我们引入了神经混合权重场来产生变形场。基于骨架驱动的变形，混合权重场与 3D 人体骨骼一起使用，以生成观察到规范和规范到观察的对应关系。由于 3D 人体骨骼更易观察，它们可以规范变形场的学习。此外，学习到的混合权重场可以与输入的骨骼运动相结合，以生成新的变形场来为人体模型设置动画。实验表明，我们的方法明显优于最近的人类合成方法。该代码将在 https://zju3dv.github.io/animatable_nerf/ 上提供。\n  - [神经演员：具有姿势控制的人类演员的神经自由视图合成, SIGSIGGRAPH Asia 2021](https://vcai.mpi-inf.mpg.de/projects/NeuralActor/) | [***``[code]``***](https://people.mpi-inf.mpg.de/~lliu/projects/NeuralActor/)\n    > 我们提出了神经演员 (NA)，这是一种从任意视角和任意可控姿势下高质量合成人类的新方法。我们的方法建立在最近的神经场景表示和渲染工作之上，这些工作仅从 2D 图像中学习几何和外观的表示。虽然现有作品展示了令人信服的静态场景渲染和动态场景回放，但使用神经隐式方法对人类进行逼真的重建和渲染，特别是在用户控制的新姿势下，仍然很困难。为了解决这个问题，我们利用粗体模型作为代理将周围的 3D 空间展开为规范姿势。神经辐射场从多视图视频输入中学习规范空间中与姿势相关的几何变形以及与姿势和视图相关的外观效果。为了合成高保真动态几何和外观的新视图，我们利用在身体模型上定义的 2D 纹理图作为潜在变量来预测残余变形和动态外观。实验表明，我们的方法在回放和新颖的姿势合成方面取得了比现有技术更好的质量，甚至可以很好地推广到与训练姿势截然不同的新姿势。此外，我们的方法还支持合成结果的体形控制。\n  - [神经体：具有结构化潜在代码的隐式神经表示，用于动态人类的新视图合成, CVPR2021](https://zju3dv.github.io/neuralbody/) | [***``[code]``***](https://github.com/zju3dv/neuralbody)\n    > 本文解决了人类表演者从一组非常稀疏的摄像机视图中合成新颖视图的挑战。最近的一些工作表明，在给定密集输入视图的情况下，学习 3D 场景的隐式神经表示可以实现显着的视图合成质量。但是，如果视图高度稀疏，则表示学习将是不适定的。为了解决这个不适定问题，我们的关键思想是整合对视频帧的观察。为此，我们提出了神经体，这是一种新的人体表示，它假设在不同帧上学习到的神经表示共享同一组锚定到可变形网格的潜在代码，以便可以自然地整合跨帧的观察结果。可变形网格还为网络提供几何指导，以更有效地学习 3D 表示。为了评估我们的方法，我们创建了一个名为 ZJU-MoCap 的多视图数据集，用于捕捉具有复杂动作的表演者。 ZJU-MoCap 的实验表明，我们的方法在新颖的视图合成质量方面大大优于先前的工作。我们还展示了我们的方法从 People-Snapshot 数据集上的单目视频重建移动人物的能力。\n  - [单张图像的人像神经辐射场](https://portrait-nerf.github.io/) | [code]\n    > 我们提出了一种从单个爆头肖像估计神经辐射场 (NeRF) 的方法。虽然 NeRF 已经展示了高质量的视图合成，但它需要静态场景的多个图像，因此对于随意捕捉和移动主体是不切实际的。在这项工作中，我们建议使用使用灯光舞台肖像数据集的元学习框架来预训练多层感知器 (MLP) 的权重，该多层感知器隐含地对体积密度和颜色进行建模。为了提高对看不见的人脸的泛化能力，我们在由 3D 人脸可变形模型近似的规范坐标空间中训练 MLP。我们使用受控捕获对方法进行定量评估，并展示了对真实肖像图像的泛化性，显示出对最先进技术的有利结果。\n  - [A-NeRF：通过神经渲染进行无表面人体 3D 姿势细化, NeurIPS2021](https://arxiv.org/abs/2102.06199) | [***``[code]``***](https://github.com/LemonATsu/A-NeRF)\n    > 虽然深度学习使用前馈网络重塑了经典的运动捕捉管道，但需要生成模型通过迭代细化来恢复精细对齐。不幸的是，现有模型通常是在受控条件下手工制作或学习的，仅适用于有限的领域。我们提出了一种通过扩展神经辐射场 (NeRFs) 从未标记的单目视频中学习生成神经体模型的方法。我们为它们配备了骨架，以适用于时变和关节运动。一个关键的见解是，隐式模型需要与显式曲面模型中使用的正向运动学相反。我们的重新参数化定义了相对于身体部位姿势的空间潜在变量，从而克服了过度参数化的不适定逆运算。这使得从头开始学习体积身体形状和外观，同时共同改进关节姿势；输入视频上的所有外观、姿势或 3D 形状都没有地面实况标签。当用于新视图合成和动作捕捉时，我们的神经模型提高了不同数据集的准确性。项目网站：此 https 网址。\n  - [学习动态人头的组成辐射场, CVPR2021(oral)](https://ziyanw1.github.io/hybrid_nerf/) | [code]\n    > 动态人体的逼真渲染是远程呈现系统、虚拟购物、合成数据生成等的重要能力。最近，结合计算机图形学和机器学习技术的神经渲染方法已经创建了人类和物体的高保真模型。其中一些方法不会为可驱动的人体模型（神经体积）产生足够高保真度的结果，而其他方法则具有极长的渲染时间（NeRF）。我们提出了一种新颖的组合 3D 表示，它结合了以前最好的方法来产生更高分辨率和更快的结果。我们的表示通过将粗略的 3D 结构感知动画代码网格与连续学习的场景函数相结合，弥合了离散和连续体积表示之间的差距，该函数将每个位置及其相应的局部动画代码映射到其与视图相关的发射辐射和局部体积密度。可微分体渲染用于计算人头和上身的照片般逼真的新颖视图，并仅使用 2D 监督来端到端训练我们的新颖表示。此外，我们表明，学习到的动态辐射场可用于基于全局动画代码合成新的看不见的表情。我们的方法在合成动态人头和上半身的新视图方面取得了最先进的结果。\n  - [使用分层神经表示的可编辑自由视点视频, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > 生成自由视点视频对于沉浸式 VR/AR 体验至关重要，但最近的神经学进展仍然缺乏编辑能力来操纵大型动态场景的视觉感知。为了填补这一空白，在本文中，我们提出了第一种仅使用稀疏的 16 个摄像头为大规模动态场景生成可编辑照片般逼真的自由视点视频的方法。我们方法的核心是一种新的分层神经表示，其中包括环境本身的每个动态实体都被制定为称为 ST-NeRF 的时空相干神经分层辐射表示。这种分层表示支持对动态场景的完全感知和真实操作，同时仍支持大范围的自由观看体验。在我们的 ST-NeRF 中，动态实体/层被表示为连续函数，以连续和自监督的方式实现动态实体的位置、变形以及外观的解耦。我们提出了一个场景解析 4D 标签映射跟踪来显式地解开空间信息，以及一个连续变形模块来隐式地解开时间运动。进一步引入了一种对象感知体绘制方案，用于重新组装所有神经层。我们采用了一种新颖的分层损失和运动感知光线采样策略，以实现对具有多个表演者的大型动态场景的有效训练，我们的框架进一步实现了各种编辑功能，即操纵规模和位置，复制或重新定时单个神经层在保持高度真实感的同时创造众多视觉效果。大量实验证明了我们的方法在为动态场景生成高质量、照片般逼真和可编辑的自由视点视频方面的有效性。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/lighting.md",
    "content": "\n每周分类神经辐射场 - lighting ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=====================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n  - [光场的神经子空间, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9968104) | [code]\n    > 我们引入了一个框架，用于用神经子空间的新概念来紧凑地表示光场内容。 虽然最近提出的神经光场表示通过将光场编码到单个神经网络中实现了很好的压缩结果，但统一设计并未针对光场中展示的复合结构进行优化。 此外，将光场的每一部分编码到一个网络中对于需要快速传输和解码的应用来说并不理想。 我们认识到这个问题与子空间学习的联系。 我们提出了一种使用几个小型神经网络的方法，专门研究特定光场段的神经子空间。 此外，我们在这些小型网络中提出了一种自适应权重共享策略，提高了参数效率。 实际上，该策略通过利用神经网络的分层结构，能够以协调一致的方式跟踪附近神经子空间之间的相似性。 此外，我们开发了一种软分类技术来提高神经表征的颜色预测准确性。 我们的实验结果表明，我们的方法在各种光场场景上比以前的方法更好地重建了光场。 我们进一步展示了其在具有不规则视点布局和动态场景内容的编码光场上的成功部署。\n  - [用于实时全局照明的高效光探测器, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555452) | [code]\n    > 再现基于物理的全局照明 (GI) 效果一直是许多实时图形应用程序的长期需求。 为了实现这一目标，许多最近的引擎采用了在预计算阶段烘焙的某种形式的光探测器。 不幸的是，由于探针存储、表示或查询的限制，预计算探针产生的 GI 效果相当有限。 在本文中，我们提出了一种基于探针的 GI 渲染的新方法，该方法可以在复杂场景中生成广泛的 GI 效果，包括具有多次反弹的光泽反射。 我们工作背后的关键贡献包括基于梯度的搜索算法和神经图像重建方法。 搜索算法旨在将探针的内容重新投影到任何查询视点，而不会引入视差误差，并快速收敛到最优解。 基于专用神经网络和多个 G 缓冲区的神经图像重建方法试图从由于分辨率有限或（潜在的）探头采样率低而导致的低质量输入中恢复高质量图像。 这种神经方法使光探针的生成变得高效。 此外，采用时间重投影策略和时间损失来提高动画序列的时间稳定性。 由于基于梯度的搜索算法的快速收敛速度和神经网络的轻量级设计，即使对于高分辨率 (1920×1080) 输出，整个流水线也实时运行（>30 帧/秒）。 已经对多个复杂场景进行了广泛的实验，以证明我们的方法优于最先进的方法。\n  - [NeuLighting：使用不受约束的照片集重新照明的自由视点户外场景的神经照明, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555384) | [code]\n    > 我们提出了 NeuLighting，这是一个新的框架，用于从一组稀疏的、不受约束的野外照片集中重新照明自由视点户外场景。 我们的框架将所有场景组件表示为由 MLP 参数化的连续函数，这些函数将 3D 位置和照明条件作为输入和输出反射率以及必要的室外照明属性。 与通常利用具有可控且一致的室内照明的训练图像的对象级重新照明方法不同，我们专注于更具挑战性的室外情况，其中所有图像都是在任意未知照明下捕获的。 我们方法的关键包括将每幅图像的光照压缩为解缠结的潜在向量的神经光照表示，以及一种新的自由视点重新光照方案，该方案对图像间的任意光照变化具有鲁棒性。 光照表示具有压缩性，可以解释各种光照，并且可以很容易地输入到基于查询的 NeuLighting 框架中，从而能够在任何一种新型光照下进行高效的阴影效果评估。 此外，为了产生高质量的投射阴影，我们根据场景几何形状和太阳方向估计太阳能见度图以指示阴影区域。 由于灵活且可解释的神经照明表示，我们的系统支持使用许多不同的照明源进行户外重新照明，包括自然图像、环境地图和延时视频。 新视角和照明下的高保真渲染证明了我们的方法相对于最先进的重新照明解决方案的优越性。\n## Nov20 - Nov26, 2022\n  - [折射物体的神经辐射场采样, SIGGRAPH-Asia2022](https://arxiv.org/abs/2211.14799) | [***``[code]``***](https://github.com/alexkeroro86/SampleNeRFRO)\n    > 最近，神经辐射场 (NeRF) 中的可微分体绘制得到了广泛的关注，其变体取得了许多令人印象深刻的结果。然而，现有的方法通常假设场景是一个均匀的体积，因此光线沿着直线路径投射。在这项工作中，场景是一个具有分段恒定折射率的异质体积，如果它与不同的折射率相交，路径将弯曲。对于折射物体的新视图合成，我们基于 NeRF 的框架旨在从具有折射物体轮廓的多视图姿势图像中优化有界体积和边界的辐射场。为了解决这个具有挑战性的问题，场景的折射率是从轮廓中重建的。给定折射率，我们扩展了 NeRF 中的分层和分层采样技术，以允许沿着由 Eikonal 方程跟踪的弯曲路径绘制样本。结果表明，我们的框架在数量和质量上都优于最先进的方法，在感知相似性度量上表现出更好的性能，并且在几个合成和真实场景的渲染质量上有明显改善。\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [LB-NERF：用于透明介质的光弯曲神经辐射场, ICIP2022](https://ieeexplore.ieee.org/abstract/document/9897642) | [code]\n    > 神经辐射场 (NeRFs) 已被提出作为新颖的视图合成方法，并且由于其多功能性已被用于解决各种问题。 NeRF 可以使用假设直线光路的神经渲染来表示 3D 空间中的颜色和密度。但是，场景中具有不同折射率的介质，例如透明介质，会引起光的折射，打破了光路直线的假设。因此，不能在多视图图像中一致地学习 NeRF。为了解决这个问题，本研究提出了一种方法，通过引入光折射效应作为与源自相机中心的直线的偏移量来学习跨多个视点的一致辐射场。实验结果定量和定性地验证了在考虑透明物体的折射时，我们的方法可以比传统的 NeRF 方法更好地插入视点。\n  - [IBL-NeRF：基于图像的神经辐射场照明公式](https://arxiv.org/abs/2210.08202) | [code]\n    > 我们提出了 IBL-NeRF，它将大规模室内场景的神经辐射场 (NeRF) 分解为内在成分。以前的 NeRF 逆向渲染方法转换隐式体积以适应显式几何的渲染管道，并使用环境照明近似分割、孤立对象的视图。相比之下，我们的逆渲染扩展了原始的 NeRF 公式，以捕捉场景体积内照明的空间变化，以及表面属性。具体来说，将不同材质的场景分解为基于图像的渲染的内在组件，即反照率、粗糙度、表面法线、辐照度和预过滤辐射度。所有组件都被推断为来自 MLP 的神经图像，可以对大规模的一般场景进行建模。通过采用基于图像的 NeRF 公式，我们的方法继承了合成图像的卓越视觉质量和多视图一致性。我们展示了在具有复杂对象布局和灯光配置的场景上的性能，这些在以前的任何作品中都无法处理。\n  - [使用树结构从辐射场估计神经反射场](https://arxiv.org/abs/2210.04217) | [code]\n    > 我们提出了一种新方法，用于在未知光照下从一组姿势多视图图像中估计对象的神经反射场 (NReF)。 NReF 以分离的方式表示对象的 3D 几何和外观，并且很难仅从图像中估计。我们的方法通过利用神经辐射场（NeRF）作为代理表示来解决这个问题，我们从中进行进一步的分解。高质量的 NeRF 分解依赖于良好的几何信息提取以及良好的先验项来正确解决不同组件之间的歧义。为了从辐射场中提取高质量的几何信息，我们重新设计了一种新的基于射线投射的表面点提取方法。为了有效地计算和应用先验项，我们将不同的先验项转换为从辐射场提取的表面上的不同类型的滤波操作。然后，我们采用两种类型的辅助数据结构，即高斯 KD-tree 和八叉树，以支持在训练期间快速查询表面点和高效计算表面过滤器。基于此，我们设计了一个多级分解优化流程，用于从神经辐射场估计神经反射场。大量实验表明，我们的方法在不同数据上优于其他最先进的方法，并且能够实现高质量的自由视图重新照明以及材料编辑任务。\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n  - [神经全局照明：动态区域光下的交互式间接照明预测, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9904431) | [code]\n    > 我们提出了神经全局照明，这是一种在具有动态视点和区域照明的静态场景中快速渲染全全局照明的新方法。我们方法的关键思想是利用深度渲染网络来模拟从每个着色点到全局照明的复杂映射。为了有效地学习映射，我们提出了一种对神经网络友好的输入表示，包括每个着色点的属性、视点信息和组合照明表示，该表示能够与紧凑的神经网络进行高质量的拟合。为了合成高频全局光照效果，我们通过位置编码将低维输入转换为高维空间，并将渲染网络建模为深度全连接网络。此外，我们将屏幕空间神经缓冲区提供给我们的渲染网络，以将屏幕空间中的对象之间的全局信息共享到每个着色点。我们已经证明了我们的神经全局照明方法可以渲染各种场景，这些场景表现出复杂的全频全局照明效果，例如多次反射光泽互反射、渗色和焦散。\n## Sep18 - Sep24, 2022\n## Sep11 - Sep17, 2022\n  - [StructNeRF：具有结构提示的室内场景的神经辐射场](https://arxiv.org/abs/2209.05277) | [code]\n    > 神经辐射场 (NeRF) 使用密集捕获的输入图像实现照片般逼真的视图合成。然而，在给定稀疏视图的情况下，NeRF 的几何形状受到极大限制，导致新视图合成质量显着下降。受自监督深度估计方法的启发，我们提出了 StructNeRF，这是一种针对具有稀疏输入的室内场景的新颖视图合成的解决方案。 StructNeRF 利用自然嵌入在多视图输入中的结构提示来处理 NeRF 中的无约束几何问题。具体来说，它分别处理纹理和非纹理区域：提出了一种基于块的多视图一致光度损失来约束纹理区域的几何形状；对于非纹理平面，我们明确将它们限制为 3D 一致平面。通过密集的自监督深度约束，我们的方法提高了 NeRF 的几何和视图合成性能，而无需对外部数据进行任何额外的训练。对几个真实世界数据集的广泛实验表明，StructNeRF 在数量和质量上都超过了用于室内场景的最先进的方法。\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n  - [跨光谱神经辐射场, 3DV2022](https://arxiv.org/abs/2209.00648) | [code]\n    > 我们提出了 X-NeRF，这是一种基于神经辐射场公式的学习交叉光谱场景表示的新方法，该方法给定从具有不同光谱灵敏度的相机捕获的图像。 X-NeRF 在训练期间优化跨光谱的相机姿势，并利用归一化跨设备坐标 (NXDC) 从任意视点呈现不同模态的图像，这些图像对齐并具有相同的分辨率。对 16 个具有彩色、多光谱和红外图像的前向场景进行的实验证实了 X-NeRF 在建模交叉光谱场景表示方面的有效性。\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n  - [从全向图像中捕捉休闲室内 HDR 辐射](https://arxiv.org/abs/2208.07903) | [code]\n    > 我们提出了 PanoHDR-NeRF，这是一种新颖的管道，可以随意捕获大型室内场景的合理全 HDR 辐射场，而无需精心设置或复杂的捕获协议。首先，用户通过在场景周围自由挥动现成的相机来捕捉场景的低动态范围 (LDR) 全向视频。 然后，LDR2HDR 网络将捕获的 LDR 帧提升为 HDR，随后用于训练定制的 NeRF++ 模型。 由此产生的 PanoHDR-NeRF 管道可以从场景的任何位置估计完整的 HDR 全景图。 通过对各种真实场景的新测试数据集进行实验，在训练期间未看到的位置捕获地面实况 HDR 辐射，我们表明 PanoHDR-NeRF 可以预测来自任何场景点的合理辐射。我们还表明，由 PanoHDR-NeRF 生成的 HDR 图像可以合成正确的照明效果，从而能够使用正确照明的合成对象来增强室内场景。\n  - [HDR-Plenoxels：自校准高动态范围辐射场, ECCV2022](https://arxiv.org/abs/2208.06787) | [code]\n    > 我们提出了高动态范围辐射 (HDR) 场 HDR-Plenoxels，它学习 3D HDR 辐射场、几何信息和 2D 低动态范围 (LDR) 图像中固有的不同相机设置的全光函数。我们基于体素的体素渲染管道仅使用从不同相机设置中以端到端方式拍摄的多视图 LDR 图像来重建 HDR 辐射场，并且具有快速的收敛速度。为了处理现实世界场景中的各种相机，我们引入了一个色调映射模块，该模块对相机内的数字成像管道 (ISP) 进行建模并解开辐射设置。我们的色调映射模块允许我们通过控制每个新视图的辐射设置来进行渲染。最后，我们构建了一个具有不同相机条件的多视图数据集，这符合我们的问题设置。我们的实验表明，HDR-Plenoxels 可以仅从带有各种相机的 LDR 图像中表达细节和高质量的 HDR 新颖视图。\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [具有全局照明的可重新照明的新视图合成的神经辐射转移场](https://arxiv.org/abs/2207.13607) | [code]\n    > 给定场景的一组图像，从新颖的视图和光照条件重新渲染该场景是计算机视觉和图形学中一个重要且具有挑战性的问题。一方面，计算机视觉中的大多数现有作品通常对图像形成过程施加许多假设，例如直接照明和预定义的材料，使场景参数估计易于处理。另一方面，成熟的计算机图形学工具允许在给定所有场景参数的情况下对复杂的照片般逼真的光传输进行建模。结合这些方法，我们提出了一种通过学习神经预计算辐射传递函数来在新视图下重新点亮场景的方法，该函数使用新的环境图隐式处理全局光照效果。我们的方法可以在单一未知照明条件下对一组场景的真实图像进行单独监督。为了在训练期间消除任务的歧义，我们在训练过程中紧密集成了一个可微的路径跟踪器，并提出了合成 OLAT 和真实图像损失的组合。结果表明，与当前技术水平相比，场景参数的恢复解缠结得到了显着改善，因此，我们的重新渲染结果也更加真实和准确。\n## Previous weeks\n  - [野外的 NeRF：无约束照片集的神经辐射场, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > 我们提出了一种基于学习的方法，用于仅使用野外照片的非结构化集合来合成复杂场景的新视图。我们建立在神经辐射场 (NeRF) 的基础上，它使用多层感知器的权重将场景的密度和颜色建模为 3D 坐标的函数。虽然 NeRF 在受控设置下捕获的静态对象的图像上效果很好，但它无法在不受控的图像中模拟许多普遍存在的真实世界现象，例如可变照明或瞬态遮挡物。我们为 NeRF 引入了一系列扩展来解决这些问题，从而能够从互联网上获取的非结构化图像集合中进行准确的重建。我们将我们的系统（称为 NeRF-W）应用于著名地标的互联网照片集，并展示时间一致的新颖视图渲染，这些渲染比现有技术更接近真实感。\n  - [Ha-NeRF：野外的幻觉神经辐射场, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > 神经辐射场 (NeRF) 最近因其令人印象深刻的新颖视图合成能力而广受欢迎。本文研究了幻觉 NeRF 的问题：即在一天中的不同时间从一组旅游图像中恢复一个真实的 NeRF。现有的解决方案采用具有可控外观嵌入的 NeRF 在各种条件下渲染新颖的视图，但它们无法渲染具有看不见的外观的视图一致图像。为了解决这个问题，我们提出了一个用于构建幻觉 NeRF 的端到端框架，称为 Ha-NeRF。具体来说，我们提出了一个外观幻觉模块来处理随时间变化的外观并将它们转移到新的视图中。考虑到旅游图像的复杂遮挡，我们引入了一个反遮挡模块来准确地分解静态主体以获得可见性。合成数据和真实旅游照片集的实验结果表明，我们的方法可以产生幻觉，并从不同的视图呈现无遮挡的图像。\n  - [黑暗中的 NeRF：来自嘈杂原始图像的高动态范围视图合成, CVPR2022(oral)](https://bmild.github.io/rawnerf/) | [***``[code]``***](https://github.com/google-research/multinerf)\n    > 神经辐射场 (NeRF) 是一种从姿势输入图像的集合中合成高质量新颖视图的技术。与大多数视图合成方法一样，NeRF 使用色调映射低动态范围（LDR）作为输入；这些图像已由有损相机管道处理，该管道可以平滑细节、剪辑高光并扭曲原始传感器数据的简单噪声分布。我们将 NeRF 修改为直接在线性原始图像上进行训练，保留场景的完整动态范围。通过从生成的 NeRF 渲染原始输出图像，我们可以执行新颖的高动态范围 (HDR) 视图合成任务。除了改变相机视角之外，我们还可以在事后操纵焦点、曝光和色调映射。尽管单个原始图像看起来比后处理的图像噪声大得多，但我们表明 NeRF 对原始噪声的零均值分布具有高度鲁棒性。当针对许多嘈杂的原始输入 (25-200) 进行优化时，NeRF 生成的场景表示非常准确，以至于其渲染的新颖视图优于在相同宽基线输入图像上运行的专用单图像和多图像深度原始降噪器。因此，我们的方法（我们称为 RawNeRF）可以从在近黑暗中捕获的极其嘈杂的图像中重建场景。\n  - [NeRV：用于重新照明和视图合成的神经反射率和可见性场, CVPR2021](https://pratulsrinivasan.github.io/nerv/) | [code]\n    > 我们提出了一种方法，该方法将由不受约束的已知照明照明的场景的一组图像作为输入，并生成可以在任意照明条件下从新视点渲染的 3D 表示作为输出。我们的方法将场景表示为参数化为 MLP 的连续体积函数，其输入是 3D 位置，其输出是该输入位置的以下场景属性：体积密度、表面法线、材料参数、到任何方向上第一个表面交点的距离，以及任何方向的外部环境的可见性。总之，这些允许我们在任意照明下渲染物体的新视图，包括间接照明效果。预测的能见度和表面相交场对于我们的模型在训练期间模拟直接和间接照明的能力至关重要，因为先前工作使用的蛮力技术对于具有单灯的受控设置之外的照明条件是难以处理的。我们的方法在恢复可重新照明的 3D 场景表示方面优于替代方法，并且在对先前工作构成重大挑战的复杂照明设置中表现良好。\n  - [NeX：具有神经基础扩展的实时视图合成, CVPR2021(oral)](https://nex-mpi.github.io/) | [***``[code]``***](https://github.com/nex-mpi/nex-code/)\n    > 我们提出了 NeX，这是一种基于多平面图像 (MPI) 增强的新型视图合成的新方法，可以实时再现 NeXt 级别的视图相关效果。与使用一组简单 RGBα 平面的传统 MPI 不同，我们的技术通过将每个像素参数化为从神经网络学习的基函数的线性组合来模拟视图相关的效果。此外，我们提出了一种混合隐式-显式建模策略，该策略改进了精细细节并产生了最先进的结果。我们的方法在基准前向数据集以及我们新引入的数据集上进行了评估，该数据集旨在测试与视图相关的建模的极限，具有明显更具挑战性的效果，例如 CD 上的彩虹反射。我们的方法在这些数据集的所有主要指标上都取得了最好的总体得分，渲染时间比现有技术快 1000 倍以上。\n  - [NeRFactor：未知光照下形状和反射率的神经分解, TOG 2021 (Proc. SIGGRAPH Asia)](https://xiuming.info/projects/nerfactor/) | [code]\n    > 我们解决了从由一种未知光照条件照射的物体的多视图图像（及其相机姿势）中恢复物体的形状和空间变化反射率的问题。这使得能够在任意环境照明下渲染对象的新颖视图并编辑对象的材质属性。我们方法的关键，我们称之为神经辐射分解（NeRFactor），是提取神经辐射场（NeRF）的体积几何[Mildenhall et al。 2020] 将对象表示为表面表示，然后在解决空间变化的反射率和环境照明的同时联合细化几何。具体来说，NeRFactor 在没有任何监督的情况下恢复表面法线、光能见度、反照率和双向反射分布函数 (BRDF) 的 3D 神经场，仅使用重新渲染损失、简单的平滑先验和从真实数据中学习的数据驱动的 BRDF 先验-世界BRDF测量。通过显式建模光可见性，NeRFactor 能够从反照率中分离出阴影，并在任意光照条件下合成逼真的软阴影或硬阴影。 NeRFactor 能够恢复令人信服的 3D 模型，用于在合成场景和真实场景的这种具有挑战性且约束不足的捕获设置中进行自由视点重新照明。定性和定量实验表明，NeRFactor 在各种任务中都优于经典和基于深度学习的最新技术。我们的视频、代码和数据可在 people.csail.mit.edu/xiuming/projects/nerfactor/ 上找到。\n  - [Fig-NeRF：用于 3D 对象类别建模的图地面神经辐射场, 3DV2021](https://fig-nerf.github.io/) | [code]\n    > 我们研究使用神经辐射场 (NeRF) 从输入图像的集合中学习高质量的 3D 对象类别模型。与以前的工作相比，我们能够做到这一点，同时将前景对象与不同的背景分开。我们通过 2 分量 NeRF 模型 FiG-NeRF 实现了这一点，该模型更喜欢将场景解释为几何恒定的背景和代表对象类别的可变形前景。我们表明，这种方法可以仅使用光度监督和随意捕获的对象图像来学习准确的 3D 对象类别模型。此外，我们的两部分分解允许模型执行准确和清晰的模态分割。我们使用合成的、实验室捕获的和野外数据，通过视图合成和图像保真度指标对我们的方法进行定量评估。我们的结果证明了令人信服的 3D 对象类别建模，其性能超过了现有方法的性能。\n  - [NerfingMVS：室内多视角立体神经辐射场的引导优化, ICCV2021(oral)](https://arxiv.org/abs/2109.01129) | [***``[code]``***](https://github.com/weiyithu/NerfingMVS)\n    > 在这项工作中，我们提出了一种新的多视图深度估计方法，该方法在最近提出的神经辐射场 (NeRF) 上利用了传统的 SfM 重建和基于学习的先验。与现有的依赖于估计对应的基于神经网络的优化方法不同，我们的方法直接优化隐式体积，消除了在室内场景中匹配像素的挑战性步骤。我们方法的关键是利用基于学习的先验来指导 NeRF 的优化过程。我们的系统首先通过微调其稀疏 SfM 重建来适应目标场景上的单目深度网络。然后，我们证明了 NeRF 的形状-辐射模糊性仍然存在于室内环境中，并建议通过采用适应的深度先验来监控体绘制的采样过程来解决这个问题。最后，通过对渲染图像进行误差计算获得的每像素置信度图可用于进一步提高深度质量。实验表明，我们提出的框架在室内场景中显着优于最先进的方法，在基于对应的优化和基于 NeRF 的优化对适应深度先验的有效性方面提出了令人惊讶的发现。此外，我们表明引导优化方案不会牺牲神经辐射场的原始合成能力，提高了可见视图和新视图的渲染质量。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/others.md",
    "content": "\n每周分类神经辐射场 - others ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n===================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n  - [紧凑型神经辐射场的掩蔽小波表示](https://arxiv.org/abs/2212.09069) | [***``[code]``***](https://github.com/daniel03c1/masked_wavelet_nerf)\n    > 神经辐射场 (NeRF) 已经证明了神经渲染中基于坐标的神经表示（神经场或隐式神经表示）的潜力。 然而，使用多层感知器 (MLP) 来表示 3D 场景或对象需要大量的计算资源和时间。 最近有关于如何通过使用额外的数据结构（例如网格或树）来减少这些计算效率低下的研究。 尽管性能很有前途，但显式数据结构需要大量内存。 在这项工作中，我们提出了一种在不损害具有附加数据结构的优势的情况下减小大小的方法。 详细地说，我们建议在基于网格的神经场上使用小波变换。 基于网格的神经场是为了快速收敛，而其效率已经在高性能标准编解码器中得到证明的小波变换是为了提高网格的参数效率。 此外，为了在保持重建质量的同时实现更高的网格系数稀疏性，我们提出了一种新颖的可训练掩蔽方法。 实验结果表明，非空间网格系数，例如小波系数，能够获得比空间网格系数更高的稀疏度，从而产生更紧凑的表示。 通过我们提出的掩码和压缩管道，我们在 2 MB 的内存预算内实现了最先进的性能。 我们的代码可通过此 https 网址获得。\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [4K-NeRF：超高分辨率下的高保真神经辐射场](https://arxiv.org/abs/2212.04701) | [***``[code]``***](https://github.com/frozoul/4K-NeRF)\n    > 在本文中，我们提出了一个新颖而有效的框架，名为 4K-NeRF，以神经辐射场 (NeRF) 的方法为基础，在超高分辨率的具有挑战性的场景中追求高保真视图合成。 基于 NeRF 的方法的渲染过程通常依赖于像素方式，在这种方式中，射线（或像素）在训练和推理阶段都被独立处理，限制了其描述细微细节的表现能力，尤其是在提升到极高的分辨率时。 我们通过更好地探索光线相关性来解决这个问题，以增强受益于使用几何感知局部上下文的高频细节。 特别是，我们使用视图一致编码器在较低分辨率空间中有效地建模几何信息，并通过视图一致解码器恢复精细细节，条件是编码器估计的光线特征和深度。 联合训练与基于补丁的采样进一步促进了我们的方法，将来自面向感知的正则化的监督纳入像素明智的损失之外。 与现代 NeRF 方法的定量和定性比较表明，我们的方法可以显着提高渲染质量以保留高频细节，在 4K 超高分辨率场景下实现最先进的视觉质量。 代码可在 \\url{this https URL}\n## Nov27 - Dec3, 2022\n  - [3D-TOGO：走向文本引导的跨类别 3D 对象生成, AAAI2023](https://arxiv.org/abs/2212.01103) | [code]\n    > 文本引导的 3D 对象生成旨在生成由用户定义的标题描述的 3D 对象，这为可视化我们想象的内容铺平了道路。 尽管一些工作致力于解决这一具有挑战性的任务，但这些工作要么使用一些明确的 3D 表示（例如，网格），这些表示缺乏纹理并且需要后期处理来渲染照片般逼真的视图； 或者需要对每个案例进行单独耗时的优化。 在这里，我们首次尝试通过新的 3D-TOGO 模型实现通用文本引导的跨类别 3D 对象生成，该模型集成了文本到视图生成模块和视图到 3D 生成模块。 文本到视图生成模块旨在生成给定输入字幕的目标 3D 对象的不同视图。 提出了先验指导、标题指导和视图对比学习，以实现更好的视图一致性和标题相似性。 同时，views-to-3D 生成模块采用 pixelNeRF 模型，以从先前生成的视图中获取隐式 3D 神经表示。 我们的 3D-TOGO 模型以具有良好纹理的神经辐射场形式生成 3D 对象，并且不需要对每个单独的字幕进行时间成本优化。 此外，3D-TOGO可以通过输入的字幕控制生成的3D对象的类别、颜色和形状。 在最大的 3D 对象数据集（即 ABO）上进行了大量实验，以验证 3D-TOGO 可以根据 PSNR、SSIM、LPIPS 和 CLIP 等 98 个不同类别的输入字幕更好地生成高质量的 3D 对象。 得分，与文本 NeRF 和 Dreamfields 相比。\n  - [SinGRAF：学习单个场景的 3D 生成辐射场](https://arxiv.org/abs/2211.17260) | [code]\n    > 生成模型在合成逼真的 3D 对象方面显示出巨大的潜力，但它们需要大量的训练数据。 我们介绍了 SinGRAF，这是一种 3D 感知生成模型，使用单个场景的一些输入图像进行训练。 经过训练后，SinGRAF 会生成此 3D 场景的不同实现，在改变场景布局的同时保留输入的外观。 为此，我们以 3D GAN 架构的最新进展为基础，并在训练期间引入了一种新颖的渐进式补丁辨别方法。 通过几个实验，我们证明了 SinGRAF 产生的结果在质量和多样性方面都大大优于最接近的相关作品。\n  - [NeAF：学习用于点法线估计的神经角度场, AAAI2023](https://arxiv.org/abs/2211.16869) | [***``[code]``***](https://github.com/lisj575/NeAF)\n    > 非结构化点云的法线估计是 3D 计算机视觉中的一项重要任务。 当前的方法通过将局部补丁映射到法向量或使用神经网络学习局部表面拟合来取得令人鼓舞的结果。 然而，这些方法不能很好地推广到看不见的场景，并且对参数设置很敏感。 为了解决这些问题，我们提出了一个隐式函数来学习球坐标系中每个点法线周围的角度场，称为神经角度场（NeAF）。 我们不是直接预测输入点的法线，而是预测地面实况法线和随机采样的查询法线之间的角度偏移。 这种策略推动网络观察更多不同的样本，从而以更稳健的方式获得更高的预测精度。 为了在推理时从学习的角度场预测法线，我们在单位球形空间中随机采样查询向量，并将具有最小角度值的向量作为预测法线。 为了进一步利用 NeAF 学到的先验知识，我们建议通过最小化角度偏移来细化预测的法向量。 合成数据和真实扫描的实验结果显示，在广泛使用的基准下，与最先进的技术相比有了显着改进。\n  - [SNAF：具有神经衰减场的稀疏视图 CBCT 重建](https://arxiv.org/abs/2211.17048) | [code]\n    > 锥形束计算机断层扫描（CBCT）已广泛应用于临床实践，尤其是牙科诊所，而捕获时X射线的辐射剂量一直是CBCT成像中长期关注的问题。 已经提出了几项研究工作来从稀疏视图 2D 投影重建高质量的 CBCT 图像，但目前最先进的技术存在伪影和缺乏精细细节的问题。 在本文中，我们提出了通过学习神经衰减场来进行稀疏视图 CBCT 重建的 SNAF，我们发明了一种新颖的视图增强策略来克服稀疏输入视图数据不足带来的挑战。 我们的方法在高重建质量（30+ PSNR）方面实现了卓越的性能，只有 20 个输入视图（比临床收集少 25 倍），优于最先进的技术。 我们进一步进行了综合实验和消融分析，以验证我们方法的有效性。\n  - [从单目视频重建手持物体, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555401) | [code]\n    > 本文提出了一种从单目视频中重建手持物体的方法。 与许多最近通过训练有素的网络直接预测对象几何形状的方法相比，所提出的方法不需要任何关于对象的先验知识，并且能够恢复更准确和详细的对象几何形状。 关键思想是手部运动自然地提供了对象的多个视图，并且可以通过手部姿势跟踪器可靠地估计该运动。 然后，可以通过解决多视图重建问题来恢复对象几何形状。 我们设计了一种基于隐式神经表示的方法来解决重建问题，并解决手部姿势估计不精确、手部相对运动和小物体的几何优化不足等问题。 我们还提供了一个新收集的具有 3D ground truth 的数据集来验证所提出的方法。 数据集和代码将发布在 https://dihuangdh.github.io/hhor。\n  - [一种轻松教授变形金刚多视图几何的方法](https://arxiv.org/abs/2211.15107) | [code]\n    > 变形金刚是强大的视觉学习者，这在很大程度上是因为它们明显缺乏手动指定的先验。 由于 3D 形状和视点的近乎无限可能的变化（需要灵活性）以及射影几何的精确性质（遵守刚性法则），这种灵活性在涉及多视图几何的任务中可能会出现问题。 为了解决这个难题，我们提出了一种“轻触”方法，引导视觉变形金刚学习多视图几何，但允许它们在需要时摆脱束缚。 我们通过使用极线来引导 Transformer 的交叉注意力图来实现这一点，惩罚极线外的注意力值并鼓励沿着这些线的更高注意力，因为它们包含几何上合理的匹配。 与以前的方法不同，我们的建议在测试时不需要任何相机姿势信息。 我们专注于姿势不变的对象实例检索，由于查询和检索图像之间的视点存在巨大差异，因此标准 Transformer 网络在这方面存在困难。 在实验上，我们的方法在对象检索方面优于最先进的方法，而且在测试时不需要姿势信息。\n  - [NeRF 在 360° 图像上的非均匀采样策略, BMVC2022](https://arxiv.org/abs/2212.03635) | [code]\n    > 近年来，随着神经辐射场 (NeRF) 的出现，使用透视图像进行新视图合成的性能得到了显着提高。 本研究提出了两种有效构建 360{\\textdegree} 全向图像 NeRF 的新技术。 由于ERP格式的360{\\textdegree}图像在高纬度地区存在空间畸变和360{\\textdegree}广视角的特点，NeRF的一般光线采样策略是无效的。 因此，NeRF 的视图合成精度有限，学习效率不高。 我们为 NeRF 提出了两种非均匀光线采样方案以适应 360{\\textdegree} 图像——失真感知光线采样和内容感知光线采样。 我们分别使用室内和室外场景的 Replica 和 SceneCity 模型创建了评估数据集 Synth360。 在实验中，我们表明我们的提议在准确性和效率方面都成功地构建了 360{\\textdegree} 图像 NeRF。 该提案广泛适用于 NeRF 的高级变体。 DietNeRF、AugNeRF 和 NeRF++ 结合所提出的技术进一步提高了性能。 此外，我们展示了我们提出的方法提高了 360{\\textdegree} 图像中真实世界场景的质量。 Synth360：这个 https 网址。\n## Nov20 - Nov26, 2022\n  - [通过神经渲染的无监督连续语义适应](https://arxiv.org/abs/2211.13969) | [code]\n    > 越来越多的应用程序依赖于数据驱动模型，这些模型被部署用于跨一系列场景的感知任务。由于训练和部署数据之间的不匹配，在新场景上调整模型对于获得良好性能通常至关重要。在这项工作中，我们研究了语义分割任务的持续多场景适应，假设在部署期间没有可用的地面实况标签，并且应该保持先前场景的性能。我们建议通过融合分割模型的预测，然后使用视图一致的渲染语义标签作为伪标签来调整模型，为每个场景训练一个语义 NeRF 网络。通过与分割模型的联合训练，Semantic-NeRF 模型有效地实现了 2D-3D 知识迁移。此外，由于其紧凑的尺寸，它可以存储在长期记忆中，随后用于从任意角度渲染数据以减少遗忘。我们在 ScanNet 上评估了我们的方法，我们的方法优于基于体素的基线和最先进的无监督域适应方法。\n  - [DiffusionSDF：有符号距离函数的条件生成模型](https://arxiv.org/abs/2211.13757) | [code]\n    > 概率扩散模型在图像合成、修复和文本到图像任务方面取得了最先进的结果。然而，它们仍处于生成复杂 3D 形状的早期阶段。这项工作提出了 DiffusionSDF，一种用于形状补全、单视图重建和真实扫描点云重建的生成模型。我们使用神经符号距离函数 (SDF) 作为我们的 3D 表示，通过神经网络参数化各种信号（例如，点云、2D 图像）的几何形状。神经 SDF 是隐式函数，扩散它们相当于学习它们的神经网络权重的反转，我们使用自定义调制模块解决了这个问题。广泛的实验表明，我们的方法能够从部分输入进行现实的无条件生成和条件生成。这项工作将扩散模型的领域从学习 2D 显式表示扩展到 3D 隐式表示。\n  - [BAD-NeRF：束调整的去模糊神经辐射场](https://arxiv.org/abs/2211.12853) | [code]\n    > 神经辐射场 (NeRF) 最近受到了相当大的关注，因为它在给定一组姿势相机图像的情况下，在逼真的 3D 重建和新颖的视图合成方面具有令人印象深刻的能力。早期的工作通常假设输入图像质量很好。然而，图像退化（例如低光条件下的图像运动模糊）在现实场景中很容易发生，这将进一步影响 NeRF 的渲染质量。在本文中，我们提出了一种新颖的束调整去模糊神经辐射场 (BAD-NeRF)，它可以对严重的运动模糊图像和不准确的相机姿势具有鲁棒性。我们的方法对运动模糊图像的物理图像形成过程进行建模，并联合学习 NeRF 的参数并恢复曝光时间内的相机运动轨迹。在实验中，我们表明，通过直接对真实物理图像形成过程进行建模，BAD-NeRF 在合成数据集和真实数据集上都实现了优于先前工作的性能。\n  - [OReX：使用神经场从 Planner 横截面重建对象](https://arxiv.org/abs/2211.12886) | [code]\n    > 从平面横截面重建 3D 形状是一项受到医学成像和地理信息学等下游应用启发的挑战。输入是在空间平面的稀疏集合上完全定义的输入/输出指示函数，输出是指示函数对整个体积的插值。以前解决这个稀疏和病态问题的工作要么产生低质量的结果，要么依赖于额外的先验，例如目标拓扑、外观信息或输入法线方向。在本文中，我们介绍了 OReX，一种仅从切片重建 3D 形状的方法，以神经场作为插值先验。在输入平面上训练一个简单的神经网络以接收 3D 坐标并返回查询点的内部/外部估计。这个先验在诱导平滑性和自相似性方面很有用。这种方法的主要挑战是高频细节，因为神经先验过度平滑。为了缓解这种情况，我们提供了一种迭代估计架构和一种分层输入采样方案，鼓励从粗到精的训练，允许在后期阶段关注高频。此外，我们识别并分析了源自网格提取步骤的常见波纹状效果。我们通过调整输入输入/输出边界周围指示函数的空间梯度来缓解它，从根本上解决问题。\n## Nov13 - Nov19, 2022\n  - [大尺度室内场景实时全向漫游, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550340.3564222) | [code]\n    > 神经辐射场 (NeRF) 最近在新视图合成方面取得了令人瞩目的成果。然而，之前关于 NeRF 的工作主要集中在以对象为中心的场景。由于位置编码容量有限，它们在面向外的和大规模场景中会遭受明显的性能下降。为了缩小差距，我们以几何感知的方式探索辐射场。我们从从多个 360° 图像中学习的全向神经辐射场估计显式几何。依靠恢复的几何形状，我们使用自适应分而治之的策略来缩小和微调辐射场，进一步提高渲染速度和质量。基线之间的定量和定性比较说明了我们在大型室内场景中的主要性能，并且我们的系统支持实时 VR 漫游。\n  - [AligNeRF：通过对齐感知训练的高保真神经辐射场](https://arxiv.org/abs/2211.09682) | [code]\n    > 神经辐射场 (NeRF) 是将 3D 场景建模为连续函数的强大表示。尽管 NeRF 能够渲染具有视图相关效果的复杂 3D 场景，但很少有人致力于探索其在高分辨率设置中的局限性。具体来说，现有的基于 NeRF 的方法在重建高分辨率真实场景时面临着一些限制，包括大量的参数、未对齐的输入数据和过度平滑的细节。在这项工作中，我们对使用高分辨率数据训练 NeRF 进行了首次试点研究，并提出了相应的解决方案：1）将多层感知器（MLP）与卷积层结合，可以编码更多的邻域信息，同时减少参数总数； 2) 一种新的训练策略来解决由移动物体或小相机校准误差引起的未对准问题； 3）高频感知损失。我们的方法几乎是免费的，没有引入明显的训练/测试成本，而在不同数据集上的实验表明，与当前最先进的 NeRF 模型相比，它可以恢复更多的高频细节。项目页面：\\url{此 https URL。}\n  - [3DLatNav：导航用于语义感知 3D 对象操作的生成潜在空间](https://arxiv.org/abs/2211.09770) | [code]\n    > 3D 生成模型最近成功地以点云的形式生成逼真的 3D 对象。然而，大多数模型在没有广泛的语义属性标签或其他参考点云的情况下不提供操纵组件对象部分的形状语义的可控性。此外，除了执行简单的潜在向量运算或插值的能力之外，还缺乏对 3D 形状的部分级语义如何在其相应的生成潜在空间中进行编码的理解。在本文中，我们提出了 3DLatNav；一种导航预训练生成潜在空间以实现 3D 对象的受控部分级语义操作的新方法。首先，我们提出了一种使用 3D 形状的潜在表示的部分级弱监督形状语义识别机制。然后，我们将该知识转移到预训练的 3D 对象生成潜在空间，以解开纠缠的嵌入，以线性子空间的形式表示对象组成部分的不同形状语义，尽管在训练期间部分级标签不可用。最后，我们利用那些已识别的子空间来表明，通过将所提出的框架应用于任何预训练的 3D 生成模型，可以实现可控的 3D 对象部分操作。通过两个新的定量指标来评估部分级操作的一致性和定位准确性，我们表明 3DLatNav 在识别编码 3D 对象的部分级形状语义的潜在方向方面优于现有的无监督潜在解缠结方法。通过对最先进的生成模型进行多项消融研究和测试，我们表明 3DLatNav 可以在输入点云上实现受控的部分级语义操作，同时保留对象的其他特征和真实性。\n  - [AsyncNeRF：从具有时间姿态函数的异步 RGB-D 序列中学习大规模辐射场](https://arxiv.org/abs/2211.07459) | [code]\n    > 大规模辐射场是用于智能交通应用（如自动驾驶或无人机送货）的有前途的测绘工具。但对于大型场景，由于感测范围有限，紧凑型同步 RGB-D 相机并不适用，使用单独的 RGB 和深度传感器不可避免地导致序列不同步。受最近不需要已知内在或外在参数的自校准辐射场训练方法的成功启发，我们提出了第一个自校准 RGB 和深度帧之间的不匹配的解决方案。我们利用重要的特定领域事实，即 RGB 和深度帧实际上是从同一轨迹采样的，并开发了一种称为时间-姿势函数的新型隐式网络。将它与大规模辐射场相结合会产生一种级联两个隐式表示网络的架构。为了验证其有效性，我们构建了一个多样化且逼真的数据集，涵盖各种 RGB-D 不匹配场景。通过对该数据集进行全面的基准测试，我们展示了我们的方法在不同场景中的灵活性以及优于适用的先前对应方法的卓越性能。代码、数据和模型将公开提供。\n## Nov6 - Nov12, 2022\n  - [NeXT：通过 Multi-skip Transformer 实现高质量的神经辐射场, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19824-3_5) | [***``[code]``***](https://github.com/Crishawy/NeXT)\n    > 神经辐射场 (NeRF) 方法通过神经网络表示场景，在新颖的视图合成方面表现出令人印象深刻的性能。然而，大多数现有的基于 NeRF 的方法（包括其变体）将每个样本点单独视为输入，同时忽略了来自相应射线的相邻样本点之间的内在关系，从而阻碍了重建性能。为了解决这个问题，我们探索了一种全新的方案，即 NeXT，引入了一个多跳跃变换器来捕获射线级查询中各个样本点之间的丰富关系。具体来说，提出了射线标记化以将每条射线表示为一系列点嵌入，并将其作为我们提出的 NeXT 的输入。这样，通过内置的自注意力机制捕获样本点之间的关系，以促进重建。此外，我们提出的 NeXT 可以很容易地与其他基于 NeRF 的方法结合，以提高它们的渲染质量。在三个数据集上进行的大量实验表明，NeXT 大大优于所有以前的最先进的工作。特别是，拟议的 NeXT 在 Blender 数据集上的 PSNR 超过了强大的 NeRF 基线 2.74 dB。该代码可在 https://github.com/Crishawy/NeXT 获得。\n  - [QRF：具有量子辐射场的隐式神经表示](https://arxiv.org/abs/2211.03418) | [code]\n    > 现实世界场景的逼真渲染对于包括混合现实 (MR) 和虚拟现实 (VR) 在内的广泛应用来说是一项巨大的挑战。神经网络长期以来一直在求解微分方程的背景下进行研究，之前已被引入作为照片级渲染的隐式表示。然而，使用经典计算的逼真渲染具有挑战性，因为它需要耗时的光线行进，并且由于维数灾难而遭受计算瓶颈。在本文中，我们提出了量子辐射场 (QRF)，它集成了量子电路、量子激活函数和量子体积渲染，用于隐式场景表示。结果表明，QRF不仅发挥了量子计算速度快、收敛快、并行度高等优势，而且保证了体绘制的高质量。\n## Oct30 - Nov5, 2022\n  - [HyperSound：使用超网络生成音频信号的隐式神经表示](https://arxiv.org/abs/2211.01839) | [code]\n    > 隐式神经表征 (INR) 是一个快速发展的研究领域，它提供了表示多媒体信号的替代方法。 INR 最近的应用包括图像超分辨率、高维信号压缩或 3D 渲染。然而，这些解决方案通常侧重于视觉数据，将它们适应音频领域并非易事。此外，它需要为每个数据样本单独训练模型。为了解决这个限制，我们提出了 HyperSound，这是一种利用超网络为训练时看不见的音频信号生成 INR 的元学习方法。我们表明，我们的方法可以重建声波，其质量可与其他最先进的模型相媲美。\n  - [基于注意力的神经元胞自动机, NeurIPS2022](https://arxiv.org/abs/2211.01233) | [code]\n    > 元胞自动机 (CA) 最近的扩展结合了现代深度学习的关键思想，极大地扩展了它们的能力并催生了一个新的神经元元自动机 (NCA) 技术家族。受基于 Transformer 的架构的启发，我们的工作提出了一类新的基于注意力的 NCA，使用空间局部化但全局组织的自注意力方案形成。我们介绍了此类的一个实例，名为 Vision Transformer Cellular Automata (ViTCA)。我们展示了跨六个基准数据集的去噪自动编码的定量和定性结果，将 ViTCA 与 U-Net、基于 U-Net 的 CA 基线 (UNetCA) 和 Vision Transformer (ViT) 进行了比较。在比较配置为类似参数复杂性的架构时，ViTCA 架构在所有基准测试和几乎每个评估指标上都产生了卓越的性能。我们对 ViTCA 的各种架构配置进行了消融研究，分析了它对细胞状态的影响，并调查了它的归纳偏差。最后，我们通过线性探针在其聚合细胞状态隐藏表示上检查其学习表示，与我们的 U-Net、ViT 和 UNetCA 基线相比，平均产生更好的结果。\n## Oct23 - Oct29, 2022\n  - [NeX360：基于神经基础扩展的实时全方位视图合成, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9931981) | [code]\n    > 我们介绍了 NeX，这是一种基于多平面图像 (MPI) 增强的新颖视图合成的新方法，可以实时再现视图相关的效果。与传统的 MPI 不同，我们的技术将每个像素参数化为从神经网络学习的球形基函数的线性组合，以对视图相关的效果进行建模，并使用混合隐式-显式建模策略来改进精细细节。此外，我们还展示了 NeX 的扩展，它利用知识蒸馏来为无限 360 ∘ 场景训练多个 MPI。我们的方法在几个基准数据集上进行了评估：NeRF-Synthetic 数据集、Light Field 数据集、Real Forward-Facing 数据集、Space 数据集以及 Shiny，我们的新数据集包含更具挑战性的视图相关效果，例如彩虹反射在 CD 上。我们的方法在 PSNR、SSIM 和 LPIPS 上优于其他实时渲染方法，可以实时渲染无界 360 ∘ 场景。\n  - [NeRFPlayer：具有分解神经辐射场的可流式动态场景表示](https://arxiv.org/abs/2210.15947) | [code]\n    > 在 VR 中自由地在真实世界的 4D 时空空间中进行视觉探索一直是一项长期的追求。当仅使用几个甚至单个 RGB 相机来捕捉动态场景时，这项任务特别有吸引力。为此，我们提出了一个能够快速重建、紧凑建模和流式渲染的高效框架。首先，我们建议根据时间特征分解 4D 时空空间。 4D 空间中的点与属于三个类别的概率相关联：静态区域、变形区域和新区域。每个区域都由一个单独的神经场表示和规范化。其次，我们提出了一种基于混合表示的特征流方案，用于有效地对神经场进行建模。我们的方法，创造了 NeRFPlayer，在单手持相机和多相机阵列捕获的动态场景上进行评估，在质量和速度方面实现与最近最先进的方法相当或更优的渲染性能，实现重建每帧 10 秒，实时渲染。\n  - [Vox-Fusion：基于体素的神经隐式表示的密集跟踪和映射](https://arxiv.org/abs/2210.15858) | [***``[code]``***](https://github.com/zju3dv/Vox-Fusion)\n    > 在这项工作中，我们提出了一个名为 Vox-Fusion 的密集跟踪和映射系统，它将神经隐式表示与传统的体积融合方法无缝融合。我们的方法受到最近开发的隐式映射和定位系统的启发，并进一步扩展了这一思想，使其可以自由应用于实际场景。具体来说，我们利用基于体素的神经隐式表面表示来编码和优化每个体素内的场景。此外，我们采用基于八叉树的结构来划分场景并支持动态扩展，使我们的系统能够像以前的作品一样在不知道环境的情况下跟踪和映射任意场景。此外，我们提出了一个高性能的多进程框架来加速该方法，从而支持一些需要实时性能的应用程序。评估结果表明，我们的方法可以实现比以前的方法更好的准确性和完整性。我们还展示了我们的 Vox-Fusion 可用于增强现实和虚拟现实应用程序。我们的源代码可通过此 https 网址公开获得。\n## Oct16 - Oct22, 2022\n  - [将多维天气和气候数据压缩到神经网络中](https://arxiv.org/abs/2210.12538) | [code]\n    > 天气和气候模拟会产生数 PB 的高分辨率数据，研究人员随后会对这些数据进行分析，以了解气候变化或恶劣天气。我们提出了一种压缩这种多维天气和气候数据的新方法：训练基于坐标的神经网络以过度拟合数据，并将生成的参数作为原始基于网格的数据的紧凑表示。虽然压缩比范围从 300 倍到超过 3,000 倍，但我们的方法在加权 RMSE、MAE 方面优于最先进的压缩器 SZ3。它可以忠实地保存重要的大型大气结构，并且不引入人工制品。当使用生成的神经网络作为 790 倍压缩数据加载器来训练 WeatherBench 预测模型时，其 RMSE 增加不到 2%。三个数量级的压缩使高分辨率气候数据的访问民主化，并实现了许多新的研究方向。\n  - [具有超分辨声向的神经声场分解](https://arxiv.org/abs/2210.12345) | [code]\n    > 声场分解使用来自有限数量麦克风的信号作为输入来预测任意方向的波形。声场分解是下游任务的基础，包括源定位、源分离和空间音频再现。传统的声场分解方法（例如 Ambisonics）具有有限的空间分解分辨率。本文提出了一种基于学习的神经声场分解 (NeSD) 框架，允许使用来自任意位置的几个麦克风的麦克风胶囊的录音进行具有精细空间方向分辨率的声场分解。 NeSD 系统的输入包括麦克风信号、麦克风位置和查询的方向。 NeSD 的输出包括波形和查询位置的存在概率。我们分别用不同的神经网络对 NeSD 系统进行建模，包括全连接、时间延迟和循环神经网络。我们表明，NeSD 系统在语音、音乐和声音事件数据集的声场分解和源定位方面优于传统的 Ambisonics 和 DOANet 方法。此 https URL 提供了演示。\n## Oct9 - Oct15, 2022\n  - [神经过程的连续条件视频合成](https://arxiv.org/abs/2210.05810) | [***``[code]``***](https://github.com/NPVS/NPVS)\n    > 我们为多个条件视频合成任务提出了一个统一模型，包括视频预测和视频帧插值。我们表明，条件视频合成可以表述为一个神经过程，它将输入时空坐标映射到给定上下文时空坐标和像素值的目标像素值。具体来说，我们将坐标的隐式神经表示馈送到基于 Transformer 的非自回归条件视频合成模型中。我们的任务特定模型优于以前在多个数据集上进行视频插值的工作，并与最先进的视频预测模型具有竞争力的性能。重要的是，该模型能够以任意高帧速率进行插值或预测，即连续合成。我们的源代码可在此 https 网址上找到。\n  - [面向 DIBR 的视图合成的几何翘曲误差感知 CNN, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547946) | [code]\n    > 基于深度图像渲染（DIBR）的面向视图合成是一种重要的虚拟视图生成技术。它根据深度图将参考视图图像扭曲到目标视点，而不需要许多可用的视点。然而，在 3D 翘曲过程中，像素被翘曲到分数像素位置，然后四舍五入（或插值）到整数像素，导致几何翘曲错误并降低图像质量。这在某种程度上类似于图像超分辨率问题，但具有不固定的小数像素位置。为了解决这个问题，我们提出了一个几何翘曲误差感知 CNN (GWEA) 框架来增强面向 DIBR 的视图合成。首先，利用 DIBR 模块中保留的几何翘曲误差，开发了一种基于可变形卷积的几何翘曲误差感知对齐 (GWEA-DCA) 模块。在可变形卷积中学习的偏移量可以解释几何翘曲误差，以促进从小数像素到整数像素的映射。此外，鉴于翘曲图像中的像素由于翘曲误差的强度不同而具有不同的质量，进一步开发了注意力增强视图混合（GWEA-AttVB）模块，以自适应地融合来自不同翘曲图像的像素。最后，基于部分卷积的空洞填充和细化模块填充剩余的空洞并提高整体图像的质量。实验表明，我们的模型可以合成比现有方法更高质量的图像，并且还进行了消融研究，验证了每个提出的模块的有效性。\n## Oct2 - Oct8, 2022\n  - [ViewFool：评估视觉识别对对抗性观点的鲁棒性, NeurIPS2022](https://arxiv.org/abs/2210.03895) | [code]\n    > 最近的研究表明，视觉识别模型对分布变化缺乏鲁棒性。然而，目前的工作主要考虑模型对 2D 图像转换的鲁棒性，而较少探索 3D 世界中的视点变化。一般来说，视点变化在各种实际应用（例如自动驾驶）中很普遍，因此评估视点鲁棒性势在必行。在本文中，我们提出了一种称为 ViewFool 的新方法来寻找误导视觉识别模型的对抗性视点。通过将现实世界中的物体编码为神经辐射场 (NeRF)，ViewFool 在熵正则化器下表征了不同对抗视点的分布，这有助于处理真实相机姿态的波动并减轻真实物体与其神经之间的现实差距申述。实验验证了常见的图像分类器极易受到生成的对抗性视点的影响，这也表现出很高的跨模型可迁移性。基于 ViewFool，我们引入了 ImageNet-V，这是一种新的分布外数据集，用于对图像分类器的视点鲁棒性进行基准测试。对具有不同架构、目标函数和数据增强的 40 个分类器的评估结果显示，在 ImageNet-V 上进行测试时模型性能显着下降，这为利用 ViewFool 作为一种有效的数据增强策略来提高视点鲁棒性提供了可能性。\n  - [用于手术记录的新视图合成](https://link.springer.com/chapter/10.1007/978-3-031-18576-2_7) | [code]\n    > 在手术室记录手术是医疗教育和评估的基本任务之一。然而，由于目标在手术过程中被医生或护士的头部或手严重遮挡，因此难以记录描绘手术的区域。我们使用了一个记录系统，该系统在手术灯中嵌入了多个摄像头，假设至少有一个摄像头正在无遮挡地记录目标。在本文中，我们提出 Conditional-BARF (C-BARF) 通过合成来自相机的新颖视图图像来生成无遮挡图像，旨在生成具有平滑相机姿态转换的视频。据我们所知，这是第一个解决从手术场景的多个图像合成新颖视图图像的问题的工作。我们使用三种不同类型手术的原始数据集进行实验。我们的实验表明，我们可以成功地从嵌入在手术灯中的多个摄像头记录的图像中合成新的视图。\n  - [用于自监督入住预测的可区分光线投射, ECCV2022](https://arxiv.org/abs/2210.01917) | [***``[code]``***](https://github.com/tarashakhurana/emergent-occ-forecasting)\n    > 安全自动驾驶的运动规划需要了解自我车辆周围的环境如何随时间演变。场景中可驱动区域的以自我为中心的感知不仅随着环境中演员的运动而变化，而且随着自我车辆本身的运动而变化。为大规模规划（例如以自我为中心的自由空间）提出的自我监督表示混淆了这两种运动，使得该表示难以用于下游运动规划器。在本文中，我们使用几何占用作为自由空间等依赖于视图的表示的自然替代方案。占用图自然地将环境的运动与自我车辆的运动分开。然而，人们无法直接观察场景的完整 3D 占用情况（由于遮挡），因此难以用作学习信号。我们的主要见解是使用可微分光线投射将未来占用预测“渲染”到未来的 LiDAR 扫描预测中，这可以与自监督学习的地面实况扫描进行比较。可微光线投射的使用允许占用率作为预测网络中的内部表示出现。在没有地面实况占用的情况下，我们定量评估了光线投射 LiDAR 扫描的预测，并显示了多达 15 个 F1 点的改进。对于下游运动规划器，紧急占用可以直接用于引导不可驱动区域，与以自由空间为中心的运动规划器相比，这种表示相对减少了高达 17% 的物体碰撞次数。\n  - [用于新视图合成的自我改进多平面到层图像, WACV2023](https://samsunglabs.github.io/MLI/) | [***``[code]``***](https://github.com/SamsungLabs/MLI)\n    > 我们提出了一种用于轻量级小说视图合成的新方法，该方法可以推广到任意前向场景。最近的方法在计算上很昂贵，需要逐场景优化，或者产生内存昂贵的表示。我们首先用一组正面平行的半透明平面来表示场景，然后以端到端的方式将它们转换为可变形层。此外，我们采用前馈细化程序，通过聚合来自输入视图的信息来纠正估计的表示。我们的方法在处理新场景时不需要微调，并且可以不受限制地处理任意数量的视图。实验结果表明，我们的方法在常用指标和人工评估方面超过了最近的模型，在推理速度和推断分层几何的紧凑性方面具有显着优势，请参阅此 https URL\n  - [NARF22：用于配置感知渲染的神经铰接辐射场, IROS2022](https://progress.eecs.umich.edu/projects/narf/) | [code]\n    > 铰接物体对机器人的感知和操作提出了独特的挑战。它们增加的自由度数量使得定位等任务在计算上变得困难，同时也使得现实世界数据集收集的过程无法扩展。为了解决这些可扩展性问题，我们提出了神经铰接辐射场 (NARF22)，这是一个使用完全可微分、配置参数化神经辐射场 (NeRF) 作为提供铰接对象高质量渲染的方法的管道。 NARF22 在推理时不需要明确了解对象结构。我们提出了一种两阶段的基于部件的训练机制，即使底层训练数据只有一个配置表示，它也允许对象渲染模型在配置空间中很好地泛化。我们通过在通过 Fetch 移动操作机器人收集的真实关节工具数据集上训练可配置渲染器来展示 NARF22 的功效。我们通过配置估计和 6 自由度姿态细化任务展示了该模型对基于梯度的推理方法的适用性。项目网页位于：此 https URL。\n  - [SinGRAV：从单个自然场景中学习生成辐射量](https://arxiv.org/abs/2210.01202) | [code]\n    > 我们提出了一个用于一般自然场景的 3D 生成模型。由于缺乏表征目标场景的必要 3D 数据量，我们建议从单个场景中学习。我们的关键见解是，一个自然场景通常包含多个组成部分，其几何、纹理和空间排列遵循一些清晰的模式，但在同一场景中的不同区域仍然表现出丰富的变化。这表明将生成模型的学习本地化在大量局部区域上。因此，我们利用具有空间局部性偏差的多尺度卷积网络来学习单个场景中多个尺度的局部区域的统计信息。与现有方法相比，我们的学习设置绕过了从许多同质 3D 场景中收集数据以学习共同特征的需要。我们创造了我们的方法 SinGRAV，用于从单个自然场景中学习生成辐射体积。我们展示了 SinGRAV 从单个场景生成合理多样的变化的能力，SingGRAV 相对于最先进的生成神经场景方法的优点，以及 SinGRAV 在各种应用中的多功能性，涵盖 3D 场景编辑、合成和动画。代码和数据将被发布以促进进一步的研究。\n  - [IntrinsicNeRF：学习用于可编辑新视图合成的内在神经辐射场](https://arxiv.org/abs/2210.00647) | [***``[code]``***](https://github.com/zju3dv/IntrinsicNeRF)\n    > 我们提出了被称为 IntrinsicNeRF 的内在神经辐射场，它将内在分解引入到基于 NeRF 的~\\cite{mildenhall2020nerf} 神经渲染方法中，并且可以在现有的逆向渲染结合神经渲染方法的同时在房间规模的场景中执行可编辑的新视图合成~ \\cite{zhang2021physg, zhang2022modeling} 只能用于特定对象的场景。鉴于内在分解本质上是一个模棱两可且约束不足的逆问题，我们提出了一种新颖的距离感知点采样和自适应反射率迭代聚类优化方法，该方法使具有传统内在分解约束的 IntrinsicNeRF 能够以无监督的方式进行训练，从而在时间上一致的内在分解结果。为了解决场景中相似反射率的不同相邻实例被错误地聚集在一起的问题，我们进一步提出了一种从粗到细优化的层次聚类方法，以获得快速的层次索引表示。它支持引人注目的实时增强现实应用，例如场景重新着色、材质编辑和照明变化。 Blender 对象和副本场景的大量实验表明，即使对于具有挑战性的序列，我们也可以获得高质量、一致的内在分解结果和高保真新视图合成。项目网页上提供了代码和数据：此 https 网址。\n## Sep25 - Oct1, 2022\n  - [SCI：用于生物医学数据的频谱集中隐式神经压缩](https://arxiv.org/abs/2209.15180) | [code]\n    > 海量医疗数据的海量采集和爆炸式增长，需要有效压缩以实现高效存储、传输和共享。现成的视觉数据压缩技术已被广泛研究，但针对自然图像/视频量身定制，因此在具有不同特征的医学数据上表现出有限的性能。新兴的隐式神经表示 (INR) 正在获得动力，并展示了以特定于目标数据的方式拟合各种视觉数据的高前景，但迄今为止还没有涵盖各种医疗数据的通用压缩方案。为了解决这个问题，我们首先对 INR 的频谱集中特性进行了数学解释，并对面向压缩的 INR 架构的设计进行了分析洞察。此外，我们设计了一个漏斗形神经网络，能够覆盖广泛的复杂医疗数据并实现高压缩比。在此设计的基础上，我们在给定预算下通过优化进行压缩，并提出了一种自适应压缩方法SCI，该方法将目标数据自适应地划分为与所采用的INR的集中频谱包络匹配的块，并在给定压缩比下分配具有高表示精度的参数.实验表明 SCI 优于传统技术的性能以及在各种医学数据中的广泛适用性。\n  - [从图像对中提取样式以进行全局正向和反向色调映射, CVMP2022](https://arxiv.org/abs/2209.15165) | [code]\n    > 许多图像增强或编辑操作，例如正向和反向色调映射或颜色分级，没有唯一的解决方案，而是有一系列解决方案，每个解决方案代表不同的风格。尽管如此，现有的基于学习的方法试图学习一个独特的映射，而忽略了这种风格。在这项工作中，我们展示了有关风格的信息可以从图像对的集合中提取并编码为 2 维或 3 维向量。这不仅为我们提供了有效的表示，而且为编辑图像样式提供了可解释的潜在空间。我们将一对图像之间的全局颜色映射表示为自定义归一化流，以像素颜色的多项式为条件。我们表明，这样的网络在低维空间中编码图像风格方面比 PCA 或 VAE 更有效，并且让我们获得接近 40 dB 的准确度，这比现有技术提高了大约 7-10 dB方法。\n  - [迈向多时空尺度广义 PDE 建模](https://arxiv.org/abs/2209.15616) | [code]\n    > 偏微分方程 (PDE) 是描述复杂物理系统模拟的核心。他们昂贵的解决方案技术引起了人们对基于深度神经网络的代理的兴趣增加。然而，训练这些代理人的实际效用取决于他们模拟复杂的多尺度时空现象的能力。已经提出了各种神经网络架构来针对此类现象，最着名的是傅里叶神经算子（FNO），它通过不同傅里叶模式的参数化对局部\\和全局空间信息进行自然处理，以及通过以下方式处理局部和全局信息的 U-Nets下采样和上采样路径。然而，跨不同方程参数或不同时间尺度的泛化仍然是一个挑战。在这项工作中，我们对涡流和速度函数形式的流体力学问题的各种 FNO 和 U-Net 方法进行了全面比较。对于 U-Net，我们从计算机视觉中转移了最近的架构改进，最显着的是来自对象分割和生成建模。我们进一步分析了使用 FNO 层来提高 U-Net 架构的性能而不显着降低计算性能的设计考虑因素。最后，我们展示了使用单个代理模型泛化到不同 PDE 参数和时间尺度的有希望的结果。\n  - [时间相关 PDE 的隐式神经空间表示](https://arxiv.org/abs/2210.00124) | [code]\n    > 数值求解偏微分方程 (PDE) 通常需要空间和时间离散化。传统方法（例如，有限差分、有限元、平滑粒子流体动力学）经常采用显式空间离散化，例如网格、网格和点云，其中每个自由度对应于空间中的一个位置。虽然这些明确的空间对应对于建模和理解来说是直观的，但这些表示对于准确性、内存使用或适应性而言不一定是最佳的。在这项工作中，我们探索隐式神经表示作为替代空间离散化，其中空间信息隐式存储在神经网络权重中。通过隐式神经空间表示，受 PDE 约束的时间步长转化为更新神经网络权重，它自然地与常用的优化时间积分器集成。我们通过涉及大弹性变形、湍流流体和多尺度现象的示例验证了我们在各种经典 PDE 上的方法。虽然计算速度比传统表示慢，但我们的方法表现出更高的准确性、更低的内存消耗和动态自适应分配的自由度，而无需复杂的重新划分网格。\n  - [具有隐式神经表示的连续 PDE 动态预测](https://arxiv.org/abs/2209.14855) | [code]\n    > 有效的数据驱动 PDE 预测方法通常依赖于固定的空间和/或时间离散化。这增加了现实世界应用的限制，例如需要在任意时空位置进行灵活外推的天气预报。我们通过引入一种新的数据驱动方法 DINo 来解决这个问题，该方法使用空间连续函数的连续时间动态对 PDE 的流进行建模。这是通过在由学习的 ODE 时间驱动的小潜在空间中通过隐式神经表示独立于其离散化嵌入空间观察来实现的。这种对时间和空间的分离和灵活处理使 DINo 成为第一个结合以下优点的数据驱动模型。它在任意空间和时间位置外推；它可以从稀疏的不规则网格或流形中学习；在测试时，它会推广到新的网格或分辨率。在代表性 PDE 系统的各种具有挑战性的泛化场景中，DINo 的表现优于替代神经 PDE 预测器。\n  - [面向多边形几何的通用表示学习, GeoInformatica](https://arxiv.org/abs/2209.15458) | [code]\n    > 空间数据的神经网络表示学习是地理人工智能 (GeoAI) 问题的普遍需求。近年来，在点、折线和网络的表示学习方面取得了许多进展，而在多边形，尤其是复杂的多边形几何形状方面进展甚微。在这项工作中，我们专注于开发一种通用的多边形编码模型，该模型可以将多边形几何体（有或没有孔，单面或多面体）编码到嵌入空间中。结果嵌入可以直接用于（或微调）下游任务，例如形状分类、空间关系预测等。为了实现模型的泛化性保证，我们确定了一些理想的属性：循环原点不变性、平凡顶点不变性、部分置换不变性和拓扑感知。我们探索了两种不同的编码器设计：一种是在空间域中派生所有表示；另一个利用谱域表示。对于空间域方法，我们提出了 ResNet1D，这是一种基于 CNN 的 1D 多边形编码器，它使用圆形填充来实现简单多边形上的循环原点不变性。对于谱域方法，我们开发了基于非均匀傅里叶变换 (NUFT) 的 NUFTspec，它自然地满足了所有所需的属性。我们对两个任务进行了实验：1）基于MNIST的形状分类； 2）基于两个新数据集——DBSR-46K和DBSR-cplx46K的空间关系预测。我们的结果表明，NUFTspec 和 ResNet1D 的性能优于多个现有的基线，具有显着的优势。虽然 ResNet1D 在形状不变几何修改后模型性能下降，但由于 NUFT 的性质，NUFTspec 对这些修改非常稳健。\n  - [通过控制屏障功能和神经辐射场增强基于视觉的控制器的安全性](https://arxiv.org/abs/2209.12266) | [code]\n    > 为了在复杂的环境中导航，机器人必须越来越多地使用高维视觉反馈（例如图像）进行控制。然而，依靠高维图像数据做出控制决策会引发重要问题；特别是，我们如何证明视觉反馈控制器的安全性？控制障碍函数 (CBF) 是在状态反馈设置中验证反馈控制器安全性的强大工具，但由于需要预测未来的观察结果以评估障碍函数，CBF 传统上不太适合视觉反馈控制.在这项工作中，我们利用神经辐射场 (NeRFs) 的最新进展来解决这个问题，神经辐射场 (NeRFs) 学习 3D 场景的隐式表示并可以从以前看不见的相机视角渲染图像，为基于 CBF 的单步视觉预测提供控制器。这种新颖的组合能够过滤掉不安全的行为并进行干预以保护安全。我们在实时模拟实验中展示了我们的控制器的效果，它成功地防止了机器人采取危险行动。\n  - [WaterNeRF：水下场景的神经辐射场](https://arxiv.org/abs/2209.13091) | [code]\n    > 水下成像是海洋机器人执行的一项关键任务，其应用范围广泛，包括水产养殖、海洋基础设施检查和环境监测。然而，水柱效应，例如衰减和反向散射，会极大地改变水下捕获图像的颜色和质量。由于不同的水条件和这些影响的范围依赖性，恢复水下图像是一个具有挑战性的问题。这会影响下游感知任务，包括深度估计和 3D 重建。在本文中，我们推进了神经辐射场 (NeRF) 的最新技术，以实现基于物理的密集深度估计和颜色校正。我们提出的方法 WaterNeRF 估计了基于物理的水下图像形成模型的参数，从而产生了混合数据驱动和基于模型的解决方案。在确定场景结构和辐射场后，我们可以生成退化和校正的水下图像的新视图，以及场景的密集深度。我们在真实的水下数据集上定性和定量地评估所提出的方法。\n## Sep18 - Sep24, 2022\n  - [感觉怎么样？ 用于越野车辆可穿越性的自我监督成本图学习](https://arxiv.org/abs/2209.10788) | [code]\n    > 估计越野环境中的地形可穿越性需要推理机器人与这些地形之间的复杂交互动力学。然而，对于这些交互，构建准确的物理模型或创建信息标签以有监督的方式学习模型具有挑战性。我们提出了一种方法，该方法通过以自我监督的方式将外部感知环境信息与本体感知地形交互反馈相结合来学习预测可遍历性成本图。此外，我们提出了一种将机器人速度纳入成本图预测管道的新方法。我们在具有挑战性的越野地形的大型自主全地形车 (ATV) 上的多个短距离和大规模导航任务中验证了我们的方法，并证明了在单独的大型地面机器人上易于集成。我们的短尺度导航结果表明，使用我们学习的成本图可以使导航整体更顺畅，并为机器人提供对机器人与不同地形类型（如草地和砾石）之间相互作用的更细粒度的理解。我们的大规模导航试验表明，在 400 米到 3150 米的具有挑战性的越野路线中，与基于占用的导航基线相比，我们可以将干预次数减少多达 57%。\n  - [wildNeRF：使用稀疏单目数据捕获的野外动态场景的完整视图合成](https://arxiv.org/abs/2209.10399) | [code]\n    > 我们提出了一种新的神经辐射模型，该模型可以以自我监督的方式进行训练，用于动态非结构化场景的新视图合成。我们的端到端可训练算法可在几秒钟内学习高度复杂的真实静态场景，并在几分钟内学习具有刚性和非刚性运动的动态场景。通过区分静态像素和以运动为中心的像素，我们从一组稀疏的图像中创建高质量的表示。我们对现有基准进行了广泛的定性和定量评估，并在具有挑战性的 NVIDIA 动态场景数据集上设置了最先进的性能指标。此外，我们在具有挑战性的现实世界数据集（例如 Cholec80 和 SurgicalActions160）上评估我们的模型性能。\n  - [密度感知 NeRF 集成：量化神经辐射场中的预测不确定性](https://arxiv.org/abs/2209.08718) | [code]\n    > 我们表明，如果考虑到密度感知认知不确定性项，则集成有效地量化了神经辐射场 (NeRFs) 中的模型不确定性。在先前的工作中研究的朴素集成只是简单地平均渲染的 RGB 图像，以量化由观察到的场景的相互矛盾的解释引起的模型不确定性。相比之下，由于缺乏关于训练期间未观察到的场景部分的知识，我们还考虑了沿单个射线的终止概率来识别认知模型的不确定性。我们在已建立的 NeRF 不确定性量化基准中实现了新的最先进的性能，优于需要对 NeRF 架构和训练机制进行复杂更改的方法。我们进一步证明了 NeRF 不确定性可用于次佳视图选择和模型细化。\n  - [LATITUDE：在城市规模的 NeRF 中使用截断动态低通滤波器进行机器人全局定位, ICRA2023](https://arxiv.org/abs/2209.08498) | [***``[code]``***](https://github.com/jike5/LATITUDE)\n    > 神经辐射场 (NeRFs) 在表示具有高分辨率细节和高效内存的复杂 3D 场景方面取得了巨大成功。然而，当前基于 NeRF 的姿态估计器没有初始姿态预测，并且在优化过程中容易出现局部最优。在本文中，我们提出了 LATITUDE：使用截断动态低通滤波器进行全局定位，它在城市规模的 NeRF 中引入了两阶段定位机制。在位置识别阶段，我们通过训练后的 NeRF 生成的图像训练回归器，为全局定位提供初始值。在姿态优化阶段，我们通过直接优化切平面上的姿态来最小化观察图像和渲染图像之间的残差。为了避免收敛到局部最优，我们引入了截断动态低通滤波器 (TDLF) 用于从粗到细的姿态配准。我们在合成数据和真实世界数据上评估我们的方法，并展示其在大规模城市场景中高精度导航的潜在应用。代码和数据将在此 https 网址上公开提供。\n  - [医学影像分割的隐式神经表示, MICCAI2022](https://link.springer.com/chapter/10.1007/978-3-031-16443-9_42) | [code]\n    > 医学成像中的 3D 信号（例如 CT 扫描）通常被参数化为体素的离散网格。例如，现有的最先进的器官分割方法学习离散的分割图。不幸的是，这些方法的内存需求随着空间分辨率的增加而呈立方增长，这使得它们不适合处理高分辨率扫描。为了克服这个问题，我们设计了一个隐式器官分割网络 (IOSNet)，它利用连续的隐式神经表示并具有几个有用的属性。首先，IOSNet 解码器内存大致恒定且独立于空间分辨率，因为它将分割图参数化为连续函数。其次，IOSNet 的收敛速度比基于离散体素的方法快得多，因为它能够准确地分割器官而不受器官大小的影响，从而在不需要任何辅助技巧的情况下缓解大小不平衡问题。第三，由于其连续学习表示，IOSNet 自然支持超分辨率（即在推理过程中以任意分辨率采样）。此外，尽管使用了一个简单的轻量级解码器，IOSNet 始终优于离散专业分割架构 UNet。因此，我们的方法表明隐式神经表示非常适合医学成像应用，尤其是处理高分辨率 3D 医学扫描。\n## Sep11 - Sep17, 2022\n  - [DevNet：通过密度体积构建的自监督单目深度学习, ECCV2022](https://arxiv.org/abs/2209.06351) | [code]\n    > 单目图像的自监督深度学习通常依赖于时间相邻图像帧之间的 2D 像素级光度关系。然而，它们既没有充分利用 3D 逐点几何对应，也没有有效地解决由遮挡或照明不一致引起的光度翘曲的模糊性。为了解决这些问题，这项工作提出了密度体积构建网络 (DevNet)，这是一种新颖的自我监督单目深度学习框架，可以考虑 3D 空间信息，并利用相邻相机平截头体之间更强的几何约束。我们的 DevNet 不是直接从单个图像中回归像素值，而是将相机平截头体划分为多个平行平面，并预测每个平面上的逐点遮挡概率密度。最终的深度图是通过沿相应光线对密度进行积分来生成的。在训练过程中，引入了新的正则化策略和损失函数来减轻光度模糊和过拟合。在没有明显扩大模型参数大小或运行时间的情况下，DevNet 在 KITTI-2015 室外数据集和 NYU-V2 室内数据集上都优于几个具有代表性的基线。特别是，在深度估计任务中，KITTI-2015 和 NYU-V2 上的 DevNet 的均方根偏差降低了约 4%。此 https 网址提供了代码。\n  - [学习用于视图合成的统一 3D 点云](https://arxiv.org/abs/2209.05013) | [code]\n    > 基于 3D 点云表示的视图合成方法已证明是有效的。然而，现有方法通常仅从单个源视图合成新视图，并且将它们泛化以处理多个源视图以追求更高的重建质量并非易事。在本文中，我们提出了一种新的基于深度学习的视图合成范式，它从不同的源视图中学习统一的 3D 点云。具体来说，我们首先通过根据深度图将源视图投影到 3D 空间来构建子点云。然后，我们通过自适应融合子点云联合上定义的局部邻域中的点来学习统一的 3D 点云。此外，我们还提出了一个 3D 几何引导图像恢复模块来填充孔洞并恢复渲染新视图的高频细节。三个基准数据集的实验结果表明，我们的方法在数量上和视觉上都在很大程度上优于最先进的视图合成方法。\n  - [用于稀疏视图计算机断层扫描的自监督坐标投影网络](https://arxiv.org/abs/2209.05483) | [code]\n    > 在目前的工作中，我们提出了一种自监督坐标投影网络（SCOPE），通过解决逆断层扫描成像问题，从单个 SV 正弦图重建无伪影的 CT 图像。与最近使用隐式神经表示网络 (INR) 解决类似问题的相关工作相比，我们的重要贡献是一种有效且简单的重投影策略，该策略将断层扫描图像重建质量提高到有监督的深度学习 CT 重建工作之上。所提出的策略受到线性代数和逆问题之间简单关系的启发。为了求解欠定线性方程组，我们首先引入INR，通过图像连续性先验来约束解空间并获得粗解。其次，我们建议生成密集视图正弦图，提高线性方程组的秩并产生更稳定的 CT 图像解空间。我们的实验结果表明，重投影策略显着提高了图像重建质量（PSNR 至少 +3 dB）。此外，我们将最近的哈希编码集成到我们的 SCOPE 模型中，这极大地加速了模型训练。最后，我们在并行和扇形 X 射线束 SVCT 重建任务中评估 SCOPE。实验结果表明，所提出的 SCOPE 模型在数量和质量上都优于两种最新的基于 INR 的方法和两种流行的监督 DL 方法。\n  - [CU-Net：高效的点云颜色上采样网络](https://arxiv.org/abs/2209.06112) | [code]\n    > 增强现实、虚拟现实和远程呈现场景需要点云上采样。尽管几何上采样被很好地研究以致密点云坐标，但颜色的上采样在很大程度上被忽略了。在本文中，我们提出了第一个深度学习点云颜色上采样模型 CU-Net。利用基于稀疏卷积的特征提取器和基于神经隐函数的颜色预测模块，CU-Net 实现了线性时间和空间复杂度。因此，理论上保证 CU-Net 比大多数具有二次复杂度的现有方法更有效。实验结果表明，CU-Net 可以实时为具有近百万个点的照片般逼真的点云着色，同时具有比基线更好的视觉质量。此外，CU-Net 可以适应任意的上采样率和看不见的对象。我们的源代码将很快向公众发布。\n## Sep4 - Sep10, 2022\n  - [具有深度神经表示的隐式全波形反演](https://arxiv.org/abs/2209.03525) | [code]\n    > 全波形反演（FWI）通常代表最先进的地下结构和物理参数成像方法，然而，其实施通常面临巨大挑战，例如建立一个良好的初始模型以摆脱局部最小值，以及评估反演结果的不确定性。在本文中，我们提出了使用连续和隐式定义的深度神经表示的隐式全波形反演（IFWI）算法。与对初始模型敏感的 FWI 相比，IFWI 受益于深度学习优化增加的自由度，从而允许从随机初始化开始，这大大降低了非唯一性和陷入局部最小值的风险。理论和实验分析均表明，在给定随机初始模型的情况下，IFWI 能够收敛到全局最小值，并生成具有精细结构的地下高分辨率图像。此外，IFWI 的不确定性分析可以很容易地通过使用各种深度学习方法近似贝叶斯推理来执行，本文通过添加 dropout 神经元对其进行分析。此外，IFWI具有一定的鲁棒性和较强的泛化能力，在各种二维地质模型的实验中得到了体现。通过适当的设置，IFWI也可以很好地适用于多尺度联合地球物理反演。\n## Aug28 - Sep3, 2022\n  - [FoV-NeRF：虚拟现实的中心凹神经辐射场, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9872532) | [code]\n    > 随着消费者显示器和商业 VR 平台的兴起，虚拟现实 (VR) 正变得无处不在。这种显示需要低延迟和高质量的合成图像渲染，同时减少计算开销。神经渲染的最新进展表明，有望通过基于图像的虚拟或物理环境表示来解锁 3D 计算机图形的新可能性。具体来说，神经辐射场 (NeRF) 表明，可以在不损失与视图相关的效果的情况下实现 3D 场景的照片般逼真的质量和连续视图变化。虽然 NeRF 可以显着受益于 VR 应用的渲染，但它面临着由高视场、高分辨率和立体/以自我为中心的观看带来的独特挑战，通常会导致渲染图像的低质量和高延迟。在 VR 中，这不仅会损害交互体验，还可能导致疾病。为了解决 VR 中的六自由度、以自我为中心和立体 NeRF 的这些问题，我们提出了第一个注视条件 3D 神经表示和视图合成方法。我们将视觉和立体敏锐度的人类心理物理学纳入 3D 风景的以自我为中心的神经表示中。然后，我们共同优化延迟/性能和视觉质量，同时相互桥接人类感知和神经场景合成，以实现感知上高质量的沉浸式交互。我们进行了客观分析和主观研究，以评估我们方法的有效性。我们发现我们的方法显着减少了延迟（与 NeRF 相比减少了高达 99% 的时间），而不会损失高保真渲染（在感知上与全分辨率地面实况相同）。所提出的方法可能是迈向未来实时捕捉、传送和可视化远程环境的 VR/AR 系统的第一步。\n  - [克隆：用于占用网格辅助神经表示的相机-激光雷达融合](https://arxiv.org/abs/2209.01194) | [code]\n    > 本文提出了 CLONeR，它通过允许对从稀疏输入传感器视图观察到的大型户外驾驶场景进行建模，显着改进了 NeRF。这是通过将 NeRF 框架内的占用和颜色学习解耦为分别使用 LiDAR 和相机数据训练的单独的多层感知器 (MLP) 来实现的。此外，本文提出了一种在 NeRF 模型旁边构建可微分 3D 占用网格图 (OGM) 的新方法，并利用此占用网格改进沿射线的点采样，以在度量空间中进行体积渲染。\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n## Aug7 - Aug13, 2022\n  - [HyperTime：时间序列的隐式神经表示](https://arxiv.org/abs/2208.05836) | [code]\n    > 隐式神经表示 (INR) 最近已成为一种强大的工具，可提供准确且与分辨率无关的数据编码。它们作为通用逼近器的鲁棒性已在各种数据源中得到证明，并应用于图像、声音和 3D 场景表示。然而，很少有人关注利用这些架构来表示和分析时间序列数据。在本文中，我们使用 INR 分析时间序列的表示，比较不同的激活函数在重建精度和训练收敛速度方面。我们展示了如何利用这些网络对时间序列进行插补，以及在单变量和多变量数据上的应用。最后，我们提出了一种利用 INR 来学习整个时间序列数据集的压缩潜在表示的超网络架构。我们引入了基于 FFT 的损失来指导训练，以便在时间序列中保留所有频率。我们展示了该网络可用于将时间序列编码为 INR，并且可以对它们的嵌入进行插值以从现有的时间序列中生成新的时间序列。我们通过将其用于数据增强来评估我们的生成方法，并表明它与当前最先进的时间序列增强方法具有竞争力。\n  - [NIDN：纳米结构的神经逆向设计](https://arxiv.org/abs/2208.05480) | [code]\n    > 近十年来，计算工具已成为材料设计的核心，以降低成本实现快速开发周期。机器学习工具在光子学领域尤其兴起。然而，从优化的角度来看，设计所需的麦克斯韦方程的反演特别具有挑战性，需要复杂的软件。我们提出了一种创新的开源软件工具，称为纳米结构的神经逆向设计 (NIDN)，它允许使用基于物理的深度学习方法设计复杂的堆叠材料纳米结构。我们执行基于梯度的神经网络训练，而不是无导数或数据驱动的优化或学习方法，在这种训练中，我们根据其光谱特性直接优化材料及其结构。 NIDN 支持两种不同的求解器，严格的耦合波分析和有限差分时域方法。 NIDN 的实用性和有效性在几个合成示例以及 1550 nm 滤光片和抗反射涂层的设计中得到了证明。结果与实验基线、其他模拟工具和所需的光谱特性相匹配。鉴于其在网络架构和 Maxwell 求解器方面的完全模块化以及开源、许可的可用性，NIDN 将能够支持广泛应用中的计算材料设计过程。\n  - [使用隐式神经表示的蒙特卡罗去噪](https://oaktrust.library.tamu.edu/handle/1969.1/196567) | [code]\n    > Monte Carlo 路径追踪是计算机图形学中流行的 3D 渲染技术，但它通常需要在图像中的噪声量和计算时间之间进行代价高昂的权衡。因此，尝试“平滑”噪声图像是有用的，通常通过在样本之间构建新数据或对图像应用过滤器。在这项工作中，我们研究了训练神经网络以将固定视点场景的亮度隐式表示为连续函数的可行性。我们使用多层感知器网络实现神经网络，并在由离线 Monte Carlo 渲染器生成的稀疏采样图像上对其进行训练。该训练数据使用图像平面上每个样本的 (x, y) 坐标作为输入，并将样本的 RGB 颜色作为输出。此外，我们为网络提供第一条光线交点的表面法线、深度和反照率，作为像素坐标旁边的额外输入。这些额外的输入维度通过帮助网络考虑深度、法线和漫反射颜色的变化来提高隐式表示的质量。一旦网络在稀疏采样的场景上得到训练，我们就可以对每个像素的网络进行多次密集采样，以创建最终的去噪图像。我们发现该网络可以在具有柔和照明和光泽反射的场景中快速学习和去噪图像，并且只需少量训练即可轻松处理深度、正常和漫反射颜色的不连续性。\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [DoF-NeRF：景深与神经辐射场相遇, ACMMM2022](https://arxiv.org/pdf/2208.00945) | [***``[code]``***](https://github.com/zijinwuzijin/DoF-NeRF)\n    > 神经辐射场 (NeRF) 及其变体在表示 3D 场景和合成逼真的新颖视图方面取得了巨大成功。但是，它们通常基于针孔相机模型并假设全焦点输入。这限制了它们的适用性，因为从现实世界捕获的图像通常具有有限的景深 (DoF)。为了缓解这个问题，我们引入了 DoF-NeRF，一种新颖的神经渲染方法，可以处理浅自由度输入并可以模拟自由度效果。特别是，它根据几何光学原理扩展了 NeRF 以模拟镜头的孔径。这样的物理保证允许 DoF-NeRF 操作具有不同焦点配置的视图。得益于显式光圈建模，DoF-NeRF 还可以通过调整虚拟光圈和焦点参数来直接操纵 DoF 效果。它是即插即用的，可以插入到基于 NeRF 的框架中。在合成数据集和真实世界数据集上的实验表明，DoF-NeRF 不仅在全焦点设置中的性能与 NeRF 相当，而且还可以合成以浅自由度输入为条件的全焦点新视图。还演示了 DoF-NeRF 在 DoF 渲染中的一个有趣应用。\n  - [神经密度-距离场, ECCV2022](https://arxiv.org/abs/2207.14455) | [***``[code]``***](https://ueda0319.github.io/neddf/)\n    > 神经领域在 3D 视觉任务中的成功现在是无可争辩的。遵循这一趋势，已经提出了几种针对视觉定位的方法（例如，SLAM）来使用神经场估计距离或密度场。然而，仅通过基于密度场的方法（例如神经辐射场 (NeRF)）很难实现高定位性能，因为它们在大多数空白区域中不提供密度梯度。另一方面，基于距离场的方法，例如神经隐式表面 (NeuS)，在对象的表面形状方面存在局限性。本文提出了神经密度-距离场 (NeDDF)，这是一种新的 3D 表示，它相互约束距离和密度场。我们将距离场公式扩展到没有明确边界表面的形状，例如毛皮或烟雾，这使得从距离场到密度场的显式转换成为可能。通过显式转换实现的一致距离和密度场既能保证初始值的鲁棒性，又能实现高质量的配准。此外，场之间的一致性允许从稀疏点云快速收敛。实验表明，NeDDF 可以实现高定位性能，同时在新颖的视图合成上提供与 NeRF 相当的结果。该代码可在此 https URL 获得。\n  - [通过 NeRF Attention 进行端到端视图合成](https://arxiv.org/abs/2207.14741) | [code]\n    > 在本文中，我们提出了一个用于视图合成的简单 seq2seq 公式，其中我们将一组光线点作为输入和输出与光线相对应的颜色。在这个 seq2seq 公式上直接应用标准转换器有两个限制。首先，标准注意力不能成功地适应体积渲染过程，因此合成视图中缺少高频分量。其次，将全局注意力应用于所有光线和像素是非常低效的。受神经辐射场 (NeRF) 的启发，我们提出了 NeRF 注意力 (NeRFA) 来解决上述问题。一方面，NeRFA 将体积渲染方程视为软特征调制过程。通过这种方式，特征调制增强了具有类似 NeRF 电感偏置的变压器。另一方面，NeRFA 执行多阶段注意力以减少计算开销。此外，NeRFA 模型采用光线和像素转换器来学习光线和像素之间的相互作用。 NeRFA 在四个数据集上展示了优于 NeRF 和 NerFormer 的性能：DeepVoxels、Blender、LLFF 和 CO3D。此外，NeRFA 在两种设置下建立了新的 state-of-the-art：单场景视图合成和以类别为中心的新颖视图合成。该代码将公开发布。\n  - [神经链：从多视图图像中学习头发的几何形状和外观, ECCV2022](https://arxiv.org/pdf/2207.14067) | [***``[code]``***](https://radualexandru.github.io/neural_strands/)\n    > 我们提出了 Neural Strands，这是一种新颖的学习框架，用于从多视图图像输入中对精确的头发几何形状和外观进行建模。学习的头发模型可以从具有高保真视图相关效果的任何视点实时渲染。与体积模型不同，我们的模型实现了直观的形状和样式控制。为了实现这些特性，我们提出了一种基于神经头皮纹理的新型头发表示，该神经头皮纹理对每个纹素位置的单个股线的几何形状和外观进行编码。此外，我们引入了一种基于学习发束光栅化的新型神经渲染框架。我们的神经渲染是精确的和抗锯齿的，使渲染视图一致且逼真。将外观与多视图几何先验相结合，我们首次实现了从多视图设置中联合学习外观和显式头发几何形状。我们展示了我们的方法在各种发型的保真度和效率方面的有效性。\n  - [拉普拉斯系统的神经格林函数, Computer & Graphics](https://www.sciencedirect.com/science/article/pii/S0097849322001406) | [code]\n    > 求解源自拉普拉斯算子的线性方程组是广泛应用的核心。由于线性系统的稀疏性，当解具有大量自由度时，通常采用迭代求解器，例如共轭梯度和多重网格。这些迭代求解器可以看作是拉普拉斯算子格林函数的稀疏近似。在本文中，我们提出了一种机器学习方法，该方法从边界条件中回归格林函数。这是通过格林函数实现的，该函数可以以多尺度方式有效地表示，从而大大降低了与密集矩阵表示相关的成本。此外，由于格林函数完全依赖于边界条件，因此训练所提出的神经网络不需要对线性系统的右侧进行采样。结果表明，我们的方法优于最先进的共轭梯度和多重网格方法。\n  - [关于物理概念的可学习性：神经网络能理解什么是真](https://arxiv.org/abs/2207.12186) | [code]\n    > 鉴于深度神经网络生成逼真的合成数据的卓越能力，我们重新审视了经典的信号到符号障碍。 DeepFakes 和欺骗突出了物理现实与其抽象表示之间联系的脆弱性，无论是由数字计算机还是生物代理学习。从一个广泛适用的抽象概念定义开始，我们表明标准的前馈架构只能捕获微不足道的概念，无论权重的数量和训练数据的数量如何，尽管它们是非常有效的分类器。另一方面，包含递归的架构可以代表更大的概念类别，但可能仍然无法从有限的数据集中学习它们。我们定性地描述了可以被用随机梯度下降变体训练的现代架构“理解”的概念类别，使用（自由能）拉格朗日来测量信息复杂性。然而，即使一个概念已经被理解，网络也无法将其理解传达给外部代理，除非通过持续的交互和验证。然后，我们将物理对象表征为抽象概念，并使用前面的分析来表明物理对象可以由有限架构编码。然而，为了理解物理概念，传感器必须提供持续令人兴奋的观察，而控制数据采集过程的能力是必不可少的（主动感知）。控制的重要性取决于形式，比听觉或化学感知更有益于视觉。最后，我们得出结论，可以在有限的时间内用有限的资源将物理实体绑定到数字身份，原则上解决了信号到符号的障碍问题，但我们强调了持续验证的必要性。\n## Previous weeks\n  - [﻿Plenoxels：没有神经网络的辐射场, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > 我们介绍了 Plenoxels（全光体素），一种用于照片级真实视图合成的系统。 Plenoxels 将场景表示为具有球谐函数的稀疏 3D 网格。这种表示可以通过梯度方法和正则化从校准图像中优化，而无需任何神经组件。在标准的基准任务中，Plenoxels 的优化速度比神经辐射场快两个数量级，而视觉质量没有损失。\n  - [城市辐射场, CVPR2022](https://urban-radiance-fields.github.io/) | [code]\n    > 这项工作的目标是从扫描平台捕获的数据中执行 3D 重建和新颖的视图合成，这些平台通常用于城市户外环境（例如街景）中的世界地图绘制。给定一系列由相机和扫描仪在户外场景中移动获得的 RGB 图像序列和激光雷达扫描，我们生成了一个模型，可以从中提取 3D 表面并合成新的 RGB 图像。我们的方法扩展了神经辐射场，该方法已被证明可以在受控环境中为小场景合成逼真的新颖图像，以及利用异步捕获的激光雷达数据、解决捕获图像之间的曝光变化以及利用预测的图像分割来监督密度的新方法在指向天空的光线上。这三个扩展中的每一个都在街景数据的实验中提供了显着的性能改进。与传统方法（例如~COLMAP）和最近的神经表示（例如~Mip-NeRF）相比，我们的系统产生最先进的 3D 表面重建并合成更高质量的新视图。\n  - [NeRF：将场景表示为用于视图合成的神经辐射场, ECCV2020](https://arxiv.org/abs/2003.08934) | [***``[code]``***](http://tancik.com/nerf)\n    > 我们提出了一种方法，该方法通过使用稀疏输入视图集优化底层连续体积场景函数，实现了合成复杂场景的新视图的最新结果。我们的算法使用全连接（非卷积）深度网络表示场景，其输入是单个连续 5D 坐标（空间位置（x,y,z）和观察方向（θ,φ）），其输出是该空间位置的体积密度和与视图相关的发射辐射。我们通过沿相机光线查询 5D 坐标来合成视图，并使用经典的体渲染技术将输出颜色和密度投影到图像中。因为体积渲染是自然可微的，所以优化我们的表示所需的唯一输入是一组具有已知相机姿势的图像。我们描述了如何有效地优化神经辐射场以渲染具有复杂几何形状和外观的场景的逼真的新颖视图，并展示了优于先前在神经渲染和视图合成方面的工作的结果。查看合成结果最好以视频形式观看，因此我们敦促读者观看我们的补充视频以进行令人信服的比较。\n  - [野外的 NeRF：无约束照片集的神经辐射场, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > 我们提出了一种基于学习的方法，用于仅使用野外照片的非结构化集合来合成复杂场景的新视图。我们建立在神经辐射场 (NeRF) 的基础上，它使用多层感知器的权重将场景的密度和颜色建模为 3D 坐标的函数。虽然 NeRF 在受控设置下捕获的静态对象的图像上效果很好，但它无法在不受控的图像中模拟许多普遍存在的真实世界现象，例如可变照明或瞬态遮挡物。我们为 NeRF 引入了一系列扩展来解决这些问题，从而能够从互联网上获取的非结构化图像集合中进行准确的重建。我们将我们的系统（称为 NeRF-W）应用于著名地标的互联网照片集，并展示时间一致的新颖视图渲染，这些渲染比现有技术更接近真实感。\n  - [Ha-NeRF：野外的幻觉神经辐射场, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > 神经辐射场 (NeRF) 最近因其令人印象深刻的新颖视图合成能力而广受欢迎。本文研究了幻觉 NeRF 的问题：即在一天中的不同时间从一组旅游图像中恢复一个真实的 NeRF。现有的解决方案采用具有可控外观嵌入的 NeRF 在各种条件下渲染新颖的视图，但它们无法渲染具有看不见的外观的视图一致图像。为了解决这个问题，我们提出了一个用于构建幻觉 NeRF 的端到端框架，称为 Ha-NeRF。具体来说，我们提出了一个外观幻觉模块来处理随时间变化的外观并将它们转移到新的视图中。考虑到旅游图像的复杂遮挡，我们引入了一个反遮挡模块来准确地分解静态主体以获得可见性。合成数据和真实旅游照片集的实验结果表明，我们的方法可以产生幻觉，并从不同的视图呈现无遮挡的图像。\n  - [Nerfies：可变形的神经辐射场, ICCV2021](https://arxiv.org/abs/2011.12948) | [code]\n    > 我们提出了第一种能够使用从手机随便捕获的照片/视频来逼真地重建可变形场景的方法。我们的方法通过优化一个额外的连续体积变形场来增强神经辐射场 (NeRF)，该场将每个观察点扭曲成一个规范的 5D NeRF。我们观察到这些类似 NeRF 的变形场容易出现局部最小值，并为基于坐标的模型提出了一种从粗到细的优化方法，可以实现更稳健的优化。通过将几何处理和物理模拟的原理应用于类似 NeRF 的模型，我们提出了变形场的弹性正则化，进一步提高了鲁棒性。我们表明，我们的方法可以将随意捕获的自拍照片/视频转换为可变形的 NeRF 模型，允许从任意视角对主体进行逼真的渲染，我们称之为“nerfies”。我们通过使用带有两部手机的装备收集时间同步数据来评估我们的方法，从而在不同视点产生相同姿势的训练/验证图像。我们表明，我们的方法忠实地重建了非刚性变形的场景，并以高保真度再现了看不见的视图。\n  - [D-NeRF：动态场景的神经辐射场, CVPR2021](https://arxiv.org/abs/2011.13961) | [***``[code]``***](https://github.com/albertpumarola/D-NeRF)\n    > 将机器学习与几何推理相结合的神经渲染技术已成为从一组稀疏图像中合成场景新视图的最有前途的方法之一。其中，神经辐射场 (NeRF) 尤为突出，它训练深度网络将 5D 输入坐标（表示空间位置和观察方向）映射为体积密度和与视图相关的发射辐射。然而，尽管在生成的图像上实现了前所未有的真实感水平，但 NeRF 仅适用于静态场景，其中可以从不同的图像中查询相同的空间位置。在本文中，我们介绍了 D-NeRF，这是一种将神经辐射场扩展到动态域的方法，允许在场景中移动的 \\emph{single} 相机的刚性和非刚性运动下重建和渲染物体的新图像。为此，我们将时间视为系统的附加输入，并将学习过程分为两个主要阶段：一个将场景编码为规范空间，另一个将这个规范表示映射到特定时间的变形场景。两种映射都是使用全连接网络同时学习的。一旦网络经过训练，D-NeRF 就可以渲染新颖的图像，同时控制相机视图和时间变量，从而控制对象的移动。我们展示了我们的方法在物体处​​于刚性、关节和非刚性运动的场景中的有效性。代码、模型权重和动态场景数据集将发布。\n  - [用于单目 4D 面部头像重建的动态神经辐射场, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > 我们提出了用于模拟人脸外观和动态的动态神经辐射场。对说话的人进行数字建模和重建是各种应用程序的关键组成部分。特别是对于 AR 或 VR 中的远程呈现应用，需要忠实再现外观，包括新颖的视点或头部姿势。与显式建模几何和材料属性或纯粹基于图像的最先进方法相比，我们引入了基于场景表示网络的头部隐式表示。为了处理面部的动态，我们将场景表示网络与低维可变形模型相结合，该模型提供对姿势和表情的显式控制。我们使用体积渲染从这种混合表示中生成图像，并证明这种动态神经场景表示只能从单目输入数据中学习，而不需要专门的捕获设置。在我们的实验中，我们表明这种学习的体积表示允许生成照片般逼真的图像，其质量超过了基于视频的最先进的重演方法的质量。\n  - [PVA：像素对齐的体积化身, CVPR2021](https://volumetric-avatars.github.io/) | [code]\n    > 逼真的人头的采集和渲染是一个极具挑战性的研究问题，对于虚拟远程呈现特别重要。目前，最高质量是通过在多视图数据上以个人特定方式训练的体积方法实现的。与更简单的基于网格的模型相比，这些模型更好地表示精细结构，例如头发。体积模型通常使用全局代码来表示面部表情，以便它们可以由一小组动画参数驱动。虽然这样的架构实现了令人印象深刻的渲染质量，但它们不能轻易地扩展到多身份设置。在本文中，我们设计了一种新颖的方法，用于在仅给定少量输入的情况下预测人头的体积化身。我们通过一种新颖的参数化实现跨身份的泛化，该参数化将神经辐射场与直接从输入中提取的局部像素对齐特征相结合，从而避免了对非常深或复杂网络的需求。我们的方法仅基于光度重新渲染损失以端到端的方式进行训练，无需明确的 3D 监督。我们证明我们的方法在质量方面优于现有的现有技术，并且能够生成忠实的面部表情多身份设置。\n  - [用于人体建模的动画神经辐射场, ICCV2021](https://zju3dv.github.io/animatable_nerf/) | [***``[code]``***](https://github.com/zju3dv/animatable_nerf)\n    > 本文解决了从多视图视频中重建可动画人体模型的挑战。最近的一些工作提出将非刚性变形场景分解为规范神经辐射场和一组将观察空间点映射到规范空间的变形场，从而使他们能够从图像中学习动态场景。然而，它们将变形场表示为平移矢量场或 SE(3) 场，这使得优化受到高度约束。此外，这些表示不能由输入运动明确控制。相反，我们引入了神经混合权重场来产生变形场。基于骨架驱动的变形，混合权重场与 3D 人体骨骼一起使用，以生成观察到规范和规范到观察的对应关系。由于 3D 人体骨骼更易观察，它们可以规范变形场的学习。此外，学习到的混合权重场可以与输入的骨骼运动相结合，以生成新的变形场来为人体模型设置动画。实验表明，我们的方法明显优于最近的人类合成方法。该代码将在 https://zju3dv.github.io/animatable_nerf/ 上提供。\n  - [NeRF++：分析和改进神经辐射场](https://arxiv.org/abs/2010.07492) | [***``[code]``***](https://github.com/Kai-46/nerfplusplus;)\n    > 神经辐射场 (NeRF) 为各种捕捉设置实现了令人印象深刻的视图合成结果，包括有界场景的 360 度捕捉以及有界和无界场景的前向捕捉。 NeRF 将表示视图不变不透明度和视图相关颜色体积的多层感知器 (MLP) 拟合到一组训练图像，并基于体积渲染技术对新视图进行采样。在这份技术报告中，我们首先评论了辐射场及其潜在的模糊性，即形状-辐射模糊度，并分析了 NeRF 在避免这种模糊性方面的成功。其次，我们解决了将 NeRF 应用于大规模、无界 3D 场景中对象的 360 度捕获所涉及的参数化问题。我们的方法在这种具有挑战性的场景中提高了视图合成保真度。此 https 网址提供了代码。\n  - [动态场景的神经场景图, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > 最近的隐式神经渲染方法表明，可以通过仅由一组 RGB 图像监督的预测其体积密度和颜色来学习复杂场景的准确视图合成。然而，现有方法仅限于学习将所有场景对象编码为单个神经网络的静态场景的有效表示，并且缺乏将动态场景表示和分解为单个场景对象的能力。在这项工作中，我们提出了第一个将动态场景分解为场景图的神经渲染方法。我们提出了一种学习的场景图表示，它对对象变换和辐射进行编码，以有效地渲染场景的新颖排列和视图。为此，我们学习隐式编码的场景，并结合联合学习的潜在表示来描述具有单个隐式函数的对象。我们在合成和真实汽车数据上评估所提出的方法，验证我们的方法学习动态场景 - 仅通过观察该场景的视频 - 并允许渲染具有看不见的对象集的新颖场景组合的新颖照片般逼真的视图看不见的姿势。\n  - [使用隐式场景表示进行就地场景标记和理解, ICCV2021(oral)](https://shuaifengzhi.com/Semantic-NeRF/) | [***``[code]``***](https://github.com/Harry-Zhi/semantic_nerf/)\n    > 语义标签与几何和辐射重建高度相关，因为具有相似形状和外观的场景实体更有可能来自相似的类别。最近的隐式神经重建技术很有吸引力，因为它们不需要事先的训练数据，但同样的完全自我监督的方法对于语义来说是不可能的，因为标签是人类定义的属性。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/pose-slam.md",
    "content": "\n每周分类神经辐射场 - pose-slam ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n======================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n  - [用于相机重定位的快速轻量级场景回归器](https://arxiv.org/abs/2212.01830) | [***``[code]``***](https://github.com/aislab/feat2map)\n    > 涉及先前 3D 重建的相机重定位在许多混合现实和机器人应用中起着至关重要的作用。 对于一些存储和/或通信带宽有限的应用程序，直接根据预建 3D 模型估计相机姿势可能非常昂贵。 尽管最近的场景和绝对姿态回归方法在有效的相机定位方面变得流行，但它们中的大多数都是计算资源密集型的，并且难以获得具有高精度约束的实时推理。 本研究提出了一种简单的场景回归方法，只需要一个多层感知器网络来映射场景坐标，即可实现准确的相机姿态估计。 所提出的方法使用稀疏描述符来回归场景坐标，而不是密集的 RGB 图像。 使用稀疏特征有几个优点。 首先，拟议的回归网络比以前的研究报告的要小得多。 这使我们的系统高效且可扩展。 其次，预建的 3D 模型提供了最可靠和稳健的 2D-3D 匹配。 因此，向它们学习可以导致对等效特征的认识并显着提高泛化性能。 提供了对我们的方法的详细分析和使用现有数据集的广泛评估，以支持所提出的方法。 可在此 https URL 获取实施细节\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [RUST：来自未定图像的潜在神经场景表示](https://arxiv.org/abs/2211.14306) | [code]\n    > 从 2D 观察中推断 3D 场景的结构是计算机视觉中的一项基本挑战。最近流行的基于神经场景表示的方法已经取得了巨大的影响，并已应用于各种应用程序。这个领域剩下的主要挑战之一是训练一个单一的模型，它可以提供潜在的表示，有效地泛化到单个场景之外。 Scene Representation Transformer (SRT) 在这个方向上显示出希望，但将其扩展到更大的不同场景集是具有挑战性的，并且需要准确定位的地面实况数据。为了解决这个问题，我们提出了 RUST（Really Unposed Scene representation Transformer），这是一种仅在 RGB 图像上训练的新颖视图合成的无姿势方法。我们的主要见解是，可以训练一个姿势编码器，它可以窥视目标图像并学习潜在姿势嵌入，解码器将其用于视图合成。我们对学习到的潜在姿势结构进行了实证研究，并表明它允许有意义的测试时间相机转换和准确的显式姿势读出。或许令人惊讶的是，RUST 实现了与获得完美相机姿势的方法相似的质量，从而释放了大规模训练摊销神经场景表示的潜力。\n  - [ActiveRMAP：用于主动映射和规划的辐射场](https://arxiv.org/abs/2211.12656) | [code]\n    > 通过离线/在线映射方法，可以从一组 2D 图像中对场景进行高质量的 3D 重建。在本文中，我们从隐式表示的角度探索主动映射，最近在各种应用中产生了令人信服的结果。最流行的隐式表示之一——神经辐射场 (NeRF)，首先展示了使用多层感知器的照片级真实感渲染结果，并将有前途的离线 3D 重建作为辐射场的副产品。最近，研究人员还将这种隐式表示应用于在线重建和定位（即隐式 SLAM 系统）。然而，将隐式表示用于主动视觉任务的研究仍然非常有限。在本文中，我们对将神经辐射场应用于主动映射和规划问题特别感兴趣，这些问题是主动系统中紧密耦合的任务。我们首次提出了一个仅使用 RGB 的主动视觉框架，该框架使用辐射场表示以在线方式进行主动 3D 重建和规划。具体来说，我们将此联合任务制定为迭代双阶段优化问题，我们交替优化辐射场表示和路径规划。实验结果表明，与其他离线方法相比，所提出的方法取得了有竞争力的结果，并且优于使用 NeRF 的主动重建方法。\n  - [束调整神经辐射场的局部到全局配准](https://arxiv.org/abs/2211.11505) | [***``[code]``***](https://github.com/rover-xingyu/L2G-NeRF)\n    > Neural Radiance Fields (NeRF) 实现了逼真的新视图合成；然而，精确相机位姿的要求限制了它的应用。尽管存在用于联合学习神经 3D 表示和注册相机帧的分析综合扩展，但如果初始化不当，它们很容易受到次优解决方案的影响。我们提出了 L2G-NeRF，这是一种用于束调整神经辐射场的局部到全局配准方法：首先，逐像素灵活对齐，然后逐帧约束参数对齐。通过优化光度重建误差的深度网络以无监督的方式学习逐像素局部对齐。使用可微分参数估计求解器对逐像素对应执行逐帧全局对齐以找到全局变换。对合成数据和真实世界数据的实验表明，我们的方法在高保真重建和解决大型相机姿态失调方面优于当前最先进的方法。我们的模块是一个易于使用的插件，可以应用于 NeRF 变体和其他神经领域应用程序。此 https URL 提供了代码和补充材料。\n  - [Neural Puppeteer：基于关键点的动态形状神经渲染, ACCV2022](https://openaccess.thecvf.com/content/ACCV2022/html/Giebenhain_Neural_Puppeteer_Keypoint-Based_Neural_Rendering_of_Dynamic_Shapes_ACCV_2022_paper.html) | [***``[code]``***](https://github.com/urs-waldmann/NePu/)\n    > 我们介绍了 Neural Puppeteer，这是一种用于铰接形状的高效神经渲染管道。通过逆向渲染，我们可以单独从多视图 2D 轮廓预测 3D 关键点，而不需要纹理信息。此外，我们可以使用一个相同的训练模型轻松预测同一类形状的 3D 关键点，并更容易地从合成数据的训练中进行概括，我们通过成功地将零样本合成应用于现实世界的实验来证明这一点。我们通过将模型拟合到不同动物和人类的合成视频来展示我们方法的灵活性，并获得优于我们基线的定量结果。我们的方法将 3D 关键点与各个局部特征向量和全局潜在代码结合使用，以有效表示时变和铰接的形状，例如人类和动物。与之前的工作相比，我们不在 3D 域中进行重建，而是将 3D 特征投影到 2D 相机中，并根据这些投影特征对 2D RGB-D 图像进行重建，这比体积渲染要快得多。我们的合成数据集将公开可用，以进一步发展不断发展的动物姿势和形状重建领域。\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n  - [nerf2nerf：神经辐射场的成对配准](https://arxiv.org/abs/2211.01600) | [code]\n    > 我们引入了一种神经场成对配准技术，该技术扩展了经典的基于优化的局部配准（即 ICP）以在神经辐射场 (NeRF) 上运行——从校准图像集合训练的神经 3D 场景表示。 NeRF 不分解照明和颜色，因此为了使配准不受照明影响，我们引入了“表面场”的概念——从预训练的 NeRF 模型中提取的场，该模型测量点在表面上的可能性物体的表面。然后，我们将 nerf2nerf 注册作为一种稳健的优化，迭代地寻求对齐两个场景的表面场的刚性转换。我们通过引入预训练的 NeRF 场景数据集来评估我们的技术的有效性——我们的合成场景可以对经典配准技术进行定量评估和比较，而我们的真实场景则证明了我们的技术在现实场景中的有效性。其他结果位于：此 https 网址\n  - [GARF：用于高保真重建和姿态估计的高斯激活辐射场, ECCV2022](https://arxiv.org/abs/2204.05735) | [code]\n    > 尽管神经辐射场 (NeRF) 在现实世界场景的逼真新颖视图合成中显示出令人信服的结果，但大多数现有方法都需要准确的先验相机姿势。尽管存在联合恢复辐射场和相机姿态的方法 (BARF)，但它们依赖于繁琐的从粗到细的辅助位置嵌入来确保良好的性能。我们提出了高斯激活神经辐射场 (GARF)，这是一种新的无位置嵌入神经辐射场架构 - 采用高斯激活 - 在高保真重建和姿态估计方面优于当前最先进的技术。\n  - [深入研究 Radiance Grid 以进行实时视图合成并保留细节, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19784-0_42) | [code]\n    > 神经辐射场 (NeRF) [31] 系列在表示场景和合成高质量新颖视图方面令人印象深刻。然而，大多数以前的作品都无法保留纹理细节并且训练速度慢。最近的一种方法 SNeRG [11] 表明，将经过训练的 NeRF 烘焙为稀疏神经辐射网格可以实现实时视图合成，同时略微降低渲染质量。在本文中，我们深入研究了 Radiance Grid 表示并提出了一系列改进，这些改进共同提高了速度和质量方面的性能。首先，我们提出了一种分层稀疏辐射网格 (HrSRG) 表示，它对信息空间具有更高的体素分辨率，对其他空间具有更少的体素。 HrSRG 利用受 [30, 55] 启发的分层体素网格构建过程，并且可以在不占用过多内存的情况下以高分辨率描述场景。此外，我们表明直接优化体素网格会在渲染图像中产生出奇的好纹理细节。这种直接优化是内存友好的，并且需要比传统 NeRF 少多个数量级的时间，因为它只涉及一个微型 MLP。最后，我们发现阻止精细细节恢复的一个关键因素是由相机姿势错误引起的图像中未对齐的 2D 像素。我们建议使用感知损失来增加对错位的容忍度，从而提高渲染图像的视觉质量。\n## Oct23 - Oct29, 2022\n  - [EpipolarNVS：利用对极几何进行单图像新视图合成, BMVC2022](https://arxiv.org/abs/2210.13077) | [code]\n    > 新视图合成 (NVS) 可以通过不同的方法来解决，具体取决于一般设置：单个源图像到短视频序列、精确或嘈杂的相机姿势信息、基于 3D 的信息（如点云等）。最具挑战性的场景，我们在这项工作中所处的场景，只考虑一个独特的源图像来从另一个角度生成一个新颖的图像。然而，在这种棘手的情况下，最新的基于学习的解决方案往往难以集成相机视点转换。事实上，外部信息通常通过低维向量按原样传递。甚至可能会发生这样的情况，当参数化为欧拉角时，这样的相机姿势会通过单热表示进行量化。这种普通的编码选择阻止了学习的架构在连续的基础上（从相机姿势的角度）推断新的视图。我们声称它存在一种优雅的方式来通过利用 3D 相关概念（例如对极约束）更好地编码相对相机姿势。因此，我们引入了一种创新方法，将视点变换编码为 2D 特征图像。这种相机编码策略为网络提供了关于相机如何在两个视图之间的空间中移动的有意义的见解。通过将相机姿势信息编码为有限数量的彩色对极线，我们通过实验证明我们的策略优于普通编码。\n  - [NeRF-SLAM：具有神经辐射场的实时密集单目 SLAM](https://arxiv.org/abs/2210.13641) | [code]\n    > 我们提出了一种新颖的几何和光度学 3D 映射管道，用于从单目图像进行准确和实时的场景重建。为实现这一目标，我们利用了密集单眼 SLAM 和实时分层体积神经辐射场的最新进展。我们的见解是，密集的单目 SLAM 通过提供准确的姿态估计和具有相关不确定性的深度图，提供正确的信息以实时拟合场景的神经辐射场。通过我们提出的基于不确定性的深度损失，我们不仅实现了良好的光度精度，而且还实现了很高的几何精度。事实上，我们提出的管道实现了比竞争方法更好的几何和光度精度（PSNR 提高了 179%，L1 深度提高了 86%），同时实时工作并且仅使用单目图像。\n## Oct16 - Oct22, 2022\n  - [用于学习 3D LiDAR 数据场景先验的生成范围成像, WACV2023](https://arxiv.org/abs/2210.11750) | [code]\n    > 3D LiDAR 传感器对于自主移动机器人的强大视觉是必不可少的。然而，部署基于 LiDAR 的感知算法通常会由于与训练环境的域差距而失败，例如角度分辨率不一致和属性缺失。现有的研究通过学习域间映射解决了这个问题，而可迁移性受到训练配置的限制，并且训练容易受到称为光线下降的特殊有损噪声的影响。为了解决这个问题，本文提出了一种适用于数据级域迁移的 LiDAR 距离图像生成模型。受 LiDAR 测量基于逐点距离成像这一事实的启发，我们训练了一个基于隐式图像表示的生成对抗网络以及可微的射线下落效应。与基于点和基于图像的最先进的生成模型相比，我们展示了我们模型的保真度和多样性。我们还展示了上采样和恢复应用程序。此外，我们介绍了用于 LiDAR 语义分割的 Sim2Real 应用程序。我们证明了我们的方法作为一个逼真的光线滴模拟器是有效的，并且优于最先进的方法。\n  - [通过多视图未校准光度立体和渐变 SDF 进行高质量 RGB-D 重建, WACV2023](https://arxiv.org/abs/2210.12202) | [code]\n    > 在许多应用中，对精细重建的需求很高。然而，大多数现有的 RGB-D 重建方法依赖于预先计算的准确相机位姿来恢复详细的表面几何形状，其中在优化不同数量时需要调整表面的表示。在本文中，我们提出了一种新颖的基于多视图 RGB-D 的重建方法，该方法通过利用梯度符号距离场 (gradient-SDF) 来处理相机位姿、光照、反照率和表面法线估计。所提出的方法使用特定的基于物理的模型来制定图像渲染过程，并使用其体积表示来优化实际表面上的表面数量，而不是仅估计实际表面附近的表面数量的其他工作。为了验证我们的方法，我们研究了两个用于自然光和点光源应用的基于物理的图像形成模型。在合成数据集和真实世界数据集上的实验结果表明，所提出的方法可以比现有技术更忠实地恢复表面的高质量几何形状，并进一步提高估计相机位姿的准确性。\n  - [从单个图像进行机器人对象操作的神经场, ICRA2023](https://arxiv.org/abs/2210.12126) | [code]\n    > 我们为对象渲染、3D 重建和抓取姿势预测提供了一个统一且紧凑的表示，可以在几秒钟内从单个图像中推断出来。我们通过利用神经辐射场 (NeRF) 文献的最新进展来实现这一点，这些文献学习类别级先验并以最少的数据和时间对新对象进行微调。我们的见解是，我们可以学习紧凑的形状表示并从中提取有意义的附加信息，例如抓取姿势。我们相信这是第一个使用单个视点（仅 RGB）直接从基于 NeRF 的表示中检索抓取姿势的工作，而不是通过辅助网络和/或表示。与现有技术相比，我们的方法小两到三个数量级，同时在视图重建和抓取方面实现了相当的性能。伴随我们的方法，我们还提出了一个新的渲染鞋数据集，用于训练 sim-2-real NeRF 方法，该方法具有不同宽度的抓手的抓取姿势。\n  - [用于鲁棒姿态估计的神经辐射场的并行反演, ICRA2023](https://arxiv.org/abs/2210.10108) | [code]\n    > 我们提出了一种基于快速神经辐射场 (NeRF) 的并行优化方法，用于估计 6-DoF 目标姿势。给定单个观察到的目标 RGB 图像，我们可以通过最小化从快速 NeRF 模型渲染的像素与观察图像中的像素之间的残差来预测相机的平移和旋转。我们将基于动量的相机外部优化程序集成到 Instant Neural Graphics Primitives 中，这是最近异常快速的 NeRF 实现。通过在姿态估计任务中引入并行蒙特卡罗采样，我们的方法克服了局部最小值并在更广泛的搜索空间中提高了效率。我们还展示了采用更强大的基于像素的损失函数来减少错误的重要性。实验表明，我们的方法可以在合成和真实世界的基准测试中实现改进的泛化性和鲁棒性。\n  - [神经接触场：使用触觉感应跟踪外部接触](https://arxiv.org/abs/2210.09297) | [code]\n    > 我们提出了神经接触场，一种将神经场和触觉传感结合在一起的方法，以解决跟踪对象与环境之间的外部接触的问题。了解外部接触发生在哪里是迈向可以主动控制它以促进下游操作任务的方法的第一步。用于定位环境接触的先前工作通常假定接触类型（例如点或线），不捕获接触/非接触过渡，并且仅适用于基本几何形状的对象。神经接触场是第一种无需对接触类型做出任何假设即可跟踪任意多模态外部接触的方法。我们的主要见解是估计物体形状潜在空间中任何 3D 点的接触概率，给定基于视觉的触觉输入，该输入感知外部接触引起的局部运动。在实验中，我们发现神经接触场能够定位多个接触块，而无需对接触的几何形状做出任何假设，并在看不见的环境配置中捕获具有看不见的形状的已知类别对象的接触/非接触转换。除了神经接触场之外，我们还发布了模拟外部接触交互的 YCB-Extrinsic-Contact 数据集，以便在该领域进行进一步研究。项目存储库：此 https 网址\n  - [动力学增强神经对象的微分物理模拟](https://arxiv.org/abs/2210.09420) | [code]\n    > 我们提出了一种可微分管道，用于模拟将其几何形状表示为参数化为深度网络的连续密度场的对象的运动。这包括神经辐射场 (NeRFs) 和其他相关模型。从密度场，我们估计物体的动力学特性，包括它的质量、质心和惯性矩阵。然后，我们引入了一种基于密度场的可微接触模型，用于计算碰撞产生的法向力和摩擦力。这允许机器人从运动物体的静止图像和视频中自主构建视觉和动态准确的物体模型。生成的动态增强神经对象 (DANO) 使用现有的可微分模拟引擎 Dojo 进行模拟，并与其他标准模拟对象（例如指定为 URDF 的球体、平面和机器人）交互。机器人可以使用这种模拟来优化神经物体的抓取和操纵轨迹，或者通过基于梯度的真实到模拟传输来改进神经物体模型。我们演示了从肥皂在桌子上滑动的真实视频中学习一块肥皂的摩擦系数的管道。我们还通过从合成数据中与熊猫机器人手臂的交互来了解斯坦福兔子的摩擦系数和质量，并在模拟中优化熊猫手臂的轨迹，以将兔子推到目标位置。\n## Oct9 - Oct15, 2022\n  - [ExAug：通过几何经验增强的机器人条件导航策略](https://arxiv.org/abs/2210.07450) | [code]\n    > 机器学习技术依赖于庞大而多样的数据集进行泛化。计算机视觉、自然语言处理和其他应用程序通常可以重用公共数据集来训练许多不同的模型。然而，由于物理配置的差异，利用公共数据集在新机器人平台上训练机器人控制策略或执行新任务具有挑战性。在这项工作中，我们提出了一个新颖的框架 ExAug，以从不同环境中的多个数据集中增强不同机器人平台的体验。 ExAug 利用了一个简单的原理：通过以点云的形式提取 3D 信息，我们可以创建更复杂和结构化的增强，利用生成合成图像和几何感知惩罚，这在相同情况下适用于不同的机器人，具有不同的尺寸、转弯半径和摄像头位置。在有障碍物的室内和室外环境中，在两个带有三个不同摄像头的新机器人平台上评估训练后的策略。\n  - [NOCaL：里程计和相机内在学的免校准半监督学习](https://arxiv.org/abs/2210.07435) | [code]\n    > 有许多新兴的成像技术可以使机器人技术受益。然而，对定制模型、校准和低级处理的需求是它们采用的主要障碍。在这项工作中，我们展示了 NOCaL、神经里程计和使用光场的校准，这是一种半监督学习架构，能够在没有校准的情况下解释以前看不见的相机。 NOCaL 学习估计相机参数、相对姿势和场景外观。它采用在大量现有摄像机和场景上预训练的场景渲染超网络，并使用小型监督训练集来适应以前看不见的摄像机来强制度量尺度。我们使用传统相机在渲染和捕获的图像上演示 NOCaL，演示免校准里程计和新颖的视图合成。这项工作是朝着自动解释一般相机几何形状和新兴成像技术迈出的关键一步。\n  - [GeoAug：具有几何约束的 Few-Shot NeRF 的数据增强, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19790-1_20) | [code]\n    > 神经辐射场 (NeRF) 通过学习仅具有姿势 RGB 图像的隐式体积表示，显示出渲染特定场景新视图的非凡能力。尽管 NeRF 令人印象深刻且简单，但在训练图像很少的情况下，它通常会收敛到几何不正确的次优解决方案。我们在此提出 GeoAug：一种用于 NeRF 的数据增强方法，它丰富了基于多视图几何约束的训练数据。 GeoAug 提供用于训练的随机人工（新姿势、RGB 图像）对，其中 RGB 图像来自附近的训练视图。新姿势的渲染被扭曲到具有深度图和相对姿势的附近训练视图，以匹配 RGB 图像监督。我们的方法通过在训练期间引入更多数据来降低过度拟合的风险，同时还为深度图提供了额外的隐式监督。在实验中，我们的方法显着提高了以少量训练视图为条件的神经辐射场的性能。\n  - [逼真的神经域随机化, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19806-9_18) | [code]\n    > 合成数据是人工监督的可扩展替代方案，但它需要克服模拟到真实领域的差距。虚拟世界和现实世界之间的这种差异可以通过两种看似相反的方法来解决：提高模拟的真实性或完全通过域随机化来超越真实性。在本文中，我们展示了神经渲染方面的最新进展实现了一种新的统一方法，我们称之为逼真的神经域随机化 (PNDR)。我们建议学习神经网络的组合，它充当基于物理的光线追踪器，仅从场景几何中生成高质量的渲染。我们的方法是模块化的，由用于材料、照明和渲染的不同神经网络组成，因此可以在可微的管道中随机化不同的关键图像生成组件。一旦经过训练，我们的方法可以与其他方法相结合，用于在线生成照片般逼真的图像增强，并且比通过传统的光线追踪更有效。我们通过两个下游任务证明了 PNDR 的有用性：6D 对象检测和单目深度估计。我们的实验表明，使用 PNDR 进行训练可以泛化到新场景，并且在现实世界传输方面明显优于现有技术。\n  - [X-NeRF：多场景 360 的显式神经辐射场∘ RGB-D 视图不足, WACV2023](https://arxiv.org/abs/2210.05135) | [***``[code]``***](https://github.com/HaoyiZhu/XNeRF)\n    > 神经辐射场 (NeRFs) 尽管在新颖的视图合成方面表现出色，但通常需要密集的输入视图。许多论文分别为每个场景训练一个模型，很少有人探索将多模态数据纳入这个问题。在本文中，我们关注一个很少讨论但很重要的设置：我们能否训练一个模型来表示多个场景、360∘ 视图和 RGB-D 图像不足？我们将不足的视图称为少数极其稀疏且几乎不重叠的视图。为了解决这个问题，提出了一种完全显式的方法 X-NeRF，它学习一般的场景完成过程而不是基于坐标的映射。给定一些不足的 RGB-D 输入视图，X-NeRF 首先将它们转换为稀疏点云张量，然后应用 3D 稀疏生成卷积神经网络 (CNN) 将其完成到可以快速进行体积渲染的显式辐射场在推理期间不运行网络。为了避免过度拟合，除了常见的渲染损失之外，我们还应用了感知损失以及通过点云上的随机旋转来增强视图。在我们的环境中，所提出的方法显着优于以前的隐式方法，表明所提出的问题和方法的巨大潜力。此 https 网址提供了代码和数据。\n  - [具有动态学习神经隐式表示的多对象导航](https://arxiv.org/abs/2210.05129) | [code]\n    > 理解和映射新环境是任何自主导航代理的核心能力。虽然经典机器人通常使用 SLAM 变体以独立的方式估计地图，这些变体保持拓扑或度量表示，但导航的端到端学习在神经网络中保留了某种形式的记忆。网络通常充满归纳偏差，其范围从矢量表示到鸟瞰度量张量或拓扑结构。在这项工作中，我们建议构建具有两个神经隐式表示的神经网络，它们在每一集期间动态学习并映射场景的内容：（i）语义查找器预测先前看到的查询对象的位置； (ii) Occupancy and Exploration Implicit Representation 封装了有关探索区域和障碍物的信息，并使用一种新颖的全局读取机制进行查询，该机制直接从函数空间映射到可用的嵌入空间。这两种表示都由经过强化学习 (RL) 训练的代理利用，并在每一集期间在线学习。我们评估了多对象导航上的代理，并展示了使用神经隐式表示作为记忆源的巨大影响。\n  - [SiNeRF：用于联合姿势估计和场景重建的正弦神经辐射场, BMVC2022](https://arxiv.org/abs/2210.04553) | [***``[code]``***](https://github.com/yitongx/sinerf)\n    > NeRFmm 是处理联合优化任务的神经辐射场 (NeRF)，即同时重建真实场景和注册相机参数。尽管 NeRFmm 产生了精确的场景合成和姿势估计，但它仍然难以在具有挑战性的场景中超越全注释基线。在这项工作中，我们发现联合优化中存在系统的次优性，并进一步确定了它的多个潜在来源。为了减少潜在源的影响，我们提出了利用正弦激活进行辐射映射的正弦神经辐射场 (SiNeRF) 和用于有效选择射线批次的新型混合区域采样 (MRS)。定量和定性的结果表明，与NeRFmm相比，SiNeRF在图像合成质量和姿态估计精度方面实现了全面的显着提升。此 https 网址提供了代码。\n  - [NeRF2Real：使用神经辐射场的视觉引导双足运动技能的 Sim2real 转移](https://arxiv.org/abs/2210.04932) | [code]\n    > 我们提出了一个系统，用于将 sim2real 方法应用于具有逼真视觉效果的“野外”场景，以及依赖于使用 RGB 相机的主动感知的策略。给定一个使用通用电话收集的静态场景的短视频，我们学习场景的接触几何和使用神经辐射场 (NeRF) 进行新视图合成的功能。我们通过叠加其他动态对象（例如机器人自己的身体、球）的渲染来增强静态场景的 NeRF 渲染。然后使用物理模拟器中的渲染引擎创建模拟，该模拟从静态场景几何（根据 NeRF 体积密度估计）和动态对象的几何和物理属性（假设已知）计算接触动力学。我们证明我们可以使用这个模拟来学习基于视觉的全身导航和推球策略，用于具有驱动头戴式 RGB 摄像头的 20 自由度类人机器人，并且我们成功地将这些策略转移到真实机器人。此 https 网址提供项目视频\n## Oct2 - Oct8, 2022\n  - [一种基于神经表面重建的鲁棒对象抓取的 Real2Sim2Real 方法](https://arxiv.org/abs/2210.02685) | [code]\n    > 最近基于 3D 的操作方法要么使用 3D 神经网络直接预测抓取姿势，要么使用从形状数据库中检索到的类似对象来解决抓取姿势。然而，前者在使用新的机器人手臂或看不见的物体进行测试时面临着普遍性挑战；后者假设数据库中存在类似的对象。我们假设最近的 3D 建模方法为构建评估场景的数字副本提供了途径，该评估场景提供物理模拟并支持稳健的操作算法学习。我们建议使用最先进的神经表面重建方法（Real2Sim 步骤）从现实世界的点云中重建高质量的网格。由于大多数模拟器采用网格进行快速模拟，因此重建的网格无需人工即可生成抓取姿势标签。生成的标签可以训练在真实评估场景中表现稳健的抓取网络（Sim2Real 步骤）。在合成和真实实验中，我们表明 Real2Sim2Real 管道的性能优于使用大型数据集训练的基线抓取网络和基于检索的重建的抓取采样方法。 Real2Sim2Real 管道的好处来自 1) 将场景建模和抓取采样解耦为子问题，以及 2) 可以使用最新的 3D 学习算法和基于网格的物理模拟技术以足够高的质量解决这两个子问题。\n  - [用于实时、开放集场景理解的特征真实神经融合](https://arxiv.org/abs/2210.03043) | [code]\n    > 机器人的一般场景理解需要灵活的语义表示，以便可以识别、分割和分组训练时可能不知道的新物体和结构。我们提出了一种算法，该算法在实时 SLAM 期间将来自标准预训练网络的一般学习特征融合到高效的 3D 几何神经场表示中。融合的 3D 特征图继承了神经域几何表示的连贯性。这意味着在运行时交互的少量人类标签使对象甚至对象的一部分能够以开放集的方式稳健而准确地分割。\n  - [IR-MCL：基于隐式表示的在线全球本地化](https://arxiv.org/abs/2210.03113) | [***``[code]``***](https://github.com/PRBonn/ir-mcl)\n    > 确定移动机器人的状态是机器人导航系统的重要组成部分。在本文中，我们解决了使用 2D LiDAR 数据估计机器人在室内环境中的姿势的问题，并研究了现代环境模型如何改进黄金标准 Monte-Carlo 定位 (MCL) 系统。我们提出了一个神经占用场（NOF）来使用神经网络隐式表示场景。借助预训练网络，我们可以通过体绘制合成 2D LiDAR 扫描以获取任意机器人姿势。基于隐式表示，我们可以获得合成扫描与实际扫描之间的相似度作为观察模型，并将其集成到 MCL 系统中以执行准确的定位。我们在五个自记录数据集和三个公开可用数据集的序列上评估我们的方法。我们表明，我们可以使用我们的方法准确有效地定位机器人，超过最先进方法的定位性能。实验表明，所呈现的隐式表示能够预测更准确的 2D LiDAR 扫描，从而为我们的基于粒子滤波器的定位提供改进的观察模型。我们方法的代码发布在：this https URL。\n  - [NARF22：用于配置感知渲染的神经铰接辐射场, IROS2022](https://progress.eecs.umich.edu/projects/narf/) | [code]\n    > 铰接物体对机器人的感知和操作提出了独特的挑战。它们增加的自由度数量使得定位等任务在计算上变得困难，同时也使得现实世界数据集收集的过程无法扩展。为了解决这些可扩展性问题，我们提出了神经铰接辐射场 (NARF22)，这是一个使用完全可微分、配置参数化神经辐射场 (NeRF) 作为提供铰接对象高质量渲染的方法的管道。 NARF22 在推理时不需要明确了解对象结构。我们提出了一种两阶段的基于部件的训练机制，即使底层训练数据只有一个配置表示，它也允许对象渲染模型在配置空间中很好地泛化。我们通过在通过 Fetch 移动操作机器人收集的真实关节工具数据集上训练可配置渲染器来展示 NARF22 的功效。我们通过配置估计和 6 自由度姿态细化任务展示了该模型对基于梯度的推理方法的适用性。项目网页位于：此 https URL。\n  - [密集单目 SLAM 的概率体积融合](https://arxiv.org/abs/2210.01276) | [code]\n    > 我们提出了一种利用深度密集单目 SLAM 和快速不确定性传播从图像中重建 3D 场景的新方法。所提出的方法能够密集、准确、实时地对场景进行 3D 重建，同时对来自密集单目 SLAM 的极其嘈杂的深度估计具有鲁棒性。与以前的方法不同，要么使用 ad-hoc 深度滤波器，要么从 RGB-D 相机的传感器模型估计深度不确定性，我们的概率深度不确定性直接来自 SLAM 中底层束调整问题的信息矩阵。我们表明，由此产生的深度不确定性提供了一个很好的信号来加权深度图以进行体积融合。如果没有我们的深度不确定性，生成的网格会很嘈杂并带有伪影，而我们的方法会生成准确的 3D 网格，并且伪影要少得多。我们提供了具有挑战性的 Euroc 数据集的结果，并表明我们的方法比直接融合来自单目 SLAM 的深度的准确度提高了 92%，与最佳竞争方法相比提高了 90%。\n  - [NeRF：3D 视觉中的神经辐射场，综合评论](https://arxiv.org/abs/2210.00379) | [code]\n    > 神经辐射场 (NeRF) 是一种具有隐式场景表示的新型视图合成，已经席卷了计算机视觉领域。作为一种新颖的视图合成和 3D 重建方法，NeRF 模型在机器人技术、城市测绘、自主导航、虚拟现实/增强现实等领域都有应用。自 Mildenhall 等人的原始论文以来，已发表了 250 多份预印本，其中 100 多份最终被一级计算机​​视觉会议接受。鉴于 NeRF 的受欢迎程度和当前对该研究领域的兴趣，我们认为有必要对过去两年的 NeRF 论文进行全面调查，我们将其组织成基于架构和基于应用程序的分类法。我们还介绍了基于 NeRF 的新颖视图合成理论，以及关键 NeRF 模型的性能和速度的基准比较。通过创建这项调查，我们希望向 NeRF 介绍新的研究人员，为该领域有影响力的工作提供有益的参考，并通过我们的讨论部分激发未来的研究方向。\n## Sep25 - Oct1, 2022\n  - [具有三层采样和全景表示的城市级增量神经映射](https://arxiv.org/abs/2209.14072) | [code]\n    > 神经隐式表示最近引起了机器人界的广泛关注，因为它们具有表现力、连续性和紧凑性。然而，基于稀疏 LiDAR 输入的城市规模增量隐式密集映射仍然是一个未充分探索的挑战。为此，我们成功构建了第一个具有全景表示的城市规模增量神经映射系统，该系统由环境级和实例级建模组成。给定一个稀疏的 LiDAR 点云流，它维护一个动态生成模型，将 3D 坐标映射到有符号距离场 (SDF) 值。为了解决在城市尺度空间中表示不同层次几何信息的困难，我们提出了一种定制的三层采样策略来动态采样全局、局部和近地表域。同时，为了实现高保真映射，引入了特定类别的先验以更好地对几何细节进行建模，从而实现全景表示。我们评估了公共 SemanticKITTI 数据集，并使用定量和定性结果证明了新提出的三层采样策略和全景表示的重要性。代码和数据将公开。\n  - [Orbeez-SLAM：具有 ORB 特征和 NeRF 实现映射的实时单目视觉 SLAM](https://arxiv.org/abs/2209.13274) | [code]\n    > 一种可以通过视觉信号执行复杂任务并与人类合作的空间人工智能备受期待。为了实现这一点，我们需要一个无需预训练即可轻松适应新场景并实时为下游任务生成密集地图的视觉 SLAM。由于其组件的内在限制，以前的基于学习和非基于学习的视觉 SLAM 都不能满足所有需求。在这项工作中，我们开发了一个名为 Orbeez-SLAM 的视觉 SLAM，它成功地与隐式神经表示 (NeRF) 和视觉里程计合作来实现我们的目标。此外，Orbeez-SLAM 可以与单目相机配合使用，因为它只需要 RGB 输入，使其广泛适用于现实世界。我们在各种具有挑战性的基准上验证了它的有效性。结果表明，我们的 SLAM 比强基线快 800 倍，并具有出色的渲染结果。\n  - [通过控制屏障功能和神经辐射场增强基于视觉的控制器的安全性](https://arxiv.org/abs/2209.12266) | [code]\n    > 为了在复杂的环境中导航，机器人必须越来越多地使用高维视觉反馈（例如图像）进行控制。然而，依靠高维图像数据做出控制决策会引发重要问题；特别是，我们如何证明视觉反馈控制器的安全性？控制障碍函数 (CBF) 是在状态反馈设置中验证反馈控制器安全性的强大工具，但由于需要预测未来的观察结果以评估障碍函数，CBF 传统上不太适合视觉反馈控制.在这项工作中，我们利用神经辐射场 (NeRFs) 的最新进展来解决这个问题，神经辐射场 (NeRFs) 学习 3D 场景的隐式表示并可以从以前看不见的相机视角渲染图像，为基于 CBF 的单步视觉预测提供控制器。这种新颖的组合能够过滤掉不安全的行为并进行干预以保护安全。我们在实时模拟实验中展示了我们的控制器的效果，它成功地防止了机器人采取危险行动。\n## Sep18 - Sep24, 2022\n  - [Local_INN：使用可逆神经网络的隐式地图表示和定位](https://arxiv.org/abs/2209.11925) | [code]\n    > 机器人定位是使用地图和传感器测量找到机器人姿势的逆问题。近年来，可逆神经网络（INNs）成功地解决了各个领域的模糊逆问题。本文提出了一个用 INN 解决本地化问题的框架。我们设计了一个 INN，它在正向路径中提供隐式地图表示并在反向路径中提供定位。通过在评估中对潜在空间进行采样，Local\\_INN 输出具有协方差的机器人位姿，可用于估计不确定性。我们表明 Local\\_INN 的本地化性能与当前的方法相当，但延迟要低得多。我们使用训练集外部的姿势从 Local\\_INN 显示详细的 2D 和 3D 地图重建。我们还提供了一个使用 Local\\_INN 的全局定位算法来解决绑架问题。\n  - [NeRF-Loc：神经辐射场内基于变换器的对象定位](https://arxiv.org/abs/2209.12068) | [code]\n    > 神经辐射场 (NeRFs) 已成功用于场景表示。最近的工作还开发了使用基于 NeRF 的环境表示的机器人导航和操纵系统。由于对象定位是许多机器人应用的基础，为了进一步释放 NeRF 在机器人系统中的潜力，我们研究了 NeRF 场景中的对象定位。我们提出了一个基于转换器的框架 NeRF-Loc 来提取 NeRF 场景中对象的 3D 边界框。 NeRF-Loc 将预先训练的 NeRF 模型和相机视图作为输入，并生成标记的 3D 对象边界框作为输出。具体来说，我们设计了一对并行的转换器编码器分支，即粗流和细流，对目标对象的上下文和细节进行编码。然后将编码特征与注意力层融合在一起，以减轻模糊性，从而实现准确的对象定位。我们将我们的方法与传统的基于变压器的方法进行了比较，我们的方法取得了更好的性能。此外，我们还展示了第一个基于 NeRF 样本的对象定位基准 NeRFLocBench。\n  - [感觉怎么样？ 用于越野车辆可穿越性的自我监督成本图学习](https://arxiv.org/abs/2209.10788) | [code]\n    > 估计越野环境中的地形可穿越性需要推理机器人与这些地形之间的复杂交互动力学。然而，对于这些交互，构建准确的物理模型或创建信息标签以有监督的方式学习模型具有挑战性。我们提出了一种方法，该方法通过以自我监督的方式将外部感知环境信息与本体感知地形交互反馈相结合来学习预测可遍历性成本图。此外，我们提出了一种将机器人速度纳入成本图预测管道的新方法。我们在具有挑战性的越野地形的大型自主全地形车 (ATV) 上的多个短距离和大规模导航任务中验证了我们的方法，并证明了在单独的大型地面机器人上易于集成。我们的短尺度导航结果表明，使用我们学习的成本图可以使导航整体更顺畅，并为机器人提供对机器人与不同地形类型（如草地和砾石）之间相互作用的更细粒度的理解。我们的大规模导航试验表明，在 400 米到 3150 米的具有挑战性的越野路线中，与基于占用的导航基线相比，我们可以将干预次数减少多达 57%。\n  - [Loc-NeRF：使用神经辐射场进行蒙特卡罗定位](https://arxiv.org/abs/2209.09050) | [***``[code]``***](https://github.com/MIT-SPARK/Loc-NeRF)\n    > 我们提出了 Loc-NeRF，这是一种基于实时视觉的机器人定位方法，它结合了蒙特卡洛定位和神经辐射场 (NeRF)。我们的系统使用预训练的 NeRF 模型作为环境地图，并且可以使用 RGB 相机作为机器人上唯一的外部感受器实时定位自身。虽然神经辐射场已经在计算机视觉和图形中看到了视觉渲染的重要应用，但它们在机器人技术中的用途有限。现有的基于 NeRF 的定位方法需要良好的初始姿势猜测和大量计算，这使得它们对于实时机器人应用不切实际。通过使用 Monte Carlo 定位作为使用 NeRF 地图模型估计姿态的主力，Loc-NeRF 能够比现有技术更快地执行定位，并且不依赖于初始姿态估计。除了对合成数据进行测试外，我们还使用 Clearpath Jackal UGV 收集的真实数据运行我们的系统，并首次展示了使用神经辐射场执行实时全局定位的能力。我们通过此 https 网址公开我们的代码。\n  - [MeSLAM：基于神经域的内存高效 SLAM, SMC2022](https://arxiv.org/abs/2209.09357) | [code]\n    > 由于长期机器人操作中地图大小的增加，现有的同时定位和映射 (SLAM) 方法的可扩展性有限。此外，为定位和规划任务处理此类地图会导致车载所需的计算资源增加。为了解决长期操作中的内存消耗问题，我们开发了一种新颖的实时 SLAM 算法 MeSLAM，它基于神经场隐式地图表示。它将提议的全局映射策略（包括神经网络分布和区域跟踪）与外部里程计系统相结合。因此，该算法能够有效地训练代表不同地图区域的多个网络，并在大规模环境中准确地跟踪姿势。实验结果表明，所提出的方法的准确性与最先进的方法相当（在 TUM RGB-D 序列上平均为 6.6 cm），并且优于基线 iMAP*。此外，所提出的 SLAM 方法在最先进的 SLAM 方法中提供了最紧凑的地图，没有细节失真（1.9 MB 可存储 57 m3）。\n  - [LATITUDE：在城市规模的 NeRF 中使用截断动态低通滤波器进行机器人全局定位, ICRA2023](https://arxiv.org/abs/2209.08498) | [***``[code]``***](https://github.com/jike5/LATITUDE)\n    > 神经辐射场 (NeRFs) 在表示具有高分辨率细节和高效内存的复杂 3D 场景方面取得了巨大成功。然而，当前基于 NeRF 的姿态估计器没有初始姿态预测，并且在优化过程中容易出现局部最优。在本文中，我们提出了 LATITUDE：使用截断动态低通滤波器进行全局定位，它在城市规模的 NeRF 中引入了两阶段定位机制。在位置识别阶段，我们通过训练后的 NeRF 生成的图像训练回归器，为全局定位提供初始值。在姿态优化阶段，我们通过直接优化切平面上的姿态来最小化观察图像和渲染图像之间的残差。为了避免收敛到局部最优，我们引入了截断动态低通滤波器 (TDLF) 用于从粗到细的姿态配准。我们在合成数据和真实世界数据上评估我们的方法，并展示其在大规模城市场景中高精度导航的潜在应用。代码和数据将在此 https 网址上公开提供。\n  - [使用神经辐射场进行主动机器人 3D 重建的不确定性引导策略, RAL2022](https://arxiv.org/abs/2209.08409) | [code]\n    > 在本文中，我们解决了物体的主动机器人 3D 重建问题。特别是，我们研究了带有手持摄像头的移动机器人如何选择有利数量的视图来有效地恢复对象的 3D 形状。与该问题的现有解决方案相反，我们利用流行的基于神经辐射场的对象表示，最近在各种计算机视觉任务中显示出令人印象深刻的结果。然而，使用这种表示直接推断对象的显式 3D 几何细节并不简单，这使得密集 3D 重建的次佳视图选择问题具有挑战性。本文介绍了一种基于光线的体积不确定性估计器，它计算颜色样本沿物体隐式神经表示的每条光线的权重分布的熵。我们表明，使用所提出的估计器给出一个新颖的视图，可以推断出底层 3D 几何的不确定性。然后，我们提出了一个下一个最佳视图选择策略，该策略由基于神经辐射场的表示中基于射线的体积不确定性指导。令人鼓舞的合成数据和真实世界数据的实验结果表明，本文提出的方法可以启用一个新的研究方向，即使用隐式 3D 对象表示来解决机器人视觉应用中的下一个最佳视图问题，将我们的方法与现有的方法区分开来依赖于显式 3D 几何建模的方法。\n## Sep11 - Sep17, 2022\n  - [iDF-SLAM：具有神经隐式映射和深度特征跟踪的端到端 RGB-D SLAM](https://arxiv.org/abs/2209.07919) | [code]\n    > 我们提出了一种新颖的端到端 RGB-D SLAM iDF-SLAM，它采用基于特征的深度神经跟踪器作为前端，采用 NeRF 风格的神经隐式映射器作为后端。神经隐式映射器是即时训练的，虽然神经跟踪器是在 ScanNet 数据集上进行预训练的，但它也会随着神经隐式映射器的训练进行微调。在这样的设计下，我们的 iDF-SLAM 能够学习使用特定场景的特征进行相机跟踪，从而实现 SLAM 系统的终身学习。跟踪器和映射器的训练都是自我监督的，没有引入地面真实姿势。我们在 Replica 和 ScanNet 数据集上测试了 iDF-SLAM 的性能，并将结果与​​最近的两个基于 NeRF 的神经 SLAM 系统进行了比较。所提出的 iDF-SLAM 在场景重建和相机跟踪的竞争性能方面展示了最先进的结果。\n## Sep4 - Sep10, 2022\n  - [PixTrack：使用 NeRF 模板和特征度量对齐的精确 6DoF 对象姿势跟踪](https://arxiv.org/abs/2209.03910) | [code]\n    > 我们提出了 PixTrack，这是一个基于视觉的对象姿态跟踪框架，使用新颖的视图合成和深度特征度量对齐。我们的评估表明，我们的方法可以对 RGB 图像中的对象进行高度准确、稳健且无抖动的 6DoF 姿态估计，而无需任何数据注释或轨迹平滑。我们的方法在计算上也很高效，可以轻松进行多对象跟踪，而无需更改我们的方法，并且只使用 CPU 多处理。\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n  - [SCONE：通过体积积分优化未知环境中的表面覆盖率](https://arxiv.org/abs/2208.10449) | [code]\n    > 下一个最佳视图计算 (NBV) 是机器人技术中长期存在的问题，包括识别下一个信息量最大的传感器位置，以有效且准确地重建 3D 对象或场景。像大多数当前方法一样，我们考虑来自深度传感器的 NBV 预测。依赖于场景体积表示的基于学习的方法适用于路径规划，但不能很好地适应场景的大小，并且精度低于使用基于表面的表示的方法。然而，后者将相机限制在少数姿势。为了获得这两种表示的优点，我们表明我们可以通过蒙特卡罗积分在体积表示上最大化表面度量。我们的方法可扩展到大型场景并处理自由相机运动：它将由深度传感器（如激光雷达系统）收集的任意大点云以及相机姿势作为输入来预测 NBV。我们在由大型复杂 3D 场景组成的新数据集上展示了我们的方法。\n## Aug14 - Aug20, 2022\n  - [8 点算法作为 ViTs 相对姿势预测的归纳偏差, 3DV2022](https://arxiv.org/abs/2208.08988) | [***``[code]``***](https://github.com/crockwell/rel_pose)\n    > 我们提出了一个简单的基线，用于直接估计两个图像之间的相对姿势（旋转和平移，包括比例）。深度方法最近显示出强劲的进展，但通常需要复杂或多阶段的架构。我们展示了一些修改可以应用于视觉转换器 (ViT)，以使其计算接近八点算法。这种归纳偏差使一种简单的方法在多种环境中具有竞争力，通常在有限的数据机制中显着提高现有技术水平，并具有强大的性能提升。\n## Aug7 - Aug13, 2022\n  - [RelPose：预测野外单个物体的概率相对旋转, ECCV2022](https://jasonyzhang.com/relpose/) | [***``[code]``***](https://github.com/jasonyzhang/relpose)\n    > 我们描述了一种数据驱动的方法，用于在给定任意对象的多个图像的情况下推断相机视点。该任务是经典几何流水线（如 SfM 和 SLAM）的核心组成部分，也是当代神经方法（例如 NeRF）对对象重建和视图合成的重要预处理要求。与现有的在稀疏视图中表现不佳的对应驱动方法相比，我们提出了一种基于自上而下预测的方法来估计相机视点。我们的关键技术见解是使用基于能量的公式来表示相对相机旋转的分布，从而使我们能够明确表示由对象对称性或视图产生的多个相机模式。利用这些相对预测，我们从多张图像中共同估计一组一致的相机旋转。我们表明，在给定可见和不可见类别的稀疏图像的情况下，我们的方法优于最先进的 SfM 和 SLAM 方法。此外，我们的概率方法明显优于直接回归相对姿势，这表明建模多模态对于连贯的关节重建很重要。我们证明我们的系统可以成为从多视图数据集进行野外重建的垫脚石。包含代码和视频的项目页面可以在这个 https URL 找到。\n## Jul31 - Aug6, 2022\n  - [PRIF: Primary Ray-based Implicit Function](https://research.google/pubs/pub51556/) | [code]\n    > 我们引入了一种新的隐式形状表示，称为基于初级光线的隐式函数 (PRIF)。与大多数基于符号距离函数 (SDF) 处理空间位置的现有方法相比，我们的表示在定向射线上运行。具体来说，PRIF 被制定为直接生成给定输入射线的表面命中点，而无需昂贵的球体跟踪操作，从而实现高效的形状提取和可微渲染。我们证明了经过训练以编码 PRIF 的神经网络在各种任务中取得了成功，包括单一形状表示、类别形状生成、稀疏或嘈杂观察的形状补全、相机姿态估计的逆渲染以及颜色的神经渲染。\n## Jul24 - Jul30, 2022\n  - [ObjectFusion：具有神经对象先验的准确对象级 SLAM, Graphical Models, Volume 123, September 2022](https://www.sciencedirect.com/science/article/pii/S1524070322000418) | [code]\n    > 以前的对象级同步定位和映射 (SLAM) 方法仍然无法以有效的方式创建高质量的面向对象的 3D 地图。主要挑战来自如何有效地表示对象形状以及如何将这种对象表示有效地应用于准确的在线相机跟踪。在本文中，我们提供 ObjectFusion 作为静态场景中的一种新颖的对象级 SLAM，它通过利用神经对象先验，有效地创建具有高质量对象重建的面向对象的 3D 地图。我们提出了一种仅具有单个编码器-解码器网络的神经对象表示，以有效地表达各种类别的对象形状，这有利于对象实例的高质量重建。更重要的是，我们建议将这种神经对象表示转换为精确测量，以共同优化对象形状、对象姿态和相机姿态，以实现最终准确的 3D 对象重建。通过对合成和真实世界 RGB-D 数据集的广泛评估，我们表明我们的 ObjectFusion 优于以前的方法，具有更好的对象重建质量，使用更少的内存占用，并且以更有效的方式，尤其是在对象级别。\n  - [神经密度-距离场, ECCV2022](https://arxiv.org/abs/2207.14455) | [***``[code]``***](https://ueda0319.github.io/neddf/)\n    > 神经领域在 3D 视觉任务中的成功现在是无可争辩的。遵循这一趋势，已经提出了几种针对视觉定位的方法（例如，SLAM）来使用神经场估计距离或密度场。然而，仅通过基于密度场的方法（例如神经辐射场 (NeRF)）很难实现高定位性能，因为它们在大多数空白区域中不提供密度梯度。另一方面，基于距离场的方法，例如神经隐式表面 (NeuS)，在对象的表面形状方面存在局限性。本文提出了神经密度-距离场 (NeDDF)，这是一种新的 3D 表示，它相互约束距离和密度场。我们将距离场公式扩展到没有明确边界表面的形状，例如毛皮或烟雾，这使得从距离场到密度场的显式转换成为可能。通过显式转换实现的一致距离和密度场既能保证初始值的鲁棒性，又能实现高质量的配准。此外，场之间的一致性允许从稀疏点云快速收敛。实验表明，NeDDF 可以实现高定位性能，同时在新颖的视图合成上提供与 NeRF 相当的结果。该代码可在此 https URL 获得。\n  - [ShAPO：多对象形状、外观和姿势优化的隐式表示, ECCV2022](https://arxiv.org/abs/2207.13691) | [***``[code]``***](https://zubair-irshad.github.io/projects/ShAPO.html)\n    > 我们的方法从单个 RGB-D 观察中研究以对象为中心的 3D 理解的复杂任务。由于这是一个不适定问题，现有方法在具有遮挡的复杂多对象场景中的 3D 形状和 6D 姿势和尺寸估计性能低下。我们提出了 ShaAPO，一种用于联合多对象检测、3D 纹理重建、6D 对象姿态和大小估计的方法。 ShAPO 的关键是一个单次管道，用于回归形状、外观和姿势潜在代码以及每个对象实例的掩码，然后以稀疏到密集的方式进一步细化。首先学习了一种新的解开的先验形状和外观数据库，以将对象嵌入到它们各自的形状和外观空间中。我们还提出了一种新颖的、基于八叉树的可微优化步骤，使我们能够以综合分析的方式在学习的潜在空间下同时进一步改进对象形状、姿势和外观。我们新颖的联合隐式纹理对象表示使我们能够准确地识别和重建新的看不见的对象，而无需访问它们的 3D 网格。通过广泛的实验，我们证明了我们的方法在模拟室内场景上进行训练，能够以最少的微调准确地回归现实世界中新物体的形状、外观和姿势。我们的方法显着优于 NOCS 数据集上的所有基线，6D 姿态估计的 mAP 绝对提高了 8%。\n  - [GAUDI：沉浸式 3D 场景生成的神经架构师](https://arxiv.org/abs/2207.13751) | [***``[code]``***](https://github.com/apple/ml-gaudi)\n    > 我们介绍了 GAUDI，这是一种生成模型，能够捕捉复杂而逼真的 3D 场景的分布，可以从移动的相机中沉浸式地渲染。我们用一种可扩展但功能强大的方法来解决这个具有挑战性的问题，我们首先优化一个潜在的表示，以解开辐射场和相机姿势。然后使用这种潜在表示来学习生成模型，该模型可以无条件和有条件地生成 3D 场景.我们的模型通过消除相机姿态分布可以跨样本共享的假设来概括以前专注于单个对象的工作。我们展示了 GAUDI 在跨多个数据集的无条件生成设置中获得了最先进的性能，并允许在给定条件变量（如稀疏图像观察或描述场景的文本）的情况下有条件地生成 3D 场景。\n  - [AlignSDF：用于手对象重建的姿势对齐有符号距离场, ECCV2022](https://arxiv.org/abs/2207.12909) | [***``[code]``***](https://zerchen.github.io/projects/alignsdf.html)\n    > 最近的工作在从单目彩色图像联合重建手和操纵对象方面取得了令人瞩目的进展。现有方法侧重于参数网格或符号距离场 (SDF) 方面的两种替代表示。一方面，参数模型可以从先验知识中受益，但代价是有限的形状变形和网格分辨率。因此，网格模型可能无法精确重建细节，例如手和物体的接触面。另一方面，基于 SDF 的方法可以表示任意细节，但缺乏明确的先验。在这项工作中，我们的目标是使用参数表示提供的先验改进 SDF 模型。特别是，我们提出了一个联合学习框架，可以解开姿势和形状。我们从参数模型中获取手和物体的姿势，并使用它们在 3D 空间中对齐 SDF。我们表明，这种对齐的 SDF 更好地专注于重建形状细节并提高手和物体的重建精度。我们评估了我们的方法，并在具有挑战性的 ObMan 和 DexYCB 基准上展示了对现有技术的显着改进。\n## Previous weeks\n  - [野外的 NeRF：无约束照片集的神经辐射场, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > 我们提出了一种基于学习的方法，用于仅使用野外照片的非结构化集合来合成复杂场景的新视图。我们建立在神经辐射场 (NeRF) 的基础上，它使用多层感知器的权重将场景的密度和颜色建模为 3D 坐标的函数。虽然 NeRF 在受控设置下捕获的静态对象的图像上效果很好，但它无法在不受控的图像中模拟许多普遍存在的真实世界现象，例如可变照明或瞬态遮挡物。我们为 NeRF 引入了一系列扩展来解决这些问题，从而能够从互联网上获取的非结构化图像集合中进行准确的重建。我们将我们的系统（称为 NeRF-W）应用于著名地标的互联网照片集，并展示时间一致的新颖视图渲染，这些渲染比现有技术更接近真实感。\n  - [Ha-NeRF：野外的幻觉神经辐射场, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > 神经辐射场 (NeRF) 最近因其令人印象深刻的新颖视图合成能力而广受欢迎。本文研究了幻觉 NeRF 的问题：即在一天中的不同时间从一组旅游图像中恢复一个真实的 NeRF。现有的解决方案采用具有可控外观嵌入的 NeRF 在各种条件下渲染新颖的视图，但它们无法渲染具有看不见的外观的视图一致图像。为了解决这个问题，我们提出了一个用于构建幻觉 NeRF 的端到端框架，称为 Ha-NeRF。具体来说，我们提出了一个外观幻觉模块来处理随时间变化的外观并将它们转移到新的视图中。考虑到旅游图像的复杂遮挡，我们引入了一个反遮挡模块来准确地分解静态主体以获得可见性。合成数据和真实旅游照片集的实验结果表明，我们的方法可以产生幻觉，并从不同的视图呈现无遮挡的图像。\n  - [Nerfies：可变形的神经辐射场, ICCV2021](https://arxiv.org/abs/2011.12948) | [code]\n    > 我们提出了第一种能够使用从手机随便捕获的照片/视频来逼真地重建可变形场景的方法。我们的方法通过优化一个额外的连续体积变形场来增强神经辐射场 (NeRF)，该场将每个观察点扭曲成一个规范的 5D NeRF。我们观察到这些类似 NeRF 的变形场容易出现局部最小值，并为基于坐标的模型提出了一种从粗到细的优化方法，可以实现更稳健的优化。通过将几何处理和物理模拟的原理应用于类似 NeRF 的模型，我们提出了变形场的弹性正则化，进一步提高了鲁棒性。我们表明，我们的方法可以将随意捕获的自拍照片/视频转换为可变形的 NeRF 模型，允许从任意视角对主体进行逼真的渲染，我们称之为“nerfies”。我们通过使用带有两部手机的装备收集时间同步数据来评估我们的方法，从而在不同视点产生相同姿势的训练/验证图像。我们表明，我们的方法忠实地重建了非刚性变形的场景，并以高保真度再现了看不见的视图。\n  - [用于单目 4D 面部头像重建的动态神经辐射场, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > 我们提出了用于模拟人脸外观和动态的动态神经辐射场。对说话的人进行数字建模和重建是各种应用程序的关键组成部分。特别是对于 AR 或 VR 中的远程呈现应用，需要忠实再现外观，包括新颖的视点或头部姿势。与显式建模几何和材料属性或纯粹基于图像的最先进方法相比，我们引入了基于场景表示网络的头部隐式表示。为了处理面部的动态，我们将场景表示网络与低维可变形模型相结合，该模型提供对姿势和表情的显式控制。我们使用体积渲染从这种混合表示中生成图像，并证明这种动态神经场景表示只能从单目输入数据中学习，而不需要专门的捕获设置。在我们的实验中，我们表明这种学习的体积表示允许生成照片般逼真的图像，其质量超过了基于视频的最先进的重演方法的质量。\n  - [神经关节辐射场, ICCV2021](https://arxiv.org/abs/2104.03110) | [***``[code]``***](https://github.com/nogu-atsu/NARF#code)\n    > 我们提出了神经关节辐射场 (NARF)，这是一种新颖的可变形 3D 表示，用于从图像中学习到的关节对象。虽然 3D 隐式表示的最新进展使得学习复杂对象的模型成为可能，但学习关节对象的姿势可控表示仍然是一个挑战，因为当前的方法需要 3D 形状监督并且无法呈现外观。在制定 3D 关节对象的隐式表示时，我们的方法在求解每个 3D 位置的辐射场时仅考虑最相关对象部分的刚性变换。通过这种方式，所提出的方法可以表示与姿势相关的变化，而不会显着增加计算复杂度。 NARF 是完全可微的，可以从带有姿势注释的图像中训练出来。此外，通过使用自动编码器，它可以学习对象类的多个实例的外观变化。实验表明，所提出的方法是有效的，并且可以很好地推广到新的姿势。\n  - [神经演员：具有姿势控制的人类演员的神经自由视图合成, SIGSIGGRAPH Asia 2021](https://vcai.mpi-inf.mpg.de/projects/NeuralActor/) | [***``[code]``***](https://people.mpi-inf.mpg.de/~lliu/projects/NeuralActor/)\n    > 我们提出了神经演员 (NA)，这是一种从任意视角和任意可控姿势下高质量合成人类的新方法。我们的方法建立在最近的神经场景表示和渲染工作之上，这些工作仅从 2D 图像中学习几何和外观的表示。虽然现有作品展示了令人信服的静态场景渲染和动态场景回放，但使用神经隐式方法对人类进行逼真的重建和渲染，特别是在用户控制的新姿势下，仍然很困难。为了解决这个问题，我们利用粗体模型作为代理将周围的 3D 空间展开为规范姿势。神经辐射场从多视图视频输入中学习规范空间中与姿势相关的几何变形以及与姿势和视图相关的外观效果。为了合成高保真动态几何和外观的新视图，我们利用在身体模型上定义的 2D 纹理图作为潜在变量来预测残余变形和动态外观。实验表明，我们的方法在回放和新颖的姿势合成方面取得了比现有技术更好的质量，甚至可以很好地推广到与训练姿势截然不同的新姿势。此外，我们的方法还支持合成结果的体形控制。\n  - [iNeRF：用于姿势估计的反转神经辐射场, IROS2021](http://yenchenlin.me/inerf/) | [***``[code]``***](https://github.com/yenchenlin/iNeRF-public)\n    > 我们提出了 iNeRF，这是一个通过“反转”经过训练的神经辐射场 (NeRF) 来执行姿态估计的框架。 NeRF 已被证明对视图合成任务非常有效——合成真实世界场景或对象的逼真的新视图。在这项工作中，我们研究是否可以使用 NeRF 进行综合分析来进行 6DoF 姿势估计——给定图像，找到相机相对于 3D 模型的平移和旋转。从初始姿态估计开始，我们使用梯度下降来最小化从已经训练的 NeRF 渲染的像素和观察图像中的像素之间的残差。在我们的实验中，我们首先研究 1）如何在 iNeRF 的姿势细化过程中对光线进行采样以收集信息梯度，以及 2）不同批次大小的光线如何影响合成数据集上的 iNeRF。然后，我们展示了对于来自 LLFF 数据集的复杂现实世界场景，iNeRF 可以通过估计新图像的相机位姿并将这些图像用作 NeRF 的额外训练数据来改进 NeRF。最后，我们展示了 iNeRF 可以与基于特征的姿势初始化相结合。该方法优于所有其他依赖 LineMOD 上的合成数据的基于 RGB 的方法。\n  - [A-NeRF：通过神经渲染进行无表面人体 3D 姿势细化, NeurIPS2021](https://arxiv.org/abs/2102.06199) | [***``[code]``***](https://github.com/LemonATsu/A-NeRF)\n    > 虽然深度学习使用前馈网络重塑了经典的运动捕捉管道，但需要生成模型通过迭代细化来恢复精细对齐。不幸的是，现有模型通常是在受控条件下手工制作或学习的，仅适用于有限的领域。我们提出了一种通过扩展神经辐射场 (NeRFs) 从未标记的单目视频中学习生成神经体模型的方法。我们为它们配备了骨架，以适用于时变和关节运动。一个关键的见解是，隐式模型需要与显式曲面模型中使用的正向运动学相反。我们的重新参数化定义了相对于身体部位姿势的空间潜在变量，从而克服了过度参数化的不适定逆运算。这使得从头开始学习体积身体形状和外观，同时共同改进关节姿势；输入视频上的所有外观、姿势或 3D 形状都没有地面实况标签。当用于新视图合成和动作捕捉时，我们的神经模型提高了不同数据集的准确性。项目网站：此 https 网址。\n  - [NeRF--：没有已知相机参数的神经辐射场](https://nerfmm.active.vision/) | [***``[code]``***](https://github.com/ActiveVisionLab/nerfmm)\n    > 考虑到仅来自一组 2D 图像的新视图合成 (NVS) 问题，我们通过消除已知或预先计算的相机参数的要求，简化了前向场景中神经辐射场 (NeRF) 的训练过程，包括内在函数和 6DoF 姿势。为此，我们提出了 NeRF−−，具有三个贡献：首先，我们表明相机参数可以通过光度重建作为可学习参数与 NeRF 训练联合优化；其次，为了对相机参数估计和新颖视图渲染的质量进行基准测试，我们引入了一个新的路径跟踪合成场景数据集，称为 Blender Forward-Facing Dataset (BLEFF)；第三，我们进行了广泛的分析以了解各种相机运动下的训练行为，并表明在大多数情况下，联合优化管道可以恢复准确的相机参数并实现与使用 COLMAP 预计算相机参数训练的方法相当的新视图合成质量。\n  - [实时隐式映射和定位, ICCV2021](https://arxiv.org/abs/2103.12352) | [code]\n    > 我们首次展示了多层感知器 (MLP) 可以作为手持 RGB-D 相机的实时 SLAM 系统中唯一的场景表示。我们的网络在没有先验数据的情况下进行实时操作训练，构建了一个密集的、特定于场景的隐式 3D 占用率和颜色模型，该模型也可立即用于跟踪。\n  - [用于 SLAM 的 NICE-SLAM 神经隐​​式可扩展编码, CVPR2022](https://arxiv.org/abs/2112.12130) | [***``[code]``***](https://github.com/cvg/nice-slam)\n    > 神经隐式表示最近在各个领域都显示出令人鼓舞的结果，包括在同时定位和映射 (SLAM) 方面取得的可喜进展。然而，现有方法会产生过度平滑的场景重建，并且难以扩展到大场景。这些限制主要是由于它们简单的全连接网络架构没有在观察中包含本地信息。在本文中，我们提出了 NICE-SLAM，这是一种密集的 SLAM 系统，它通过引入分层场景表示来结合多级局部信息。使用预先训练的几何先验优化这种表示可以在大型室内场景中进行详细的重建。与最近的神经隐式 SLAM 系统相比，我们的方法更具可扩展性、高效性和鲁棒性。在五个具有挑战性的数据集上的实验证明了 NICE-SLAM 在映射和跟踪质量方面的竞争结果。\n  - [GNeRF：基于 GAN 的无姿势相机的神经辐射场, ICCV2021(oral)](https://arxiv.org/abs/2103.15606) | [code]\n    > 我们介绍了 GNeRF，这是一个将生成对抗网络 (GAN) 与神经辐射场 (NeRF) 重建相结合的框架，用于具有未知甚至随机初始化相机姿势的复杂场景。最近基于 NeRF 的进展因显着的逼真的新视图合成而受到欢迎。然而，它们中的大多数严重依赖于准确的相机位姿估计，而最近的一些方法只能在相机轨迹相对较短的大致前向场景中优化未知相机位姿，并且需要粗略的相机位姿初始化。不同的是，我们的 GNeRF 仅将随机初始化的姿势用于复杂的由外而内的场景。我们提出了一种新颖的两阶段端到端框架。第一阶段将 GAN 的使用带入新领域，以联合优化粗略的相机姿势和辐射场，而第二阶段通过额外的光度损失对它们进行细化。我们使用混合迭代优化方案克服了局部最小值。对各种合成和自然场景的广泛实验证明了 GNeRF 的有效性。更令人印象深刻的是，我们的方法在那些以前被认为极具挑战性的重复模式甚至低纹理的场景中优于基线。\n  - [BARF：捆绑调整神经辐射场, ICCV2021(oral)](https://chenhsuanlin.bitbucket.io/bundle-adjusting-NeRF/) | [***``[code]``***](https://github.com/chenhsuanlin/bundle-adjusting-NeRF)\n    > 神经辐射场 (NeRF) 最近在计算机视觉界引起了极大的兴趣，因为它具有合成真实世界场景的逼真的新颖视图的能力。然而，NeRF 的一个限制是它需要准确的相机姿势来学习场景表示。在本文中，我们提出了 Bundle-Adjusting Neural Radiance Fields (BARF)，用于从不完美（甚至未知）的相机姿势训练 NeRF——学习神经 3D 表示和注册相机帧的联合问题。我们建立了与经典图像对齐的理论联系，并表明从粗到细的配准也适用于 NeRF。此外，我们表明，在 NeRF 中天真地应用位置编码会对基于合成的目标的注册产生负面影响。合成数据和真实世界数据的实验表明，BARF 可以有效地优化神经场景表示并同时解决大的相机位姿错位问题。这使得来自未知相机位姿的视频序列的视图合成和定位成为可能，为视觉定位系统（例如 SLAM）和密集 3D 映射和重建的潜在应用开辟了新途径。\n  - [自校准神经辐射场, ICCV2021](https://postech-cvlab.github.io/SCNeRF/) | [***``[code]``***](https://github.com/POSTECH-CVLab/SCNeRF)\n    > 在这项工作中，我们提出了一种用于具有任意非线性失真的通用相机的相机自校准算法。我们共同学习场景的几何形状和准确的相机参数，无需任何校准对象。我们的相机模型包括针孔模型、径向失真和可以学习任意非线性相机失真的通用噪声模型。虽然传统的自校准算法主要依赖于几何约束，但我们还结合了光度一致性。这需要学习场景的几何形状，我们使用神经辐射场 (NeRF)。我们还提出了一种新的几何损失函数，即投影射线距离损失，以结合复杂非线性相机模型的几何一致性。我们在标准真实图像数据集上验证了我们的方法，并证明我们的模型可以从头开始学习相机的内在和外在（姿势），而无需 COLMAP 初始化。此外，我们表明，以可微分的方式学习准确的相机模型可以让我们在 NeRF 上提高 PSNR。我们通过实验证明我们提出的方法适用于 NeRF 的变体。此外，我们使用一组用鱼眼镜头拍摄的图像来证明学习相机模型与 COLMAP 初始化相比，共同提高了性能。\n  - [动态场景的神经场景图, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > 最近的隐式神经渲染方法表明，可以通过仅由一组 RGB 图像监督的预测其体积密度和颜色来学习复杂场景的准确视图合成。然而，现有方法仅限于学习将所有场景对象编码为单个神经网络的静态场景的有效表示，并且缺乏将动态场景表示和分解为单个场景对象的能力。在这项工作中，我们提出了第一个将动态场景分解为场景图的神经渲染方法。我们提出了一种学习的场景图表示，它对对象变换和辐射进行编码，以有效地渲染场景的新颖排列和视图。为此，我们学习隐式编码的场景，并结合联合学习的潜在表示来描述具有单个隐式函数的对象。我们在合成和真实汽车数据上评估所提出的方法，验证我们的方法学习动态场景 - 仅通过观察该场景的视频 - 并允许渲染具有看不见的对象集的新颖场景组合的新颖照片般逼真的视图看不见的姿势。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/reconstruction.md",
    "content": "\n每周分类神经辐射场 - reconstruction ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n===========================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [ResNeRF：用于室内场景新视图合成的几何引导残余神经辐射场](https://arxiv.org/abs/2211.16211) | [code]\n    > 我们代表 ResNeRF，这是一种用于室内场景新颖视图合成的新颖几何引导两阶段框架。请注意，良好的几何形状将极大地提高新视图合成的性能，并且为了避免几何模糊问题，我们建议基于从场景几何形状估计的基本密度和参数化的残差密度来表征场景的密度分布几何。在第一阶段，我们专注于基于 SDF 表示的几何重建，这将导致场景的良好几何表面和清晰的密度。在第二阶段，残差密度是基于第一阶段学习的SDF来学习的，用于编码更多关于外观的细节。通过这种方式，我们的方法可以更好地学习具有几何先验的密度分布，用于高保真新视图合成，同时保留 3D 结构。在具有许多观察较少和无纹理区域的大型室内场景上进行的实验表明，凭借良好的 3D 表面，我们的方法实现了新视图合成的最先进性能。\n  - [恢复神经隐式表面重建的精细细节](https://arxiv.org/abs/2211.11320) | [code]\n    > 最近关于隐式神经表征的工作取得了重大进展。使用体绘制学习隐式神经表面在没有 3D 监督的多视图重建中得到了普及。然而，由于几何和外观表示的潜在模糊性，准确地恢复精细细节仍然具有挑战性。在本文中，我们提出了 D-NeuS，一种能够恢复精细几何细节的基于体积渲染的神经隐式表面重建方法，它通过两个额外的损失函数扩展了 NeuS，旨在提高重建质量。首先，我们鼓励来自 alpha 合成的渲染表面点具有零符号距离值，从而减轻将 SDF 转换为体积渲染密度所产生的几何偏差。其次，我们在表面点上施加多视图特征一致性，这是通过沿射线从采样点插值 SDF 零交叉得出的。广泛的定量和定性结果表明，我们的方法重建了具有细节的高精度表面，并且优于现有技术。\n## Nov13 - Nov19, 2022\n  - [Magic3D：高分辨率文本到 3D 内容创建](https://arxiv.org/abs/2211.10440) | [code]\n    > DreamFusion 最近展示了预训练的文本到图像扩散模型在优化神经辐射场 (NeRF) 方面的实用性，实现了卓越的文本到 3D 合成结果。然而，该方法有两个固有的局限性：(a) NeRF 的优化极其缓慢和 (b) NeRF 上的低分辨率图像空间监督，导致处理时间长的低质量 3D 模型。在本文中，我们通过使用两阶段优化框架来解决这些限制。首先，我们使用低分辨率扩散先验获得粗糙模型，并使用稀疏 3D 哈希网格结构进行加速。使用粗略表示作为初始化，我们进一步优化了带纹理的 3D 网格模型，该模型具有与高分辨率潜在扩散模型交互的高效可微分渲染器。我们的方法被称为 Magic3D，可以在 40 分钟内创建高质量的 3D 网格模型，比 DreamFusion 快 2 倍（据报道平均需要 1.5 小时），同时还实现了更高的分辨率。用户研究表明 61.7% 的评分者更喜欢我们的方法而不是 DreamFusion。连同图像调节生成功能，我们为用户提供了控制 3D 合成的新方法，为各种创意应用开辟了新途径。\n  - [用于形状引导生成 3D 形状和纹理的 Latent-NeRF](https://arxiv.org/abs/2211.07600) | [code]\n    > 近年来，文本引导图像生成发展迅速，激发了文本引导形状生成方面的重大突破。最近，已经表明，使用分数蒸馏，可以成功地通过文本引导 NeRF 模型生成 3D 对象。我们将分数蒸馏调整为公开可用且计算效率高的潜在扩散模型，该模型将整个扩散过程应用于预训练自动编码器的紧凑潜在空间中。由于 NeRF 在图像空间中运行，因此通过潜在分数蒸馏来引导它们的简单解决方案需要在每个引导步骤中编码到潜在空间。相反，我们建议将 NeRF 带入潜在空间，从而产生 Latent-NeRF。分析我们的 Latent-NeRF，我们表明虽然文本到 3D 模型可以产生令人印象深刻的结果，但它们本质上是不受约束的，并且可能缺乏引导或执行特定 3D 结构的能力。为了协助和指导 3D 生成，我们建议使用 Sketch-Shape 来指导我们的 Latent-NeRF：一种定义所需对象的粗略结构的抽象几何体。然后，我们提出了将这种约束直接集成到 Latent-NeRF 中的方法。这种文本和形状指导的独特组合可以增强对生成过程的控制。我们还表明，潜在分数蒸馏可以成功地直接应用于 3D 网格。这允许在给定的几何体上生成高质量的纹理。我们的实验验证了我们不同形式的指导的力量和使用潜在渲染的效率。可通过此 https 网址实现\n## Nov6 - Nov12, 2022\n  - [用于 3D 场景重建的定向射线距离函数, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-20086-1_12) | [code]\n    > 我们提出了一种从单个看不见的图像重建全 3D 场景的方法。我们训练了真实的非水密场景扫描数据集。我们的方法使用预测距离函数，因为这些函数在处理复杂拓扑和大空间方面显示出前景。我们确定并分析了预测此类图像条件距离函数的两个关键挑战，这些距离函数阻碍了它们在真实 3D 场景数据上的成功。首先，我们展示了从图像预测传统场景距离需要对大感受野进行推理。其次，我们分析表明，经过训练以预测这些距离函数的网络的最佳输出不符合所有距离函数属性。我们提出了一种替代距离函数，即定向射线距离函数 (DRDF)，它可以解决这两个挑战。我们表明，在 Matterport3D、3DFront 和 ScanNet 上从单个图像进行 3D 重建时，经过训练以预测 DRDF 的深度网络在数量和质量上优于所有其他方法。 （项目页面：https://nileshkulkarni.github.io/scene_drdf）\n  - [3D常见宠物：现实生活中可变形类别的动态新视角合成](https://arxiv.org/abs/2211.03889) | [code]\n    > 从稀疏视图中获得对象的逼真重建本质上是模棱两可的，只能通过学习合适的重建先验来实现。早期关于稀疏刚性对象重建的工作成功地从大型数据集（如 CO3D）中学习了这样的先验。在本文中，我们将这种方法扩展到动态对象。我们以猫和狗作为代表性示例，并介绍 Common Pets in 3D (CoP3D)，这是一组众包视频，展示了大约 4,200 种不同的宠物。 CoP3D 是首批用于“野外”非刚性 3D 重建基准测试的大型数据集之一。我们还提出了 Tracker-NeRF，这是一种从我们的数据集中学习 4D 重建的方法。在测试时，给定一个看不见的物体的少量视频帧，Tracker-NeRF 预测其 3D 点的轨迹并生成新视图、插值视点和时间。 CoP3D 的结果揭示了比现有基线更好的非刚性新视图合成性能。\n## Oct30 - Nov5, 2022\n  - [HyperSound：使用超网络生成音频信号的隐式神经表示](https://arxiv.org/abs/2211.01839) | [code]\n    > 隐式神经表征 (INR) 是一个快速发展的研究领域，它提供了表示多媒体信号的替代方法。 INR 最近的应用包括图像超分辨率、高维信号压缩或 3D 渲染。然而，这些解决方案通常侧重于视觉数据，将它们适应音频领域并非易事。此外，它需要为每个数据样本单独训练模型。为了解决这个限制，我们提出了 HyperSound，这是一种利用超网络为训练时看不见的音频信号生成 INR 的元学习方法。我们表明，我们的方法可以重建声波，其质量可与其他最先进的模型相媲美。\n  - [使用表面信号参数化学习神经隐式表示](https://arxiv.org/abs/2211.00519) | [code]\n    > 神经隐式表面表示最近已成为显式 3D 对象编码的流行替代方法，例如多边形网格、列表点或体素。虽然重要的工作已经提高了这些表示的几何保真度，但很少有人关注它们的最终外观。传统的显式对象表示通常将 3D 形状数据与辅助表面映射图像数据耦合，例如漫反射颜色纹理和法线贴图中的精细几何细节，通常需要将 3D 表面映射到平面上，即表面参数化;另一方面，由于缺乏可配置的表面参数化，隐式表示不能轻易地进行纹理化。受这种数字内容创作方法的启发，我们设计了一种神经网络架构，该架构隐式编码适合外观数据的底层表面参数化。因此，我们的模型与现有的具有外观数据的基于网格的数字内容保持兼容。受到最近将紧凑网络过度拟合到单个 3D 对象的工作的启发，我们提出了一种新的权重编码神经隐式表示，它扩展了神经隐式表面的能力，以实现纹理映射的各种常见和重要应用。我们的方法优于合理的基线和最先进的替代方法。\n  - [gCoRF：生成合成辐射场, 3DV2022](https://vcai.mpi-inf.mpg.de/projects/gCoRF/) | [code]\n    > 对象的 3D 生成模型可通过 3D 控制实现逼真的图像合成。现有方法将场景建模为全局场景表示，忽略了场景的组成方面。除了支持可概括的 3D 推理之外，组合推理还可以支持各种编辑应用程序。在本文中，我们提出了一个组合生成模型，其中对象的每个语义部分都表示为仅从野外 2D 数据中学习的独立 3D 表示。我们从全局生成模型 (GAN) 开始，学习使用 2D 分割掩码的监督将其分解为不同的语义部分。然后，我们学习合成独立采样的部分，以创建连贯的全局场景。不同的部分可以独立采样，同时保持物体的其余部分固定。我们在各种对象和部件上评估我们的方法，并演示编辑应用程序。\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n  - [神经接触场：使用触觉感应跟踪外部接触](https://arxiv.org/abs/2210.09297) | [code]\n    > 我们提出了神经接触场，一种将神经场和触觉传感结合在一起的方法，以解决跟踪对象与环境之间的外部接触的问题。了解外部接触发生在哪里是迈向可以主动控制它以促进下游操作任务的方法的第一步。用于定位环境接触的先前工作通常假定接触类型（例如点或线），不捕获接触/非接触过渡，并且仅适用于基本几何形状的对象。神经接触场是第一种无需对接触类型做出任何假设即可跟踪任意多模态外部接触的方法。我们的主要见解是估计物体形状潜在空间中任何 3D 点的接触概率，给定基于视觉的触觉输入，该输入感知外部接触引起的局部运动。在实验中，我们发现神经接触场能够定位多个接触块，而无需对接触的几何形状做出任何假设，并在看不见的环境配置中捕获具有看不见的形状的已知类别对象的接触/非接触转换。除了神经接触场之外，我们还发布了模拟外部接触交互的 YCB-Extrinsic-Contact 数据集，以便在该领域进行进一步研究。项目存储库：此 https 网址\n  - [S3-NeRF：单一视点下阴影和阴影的神经反射场, NeurIPS2022](https://arxiv.org/abs/2210.08936) | [***``[code]``***](https://github.com/ywq/s3nerf)\n    > 在本文中，我们解决了多视图场景重建的“双重问题”，其中我们利用在不同点光源下捕获的单视图图像来学习神经场景表示。与只能恢复 2.5D 场景表示（即可见表面的法线/深度图）的现有单视图方法不同，我们的方法学习神经反射场来表示场景的 3D 几何和 BRDF。我们的方法不依赖于多视图照片一致性，而是利用两个信息丰富的单目线索，即阴影和阴影来推断场景几何。对多个具有挑战性的数据集的实验表明，我们的方法能够从单视图图像中恢复场景的 3D 几何图形，包括可见和不可见部分。由于神经反射场表示，我们的方法对深度不连续性具有鲁棒性。它支持新视图合成和重新照明等应用程序。我们的代码和模型可以在这个 https URL 上找到。\n## Oct9 - Oct15, 2022\n  - [重新审视多视图光度立体, WACV2023](https://arxiv.org/abs/2210.07670) | [code]\n    > 多视图光度立体 (MVPS) 是从图像中详细和精确地 3D 采集对象的首选方法。尽管 MVPS 的流行方法可以提供出色的结果，但它们通常执行起来很复杂，并且仅限于各向同性的材料对象。为了解决这些限制，我们提出了一种简单实用的 MVPS 方法，该方法适用于各向同性以及其他对象材料类型，例如各向异性和光泽。本文提出的方法利用深度神经网络中不确定性建模的优势，实现光度立体 (PS) 和多视图立体 (MVS) 网络预测的可靠融合。然而，与最近提出的最先进技术相反，我们引入了神经体积渲染方法，用于可靠地融合 MVS 和 PS 测量。引入神经体绘制的优势在于它有助于对具有不同材料类型的对象进行可靠建模，而现有的 MVS 方法、PS 方法或两者都可能失败。此外，它允许我们处理神经 3D 形状表示，最近在许多几何处理任务中显示出出色的结果。我们建议的新损失函数旨在使用最确定的 MVS 和 PS 网络预测以及加权神经体积渲染成本来拟合隐式神经函数的零水平集。当在几个基准数据集上进行广泛测试时，所提出的方法显示了最先进的结果。\n  - [MVSPlenOctree：从多视图立体中快速和通用地重建 PlenOctree 中的辐射场, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547795) | [code]\n    > 我们提出了 MVSPlenOctree，这是一种新方法，可以有效地重建辐射场以进行视图合成。与以前特定场景的辐射场重建方法不同，我们提出了一个通用管道，可以通过从数十个稀疏展开的图像中进行多视图立体 (MVS) 推断来有效地重建 360 度可渲染的辐射场。我们的方法利用基于方差的统计特征进行 MVS 推理，并将其与基于图像的渲染和体积渲染相结合以进行辐射场重建。我们首先训练一个 MVS 机器来推理场景的密度和外观。然后，基于 PlenOctree 的空间层次结构和从粗到细的密集采样机制，我们设计了一种鲁棒高效的 PlenOctree 重建采样策略，可以鲁棒地处理遮挡。一个 360 度可渲染的辐射场可以在 MVS Machine 的 PlenOctree 中以有效的单次前向传递进行重建。我们在真实世界的 DTU、LLFF 数据集和合成数据集上训练了我们的方法。我们通过评估在训练中看不到的 DTU 数据集的测试集来验证其普遍性。总之，我们的辐射场重建方法既高效又通用，可以在几秒钟内重建一个粗略的 360 度可渲染辐射场，在几分钟内重建一个密集的辐射场。更多详情请访问项目页面：https://derry-xing.github.io/projects/MVSPlenOctree。\n  - [ParseMVS：学习用于稀疏多视图立体视觉的原始感知表面表示, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547920) | [code]\n    > 多视图立体视觉 (MVS) 通过从密集采样的图像中找到密集的照片一致对应关系来恢复 3D 表面。在本文中，我们从稀疏采样的视图（最多减少一个数量级的图像）解决具有挑战性的 MVS 任务，这在应用程序中更实用且更具成本效益。主要挑战来自严重遮挡和高度倾斜的补丁引入的显着对应模糊性。另一方面，这种模糊性可以通过结合来自全局结构的几何线索来解决。有鉴于此，我们提出 ParseMVS，通过学习 Primitive-A waR e S urface rE 表示来提升稀疏 MVS。特别是，除了了解全局结构之外，我们的新颖表示还允许保留精细细节，包括几何、纹理和可见性。更具体地说，整个场景被解析为多个几何图元。在它们中的每一个上，几何定义为沿基元法线方向的位移，以及沿每个视图方向的纹理和可见性。一个无监督的神经网络被训练来通过逐渐增加所有输入图像之间的照片一致性和渲染一致性来学习这些因素。由于表面属性在每个图元的 2D 空间中局部更改，ParseMVS 可以在优化局部细节的同时保留全局图元结构，处理“不完整”和“不准确”问题。我们通过实验证明，在不同的采样稀疏度下，尤其是在极端稀疏的 MVS 设置下，ParseMVS 在完整性和总体得分方面始终优于最先进的表面重建方法。除此之外，ParseMVS 在压缩、鲁棒性和效率方面也显示出巨大的潜力。\n  - [通过相邻几何引导体积完成的自监督多视图立体, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547926) | [code]\n    > 现有的自我监督多视图立体（MVS）方法在很大程度上依赖于几何推断的光度一致性，因此受到低纹理或非朗伯外观的影响。在本文中，我们观察到相邻几何具有某些共性，可以帮助推断具有挑战性或低置信度区域的正确几何。然而，由于缺乏训练数据和确保视图之间一致性的必要性，在非监督 MVS 方法中利用此类属性仍然具有挑战性。为了解决这些问题，我们提出了一种新颖的几何推理训练方案，通过选择性地掩盖具有丰富纹理的区域，其中几何可以很好地恢复并用于监督信号，然后引导一个精心设计的成本体积完成网络来学习如何恢复几何被屏蔽的区域。在推理过程中，我们然后屏蔽低置信区域，并使用成本体积完成网络进行几何校正。为了处理成本体积金字塔的不同深度假设，我们为完成网络设计了一个三分支体积推理结构。此外，通过将平面视为一种特殊的几何形状，我们首先从伪标签中识别平面区域，然后通过平面法线一致性通过高置信度标签校正低置信度像素。在 DTU 和 Tanks & Temples 上进行的大量实验证明了所提出框架的有效性和最先进的性能。\n  - [从单幅图像中进行 3D 人脸绑定的不确定性感知半监督学习, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3548285) | [code]\n    > 我们提出了一种通过动作单元 (AU)、视点和光线方向从单个输入图像中装配 3D 面的方法。现有的人脸合成和动画 3D 方法严重依赖 3D 可变形模型（3DMM），该模型建立在 3D 数据之上，无法提供直观的表情参数，而 AU 驱动的 2D 方法无法处理头部姿势和光照效果。我们通过以半监督方式将最近的 3D 重建方法与 2D AU 驱动方法相结合来弥补差距。建立在自动编码 3D 人脸重建模型的基础上，该模型在没有任何监督的情况下将深度、反照率、视点和光线解耦，我们进一步将表达式与深度和反照率的身份解耦，并使用新的条件特征转换模块和预训练的批评家进行 AU 强度估计和图像分类.新颖的目标函数是使用未标记的野外图像和带有 AU 标签的室内图像设计的。我们还利用不确定性损失将可能变化的图像 AU 区域建模为合成的输入噪声，并对有噪声的 AU 强度标签进行建模以估计 AU 评论家的强度。在四个数据集上进行的人脸编辑和动画实验表明，与六种最先进的方法相比，我们提出的方法在表情一致性、身份相似性和姿势相似性方面具有优越性和有效性。\n  - [强化神经辐射场的多尺度表示, BMVC2022](https://arxiv.org/abs/2210.04233) | [code]\n    > 神经辐射场 (NeRF) 最近成为从多视图 (MV) 图像中表示对象的新范例。然而，它无法处理多尺度 (MS) 图像和相机姿态估计错误，这通常是从日常商品相机捕获的多视图图像的情况。虽然最近提出的 Mip-NeRF 可以处理 NeRF 的多尺度成像问题，但它不能处理相机姿态估计误差。另一方面，新提出的 BARF 可以解决 NeRF 的相机位姿问题，但如果图像本质上是多尺度的，则会失败。本文提出了一种强大的多尺度神经辐射场表示方法，以同时克服两个现实世界的成像问题。我们的方法通过利用场景刚性的基本原理，使用受 NeRF 启发的方法来处理多尺度成像效果和相机姿态估计问题。为了减少由于光线空间中的多尺度图像造成的令人不快的混叠伪影，我们利用了 Mip-NeRF 多尺度表示。对于鲁棒相机位姿的联合估计，我们在神经体绘制框架中提出了基于图神经网络的多重运动平均。我们通过示例证明，为了从日常获取的多视图图像中准确地表示对象，拥有精确的相机姿态估计是至关重要的。如果不考虑相机姿态估计中的鲁棒性度量，通过圆锥截头体对多尺度混叠伪影进行建模可能会适得其反。我们在基准数据集上进行了广泛的实验，以证明我们的方法比最近的 NeRF 启发的方法在这种现实设置中提供了更好的结果。\n## Oct2 - Oct8, 2022\n  - [XDGAN：2D 空间中的多模态 3D 形状生成](https://arxiv.org/abs/2210.03007) | [code]\n    > 由于二维卷积架构的效率，二维图像的生成模型最近在质量、分辨率和速度方面取得了巨大进步。然而，由于大多数当前的 3D 表示依赖于自定义网络组件，因此很难将此进展扩展到 3D 领域。本文解决了一个核心问题：是否可以直接利用 2D 图像生成模型来生成 3D 形状？为了回答这个问题，我们提出了 XDGAN，这是一种有效且快速的方法，用于将 2D 图像 GAN 架构应用于 3D 对象几何图形的生成，并结合附加的表面属性，如颜色纹理和法线。具体来说，我们提出了一种将 3D 形状转换为紧凑的 1 通道几何图像并利用 StyleGAN3 和图像到图像转换网络在 2D 空间中生成 3D 对象的新方法。生成的几何图像可以快速转换为 3D 网格，实现实时 3D 对象合成、可视化和交互式编辑。此外，使用标准 2D 架构有助于将更多 2D 进步带入 3D 领域。我们定量和定性地表明，我们的方法在各种任务中非常有效，例如 3D 形状生成、单视图重建和形状操作，同时与最近的 3D 生成模型相比明显更快、更灵活。\n  - [使用辐射场传播的无监督多视图对象分割, NeurIPS2022](https://arxiv.org/abs/2210.00489) | [code]\n    > 我们提出了辐射场传播 (RFP)，这是一种在重建过程中分割 3D 对象的新方法，仅给出场景的未标记多视图图像。 RFP 源自新兴的基于神经辐射场的技术，该技术将语义与外观和几何形状联合编码。我们方法的核心是一种新颖的传播策略，用于具有双向光度损失的单个对象的辐射场，能够将场景无监督地划分为对应于不同对象实例的显着或有意义的区域。为了更好地处理具有多个对象和遮挡的复杂场景，我们进一步提出了一种迭代期望最大化算法来细化对象掩码。据我们所知，RFP 是第一个在没有任何监督、注释或其他线索（如 3D 边界框和对象类别的先验知识）的情况下处理神经辐射场 (NeRF) 的 3D 场景对象分割的无监督方法。实验表明，RFP 实现了可行的分割结果，比以前的无监督图像/场景分割方法更准确，并且可与现有的基于 NeRF 监督的方法相媲美。分段对象表示支持单独的 3D 对象编辑操作。\n## Sep25 - Oct1, 2022\n  - [神经隐式曲面的球面引导训练](https://arxiv.org/abs/2209.15511) | [code]\n    > 近年来，通过神经隐函数进行表面建模已成为多视图 3D 重建的主要技术之一。然而，最先进的方法依赖于隐式函数来模拟整个场景体积，导致在具有薄物体或高频细节的区域中降低重建保真度。为了解决这个问题，我们提出了一种与辅助显式形状表示一起联合训练神经隐式表面的方法，该辅助显式形状表示充当表面引导。在我们的方法中，这种表示封装了场景的表面区域，使我们能够通过仅对该区域的体积进行建模来提高隐式函数训练的效率。我们建议使用一组可学习的球形基元作为可学习的表面指导，因为它们可以使用其梯度与神经表面函数一起有效地训练。我们的训练管道包括使用隐函数的梯度对球体中心的迭代更新，然后将后者微调到场景的更新表面区域。我们表明，对训练过程的这种修改可以插入到几种流行的隐式重建方法中，从而提高多个 3D 重建基准的结果质量。\n  - [360FusionNeRF：具有联合引导的全景神经辐射场](https://arxiv.org/abs/2209.14265) | [code]\n    > 我们提出了一种基于神经辐射场 (NeRF) 从单个 360 度全景图像合成新视图的方法。类似设置中的先前研究依赖于多层感知的邻域插值能力来完成由遮挡引起的缺失区域，这导致其预测中的伪影。我们提出了 360FusionNeRF，这是一个半监督学习框架，我们在其中引入几何监督和语义一致性来指导渐进式训练过程。首先，将输入图像重新投影到 360 度图像，并在其他相机位置提取辅助深度图。除了 NeRF 颜色指导之外，深度监督还改进了合成视图的几何形状。此外，我们引入了语义一致性损失，鼓励对新视图进行逼真的渲染。我们使用预训练的视觉编码器（例如 CLIP）提取这些语义特征，CLIP 是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的 2D 照片进行训练。实验表明，我们提出的方法可以在保留场景特征的同时产生未观察到的区域的合理完成。在跨各种场景进行训练时，360FusionNeRF 在转移到合成 Structured3D 数据集（PSNR~5%，SSIM~3% LPIPS~13%）、真实世界的 Matterport3D 数据集（PSNR~3%）时始终保持最先进的性能, SSIM~3% LPIPS~9%) 和 Replica360 数据集 (PSNR~8%, SSIM~2% LPIPS~18%)。\n  - [自主隐式重建的高效视图路径规划](https://arxiv.org/abs/2209.13159) | [code]\n    > 隐式神经表示已显示出用于 3D 场景重建的巨大潜力。最近的工作通过学习用于视图路径规划的信息增益，将其应用于自主 3D 重建。虽然有效，但信息增益的计算成本很高，并且与使用体积表示的计算相比，使用 3D 点的隐式表示的碰撞检查要慢得多。在本文中，我们建议 1）利用神经网络作为信息增益场的隐式函数逼近器，以及 2）将隐式细粒度表示与粗略的体积表示相结合以提高效率。随着效率的进一步提高，我们提出了一种基于基于图的规划器的新颖的信息路径规划。与具有隐式和显式表示的自主重建相比，我们的方法证明了重建质量和规划效率的显着提高。我们将该方法部署在真实的无人机上，结果表明我们的方法可以规划信息丰富的视图并重建高质量的场景。\n## Sep18 - Sep24, 2022\n  - [SG-SRNs：超像素引导的场景表示网络, SignalProcessingLetters](https://ieeexplore.ieee.org/abstract/document/9900405) | [code]\n    > 最近，场景表示网络（SRNs）由于其连续且轻量级的场景表示能力，在计算机视觉领域引起了越来越多的关注。然而，SRN 通常在低纹理图像区域上表现不佳。为了解决这个问题，我们在本文中提出了超像素引导的场景表示网络，称为 SG-SRN，由主干模块 (SRN)、超像素分割模块和超像素正则化模块组成。在所提出的方法中，除了新颖的视图合成任务外，表示感知的超像素分割掩码生成任务由所提出的超像素分割模块实现。然后，超像素正则化模块利用超像素分割掩码以局部平滑的方式引导要学习的主干，并优化局部区域的场景表示，以自监督的方式间接缓解低纹理区域的结构失真.在我们构建的数据集和公共 Synthetic-NeRF 数据集上的广泛实验结果表明，所提出的 SG-SRN 实现了显着更好的 3D 结构表示性能。\n  - [具有通道调谐的面向边缘的隐式神经表示](https://arxiv.org/abs/2209.11697) | [code]\n    > 隐式神经表示，将图像表示为连续函数而不是离散网格形式，广泛用于图像处理。尽管其表现出色，但在恢复给定信号的清晰形状（例如图像边缘）方面仍然存在限制。在本文中，我们提出了梯度幅度调整算法，该算法计算图像的梯度以训练隐式表示。此外，我们提出了面向边缘的表示网络（EoREN），它可以通过拟合梯度信息（面向边缘的模块）来重建具有清晰边缘的图像。此外，我们添加了 Channel-tuning 模块来调整给定信号的分布，从而解决了拟合梯度的长期问题。通过分离两个模块的反向传播路径，EoREN 可以在不妨碍梯度作用的情况下学习图像的真实颜色。我们定性地证明了我们的模型可以重建复杂的信号，并通过定量结果证明了我们模型的一般重建能力。\n  - [使用成像声纳的神经隐式表面重建](https://arxiv.org/abs/2209.08221) | [code]\n    > 我们提出了一种使用成像声纳（也称为前视声纳（FLS））对物体进行密集 3D 重建的技术。与以前将场景几何建模为点云或体积网格的方法相比，我们将几何表示为神经隐函数。此外，给定这样的表示，我们使用可微分体积渲染器来模拟声波的传播以合成成像声纳测量。我们在真实和合成数据集上进行实验，并表明我们的算法从多视图 FLS 图像中重建高保真表面几何图形的质量比以前的技术高得多，并且不会受到相关的内存开销的影响。\n  - [使用神经辐射场进行主动机器人 3D 重建的不确定性引导策略, RAL2022](https://arxiv.org/abs/2209.08409) | [code]\n    > 在本文中，我们解决了物体的主动机器人 3D 重建问题。特别是，我们研究了带有手持摄像头的移动机器人如何选择有利数量的视图来有效地恢复对象的 3D 形状。与该问题的现有解决方案相反，我们利用流行的基于神经辐射场的对象表示，最近在各种计算机视觉任务中显示出令人印象深刻的结果。然而，使用这种表示直接推断对象的显式 3D 几何细节并不简单，这使得密集 3D 重建的次佳视图选择问题具有挑战性。本文介绍了一种基于光线的体积不确定性估计器，它计算颜色样本沿物体隐式神经表示的每条光线的权重分布的熵。我们表明，使用所提出的估计器给出一个新颖的视图，可以推断出底层 3D 几何的不确定性。然后，我们提出了一个下一个最佳视图选择策略，该策略由基于神经辐射场的表示中基于射线的体积不确定性指导。令人鼓舞的合成数据和真实世界数据的实验结果表明，本文提出的方法可以启用一个新的研究方向，即使用隐式 3D 对象表示来解决机器人视觉应用中的下一个最佳视图问题，将我们的方法与现有的方法区分开来依赖于显式 3D 几何建模的方法。\n## Sep11 - Sep17, 2022\n  - [DevNet：通过密度体积构建的自监督单目深度学习, ECCV2022](https://arxiv.org/abs/2209.06351) | [code]\n    > 单目图像的自监督深度学习通常依赖于时间相邻图像帧之间的 2D 像素级光度关系。然而，它们既没有充分利用 3D 逐点几何对应，也没有有效地解决由遮挡或照明不一致引起的光度翘曲的模糊性。为了解决这些问题，这项工作提出了密度体积构建网络 (DevNet)，这是一种新颖的自我监督单目深度学习框架，可以考虑 3D 空间信息，并利用相邻相机平截头体之间更强的几何约束。我们的 DevNet 不是直接从单个图像中回归像素值，而是将相机平截头体划分为多个平行平面，并预测每个平面上的逐点遮挡概率密度。最终的深度图是通过沿相应光线对密度进行积分来生成的。在训练过程中，引入了新的正则化策略和损失函数来减轻光度模糊和过拟合。在没有明显扩大模型参数大小或运行时间的情况下，DevNet 在 KITTI-2015 室外数据集和 NYU-V2 室内数据集上都优于几个具有代表性的基线。特别是，在深度估计任务中，KITTI-2015 和 NYU-V2 上的 DevNet 的均方根偏差降低了约 4%。此 https 网址提供了代码。\n## Sep4 - Sep10, 2022\n  - [具有学习几何先验的 3D 纹理形状恢复](https://arxiv.org/abs/2209.03254) | [code]\n    > 从部分扫描中恢复 3D 纹理形状对于许多实际应用至关重要。现有方法已经证明了隐式函数表示的有效性，但它们存在严重遮挡和不同对象类型的部分输入，这极大地阻碍了它们在现实世界中的应用价值。本技术报告介绍了我们通过结合学习几何先验来解决这些限制的方法。为此，我们从学习的姿势预测中生成一个 SMPL 模型，并将其融合到部分输入中，以添加人体的先验知识。我们还提出了一种新颖的完整性感知边界框自适应，用于处理不同级别的尺度和部分扫描的局部性。\n  - [SIRA：来自单个图像的可重新点亮的头像](https://arxiv.org/abs/2209.03027) | [code]\n    > 从单个图像中恢复人头的几何形状，同时分解材料和照明是一个严重不适定的问题，需要解决先验信息。基于 3D 可变形模型 (3DMM) 的方法，以及它们与可微渲染器的组合，已显示出可喜的结果。然而，3DMM 的表现力是有限的，它们通常会产生过度平滑且与身份无关的 3D 形状，仅限于面部区域。最近已经通过使用多层感知器参数化几何形状的神经场获得了高度准确的全头重建。这些表示的多功能性也被证明对于解开几何、材料和照明是有效的。然而，这些方法需要几十个输入图像。在本文中，我们介绍了 SIRA，这是一种从单个图像重建具有高保真几何形状和分解光和表面材料的人头头像的方法。我们的关键成分是两个基于神经场的数据驱动统计模型，可解决单视图 3D 表面重建和外观分解的模糊性。实验表明，SIRA 在 3D 头部重建中获得了最先进的结果，同时它成功地解开了全局照明、漫反射和镜面反射率。此外，我们的重建适用于基于物理的外观编辑和头部模型重新照明。\n## Aug28 - Sep3, 2022\n  - [使用有符号射线距离函数 (SRDF) 的多视图重建](https://arxiv.org/abs/2209.00082) | [code]\n    > 在本文中，我们解决了多视图 3D 形状重建的问题。尽管最近与隐式形状表示相关的可微渲染方法提供了突破性的性能，但它们的计算量仍然很大，并且通常在估计的几何形状上缺乏精度。为了克服这些限制，我们研究了一种新的计算方法，它建立在一种新的体积形状表示上，就像最近的可微渲染方法一样，但用深度图参数化以更好地实现形状表面。与此表示相关的形状能量评估给定彩色图像的 3D 几何形状，不需要外观预测，但在优化时仍然受益于体积积分。在实践中，我们提出了一种隐式形状表示，SRDF，它基于我们通过沿相机光线的深度参数化的有符号距离。相关的形状能量考虑了深度预测一致性和光度一致性之间的一致性，这在体积表示中的 3D 位置。可以考虑各种照片一致性先验，例如基于中值的基线，或更详细的标准，如学习函数。该方法保留了深度图的像素精度，并且是可并行化的。我们在标准数据集上的实验表明，它提供了关于最近使用隐式形状表示的方法以及传统的多视图立体方法的最先进的结果。\n  - [Dual-Space NeRF：在不同空间中学习动画化身和场景照明, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > 在规范空间中对人体进行建模是捕捉和动画的常见做法。但是当涉及到神经辐射场 (NeRF) 时，仅仅在标准空间中学习一个静态的 NeRF 是不够的，因为即使场景照明是恒定的，当人移动时身体的照明也会发生变化。以前的方法通过学习每帧嵌入来缓解光照的不一致性，但这种操作并不能推广到看不见的姿势。鉴于光照条件在世界空间中是静态的，而人体在规范空间中是一致的，我们提出了一种双空间 NeRF，它在两个独立的空间中使用两个 MLP 对场景光照和人体进行建模。为了弥合这两个空间，以前的方法主要依赖于线性混合蒙皮 (LBS) 算法。然而，动态神经领域的 LBS 的混合权重是难以处理的，因此通常用另一个 MLP 来记忆，这不能推广到新的姿势。尽管可以借用 SMPL 等参数网格的混合权重，但插值操作会引入更多伪影。在本文中，我们建议使用重心映射，它可以直接泛化到看不见的姿势，并且出人意料地取得了比具有神经混合权重的 LBS 更好的结果。 Human3.6M 和 ZJU-MoCap 数据集的定量和定性结果显示了我们方法的有效性。\n  - [NerfCap：使用动态神经辐射场捕获人类表现, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > 本文解决了从稀疏的多视图或单目视频中捕捉人类表演的挑战。给定表演者的模板网格，以前的方法通过将模板网格非刚性地注册到具有 2D 轮廓或密集光度对齐的图像来捕获人体运动。然而，详细的表面变形无法从轮廓中恢复，而光度对齐则受到视频外观变化引起的不稳定性的影响。为了解决这些问题，我们提出了 NerfCap，这是一种基于表演者动态神经辐射场 (NeRF) 表示的新型表演捕捉方法。具体来说，通过优化变形场和规范 NeRF 的外观模型，从模板几何初始化规范 NeRF 并注册到视频帧。为了捕捉大型身体运动和详细的表面变形，NerfCap 将线性混合蒙皮与嵌入式图形变形相结合。与受限于固定拓扑和纹理的基于网格的方法相比，NerfCap 能够灵活地捕捉视频中复杂的几何形状和外观变化，并合成更逼真的图像。此外，NerfCap 可以通过将合成视频与输入视频进行匹配，以自我监督的方式进行端到端的预训练。各种数据集的实验结果表明，NerfCap 在表面重建精度和新视图合成质量方面都优于先前的工作。\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n  - [Vox-Surf：基于体素的隐式表面表示](https://arxiv.org/abs/2208.10925) | [code]\n    > 虚拟内容创建和交互在 AR 和 VR 等现代 3D 应用中发挥着重要作用。从真实场景中恢复详细的 3D 模型可以显着扩展其应用范围，并且已经在计算机视觉和计算机图形学界进行了数十年的研究。我们提出了 Vox-Surf，一种基于体素的隐式表面表示。我们的 Vox-Surf 将空间划分为有限的有界体素。每个体素在其角顶点中存储几何和外观信息。由于从体素表示继承而来的稀疏性，Vox-Surf 几乎适用于任何场景，并且可以从多个视图图像中轻松训练。我们利用渐进式训练过程逐步提取重要体素进行进一步优化，从而只保留有效体素，这大大减少了采样点的数量并提高了渲染速度。精细体素也可以视为碰撞检测的边界体积。实验表明，与其他方法相比，Vox-Surf 表示可以以更少的内存和更快的渲染速度学习精细的表面细节和准确的颜色。我们还表明，Vox-Surf 在场景编辑和 AR 应用中可以更实用。\n  - [从单目视频中对动画 3D 人体进行神经捕获, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > 我们提出了一种从单目视频输入构建可动画 3D 人体表示的新颖范例，这样它就可以以任何看不见的姿势和视图进行渲染。我们的方法基于动态神经辐射场 (NeRF)，该动态神经辐射场 (NeRF) 由作为几何代理的基于网格的参数化 3D 人体模型装配。以前的方法通常依赖多视图视频或准确的 3D 几何信息作为附加输入；此外，大多数方法在推广到看不见的姿势时质量会下降。我们认为，泛化的关键是用于查询动态 NeRF 的良好输入嵌入：良好的输入嵌入应该定义全体积空间中的单射映射，由姿态变化下的表面网格变形引导。基于这一观察，我们建议嵌入输入查询及其与网格顶点上一组测地最近邻所跨越的局部表面区域的关系。通过包含位置和相对距离信息，我们的嵌入定义了距离保留的变形映射，并很好地推广到看不见的姿势。为了减少对额外输入的依赖，我们首先使用现成的工具初始化每帧 3D 网格，然后提出一个管道来联合优化 NeRF 并细化初始网格。大量实验表明，我们的方法可以在看不见的姿势和视图下合成合理的人类渲染结果。\n## Aug7 - Aug13, 2022\n  - [OmniVoxel：一种快速精确的全向神经辐射场重建方法, GCCE 2022](https://arxiv.org/abs/2208.06335) | [code]\n    > 本文提出了一种利用等矩形全向图像重建神经辐射场的方法。具有辐射场的隐式神经场景表示可以在有限的空间区域内连续重建场景的 3D 形状。然而，在商用 PC 硬件上训练完全隐式表示需要大量时间和计算资源（每个场景 15 ~ 20 小时）。因此，我们提出了一种显着加速这一过程的方法（每个场景 20 ∼ 40 分钟）。我们没有使用完全隐式的光线表示来重建辐射场，而是采用包含张量中的密度和颜色特征的特征体素。考虑到全向 equirectangular 输入和相机布局，我们使用球面体素化来表示，而不是三次表示。我们的体素化方法可以平衡内景和外景的重建质量。此外，我们对颜色特征采用轴对齐位置编码方法来提高整体图像质量。我们的方法在具有随机相机姿势的合成数据集上实现了令人满意的经验性能。此外，我们在包含复杂几何形状的真实场景中测试了我们的方法，并实现了最先进的性能。我们的代码和完整的数据集将与论文发表的同时发布。\n  - [通过可微分渲染进行表面捕获的快速梯度下降, 3DV2022](https://hal.inria.fr/hal-03748662/) | [code]\n    > 差分渲染最近已成为一种强大的工具，用于从多个视图进行基于图像的渲染或几何重建，具有非常高的质量。到目前为止，此类方法已在通用对象数据库上进行了基准测试，并有望应用于一些真实数据，但尚未应用于可能受益的特定应用程序。在本文中，我们研究了如何为原始多相机性能捕获制作差分渲染系统。我们以实际可用性和可重复性的方式解决了几个关键问题，例如处理速度、模型的可解释性和一般输出模型质量。这导致我们对差分渲染框架做出了一些贡献。特别是，我们展示了差分渲染和经典优化的统一视图是可能的，从而导致可以分析计算完整的非随机梯度步骤并将完整的每帧数据存储在视频内存中的公式和实现，从而产生简单有效的实现.我们还使用稀疏存储和从粗到细的方案来实现极高的分辨率，同时包含内存和计算时间。我们通过实验表明，在质量上与最先进的多视图人体表面捕获方法相媲美的结果可以在很短的时间内实现，通常每帧大约一分钟。\n  - [PlaneFormers：从稀疏视图平面到 3D 重建, ECCV2022](https://arxiv.org/abs/2208.04307) | [code]\n    > 我们提出了一种从具有有限重叠的图像中对场景进行平面表面重建的方法。这种重建任务具有挑战性，因为它需要联合推理单图像 3D 重建、图像之间的对应关系以及图像之间的相对相机位姿。过去的工作提出了基于优化的方法。我们介绍了一种更简单的方法，PlaneFormer，它使用一个应用于 3D 感知平面令牌的转换器来执行 3D 推理。我们的实验表明，我们的方法比以前的工作要有效得多，并且几个特定于 3D 的设计决策对其成功至关重要。\n  - [PS-NeRV：视频的补丁风格化神经表示](https://arxiv.org/abs/2208.03742) | [code]\n    > 我们研究如何使用隐式神经表示 (INR) 来表示视频。经典的 INR 方法通常利用 MLP 将输入坐标映射到输出像素。虽然最近的一些作品试图用 CNN 直接重建整个图像。然而，我们认为上述像素级和图像级策略都不利于视频数据。相反，我们提出了一种补丁解决方案 PS-NeRV，它将视频表示为补丁和相应补丁坐标的函数。它自然继承了image-wise方法的优点，并以快速的解码速度实现了出色的重建性能。整个方法包括传统的模块，如位置嵌入、MLPs 和 CNNs，同时还引入了 AdaIN 来增强中间特征。这些简单而重要的变化可以帮助网络轻松适应高频细节。大量实验证明了它在视频压缩和视频修复等视频相关任务中的有效性。\n## Jul31 - Aug6, 2022\n  - [PRIF: Primary Ray-based Implicit Function](https://research.google/pubs/pub51556/) | [code]\n    > 我们引入了一种新的隐式形状表示，称为基于初级光线的隐式函数 (PRIF)。与大多数基于符号距离函数 (SDF) 处理空间位置的现有方法相比，我们的表示在定向射线上运行。具体来说，PRIF 被制定为直接生成给定输入射线的表面命中点，而无需昂贵的球体跟踪操作，从而实现高效的形状提取和可微渲染。我们证明了经过训练以编码 PRIF 的神经网络在各种任务中取得了成功，包括单一形状表示、类别形状生成、稀疏或嘈杂观察的形状补全、相机姿态估计的逆渲染以及颜色的神经渲染。\n## Jul24 - Jul30, 2022\n  - [脱离网格：用于 3D 血管建模的连续隐式神经表示, MICCAI STACOM 2022](https://arxiv.org/abs/2207.14663) | [code]\n    > 个性化 3D 血管模型对于心血管疾病患者的诊断、预后和治疗计划非常有价值。传统上，此类模型是用网格和体素掩码等显式表示或径向基函数或原子（管状）形状等隐式表示构建的。在这里，我们建议在可微的隐式神经表示 (INR) 中通过其有符号距离函数 (SDF) 的零水平集来表示表面。这使我们能够用隐式、连续、轻量级且易于与深度学习算法集成的表示来对复杂的血管结构进行建模。我们在这里通过三个实际示例展示了这种方法的潜力。首先，我们从 CT 图像中获得了腹主动脉瘤 (AAA) 的准确且防水的表面，并从表面上的 200 个点显示出稳健的拟合。其次，我们同时将嵌套的血管壁安装在单个 INR 中，没有交叉点。第三，我们展示了如何将单个动脉的 3D 模型平滑地融合到单个防水表面中。我们的结果表明，INR 是一种灵活的表示形式，具有最小交互注释的潜力复杂血管结构的研究和操作。\n  - [GAUDI：沉浸式 3D 场景生成的神经架构师](https://arxiv.org/abs/2207.13751) | [***``[code]``***](https://github.com/apple/ml-gaudi)\n    > 我们介绍了 GAUDI，这是一种生成模型，能够捕捉复杂而逼真的 3D 场景的分布，可以从移动的相机中沉浸式地渲染。我们用一种可扩展但功能强大的方法来解决这个具有挑战性的问题，我们首先优化一个潜在的表示，以解开辐射场和相机姿势。然后使用这种潜在表示来学习生成模型，该模型可以无条件和有条件地生成 3D 场景.我们的模型通过消除相机姿态分布可以跨样本共享的假设来概括以前专注于单个对象的工作。我们展示了 GAUDI 在跨多个数据集的无条件生成设置中获得了最先进的性能，并允许在给定条件变量（如稀疏图像观察或描述场景的文本）的情况下有条件地生成 3D 场景。\n  - [AlignSDF：用于手对象重建的姿势对齐有符号距离场, ECCV2022](https://arxiv.org/abs/2207.12909) | [***``[code]``***](https://zerchen.github.io/projects/alignsdf.html)\n    > 最近的工作在从单目彩色图像联合重建手和操纵对象方面取得了令人瞩目的进展。现有方法侧重于参数网格或符号距离场 (SDF) 方面的两种替代表示。一方面，参数模型可以从先验知识中受益，但代价是有限的形状变形和网格分辨率。因此，网格模型可能无法精确重建细节，例如手和物体的接触面。另一方面，基于 SDF 的方法可以表示任意细节，但缺乏明确的先验。在这项工作中，我们的目标是使用参数表示提供的先验改进 SDF 模型。特别是，我们提出了一个联合学习框架，可以解开姿势和形状。我们从参数模型中获取手和物体的姿势，并使用它们在 3D 空间中对齐 SDF。我们表明，这种对齐的 SDF 更好地专注于重建形状细节并提高手和物体的重建精度。我们评估了我们的方法，并在具有挑战性的 ObMan 和 DexYCB 基准上展示了对现有技术的显着改进。\n  - [NeuMesh：学习基于解缠结神经网格的隐式场，用于几何和纹理编辑, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > 最近，神经隐式渲染技术得到了迅速发展，并在新颖的视图合成和 3D 场景重建中显示出巨大的优势。然而，现有的用于编辑目的的神经渲染方法提供的功能有限，例如，刚性变换，或者不适用于日常生活中一般对象的细粒度编辑。在本文中，我们提出了一种新颖的基于网格的表示，通过在网格顶点上使用解开几何和纹理代码对神经隐场进行编码，这促进了一组编辑功能，包括网格引导的几何编辑、带有纹理交换的指定纹理编辑、填充和绘画操作。为此，我们开发了几种技术包括可学习的符号指标以放大基于网格的表示的空间可区分性，蒸馏和微调机制以实现稳定收敛，以及空间感知优化策略以实现精确的纹理编辑。对真实数据和合成数据的大量实验和编辑示例证明了我们的方法在表示质量和编辑能力方面的优越性。代码可在项目网页上找到：此 https URL。\n## Previous weeks\n  - [非刚性神经辐射场：单目视频变形场景的重建和新视图合成，, ICCV2021](https://vcai.mpi-inf.mpg.de/projects/nonrigid_nerf/) | [***``[code]``***](https://github.com/facebookresearch/nonrigid_nerf)\n    > 我们提出了非刚性神经辐射场 (NR-NeRF)，这是一种用于一般非刚性动态场景的重建和新颖的视图合成方法。我们的方法将动态场景的 RGB 图像作为输入（例如，来自单目视频记录），并创建高质量的时空几何和外观表示。我们表明，单个手持消费级相机足以从新颖的虚拟相机视图合成动态场景的复杂渲染，例如一个“子弹时间”的视频效果。 NR-NeRF 将动态场景分解为规范体积及其变形。场景变形被实现为光线弯曲，其中直线光线被非刚性变形。我们还提出了一种新的刚性网络来更好地约束场景的刚性区域，从而获得更稳定的结果。射线弯曲和刚性网络在没有明确监督的情况下进行训练。我们的公式可以实现跨视图和时间的密集对应估计，以及引人注目的视频编辑应用程序，例如运动夸张。我们的代码将是开源的。\n  - [神经关节辐射场, ICCV2021](https://arxiv.org/abs/2104.03110) | [***``[code]``***](https://github.com/nogu-atsu/NARF#code)\n    > 我们提出了神经关节辐射场 (NARF)，这是一种新颖的可变形 3D 表示，用于从图像中学习到的关节对象。虽然 3D 隐式表示的最新进展使得学习复杂对象的模型成为可能，但学习关节对象的姿势可控表示仍然是一个挑战，因为当前的方法需要 3D 形状监督并且无法呈现外观。在制定 3D 关节对象的隐式表示时，我们的方法在求解每个 3D 位置的辐射场时仅考虑最相关对象部分的刚性变换。通过这种方式，所提出的方法可以表示与姿势相关的变化，而不会显着增加计算复杂度。 NARF 是完全可微的，可以从带有姿势注释的图像中训练出来。此外，通过使用自动编码器，它可以学习对象类的多个实例的外观变化。实验表明，所提出的方法是有效的，并且可以很好地推广到新的姿势。\n  - [GRF：学习用于 3D 场景表示和渲染的一般辐射场, ICCV2021(oral)](https://arxiv.org/abs/2010.04595) | [***``[code]``***](https://github.com/alextrevithick/GRF)\n    > 我们提出了一个简单而强大的神经网络，它仅从 2D 观察中隐式表示和渲染 3D 对象和场景。该网络将 3D 几何建模为一般辐射场，它以一组具有相机位姿和内在函数的 2D 图像作为输入，为 3D 空间的每个点构建内部表示，然后渲染该点的相应外观和几何观察从任意位置。我们方法的关键是学习 2D 图像中每个像素的局部特征，然后将这些特征投影到 3D 点，从而产生一般和丰富的点表示。我们还集成了一种注意力机制来聚合来自多个 2D 视图的像素特征，从而隐式考虑视觉遮挡。大量实验表明，我们的方法可以为新物体、看不见的类别和具有挑战性的现实世界场景生成高质量和逼真的新视图。\n  - [MVSNeRF：从多视图立体快速概括辐射场重建, ICCV2021](https://apchenstu.github.io/mvsnerf/) | [***``[code]``***](https://github.com/apchenstu/mvsnerf)\n    > 我们提出了 MVSNeRF，一种新颖的神经渲染方法，可以有效地重建神经辐射场以进行视图合成。与先前的神经辐射场工作考虑对密集捕获的图像进行逐场景优化不同，我们提出了一个通用的深度神经网络，它可以通过快速网络推理仅从三个附近的输入视图重建辐射场。我们的方法利用平面扫描成本体积（广泛用于多视图立体）进行几何感知场景推理，并将其与基于物理的体积渲染相结合用于神经辐射场重建。我们在 DTU 数据集中的真实对象上训练我们的网络，并在三个不同的数据集上对其进行测试，以评估其有效性和普遍性。我们的方法可以跨场景（甚至是室内场景，与我们的对象训练场景完全不同）进行泛化，并仅使用三个输入图像生成逼真的视图合成结果，显着优于可泛化辐射场重建的并行工作。此外，如果捕捉到密集的图像，我们估计的辐射场表示可以很容易地进行微调；与 NeRF 相比，这导致具有更高渲染质量和更短优化时间的快速每场景重建。\n  - [使用 NeRF 实现新视图合成的连续深度 MPI, ICCV2021](https://arxiv.org/abs/2103.14910) | [***``[code]``***](https://github.com/vincentfung13/MINE)\n    > 在本文中，我们建议 MINE 通过从单个图像进行密集 3D 重建来执行新颖的视图合成和深度估计。我们的方法是通过引入神经辐射场 (NeRF) 对多平面图像 (MPI) 进行连续深度泛化。给定单个图像作为输入，MINE 预测任意深度值的 4 通道图像（RGB 和体积密度）以联合重建相机平截头体并填充被遮挡的内容。然后可以使用可微分渲染轻松地将重建和修复的截锥体渲染为新颖的 RGB 或深度视图。在 RealEstate10K、KITTI 和 Flowers Light Fields 上进行的大量实验表明，我们的 MINE 在新颖的视图合成中大大优于最先进的技术。我们还在 iBims-1 和 NYU-v2 的深度估计方面取得了具有竞争力的结果，而无需注释深度监督。我们的源代码可在此 https 网址获得\n  - [UNISURF：统一神经隐式表面和辐射场以进行多视图重建, ICCV2021(oral)](https://arxiv.org/abs/2104.10078) | [***``[code]``***](https://github.com/autonomousvision/unisurf)\n    > 神经隐式 3D 表示已成为从多视图图像重建表面和合成新视图的强大范例。不幸的是，DVR 或 IDR 等现有方法需要精确的每像素对象掩码作为监督。同时，神经辐射场已经彻底改变了新的视图合成。然而，NeRF 的估计体积密度不允许精确的表面重建。我们的主要见解是隐式表面模型和辐射场可以以统一的方式制定，从而使用相同的模型实现表面和体积渲染。这种统一的视角实现了新颖、更有效的采样程序，并能够在没有输入掩码的情况下重建准确的表面。我们在 DTU、BlendedMVS 和合成室内数据集上比较我们的方法。我们的实验表明，我们在重建质量方面优于 NeRF，同时在不需要掩码的情况下与 IDR 相当。\n  - [NeuS：通过体渲染学习神经隐式表面以进行多视图重建, NeurIPS2021](https://arxiv.org/abs/2106.10689) | [***``[code]``***](https://github.com/Totoro97/NeuS)\n    > 我们提出了一种新的神经表面重建方法，称为 NeuS，用于从 2D 图像输入中重建具有高保真度的对象和场景。现有的神经表面重建方法，如 DVR 和 IDR，需要前景掩码作为监督，容易陷入局部最小值，因此难以重建具有严重自遮挡或薄结构的物体。同时，最近用于新视图合成的神经方法，例如 NeRF 及其变体，使用体积渲染来生成具有优化鲁棒性的神经场景表示，即使对于高度复杂的对象也是如此。然而，从这种学习到的隐式表示中提取高质量的表面是很困难的，因为表示中没有足够的表面约束。在 NeuS 中，我们建议将表面表示为有符号距离函数 (SDF) 的零级集，并开发一种新的体绘制方法来训练神经 SDF 表示。我们观察到传统的体绘制方法会导致表面重建的固有几何误差（即偏差），因此提出了一种新的公式，该公式在一阶近似中没有偏差，从而即使没有掩模监督也能实现更准确的表面重建.在 DTU 数据集和 BlendedMVS 数据集上的实验表明，NeuS 在高质量表面重建方面优于最先进的技术，特别是对于具有复杂结构和自遮挡的物体和场景。\n  - [神经隐式表面的体积渲染, NeurIPS2021](https://arxiv.org/abs/2106.12052) | [code]\n    > 神经体绘制最近变得越来越流行，因为它成功地从一组稀疏的输入图像中合成了场景的新视图。到目前为止，通过神经体绘制技术学习的几何图形是使用通用密度函数建模的。此外，几何本身是使用密度函数的任意水平集提取的，这会导致嘈杂的、通常是低保真度的重建。本文的目标是改进神经体绘制中的几何表示和重建。我们通过将体积密度建模为几何形状的函数来实现这一点。这与之前将几何建模为体积密度函数的工作形成对比。更详细地说，我们将体积密度函数定义为应用于有符号距离函数 (SDF) 表示的拉普拉斯累积分布函数 (CDF)。这种简单的密度表示具有三个好处：（i）它为在神经体绘制过程中学习的几何图形提供了有用的归纳偏差； (ii) 它有助于限制不透明度近似误差，从而实现对视线的准确采样。准确的采样对于提供几何和辐射的精确耦合很重要； (iii) 它允许在体积渲染中对形状和外观进行有效的无监督解开。将这种新的密度表示应用于具有挑战性的场景多视图数据集产生了高质量的几何重建，优于相关的基线。此外，由于两者的分离，可以在场景之间切换形状和外观。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/semantic.md",
    "content": "\n每周分类神经辐射场 - semantic ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=====================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n  - [iLabel：揭示神经领域中的对象, RAL2022](https://ieeexplore.ieee.org/abstract/document/9996585) | [code]\n    > 经过自我监督训练以有效表示 3D 场景的几何形状和颜色的神经场往往会自动将其分解为连贯且准确的类似物体的区域，这些区域可以通过稀疏标记交互来揭示以产生 3D 语义场景分割。 我们的实时 iLabel 系统从手持式 RGB-D 相机获取输入，需要零先验训练数据，并以“开放集”方式工作，语义类别由用户即时定义。 iLabel 的底层模型是一个简单的多层感知器 (MLP)，从头开始训练以学习单个 3D 场景的神经表示。 该模型不断更新并实时可视化，使用户能够专注于交互以实现极其高效的语义分割。 一个房间规模的场景可以准确地标记为 10 多个语义类别，只需大约 100 次点击，耗时不到 5 分钟。 定量标记的准确性随着点击次数的增加而显着增加，并迅速超越标准的预训练语义分割方法。 我们还展示了 iLabel 的分层标签变体和“免提”模式，用户只需为自动生成的位置提供标签名称。\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n  - [NeRF-RPN：NeRF 中对象检测的通用框架](https://arxiv.org/abs/2211.11646) | [code]\n    > 本文介绍了第一个重要的目标检测框架 NeRF-RPN，它直接在 NeRF 上运行。给定预训练的 NeRF 模型，NeRF-RPN 旨在检测场景中对象的所有边界框。通过利用包含多尺度 3D 神经体积特征的新型体素表示，我们证明可以直接回归 NeRF 中对象的 3D 边界框，而无需在任何视点渲染 NeRF。 NeRF-RPN 是一个通用框架，可用于检测没有类标签的对象。我们用各种骨干架构、RPN 头部设计和损失函数对 NeRF-RPN 进行了实验。所有这些都可以以端到端的方式进行训练，以估计高质量的 3D 边界框。为了促进 NeRF 对象检测的未来研究，我们构建了一个新的基准数据集，其中包含经过仔细标记和清理的合成数据和真实数据。请单击此 https URL 以可视化我们的 NeRF-RPN 的 3D 区域提案。代码和数据集将可用。\n  - [SegNeRF：具有神经辐射场的 3D 部分分割](https://arxiv.org/abs/2211.11215) | [code]\n    > 神经辐射场 (NeRF) 的最新进展在生成任务（如新视图合成和 3D 重建）方面表现出色。基于神经辐射场的方法能够通过完全依赖姿势图像隐含地表示 3D 世界。然而，它们很少在 3D 零件分割等判别任务领域进行探索。在这项工作中，我们试图通过提出 SegNeRF 来弥合这一差距：一种将语义场与通常的辐射场集成在一起的神经场表示。 SegNeRF 继承了之前作品执行新视图合成和 3D 重建的能力，并能够从少量图像中进行 3D 部分分割。我们在 PartNet 上进行的广泛实验表明，SegNeRF 能够同时预测来自摆姿势图像的几何形状、外观和语义信息，即使对于看不见的物体也是如此。预测的语义场允许 SegNeRF 实现 2D 新视图分割的平均 mIoU 为 30.30%，3D 部分分割的平均 mIoU 为 37.46%，与基于点的方法相比，仅使用少量姿势图像具有竞争力的性能。此外，SegNeRF 能够从野外拍摄的物体的单个图像及其相应的部分分割生成显式 3D 模型。\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [CLIP-Fields：机器人记忆的弱监督语义场](https://mahis.life/clip-fields/) | [code]\n    > 我们提出了 CLIP-Fields，这是一种隐式场景模型，可以在没有直接人工监督的情况下进行训练。该模型学习从空间位置到语义嵌入向量的映射。然后，该映射可用于各种任务，例如分割、实例识别、空间语义搜索和视图定位。最重要的是，映射可以通过仅来自网络图像和网络文本训练模型（如 CLIP、Detic 和 Sentence-BERT）的监督进行训练。与 Mask-RCNN 之类的基线相比，我们的方法在 HM3D 数据集上的少量实例识别或语义分割方面表现优于仅一小部分示例。最后，我们展示了使用 CLIP-Fields 作为场景记忆，机器人可以在现实环境中执行语义导航。我们的代码和演示可在此处获得：https://mahis.life/clip-fields/\n## Oct2 - Oct8, 2022\n  - [ViewFool：评估视觉识别对对抗性观点的鲁棒性, NeurIPS2022](https://arxiv.org/abs/2210.03895) | [code]\n    > 最近的研究表明，视觉识别模型对分布变化缺乏鲁棒性。然而，目前的工作主要考虑模型对 2D 图像转换的鲁棒性，而较少探索 3D 世界中的视点变化。一般来说，视点变化在各种实际应用（例如自动驾驶）中很普遍，因此评估视点鲁棒性势在必行。在本文中，我们提出了一种称为 ViewFool 的新方法来寻找误导视觉识别模型的对抗性视点。通过将现实世界中的物体编码为神经辐射场 (NeRF)，ViewFool 在熵正则化器下表征了不同对抗视点的分布，这有助于处理真实相机姿态的波动并减轻真实物体与其神经之间的现实差距申述。实验验证了常见的图像分类器极易受到生成的对抗性视点的影响，这也表现出很高的跨模型可迁移性。基于 ViewFool，我们引入了 ImageNet-V，这是一种新的分布外数据集，用于对图像分类器的视点鲁棒性进行基准测试。对具有不同架构、目标函数和数据增强的 40 个分类器的评估结果显示，在 ImageNet-V 上进行测试时模型性能显着下降，这为利用 ViewFool 作为一种有效的数据增强策略来提高视点鲁棒性提供了可能性。\n  - [用于将图像转换为任意比例的简单插件](https://arxiv.org/abs/2210.03417) | [code]\n    > 现有的超分辨率模型通常专门针对一个尺度，从根本上限制了它们在实际场景中的使用。在本文中，我们的目标是开发一个通用插件，可以插入到现有的超分辨率模型中，方便地增强它们对任意分辨率图像缩放的能力，因此被称为 ARIS。我们做出以下贡献：（i）我们提出了一个基于transformer的插件模块，它使用空间坐标作为查询，通过交叉注意迭代地关注低分辨率图像特征，并为查询的空间位置输出视觉特征，类似于图像的隐式表示； (ii) 我们引入了一种新颖的自我监督训练方案，该方案利用一致性约束来有效地增强模型将图像上采样到看不见的尺度的能力，即不提供真实的高分辨率图像； (iii) 在不失一般性的情况下，我们将提出的 ARIS 插件模块注入到多个现有模型中，即 IPT、SwinIR 和 HAT，表明生成的模型不仅可以在固定比例因子上保持其原始性能，而且可以外推到看不见的模型尺度，在标准基准上大大优于现有的任何尺度超分辨率模型，例如Urban100、DIV2K等\n  - [用于实时、开放集场景理解的特征真实神经融合](https://arxiv.org/abs/2210.03043) | [code]\n    > 机器人的一般场景理解需要灵活的语义表示，以便可以识别、分割和分组训练时可能不知道的新物体和结构。我们提出了一种算法，该算法在实时 SLAM 期间将来自标准预训练网络的一般学习特征融合到高效的 3D 几何神经场表示中。融合的 3D 特征图继承了神经域几何表示的连贯性。这意味着在运行时交互的少量人类标签使对象甚至对象的一部分能够以开放集的方式稳健而准确地分割。\n  - [神经匹配字段：视觉对应匹配字段的隐式表示, NeurIPS2022](https://arxiv.org/abs/2210.02689) | [***``[code]``***](https://ku-cvlab.github.io/NeMF/)\n    > 现有的语义对应管道通常包括提取高级语义特征以保持对类内变化和背景杂波的不变性。然而，这种架构不可避免地会导致低分辨率匹配字段，该字段还需要临时插值过程作为将其转换为高分辨率的后处理，这肯定会限制匹配结果的整体性能。为了克服这个问题，受隐式神经表示最近成功的启发，我们提出了一种新的语义对应方法，称为神经匹配场 (NeMF)。然而，4D 匹配场的复杂性和高维性是主要障碍，我们提出了一种成本嵌入网络来处理粗略的成本量，以作为通过以下全连接网络建立高精度匹配场的指导。然而，学习高维匹配字段仍然具有挑战性，主要是由于计算复杂性，因为简单的穷举推理需要从 4D 空间中的所有像素中查询以推断像素级对应关系。为了克服这个问题，我们提出了充分的训练和推理程序，在训练阶段，我们随机抽取匹配的候选者，在推理阶段，我们在测试时迭代地执行基于 PatchMatch 的推理和坐标优化。通过这些结合，在语义对应的几个标准基准上获得了具有竞争力的结果。此 https URL 提供了代码和预训练的权重。\n## Sep25 - Oct1, 2022\n  - [了解体素网格 NeRF 模型的纯 CLIP 指导](https://arxiv.org/abs/2209.15172) | [code]\n    > 我们使用 CLIP 探索文本到 3D 对象生成的任务。具体来说，我们在不访问任何数据集的情况下使用 CLIP 进行指导，我们将这种设置称为纯 CLIP 指导。虽然之前的工作采用了这种设置，但没有系统研究防止 CLIP 中产生对抗性生成的机制。我们说明了不同的基于图像的增强如何防止对抗性生成问题，以及生成的结果如何受到影响。我们测试了不同的 CLIP 模型架构，并表明集成不同的模型进行指导可以防止更大模型中的对抗性生成并产生更清晰的结果。此外，我们实现了一个隐式体素网格模型，以展示神经网络如何提供额外的正则化层，从而产生更好的几何结构和生成对象的连贯性。与之前的工作相比，我们以更高的记忆效率和更快的训练速度获得了更连贯的结果。\n  - [具有三层采样和全景表示的城市级增量神经映射](https://arxiv.org/abs/2209.14072) | [code]\n    > 神经隐式表示最近引起了机器人界的广泛关注，因为它们具有表现力、连续性和紧凑性。然而，基于稀疏 LiDAR 输入的城市规模增量隐式密集映射仍然是一个未充分探索的挑战。为此，我们成功构建了第一个具有全景表示的城市规模增量神经映射系统，该系统由环境级和实例级建模组成。给定一个稀疏的 LiDAR 点云流，它维护一个动态生成模型，将 3D 坐标映射到有符号距离场 (SDF) 值。为了解决在城市尺度空间中表示不同层次几何信息的困难，我们提出了一种定制的三层采样策略来动态采样全局、局部和近地表域。同时，为了实现高保真映射，引入了特定类别的先验以更好地对几何细节进行建模，从而实现全景表示。我们评估了公共 SemanticKITTI 数据集，并使用定量和定性结果证明了新提出的三层采样策略和全景表示的重要性。代码和数据将公开。\n  - [360FusionNeRF：具有联合引导的全景神经辐射场](https://arxiv.org/abs/2209.14265) | [code]\n    > 我们提出了一种基于神经辐射场 (NeRF) 从单个 360 度全景图像合成新视图的方法。类似设置中的先前研究依赖于多层感知的邻域插值能力来完成由遮挡引起的缺失区域，这导致其预测中的伪影。我们提出了 360FusionNeRF，这是一个半监督学习框架，我们在其中引入几何监督和语义一致性来指导渐进式训练过程。首先，将输入图像重新投影到 360 度图像，并在其他相机位置提取辅助深度图。除了 NeRF 颜色指导之外，深度监督还改进了合成视图的几何形状。此外，我们引入了语义一致性损失，鼓励对新视图进行逼真的渲染。我们使用预训练的视觉编码器（例如 CLIP）提取这些语义特征，CLIP 是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的 2D 照片进行训练。实验表明，我们提出的方法可以在保留场景特征的同时产生未观察到的区域的合理完成。在跨各种场景进行训练时，360FusionNeRF 在转移到合成 Structured3D 数据集（PSNR~5%，SSIM~3% LPIPS~13%）、真实世界的 Matterport3D 数据集（PSNR~3%）时始终保持最先进的性能, SSIM~3% LPIPS~9%) 和 Replica360 数据集 (PSNR~8%, SSIM~2% LPIPS~18%)。\n  - [烘焙特征：通过渲染特征图加速体积分割](https://arxiv.org/abs/2209.12744) | [code]\n    > 最近提出了一些方法，即仅使用彩色图像和专家监督以稀疏语义注释像素的形式将 3D 体积密集分割成类。虽然令人印象深刻，但这些方法仍然需要相对大量的监督，并且在实践中分割对象可能需要几分钟。这样的系统通常只优化它们在它们适合的特定场景上的表示，而不利用来自先前看到的图像的任何先验信息。在本文中，我们建议使用在现有大型数据集上训练的模型提取的特征来提高分割性能。我们通过体积渲染特征图并监督从每个输入图像中提取的特征，将这种特征表示烘焙到神经辐射场 (NeRF) 中。我们表明，通过将这种表示烘焙到 NeRF 中，我们使后续的分类任务变得更加容易。我们的实验表明，与现有方法相比，我们的方法在广泛的场景中以更少的语义注释实现了更高的分割精度。\n## Sep18 - Sep24, 2022\n  - [NeRF-SOS：复杂场景上的任意视图自监督对象分割](https://zhiwenfan.github.io/NeRF-SOS/) | [***``[code]``***](https://github.com/VITA-Group/NeRF-SOS)\n    > 神经体积表示已经显示了多层感知器 (MLP) 可以使用多视图校准图像进行优化以表示场景几何和外观的潜力，而无需明确的 3D 监督。对象分割可以基于学习到的辐射场丰富许多下游应用。然而，引入手工分割来定义复杂现实世界场景中的感兴趣区域并非易事且成本高昂，因为它需要每个视图注释。本文针对复杂的现实世界场景使用 NeRF 进行对象分割的自监督学习探索。我们的框架称为带有自监督对象分割 NeRF-SOS 的 NeRF，它结合了对象分割和神经辐射场来分割场景中任何视图中的对象。通过在外观和几何级别上提出一种新颖的协作对比损失，NeRF-SOS 鼓励 NeRF 模型从其密度场和自我监督的预训练 2D 视觉特征中提取紧凑的几何感知分割簇。自监督对象分割框架可以应用于各种 NeRF 模型，这些模型既可以产生逼真的渲染结果，又可以在室内和室外场景中提供令人信服的分割图。 LLFF、Tank & Temple 和 BlendedMVS 数据集的广泛结果验证了 NeRF-SOS 的有效性。它始终超越其他基于 2D 的自我监督基线，并预测比现有监督对应物更精细的语义掩码。请参阅我们项目页面上的视频以获取更多详细信息：此 https URL。\n  - [医学影像分割的隐式神经表示, MICCAI2022](https://link.springer.com/chapter/10.1007/978-3-031-16443-9_42) | [code]\n    > 医学成像中的 3D 信号（例如 CT 扫描）通常被参数化为体素的离散网格。例如，现有的最先进的器官分割方法学习离散的分割图。不幸的是，这些方法的内存需求随着空间分辨率的增加而呈立方增长，这使得它们不适合处理高分辨率扫描。为了克服这个问题，我们设计了一个隐式器官分割网络 (IOSNet)，它利用连续的隐式神经表示并具有几个有用的属性。首先，IOSNet 解码器内存大致恒定且独立于空间分辨率，因为它将分割图参数化为连续函数。其次，IOSNet 的收敛速度比基于离散体素的方法快得多，因为它能够准确地分割器官而不受器官大小的影响，从而在不需要任何辅助技巧的情况下缓解大小不平衡问题。第三，由于其连续学习表示，IOSNet 自然支持超分辨率（即在推理过程中以任意分辨率采样）。此外，尽管使用了一个简单的轻量级解码器，IOSNet 始终优于离散专业分割架构 UNet。因此，我们的方法表明隐式神经表示非常适合医学成像应用，尤其是处理高分辨率 3D 医学扫描。\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n  - [神经特征融合领域：自监督 2D 图像表示的 3D 蒸馏, 3DV2022(oral)](https://arxiv.org/abs/2209.03494) | [***``[code]``***](https://github.com/dichotomies/N3F)\n    > 我们提出了神经特征融合场 (N3F)，这是一种在将密集 2D 图像特征提取器应用于可重构为 3D 场景的多张图像分析时改进密集 2D 图像特征提取器的方法。给定一个图像特征提取器，例如使用自我监督进行预训练，N3F 使用它作为教师来学习在 3D 空间中定义的学生网络。 3D 学生网络类似于提取所述特征的神经辐射场，并且可以使用通常的可微渲染机器进行训练。因此，N3F 很容易适用于大多数神经渲染公式，包括 vanilla NeRF 及其对复杂动态场景的扩展。我们表明，我们的方法不仅能够在不使用手动标签的情况下在特定场景的神经领域的上下文中实现语义理解，而且在自我监督的 2D 基线上持续改进。这通过考虑不同序列中的各种任务（例如 2D 对象检索、3D 分割和场景编辑）来证明，包括 EPIC-KITCHENS 基准测试中的以自我为中心的长视频。\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n  - [DreamBooth：为主题驱动生成微调文本到图像的扩散模型](https://dreambooth.github.io/) | [code]\n    > 大型文本到图像模型在人工智能的演进中实现了显着的飞跃，能够从给定的文本提示中对图像进行高质量和多样化的合成。然而，这些模型缺乏模仿给定参考集中对象的外观并在不同上下文中合成它们的新颖再现的能力。在这项工作中，我们提出了一种“个性化”文本到图像扩散模型的新方法（专门针对用户的需求）。给定主题的几张图像作为输入，我们微调预训练的文本到图像模型（Imagen，尽管我们的方法不限于特定模型），以便它学会将唯一标识符与该特定主题绑定.一旦对象被嵌入模型的输出域中，唯一标识符就可以用于合成在不同场景中情境化的对象的完全新颖的真实感图像。通过利用嵌入在模型中的语义先验和新的自生类特定先验保存损失，我们的技术能够在参考图像中没有出现的不同场景、姿势、视图和照明条件下合成主体。我们将我们的技术应用于几个以前无懈可击的任务，包括主题重新上下文化、文本引导视图合成、外观修改和艺术渲染（同时保留主题的关键特征）。项目页面：此 https 网址\n## Aug14 - Aug20, 2022\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n  - [NeSF: 用于 3D 场景的可概括语义分割的神经语义场](https://research.google/pubs/pub51563/) | [code]\n    > 我们提出了 NeSF，一种从预训练的密度场和稀疏的 2D 语义监督产生 3D 语义场的方法。我们的方法通过利用将 3D 信息存储在神经域中的神经表示来避开传统的场景表示。尽管仅由 2D 信号监督，我们的方法能够从新颖的相机姿势生成 3D 一致的语义图，并且可以在任意 3D 点进行查询。值得注意的是，NeSF 与任何产生密度场的方法兼容，并且随着预训练密度场质量的提高，其准确性也会提高。我们的实证分析证明了在令人信服的合成场景上与竞争性 2D 和 3D 语义分割基线相当的质量，同时还提供了现有方法无法提供的功能。\n## Jul24 - Jul30, 2022\n## Previous weeks\n  - [节食 NeRF：语义一致的 Few-Shot 视图合成, ICCV2021](https://www.ajayj.com/dietnerf) | [***``[code]``***](https://github.com/ajayjain/DietNeRF)\n    > 我们提出了 DietNeRF，一种从几张图像估计的 3D 神经场景表示。神经辐射场 (NeRF) 通过多视图一致性学习场景的连续体积表示，并且可以通过光线投射从新颖的视点进行渲染。虽然 NeRF 在给定许多图像的情况下具有令人印象深刻的重建几何和精细细节的能力，对于具有挑战性的 360° 场景最多可重建 100 个，但当只有少数输入视图可用时，它通常会为其图像重建目标找到退化的解决方案。为了提高few-shot质量，我们提出了DietNeRF。我们引入了一种辅助语义一致性损失，它鼓励以新颖的姿势进行逼真的渲染。 DietNeRF 在单个场景上进行训练，以 (1) 从相同的姿势正确渲染给定的输入视图，以及 (2) 在不同的随机姿势中匹配高级语义属性。我们的语义损失使我们能够从任意姿势监督 DietNeRF。我们使用预训练的视觉编码器提取这些语义，例如 CLIP，这是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的单视图 2D 照片进行训练。在实验中，DietNeRF 在从头开始学习时提高了少镜头视图合成的感知质量，在多视图数据集上进行预训练时，可以用少至一张观察到的图像渲染新视图，并生成完全未观察到的区域的合理完成。\n  - [物体辐射场的无监督发现, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > 我们研究从单个图像推断以对象为中心的场景表示的问题，旨在推导出解释图像形成过程的表示，捕捉场景的 3D 性质，并且在没有监督的情况下学习。由于将复杂的 3D 到 2D 图像形成过程集成到强大的推理方案（如深度网络）中存在根本性挑战，大多数现有的场景分解方法都缺乏这些特征中的一个或多个。在本文中，我们提出了对象辐射场 (uORF) 的无监督发现，将神经 3D 场景表示和渲染的最新进展与深度推理网络相结合，用于无监督 3D 场景分解。在没有注释的多视图 RGB 图像上进行训练，uORF 学习从单个图像分解具有不同纹理背景的复杂场景。我们展示了 uORF 在无监督 3D 场景分割、新视图合成和三个数据集上的场景编辑方面表现良好。\n  - [使用隐式场景表示进行就地场景标记和理解, ICCV2021(oral)](https://shuaifengzhi.com/Semantic-NeRF/) | [***``[code]``***](https://github.com/Harry-Zhi/semantic_nerf/)\n    > 语义标签与几何和辐射重建高度相关，因为具有相似形状和外观的场景实体更有可能来自相似的类别。最近的隐式神经重建技术很有吸引力，因为它们不需要事先的训练数据，但同样的完全自我监督的方法对于语义来说是不可能的，因为标签是人类定义的属性。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/texture.md",
    "content": "\n每周分类神经辐射场 - texture ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n====================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n## Nov20 - Nov26, 2022\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n  - [深度外观预过滤, ToG2022](https://dl.acm.org/doi/abs/10.1145/3570327) | [code]\n    > 复杂场景的基于物理的渲染可能成本高得令人望而却步，并且复杂性在渲染图像上的分布可能是无限且不均匀的。理想的细节层次 (LoD) 方法的目标是使渲染成本独立于 3D 场景的复杂性，同时保持场景的外观。然而，由于依赖近似模型和其他启发式方法，当前的预过滤 LoD 方法在它们可以支持的外观方面受到限制。我们提出了第一个全面的多尺度 LoD 框架，用于预过滤具有复杂几何形状和材料（例如 Disney BRDF）的 3D 环境，同时保持与光线追踪参考相关的外观。使用场景的多尺度层次结构，我们执行数据驱动的预过滤步骤以获得每个尺度的外观相位函数和方向覆盖掩码。我们方法的核心是一种新颖的神经表示，它将这些信息编码成一种紧凑的潜在形式，这种形式很容易在基于物理的渲染器中解码。一旦场景被烘焙出来，我们的方法在渲染时不需要原始几何体、材质或纹理。我们证明我们的方法与最先进的预过滤方法相比具有优势，并且可以为复杂场景节省大量内存。\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n  - [IBL-NeRF：基于图像的神经辐射场照明公式](https://arxiv.org/abs/2210.08202) | [code]\n    > 我们提出了 IBL-NeRF，它将大规模室内场景的神经辐射场 (NeRF) 分解为内在成分。以前的 NeRF 逆向渲染方法转换隐式体积以适应显式几何的渲染管道，并使用环境照明近似分割、孤立对象的视图。相比之下，我们的逆渲染扩展了原始的 NeRF 公式，以捕捉场景体积内照明的空间变化，以及表面属性。具体来说，将不同材质的场景分解为基于图像的渲染的内在组件，即反照率、粗糙度、表面法线、辐照度和预过滤辐射度。所有组件都被推断为来自 MLP 的神经图像，可以对大规模的一般场景进行建模。通过采用基于图像的 NeRF 公式，我们的方法继承了合成图像的卓越视觉质量和多视图一致性。我们展示了在具有复杂对象布局和灯光配置的场景上的性能，这些在以前的任何作品中都无法处理。\n  - [NeuralRoom：用于室内场景重建的几何约束神经隐式表面](https://arxiv.org/abs/2210.06853) | [code]\n    > 我们提出了一种称为 NeuralRoom 的新型神经表面重建方法，用于直接从一组 2D 图像重建房间大小的室内场景。最近，由于其高质量的结果和简单性，隐式神经表示已成为从多视图图像重建表面的有前途的方法。然而，隐式神经表示通常不能很好地重建室内场景，因为它们存在严重的形状-辐射度模糊性。我们假设室内场景由纹理丰富和平坦的无纹理区域组成。在纹理丰富的区域，多视图立体可以获得准确的结果。在平坦区域，正态估计网络通常能获得较好的正态估计。基于上述观察，我们通过可靠的几何先验来减少隐式神经表面可能的空间变化范围，以减轻形状-辐射度的模糊性。具体来说，我们使用多视图立体结果来限制 NeuralRoom 优化空间，然后使用可靠的几何先验来指导 NeuralRoom 训练。然后，NeuralRoom 将生成一个神经场景表示，该表示可以渲染与输入训练图像一致的图像。此外，我们提出了一种称为扰动残差限制的平滑方法来提高平坦区域的准确性和完整性，该方法假设局部表面中的采样点应该与观测中心具有相同的法线和相似的距离。在 ScanNet 数据集上的实验表明，我们的方法可以重建室内场景的无纹理区域，同时保持细节的准确性。我们还将 NeuralRoom 应用于更高级的多视图重建算法，并显着提高了它们的重建质量。\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n## Sep18 - Sep24, 2022\n  - [SG-SRNs：超像素引导的场景表示网络, SignalProcessingLetters](https://ieeexplore.ieee.org/abstract/document/9900405) | [code]\n    > 最近，场景表示网络（SRNs）由于其连续且轻量级的场景表示能力，在计算机视觉领域引起了越来越多的关注。然而，SRN 通常在低纹理图像区域上表现不佳。为了解决这个问题，我们在本文中提出了超像素引导的场景表示网络，称为 SG-SRN，由主干模块 (SRN)、超像素分割模块和超像素正则化模块组成。在所提出的方法中，除了新颖的视图合成任务外，表示感知的超像素分割掩码生成任务由所提出的超像素分割模块实现。然后，超像素正则化模块利用超像素分割掩码以局部平滑的方式引导要学习的主干，并优化局部区域的场景表示，以自监督的方式间接缓解低纹理区域的结构失真.在我们构建的数据集和公共 Synthetic-NeRF 数据集上的广泛实验结果表明，所提出的 SG-SRN 实现了显着更好的 3D 结构表示性能。\n  - [通过神经动画网格进行人体性能建模和渲染](https://arxiv.org/abs/2209.08468) | [code]\n    > 我们最近看到了照片真实人体建模和渲染的神经进步的巨大进步。但是，将它们集成到现有的基于网格的管道中以用于下游应用程序仍然具有挑战性。在本文中，我们提出了一种综合神经方法，用于从密集的多视图视频中对人类表演进行高质量的重建、压缩和渲染。我们的核心直觉是将传统的动画网格工作流程与新型高效神经技术联系起来。我们首先介绍了一种用于在几分钟内生成高质量表面的神经表面重建器。它将截断有符号距离场 (TSDF) 的隐式体积渲染与多分辨率哈希编码结合在一起。我们进一步提出了一种混合神经跟踪器来生成动画网格，它将显式非刚性跟踪与自监督框架中的隐式动态变形相结合。前者将粗略的变形提供回规范空间，而后者隐含的进一步使用我们的重构器中的 4D 哈希编码来预测位移。然后，我们讨论使用获得的动画网格的渲染方案，范围从动态纹理到各种带宽设置下的流明图渲染。为了在质量和带宽之间取得复杂的平衡，我们提出了一种分层解决方案，首先渲染覆盖表演者的 6 个虚拟视图，然后进行遮挡感知神经纹理混合。我们展示了我们的方法在各种基于网格的应用程序和各种平台上逼真的自由视图体验中的有效性，即通过移动 AR 将虚拟人类表演插入真实环境或使用 VR 耳机沉浸式观看才艺表演。\n## Sep11 - Sep17, 2022\n  - [StructNeRF：具有结构提示的室内场景的神经辐射场](https://arxiv.org/abs/2209.05277) | [code]\n    > 神经辐射场 (NeRF) 使用密集捕获的输入图像实现照片般逼真的视图合成。然而，在给定稀疏视图的情况下，NeRF 的几何形状受到极大限制，导致新视图合成质量显着下降。受自监督深度估计方法的启发，我们提出了 StructNeRF，这是一种针对具有稀疏输入的室内场景的新颖视图合成的解决方案。 StructNeRF 利用自然嵌入在多视图输入中的结构提示来处理 NeRF 中的无约束几何问题。具体来说，它分别处理纹理和非纹理区域：提出了一种基于块的多视图一致光度损失来约束纹理区域的几何形状；对于非纹理平面，我们明确将它们限制为 3D 一致平面。通过密集的自监督深度约束，我们的方法提高了 NeRF 的几何和视图合成性能，而无需对外部数据进行任何额外的训练。对几个真实世界数据集的广泛实验表明，StructNeRF 在数量和质量上都超过了用于室内场景的最先进的方法。\n## Sep4 - Sep10, 2022\n  - [具有学习几何先验的 3D 纹理形状恢复](https://arxiv.org/abs/2209.03254) | [code]\n    > 从部分扫描中恢复 3D 纹理形状对于许多实际应用至关重要。现有方法已经证明了隐式函数表示的有效性，但它们存在严重遮挡和不同对象类型的部分输入，这极大地阻碍了它们在现实世界中的应用价值。本技术报告介绍了我们通过结合学习几何先验来解决这些限制的方法。为此，我们从学习的姿势预测中生成一个 SMPL 模型，并将其融合到部分输入中，以添加人体的先验知识。我们还提出了一种新颖的完整性感知边界框自适应，用于处理不同级别的尺度和部分扫描的局部性。\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n## Aug7 - Aug13, 2022\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n  - [ShAPO：多对象形状、外观和姿势优化的隐式表示, ECCV2022](https://arxiv.org/abs/2207.13691) | [***``[code]``***](https://zubair-irshad.github.io/projects/ShAPO.html)\n    > 我们的方法从单个 RGB-D 观察中研究以对象为中心的 3D 理解的复杂任务。由于这是一个不适定问题，现有方法在具有遮挡的复杂多对象场景中的 3D 形状和 6D 姿势和尺寸估计性能低下。我们提出了 ShaAPO，一种用于联合多对象检测、3D 纹理重建、6D 对象姿态和大小估计的方法。 ShAPO 的关键是一个单次管道，用于回归形状、外观和姿势潜在代码以及每个对象实例的掩码，然后以稀疏到密集的方式进一步细化。首先学习了一种新的解开的先验形状和外观数据库，以将对象嵌入到它们各自的形状和外观空间中。我们还提出了一种新颖的、基于八叉树的可微优化步骤，使我们能够以综合分析的方式在学习的潜在空间下同时进一步改进对象形状、姿势和外观。我们新颖的联合隐式纹理对象表示使我们能够准确地识别和重建新的看不见的对象，而无需访问它们的 3D 网格。通过广泛的实验，我们证明了我们的方法在模拟室内场景上进行训练，能够以最少的微调准确地回归现实世界中新物体的形状、外观和姿势。我们的方法显着优于 NOCS 数据集上的所有基线，6D 姿态估计的 mAP 绝对提高了 8%。\n  - [NeuMesh：学习基于解缠结神经网格的隐式场，用于几何和纹理编辑, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > 最近，神经隐式渲染技术得到了迅速发展，并在新颖的视图合成和 3D 场景重建中显示出巨大的优势。然而，现有的用于编辑目的的神经渲染方法提供的功能有限，例如，刚性变换，或者不适用于日常生活中一般对象的细粒度编辑。在本文中，我们提出了一种新颖的基于网格的表示，通过在网格顶点上使用解开几何和纹理代码对神经隐场进行编码，这促进了一组编辑功能，包括网格引导的几何编辑、带有纹理交换的指定纹理编辑、填充和绘画操作。为此，我们开发了几种技术包括可学习的符号指标以放大基于网格的表示的空间可区分性，蒸馏和微调机制以实现稳定收敛，以及空间感知优化策略以实现精确的纹理编辑。对真实数据和合成数据的大量实验和编辑示例证明了我们的方法在表示质量和编辑能力方面的优越性。代码可在项目网页上找到：此 https URL。\n## Previous weeks\n  - [CodeNeRF：对象类别的解开神经辐射场, ICCV2021(oral)](https://www.google.com/url?q=https%3A%2F%2Farxiv.org%2Fpdf%2F2109.01750.pdf&sa=D&sntz=1&usg=AOvVaw1Fnir0e4aRa22Nt0HoXDWh) | [***``[code]``***](https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fwbjang%2Fcode-nerf&sa=D&sntz=1&usg=AOvVaw2eD5ZoRbk2aWFuwUSHlh5_)\n    > CodeNeRF 是一种隐式 3D 神经表示，它学习对象形状和纹理在一个类别中的变化，并且可以从一组姿势图像中进行训练，以合成看不见的对象的新视图。与特定场景的原始 NeRF 不同，CodeNeRF 通过学习单独的嵌入来学习解开形状和纹理。在测试时，给定一个看不见的物体的单个未定位图像，CodeNeRF 通过优化联合估计相机视点、形状和外观代码。看不见的物体可以从单个图像中重建，然后从新的视点渲染，或者通过改变潜在代码编辑它们的形状和纹理。我们在 SRN 基准上进行了实验，结果表明 CodeNeRF 可以很好地泛化到看不见的对象，并且在测试时需要已知相机姿态的方法达到同等性能。我们在真实世界图像上的结果表明，CodeNeRF 可以弥合模拟到真实的差距。\n  - [物体辐射场的无监督发现, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > 我们研究从单个图像推断以对象为中心的场景表示的问题，旨在推导出解释图像形成过程的表示，捕捉场景的 3D 性质，并且在没有监督的情况下学习。由于将复杂的 3D 到 2D 图像形成过程集成到强大的推理方案（如深度网络）中存在根本性挑战，大多数现有的场景分解方法都缺乏这些特征中的一个或多个。在本文中，我们提出了对象辐射场 (uORF) 的无监督发现，将神经 3D 场景表示和渲染的最新进展与深度推理网络相结合，用于无监督 3D 场景分解。在没有注释的多视图 RGB 图像上进行训练，uORF 学习从单个图像分解具有不同纹理背景的复杂场景。我们展示了 uORF 在无监督 3D 场景分割、新视图合成和三个数据集上的场景编辑方面表现良好。\n  - [NeRF-Tex：神经反射场纹理, EGSR2021](https://developer.nvidia.com/blog/nvidia-research-nerf-tex-neural-reflectance-field-textures/) | [***``[code]``***](https://github.com/hbaatz/nerf-tex)\n    > 我们研究使用神经场来模拟不同的中尺度结构，例如毛皮、织物和草。我们建议使用由神经反射场 (NeRF-Tex) 表示的多功能体积基元，而不是使用经典的图形基元来建模结构，它联合建模材料的几何形状及其对照明的响应。 NeRF-Tex 原语可以在基础网格上实例化，以使用所需的细观和微尺度外观对其进行“纹理化”。我们根据控制外观的用户定义参数来调节反射率场。因此，单个 NeRF 纹理捕获了反射场的整个空间，而不是一个特定的结构。这增加了可以建模的外观范围，并提供了一种解决重复纹理伪影的解决方案。我们还证明了 NeRF 纹理自然地促进了连续的细节层次渲染。我们的方法将神经网络的多功能性和建模能力与虚拟场景精确建模所需的艺术控制相结合。虽然我们所有的训练数据目前都是合成的，但我们的工作提供了一个方法，可以进一步扩展以从真实图像中提取复杂、难以建模的外观。\n"
  },
  {
    "path": "docs/classified_weekly_nerf_cn/video.md",
    "content": "\n每周分类神经辐射场 - video ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n==================================================================================================================================\n## 按类别筛选: \n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | [光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \n## Dec27 - Jan3, 2023\n  - [使用基于体素的轨迹感知预训练增强无人机跟踪, RAL2022](https://ieeexplore.ieee.org/abstract/document/10015867) | [code]\n    > 基于 Siamese 网络的目标跟踪显着提升了高度机动无人机 (UAV) 的自动化能力。 然而，前沿的跟踪框架往往依赖于模板匹配，这使得它在面对连续帧中的多个对象视图时陷入困境。 此外，一般的图像级预训练主干可能会过度适应整体表示，导致在无人机跟踪中学习对象级属性时出现错位。 为了解决这些问题，这项工作提出了 TRTrack，这是一个全面的框架，可以充分利用无人机跟踪的立体表示。 具体来说，提出了一种新的预训练范式方法。 通过轨迹感知重建训练（TRT），在不增加任何参数的情况下，增强了主干提取立体结构特征的能力。 因此，提出了一种创新的分层自注意力 Transformer 来捕获局部细节信息和全局结构知识。 为了优化相关图，我们提出了一种新的空间相关细化（SCR）模块，它提高了对远程空间依赖性进行建模的能力。 三个具有挑战性的无人机基准测试的综合实验表明，所提出的 TRTrack 在精度和效率方面都实现了卓越的无人机跟踪性能。 现实环境中的定量测试充分证明了我们工作的有效性。\n## Dec25 - Dec31, 2022\n## Dec18 - Dec24, 2022\n## Dec11 - Dec17, 2022\n## Dec4 - Dec10, 2022\n## Nov27 - Dec3, 2022\n  - [QuadStream：一种用于新视点重建的基于 Quad 的场景流架构, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555524) | [code]\n    > 通过网络将渲染的 3D 内容流式传输到手机或 VR/AR 耳机等瘦客户端设备，将高保真图形带到通常由于热量、功率或成本限制而无法实现的平台。 流式 3D 内容必须以对延迟和潜在网络丢失都具有鲁棒性的表示形式进行传输。 在存在遮挡事件的情况下，传输视频流并重新投影以纠正不断变化的视点失败； 在功率有限的移动 GPU 上无法在客户端流式传输场景几何体和执行高质量渲染。 为了平衡消除遮挡稳健性和最小客户端工作量这两个相互竞争的目标，我们引入了 QuadStream，这是一种新的流媒体内容表示，它通过允许客户端有效地渲染新颖的视图而没有由消除遮挡事件引起的伪影来减少运动到光子的延迟。 受视频编解码器设计的传统宏块方法的启发，我们将从视图单元中的位置看到的场景分解为一系列四边形代理，或来自多个视图的视图对齐四边形。 通过在光栅化 G-Buffer 上操作，我们的方法独立于场景本身的表示； 生成的 QuadStream 是场景的近似几何表示，可以由瘦客户端重建以呈现当前视图和附近的相邻视图。 我们的技术贡献是一种有效的并行四边形生成、合并和打包策略，用于覆盖场景中潜在客户移动的代理视图； 一种打包和编码策略，允许将具有深度信息的掩码四边形作为帧相干流传输； 以及一种高效的渲染方法，用于将我们的 QuadStream 表示渲染为瘦客户端上的全新视图。 我们表明，与视频数据流方法和基于几何的流媒体相比，我们的方法实现了卓越的质量。\n## Nov20 - Nov26, 2022\n## Nov13 - Nov19, 2022\n## Nov6 - Nov12, 2022\n## Oct30 - Nov5, 2022\n## Oct23 - Oct29, 2022\n## Oct16 - Oct22, 2022\n## Oct9 - Oct15, 2022\n## Oct2 - Oct8, 2022\n## Sep25 - Oct1, 2022\n  - [MonoNeuralFusion：具有几何先验的在线单目神经 3D 重建](https://arxiv.org/abs/2209.15153) | [code]\n    > 从单目视频重建高保真 3D 场景仍然具有挑战性，特别是对于完整和细粒度的几何重建。先前具有神经隐式表示的 3D 重建方法已显示出完整场景重建的有希望的能力，但它们的结果通常过于平滑且缺乏足够的几何细节。本文介绍了一种新颖的神经隐式场景表示法，用于从单目视频中进行高保真在线 3D 场景重建的体积渲染。对于细粒度重建，我们的关键见解是将几何先验纳入神经隐式场景表示和神经体绘制，从而产生基于体绘制优化的有效几何学习机制。受益于此，我们提出了 MonoNeuralFusion 来从单目视频执行在线神经 3D 重建，从而在动态 3D 单目扫描期间有效地生成和优化 3D 场景几何图形。与最先进方法的广泛比较表明，我们的 MonoNeuralFusion 在数量和质量上始终生成更好的完整和细粒度的重建结果。\n## Sep18 - Sep24, 2022\n## Sep11 - Sep17, 2022\n## Sep4 - Sep10, 2022\n## Aug28 - Sep3, 2022\n## Aug21 - Aug27, 2022\n## Aug14 - Aug20, 2022\n  - [通过多平面图像的 3D 对象运动估计动态场景的时间视图合成, ISMAR2022](https://arxiv.org/abs/2208.09463) | [***``[code]``***](https://github.com/NagabhushanSN95/DeCOMPnet)\n    > 在低计算设备上以图形方式渲染高帧率视频的挑战可以通过对未来帧的定期预测来解决，以增强虚拟现实应用程序中的用户体验。这是通过时间视图合成 (TVS) 的问题来研究的，其目标是在给定前一帧以及前一帧和下一帧的头部姿势的情况下预测视频的下一帧。在这项工作中，我们考虑了用户和对象都在移动的动态场景的 TVS。我们设计了一个框架，将运动解耦为用户和对象运动，以在预测下一帧的同时有效地使用可用的用户运动。我们通过隔离和估计过去帧中的 3D 对象运动然后外推来预测对象的运动。我们使用多平面图像 (MPI) 作为场景的 3D 表示，并将对象运动建模为 MPI 表示中对应点之间的 3D 位移。为了在估计运动时处理 MPI 中的稀疏性，我们结合了部分卷积和掩蔽相关层来估计对应点。然后将预测的对象运动与给定的用户或相机运动集成以生成下一帧。使用遮蔽填充模块，我们合成由于相机和物体运动而未覆盖的区域。我们为包含 800 个全高清分辨率视频的动态场景 TVS 开发了一个新的合成数据集。我们通过对我们的数据集和 MPI Sintel 数据集的实验表明，我们的模型优于文献中的所有竞争方法。\n  - [从单目视频中对动画 3D 人体进行神经捕获, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > 我们提出了一种从单目视频输入构建可动画 3D 人体表示的新颖范例，这样它就可以以任何看不见的姿势和视图进行渲染。我们的方法基于动态神经辐射场 (NeRF)，该动态神经辐射场 (NeRF) 由作为几何代理的基于网格的参数化 3D 人体模型装配。以前的方法通常依赖多视图视频或准确的 3D 几何信息作为附加输入；此外，大多数方法在推广到看不见的姿势时质量会下降。我们认为，泛化的关键是用于查询动态 NeRF 的良好输入嵌入：良好的输入嵌入应该定义全体积空间中的单射映射，由姿态变化下的表面网格变形引导。基于这一观察，我们建议嵌入输入查询及其与网格顶点上一组测地最近邻所跨越的局部表面区域的关系。通过包含位置和相对距离信息，我们的嵌入定义了距离保留的变形映射，并很好地推广到看不见的姿势。为了减少对额外输入的依赖，我们首先使用现成的工具初始化每帧 3D 网格，然后提出一个管道来联合优化 NeRF 并细化初始网格。大量实验表明，我们的方法可以在看不见的姿势和视图下合成合理的人类渲染结果。\n  - [从全向图像中捕捉休闲室内 HDR 辐射](https://arxiv.org/abs/2208.07903) | [code]\n    > 我们提出了 PanoHDR-NeRF，这是一种新颖的管道，可以随意捕获大型室内场景的合理全 HDR 辐射场，而无需精心设置或复杂的捕获协议。首先，用户通过在场景周围自由挥动现成的相机来捕捉场景的低动态范围 (LDR) 全向视频。 然后，LDR2HDR 网络将捕获的 LDR 帧提升为 HDR，随后用于训练定制的 NeRF++ 模型。 由此产生的 PanoHDR-NeRF 管道可以从场景的任何位置估计完整的 HDR 全景图。 通过对各种真实场景的新测试数据集进行实验，在训练期间未看到的位置捕获地面实况 HDR 辐射，我们表明 PanoHDR-NeRF 可以预测来自任何场景点的合理辐射。我们还表明，由 PanoHDR-NeRF 生成的 HDR 图像可以合成正确的照明效果，从而能够使用正确照明的合成对象来增强室内场景。\n## Aug7 - Aug13, 2022\n  - [PS-NeRV：视频的补丁风格化神经表示](https://arxiv.org/abs/2208.03742) | [code]\n    > 我们研究如何使用隐式神经表示 (INR) 来表示视频。经典的 INR 方法通常利用 MLP 将输入坐标映射到输出像素。虽然最近的一些作品试图用 CNN 直接重建整个图像。然而，我们认为上述像素级和图像级策略都不利于视频数据。相反，我们提出了一种补丁解决方案 PS-NeRV，它将视频表示为补丁和相应补丁坐标的函数。它自然继承了image-wise方法的优点，并以快速的解码速度实现了出色的重建性能。整个方法包括传统的模块，如位置嵌入、MLPs 和 CNNs，同时还引入了 AdaIN 来增强中间特征。这些简单而重要的变化可以帮助网络轻松适应高频细节。大量实验证明了它在视频压缩和视频修复等视频相关任务中的有效性。\n## Jul31 - Aug6, 2022\n## Jul24 - Jul30, 2022\n## Previous weeks\n  - [﻿Plenoxels：没有神经网络的辐射场, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > 我们介绍了 Plenoxels（全光体素），一种用于照片级真实视图合成的系统。 Plenoxels 将场景表示为具有球谐函数的稀疏 3D 网格。这种表示可以通过梯度方法和正则化从校准图像中优化，而无需任何神经组件。在标准的基准任务中，Plenoxels 的优化速度比神经辐射场快两个数量级，而视觉质量没有损失。\n  - [用于动态场景时空视图合成的神经场景流场, CVPR2021](http://www.cs.cornell.edu/~zl548/NSFF/) | [***``[code]``***](https://github.com/zhengqili/Neural-Scene-Flow-Fields)\n    > 我们提出了一种方法来执行动态场景的新颖视图和时间合成，只需要具有已知相机姿势的单目视频作为输入。为此，我们引入了神经场景流场，这是一种将动态场景建模为外观、几何和 3D 场景运动的时变连续函数的新表示。我们的表示通过神经网络进行优化，以适应观察到的输入视图。我们表明，我们的表示可用于复杂的动态场景，包括薄结构、视图相关效果和自然运动度。我们进行了许多实验，证明我们的方法明显优于最近的单目视图合成方法，并展示了各种真实世界视频的时空视图合成的定性结果。\n  - [来自多视图视频的神经 3D 视频合成, CVPR2022(oral)](https://neural-3d-video.github.io/) | [code]\n    > 我们提出了一种新颖的 3D 视频合成方法，能够以紧凑但富有表现力的表示形式表示动态真实世界场景的多视图视频记录，从而实现高质量的视图合成和运动插值。我们的方法将静态神经辐射场的高质量和紧凑性带到了一个新的方向：无模型的动态设置。我们方法的核心是一种新颖的时间条件神经辐射场，它使用一组紧凑的潜在代码来表示场景动态。为了利用视频相邻帧之间的变化通常很小且局部一致的事实，我们提出了两种有效训练神经网络的新策略：1）有效的分层训练方案，以及 2）选择根据输入视频的时间变化进行训练的下一条光线。结合起来，这两种策略显着提高了训练速度，导致训练过程快速收敛，并获得高质量的结果。我们学习的表示非常紧凑，能够表示由 18 个摄像机录制的 10 秒 30 FPS 多视图视频，模型大小仅为 28MB。我们证明了我们的方法可以以超过 1K 的分辨率渲染高保真广角新颖视图，即使对于高度复杂和动态的场景也是如此。我们进行了广泛的定性和定量评估，表明我们的方法优于当前的技术水平。项目网站：https://neural-3d-video.github.io。\n  - [动态单目视频的动态视图合成, ICCV2021](https://free-view-video.github.io/) | [***``[code]``***](https://github.com/gaochen315/DynamicNeRF)\n    > 我们提出了一种算法，用于在给定动态场景的单目视频的任意视点和任何输入时间步长处生成新视图。我们的工作建立在神经隐式表示的最新进展的基础上，并使用连续和可微的函数来建模时变结构和场景的外观。我们联合训练一个时不变的静态 NeRF 和一个时变的动态 NeRF，并学习如何以无监督的方式混合结果。然而，从单个视频中学习这个隐式函数是非常不适定的（与输入视频匹配的解决方案有无限多）。为了解决歧义，我们引入了正则化损失以鼓励更合理的解决方案。我们展示了从随意捕获的视频中进行动态视图合成的广泛定量和定性结果。\n  - [使用分层神经表示的可编辑自由视点视频, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > 生成自由视点视频对于沉浸式 VR/AR 体验至关重要，但最近的神经学进展仍然缺乏编辑能力来操纵大型动态场景的视觉感知。为了填补这一空白，在本文中，我们提出了第一种仅使用稀疏的 16 个摄像头为大规模动态场景生成可编辑照片般逼真的自由视点视频的方法。我们方法的核心是一种新的分层神经表示，其中包括环境本身的每个动态实体都被制定为称为 ST-NeRF 的时空相干神经分层辐射表示。这种分层表示支持对动态场景的完全感知和真实操作，同时仍支持大范围的自由观看体验。在我们的 ST-NeRF 中，动态实体/层被表示为连续函数，以连续和自监督的方式实现动态实体的位置、变形以及外观的解耦。我们提出了一个场景解析 4D 标签映射跟踪来显式地解开空间信息，以及一个连续变形模块来隐式地解开时间运动。进一步引入了一种对象感知体绘制方案，用于重新组装所有神经层。我们采用了一种新颖的分层损失和运动感知光线采样策略，以实现对具有多个表演者的大型动态场景的有效训练，我们的框架进一步实现了各种编辑功能，即操纵规模和位置，复制或重新定时单个神经层在保持高度真实感的同时创造众多视觉效果。大量实验证明了我们的方法在为动态场景生成高质量、照片般逼真和可编辑的自由视点视频方面的有效性。\n"
  },
  {
    "path": "docs/contribute_weekly_nerf.md",
    "content": "# How to contribute to weekly classified NeRF\n\n1. Add or revise items in the meta data: [weekly_nerf_meta_data.xlsx](./weekly_nerf_meta_data.xlsx).\n\n2. Run the following commands to render views:\n\n```bash\npip install pydoc mdutils\npython docs/render_docs_from_csv.py\n```\n\n3. Create a pull request and briefly describe what you have done. Thanks!\n\n4. Indicate whether you wish to display your name in our main README as a contributor."
  },
  {
    "path": "docs/get_pytorch_waymo_dataset.md",
    "content": "# How to generate pytorch_block_nerf_dataset\n\n1. download the Waymo Block dataset via the following command:\n\n\t```bash\n\tpip install gdown # download google drive download.\n\tcd data\n\tgdown --id 1iRqO4-GMqZAYFNvHLlBfjTcXY-l3qMN5 --no-cache \n\tunzip v1.0.zip\n\tcd ../\n\t```\n   The Google cloud may [limit the download speed in this operation](https://stackoverflow.com/questions/16856102/google-drive-limit-number-of-download). You can instead:\n   (1) Downloading in your browser by clicking [this link](https://drive.google.com/file/d/1iRqO4-GMqZAYFNvHLlBfjTcXY-l3qMN5/view). (2) Alternatively, you can directly download from the official [Waymo](https://waymo.com/research/block-nerf/licensing/) website. However, this download may needs the sudo access to install the [gsutil tool](https://cloud.google.com/storage/docs/gsutil_install#deb) (if you don't have sudo access, you can download from your local laptop and then transport it to your server). The reference script is as follows:\n\n\t```bash\n\t# install gsutil tool\n\tsudo apt-get install apt-transport-https ca-certificates gnupg # needs sudo access\n\techo \"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main\" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list\n\tcurl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -\n\tsudo apt-get update && sudo apt-get install google-cloud-cli # needs sudo access\n\tgcloud init # login your google account then\n\tcd data\n\tgsutil -m cp -r \\\n\t  \"gs://waymo-block-nerf/v1.0\" \\\n\t  .\n\tunzip v1.0.zip\n\tcd ..\n\t```\n   You may otherwise symbol link the downloaded dataset (\"v1.0\") under the \"data\" folder. The Waymo official files (e.g., v1.0/v1.0_waymo_block_nerf_mission_bay_train.tfrecord-00000-of-01063) would be put under the data folder. \n\n2. Transfer the original data in TF to the pytorch format via the following command:\n\n   ```bash\n   python data_preprocess/fetch_data_from_tf_record.py\n   ```\n\n3. Split the waymo dataset into blocks and extract corresponding information.\n\n\t```bash\n\tpython data_preprocess/split_block.py\n\t```\n\nNow you have finished the waymo data preprocess procedure and you can start training."
  },
  {
    "path": "docs/parse_markdown.py",
    "content": "import markdown\nimport pdb\n\nf = open('docs/weekly_nerf.md', 'r')\nhtmlmarkdown=markdown.markdown( f.read() )\n\nfrom html.parser import HTMLParser\n\nclass MyHTMLParser(HTMLParser):\n    def __init__(self, *, convert_charrefs: bool = ...) -> None:\n        super().__init__(convert_charrefs=convert_charrefs)\n        \n\n    def handle_starttag(self, tag, attrs):\n        # print(\"Encountered a start tag:\", tag)\n        # print(attrs)\n        # if len(attrs) < 1 or len(attrs[0]) < 2:\n        #     return\n        # if attrs[0][0] == 'href':\n        #     link = attrs[0][1]\n        #     if 'github' not in link:\n        #         print(link)\n        pass\n\n    def handle_endtag(self, tag):\n        # print(\"Encountered an end tag :\", tag)\n        pass\n\n    def handle_data(self, data):\n        if len(data) < 5 or '2022' in data:\n            return\n        elif '>' in data:\n            pass\n            # abstract = data.replace(\"[code]\", \"\").replace(\"|\", \"\").replace(\">\", \"\").replace(\"> \", \"\").strip(\" \").replace(\"\\n\", \"\")\n            # abstract += \"\\n\"\n            # text_file = open(\"data/abstract.txt\", \"a\")\n            # n = text_file.write(abstract)\n            # text_file.close()\n        else:\n            print(data) # print titles here\n            # pass\n\nparser = MyHTMLParser()\nparser.feed(htmlmarkdown)\npdb.set_trace()"
  },
  {
    "path": "docs/render_docs_from_csv.py",
    "content": "from pydoc import allmethods\nimport pandas as pd\nimport pdb\nimport math\nfrom mdutils.mdutils import MdUtils\nimport numpy as np\nimport os\n\n\ndef check_nan(item):\n    if (type(item) == float or type(item) == np.float64 or type(item) == np.float32) and math.isnan(item):\n        return True\n    else:\n        return False\n\ndef title_by_cls(cls_str, cn):\n    if not cn:\n        ret = f'Weekly Classified Neural Radiance Fields - {cls_str} ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)'\n    else:\n        ret = f'每周分类神经辐射场 - {cls_str} ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)'\n    return ret\n\ndef write_item_to_md(md, data, idx, cn=False):\n    if cn:\n        cur_title = data['title_cn'][idx]\n    else:\n        cur_title = data['title'][idx]\n    cur_publisher = data['publisher'][idx]\n    if not check_nan(cur_publisher):\n        cur_title = cur_title + \", \" + cur_publisher\n    cur_link = data['link'][idx]\n    md.write('  - ' + md.new_inline_link(link=cur_link, text=cur_title) + \" | \")\n    cur_code = data['code'][idx]\n    if not check_nan(cur_code):\n        md.write(md.new_inline_link(link=cur_code, text='[code]', bold_italics_code='cbi'))\n        md.write(\"\\n\")\n    else:\n        md.write(\"[code]\\n\")\n    if cn:\n        cur_abstract = data['abstract_cn'][idx]\n    else:\n        cur_abstract = data['abstract'][idx]\n    md.write(\"    > \" + cur_abstract + \"\\n\")\n    return\n\ndef render_main_doc(meta_data_path=\"docs/weekly_nerf_meta_data.xlsx\", cn=False):\n    excel_data = pd.read_excel(meta_data_path)\n    # Read the values of the file in the dataframe\n    classes = ['lighting',\t'editing', 'fast',\t'dynamic',\t'generalization', 'reconstruction',\t'pose-slam', 'texture',\t'semantic',\t'human', 'video', 'others']\n    data = pd.DataFrame(excel_data, columns=['week', 'title', 'publisher', 'abstract', 'link', 'code', 'title_cn', 'abstract_cn'] + classes)\n    classified_path = \"docs/classified_weekly_nerf\"\n    os.makedirs(classified_path, exist_ok=True)\n    classified_path_cn = \"docs/classified_weekly_nerf_cn\"\n    os.makedirs(classified_path_cn, exist_ok=True)\n    if not cn:\n        general_title = 'Weekly Classified Neural Radiance Fields ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)'\n    else:\n        general_title = '每周分类神经辐射场 ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)'\n    all_md_fn = 'docs/weekly_nerf' if not cn else 'docs/weekly_nerf_cn'\n    all_md = MdUtils(file_name=all_md_fn, title=general_title)\n    if not cn:\n        cls_md = {one_cls: MdUtils(file_name=os.path.join('docs/classified_weekly_nerf', one_cls),title=title_by_cls(one_cls, cn)) for one_cls in classes}\n        cls_head = \"## Filter by classes: \\n [all](./weekly_nerf.md) | [dynamic](./classified_weekly_nerf/dynamic.md) | [editing](./classified_weekly_nerf/editing.md) | [fast](./classified_weekly_nerf/fast.md) | [generalization](./classified_weekly_nerf/generalization.md) | [human](./classified_weekly_nerf/human.md) | [video](./classified_weekly_nerf/video.md) | \"\n        cls_head2 = \"[lighting](./classified_weekly_nerf/lighting.md) | [reconstruction](./classified_weekly_nerf/reconstruction.md) | [texture](./classified_weekly_nerf/texture.md) | [semantic](./classified_weekly_nerf/semantic.md) | [pose-slam](./classified_weekly_nerf/pose-slam.md) | [others](./classified_weekly_nerf/others.md) \\n\"\n    else:\n        cls_md = {one_cls: MdUtils(file_name=os.path.join('docs/classified_weekly_nerf_cn', one_cls),title=title_by_cls(one_cls, cn)) for one_cls in classes}\n        all_md.write(\"\\n NeRF研究QQ大群（300+成员）：706949479 \\n\")\n        cls_head = \"## 按类别筛选: \\n [全部](./weekly_nerf_cn.md) | [动态](./classified_weekly_nerf_cn/dynamic.md) | [编辑](./classified_weekly_nerf_cn/editing.md) | [快速](./classified_weekly_nerf_cn/fast.md) | [泛化](./classified_weekly_nerf_cn/generalization.md) | [人体](./classified_weekly_nerf_cn/human.md) | [视频](./classified_weekly_nerf_cn/video.md) | \"\n        cls_head2 = \"[光照](./classified_weekly_nerf_cn/lighting.md) | [重建](./classified_weekly_nerf_cn/reconstruction.md) | [纹理](./classified_weekly_nerf_cn/texture.md) | [语义](./classified_weekly_nerf_cn/semantic.md) | [姿态-SLAM](./classified_weekly_nerf_cn/pose-slam.md) | [其他](./classified_weekly_nerf_cn/others.md) \\n\"\n\n    all_md.write(cls_head)\n    all_md.write(cls_head2)\n\n    # if cn:\n        # all_md.write(\"## 大部分为机器翻译，少数论文手动翻译，有翻译错误可以PR修复。\\n\")\n    # note this is different from above because the following is used in classified NeRFs\n    if not cn:\n        cls_head = \"## Filter by classes: \\n [all](../weekly_nerf.md) | [dynamic](./dynamic.md) | [editing](./editing.md) | [fast](./fast.md) | [generalization](./generalization.md) | [human](./human.md) | [video](./video.md) | \"\n        cls_head2 = \"[lighting](./lighting.md) | [reconstruction](./reconstruction.md) | [texture](./texture.md) | [semantic](./semantic.md) | [pose-slam](./pose-slam.md) | [others](./others.md) \\n\"\n    else:\n        cls_head = \"## 按类别筛选: \\n [全部](../weekly_nerf_cn.md) | [动态](./dynamic.md) | [编辑](./editing.md) | [快速](./fast.md) | [泛化](./generalization.md) | [人体](./human.md) | [视频](./video.md) | \"\n        cls_head2 = \"[光照](./lighting.md) | [重建](./reconstruction.md) | [纹理](./texture.md) | [语义](./semantic.md) | [姿态-SLAM](./pose-slam.md) | [其他](./others.md) \\n\"\n    for cls in cls_md:\n        cls_md[cls].write(cls_head)\n        cls_md[cls].write(cls_head2)\n        # if cn:\n        #     cls_md[cls].write(\"## 大部分为机器翻译，少数论文手动翻译，有翻译错误可以PR修复。\\n\")\n    data_len = len(data['week'])\n    week = \"\"\n    for idx in range(data_len):\n        print(f\"Generating {idx} / {data_len} ...\")\n        cur_week = data['week'][idx]\n        if cur_week != week:\n            week = cur_week\n            all_md.write(\"## \" + week + \"\\n\")\n            for cls in cls_md:\n                cls_md[cls].write(\"## \" + week + \"\\n\")\n        write_item_to_md(all_md, data, idx, cn)\n        for cls in classes:\n            cur_cls = data[cls][idx]\n            if not check_nan(cur_cls):\n                write_item_to_md(cls_md[cls], data, idx, cn)\n    # if not cn:\n    #     all_md.write(\"## \" + 'Old papers\\n')\n    #     all_md.write(\"Refer to the [awesome-NeRF code repo](https://github.com/yenchenlin/awesome-NeRF).\\n\")\n    # else:\n    #     all_md.write(\"## \" + '旧论文\\n')\n    #     all_md.write(\"参考这个仓库： [awesome-NeRF](https://github.com/yenchenlin/awesome-NeRF).\\n\")\n    all_md.create_md_file()\n    for cls in cls_md:\n        cls_md[cls].create_md_file()\n\n\nif __name__ == '__main__':\n    render_main_doc(cn=False)\n    render_main_doc(cn=True)\n"
  },
  {
    "path": "docs/sample_logs/create_cluster_mask.txt",
    "content": "Number of images in dir: torch.Size([1940, 3])\nCoord range: tensor([-0.0140, -0.5812, -0.9744]) tensor([0.0140, 0.5812, 0.9744])\nCentroids shape: torch.Size([8, 3])\n\r  0%|          | 0/1920 [00:00<?, ?it/s]\n\r  0%|          | 0/324 [00:00<?, ?it/s]\u001b[A\n\r  5%|▍         | 16/324 [00:00<00:02, 112.60it/s]\u001b[A\n\r  9%|▊         | 28/324 [00:01<00:14, 20.97it/s] \u001b[A\n\r 10%|█         | 34/324 [00:01<00:16, 17.42it/s]\u001b[A\n\r 12%|█▏        | 38/324 [00:01<00:17, 15.93it/s]\u001b[A\n\r 13%|█▎        | 41/324 [00:02<00:18, 15.03it/s]\u001b[A\n\r 14%|█▎        | 44/324 [00:02<00:19, 14.27it/s]\u001b[A\n\r 14%|█▍        | 46/324 [00:02<00:20, 13.84it/s]\u001b[A\n\r 15%|█▍        | 48/324 [00:02<00:20, 13.43it/s]\u001b[A\n\r 15%|█▌        | 50/324 [00:02<00:20, 13.09it/s]\u001b[A\n\r 16%|█▌        | 52/324 [00:03<00:21, 12.80it/s]\u001b[A\n\r 17%|█▋        | 54/324 [00:03<00:21, 12.58it/s]\u001b[A\n\r 17%|█▋        | 56/324 [00:03<00:21, 12.40it/s]\u001b[A\n\r 18%|█▊        | 58/324 [00:03<00:21, 12.28it/s]\u001b[A\n\r 19%|█▊        | 60/324 [00:03<00:21, 12.18it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.81it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.81it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:18, 11.81it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:18, 11.81it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.81it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.81it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.81it/s]\u001b[A\n\r 25%|██▍       | 80/324 [00:05<00:20, 11.82it/s]\u001b[A\n\r 25%|██▌       | 82/324 [00:05<00:20, 11.82it/s]\u001b[A\n\r 26%|██▌       | 84/324 [00:05<00:20, 11.82it/s]\u001b[A\n\r 27%|██▋       | 86/324 [00:06<00:20, 11.82it/s]\u001b[A\n\r 27%|██▋       | 88/324 [00:06<00:20, 11.79it/s]\u001b[A\n\r 28%|██▊       | 90/324 [00:06<00:19, 11.81it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.81it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:19, 11.71it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:18, 11.78it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.85it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.84it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.83it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.70it/s]\u001b[A\n\r 35%|███▍      | 112/324 [00:08<00:17, 11.85it/s]\u001b[A\n\r 35%|███▌      | 114/324 [00:08<00:17, 11.84it/s]\u001b[A\n\r 36%|███▌      | 116/324 [00:08<00:17, 11.84it/s]\u001b[A\n\r 36%|███▋      | 118/324 [00:08<00:17, 11.83it/s]\u001b[A\n\r 37%|███▋      | 120/324 [00:08<00:17, 11.82it/s]\u001b[A\n\r 38%|███▊      | 122/324 [00:09<00:17, 11.82it/s]\u001b[A\n\r 38%|███▊      | 124/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 39%|███▉      | 126/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 40%|███▉      | 128/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 40%|████      | 130/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 41%|████      | 132/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 41%|████▏     | 134/324 [00:10<00:16, 11.81it/s]\u001b[A\n\r 42%|████▏     | 136/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 43%|████▎     | 138/324 [00:10<00:15, 11.80it/s]\u001b[A\n\r 43%|████▎     | 140/324 [00:10<00:15, 11.80it/s]\u001b[A\n\r 44%|████▍     | 142/324 [00:10<00:15, 11.80it/s]\u001b[A\n\r 44%|████▍     | 144/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 45%|████▌     | 146/324 [00:11<00:15, 11.81it/s]\u001b[A\n\r 46%|████▌     | 148/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 46%|████▋     | 150/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 47%|████▋     | 152/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 48%|████▊     | 154/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 48%|████▊     | 156/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 158/324 [00:12<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 160/324 [00:12<00:13, 11.80it/s]\u001b[A\n\r 50%|█████     | 162/324 [00:12<00:13, 11.81it/s]\u001b[A\n\r 51%|█████     | 164/324 [00:12<00:13, 11.81it/s]\u001b[A\n\r 51%|█████     | 166/324 [00:12<00:13, 11.81it/s]\u001b[A\n\r 52%|█████▏    | 168/324 [00:13<00:13, 11.81it/s]\u001b[A\n\r 52%|█████▏    | 170/324 [00:13<00:13, 11.81it/s]\u001b[A\n\r 53%|█████▎    | 172/324 [00:13<00:12, 11.81it/s]\u001b[A\n\r 54%|█████▎    | 174/324 [00:13<00:12, 11.81it/s]\u001b[A\n\r 54%|█████▍    | 176/324 [00:13<00:12, 11.81it/s]\u001b[A\n\r 55%|█████▍    | 178/324 [00:13<00:12, 11.81it/s]\u001b[A\n\r 56%|█████▌    | 180/324 [00:14<00:12, 11.82it/s]\u001b[A\n\r 56%|█████▌    | 182/324 [00:14<00:12, 11.81it/s]\u001b[A\n\r 57%|█████▋    | 184/324 [00:14<00:11, 11.81it/s]\u001b[A\n\r 57%|█████▋    | 186/324 [00:14<00:11, 11.81it/s]\u001b[A\n\r 58%|█████▊    | 188/324 [00:14<00:11, 11.72it/s]\u001b[A\n\r 59%|█████▊    | 190/324 [00:14<00:11, 11.83it/s]\u001b[A\n\r 59%|█████▉    | 192/324 [00:15<00:11, 11.83it/s]\u001b[A\n\r 60%|█████▉    | 194/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 60%|██████    | 196/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 61%|██████    | 198/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 62%|██████▏   | 200/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 62%|██████▏   | 202/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 63%|██████▎   | 204/324 [00:16<00:10, 11.81it/s]\u001b[A\n\r 64%|██████▎   | 206/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 64%|██████▍   | 208/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▍   | 210/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▌   | 212/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 66%|██████▌   | 214/324 [00:16<00:09, 11.80it/s]\u001b[A\n\r 67%|██████▋   | 216/324 [00:17<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 218/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 68%|██████▊   | 220/324 [00:17<00:08, 11.80it/s]\u001b[A\n\r 69%|██████▊   | 222/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▉   | 224/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|██████▉   | 226/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|███████   | 228/324 [00:18<00:08, 11.81it/s]\u001b[A\n\r 71%|███████   | 230/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 232/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 234/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 236/324 [00:18<00:07, 11.82it/s]\u001b[A\n\r 73%|███████▎  | 238/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 74%|███████▍  | 240/324 [00:19<00:07, 11.82it/s]\u001b[A\n\r 75%|███████▍  | 242/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 75%|███████▌  | 244/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 76%|███████▌  | 246/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 248/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 250/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 252/324 [00:20<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 254/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 79%|███████▉  | 256/324 [00:20<00:05, 11.80it/s]\u001b[A\n\r 80%|███████▉  | 258/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|████████  | 260/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████  | 262/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████▏ | 264/324 [00:21<00:05, 11.81it/s]\u001b[A\n\r 82%|████████▏ | 266/324 [00:21<00:04, 11.82it/s]\u001b[A\n\r 83%|████████▎ | 268/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 270/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 84%|████████▍ | 272/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 85%|████████▍ | 274/324 [00:21<00:04, 11.80it/s]\u001b[A\n\r 85%|████████▌ | 276/324 [00:22<00:04, 11.80it/s]\u001b[A\n\r 86%|████████▌ | 278/324 [00:22<00:03, 11.80it/s]\u001b[A\n\r 86%|████████▋ | 280/324 [00:22<00:03, 11.80it/s]\u001b[A\n\r 87%|████████▋ | 282/324 [00:22<00:03, 11.80it/s]\u001b[A\n\r 88%|████████▊ | 284/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 286/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 89%|████████▉ | 288/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 90%|████████▉ | 290/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 90%|█████████ | 292/324 [00:23<00:02, 11.80it/s]\u001b[A\n\r 91%|█████████ | 294/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████▏| 296/324 [00:23<00:02, 11.80it/s]\u001b[A\n\r 92%|█████████▏| 298/324 [00:24<00:02, 11.80it/s]\u001b[A\n\r 93%|█████████▎| 300/324 [00:24<00:02, 11.80it/s]\u001b[A\n\r 93%|█████████▎| 302/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 304/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 306/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 95%|█████████▌| 308/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▌| 310/324 [00:25<00:01, 11.80it/s]\u001b[A\n\r 96%|█████████▋| 312/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 97%|█████████▋| 314/324 [00:25<00:00, 11.80it/s]\u001b[A\n\r 98%|█████████▊| 316/324 [00:25<00:00, 11.80it/s]\u001b[A\n\r 98%|█████████▊| 318/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 320/324 [00:25<00:00, 11.80it/s]\u001b[A\n\r 99%|█████████▉| 322/324 [00:26<00:00, 11.81it/s]\u001b[A\n\r100%|██████████| 324/324 [00:26<00:00, 11.81it/s]\u001b[A\r100%|██████████| 324/324 [00:26<00:00, 12.35it/s]\n\r100%|██████████| 480/480 [4:05:10<00:00, 30.52s/it]\r100%|██████████| 480/480 [4:05:10<00:00, 30.65s/it]\n\r  0%|          | 0/5 [00:00<?, ?it/s]\n\r  0%|          | 0/324 [00:00<?, ?it/s]\u001b[A\n\r  5%|▍         | 16/324 [00:00<00:02, 111.59it/s]\u001b[A\n\r  9%|▊         | 28/324 [00:01<00:14, 20.81it/s] \u001b[A\n\r 10%|█         | 34/324 [00:01<00:16, 17.26it/s]\u001b[A\n\r 12%|█▏        | 38/324 [00:02<00:18, 15.76it/s]\u001b[A\n\r 13%|█▎        | 41/324 [00:02<00:19, 14.81it/s]\u001b[A\n\r 14%|█▎        | 44/324 [00:02<00:19, 14.15it/s]\u001b[A\n\r 14%|█▍        | 46/324 [00:02<00:20, 13.68it/s]\u001b[A\n\r 15%|█▍        | 48/324 [00:02<00:20, 13.31it/s]\u001b[A\n\r 15%|█▌        | 50/324 [00:03<00:21, 12.96it/s]\u001b[A\n\r 16%|█▌        | 52/324 [00:03<00:21, 12.67it/s]\u001b[A\n\r 17%|█▋        | 54/324 [00:03<00:21, 12.45it/s]\u001b[A\n\r 17%|█▋        | 56/324 [00:03<00:21, 12.27it/s]\u001b[A\n\r 18%|█▊        | 58/324 [00:03<00:21, 12.14it/s]\u001b[A\n\r 19%|█▊        | 60/324 [00:03<00:21, 12.05it/s]\u001b[A\n\r 19%|█▉        | 62/324 [00:04<00:21, 11.98it/s]\u001b[A\n\r 20%|█▉        | 64/324 [00:04<00:21, 11.93it/s]\u001b[A\n\r 20%|██        | 66/324 [00:04<00:21, 11.89it/s]\u001b[A\n\r 21%|██        | 68/324 [00:04<00:21, 11.87it/s]\u001b[A\n\r 22%|██▏       | 70/324 [00:04<00:21, 11.81it/s]\u001b[A\n\r 22%|██▏       | 72/324 [00:04<00:21, 11.79it/s]\u001b[A\n\r 23%|██▎       | 74/324 [00:05<00:21, 11.85it/s]\u001b[A\n\r 23%|██▎       | 76/324 [00:05<00:20, 11.83it/s]\u001b[A\n\r 24%|██▍       | 78/324 [00:05<00:20, 11.83it/s]\u001b[A\n\r 25%|██▍       | 80/324 [00:05<00:20, 11.71it/s]\u001b[A\n\r 25%|██▌       | 82/324 [00:05<00:20, 11.85it/s]\u001b[A\n\r 26%|██▌       | 84/324 [00:05<00:20, 11.84it/s]\u001b[A\n\r 27%|██▋       | 86/324 [00:06<00:20, 11.70it/s]\u001b[A\n\r 27%|██▋       | 88/324 [00:06<00:20, 11.77it/s]\u001b[A\n\r 28%|██▊       | 90/324 [00:06<00:19, 11.79it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.88it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.86it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.84it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.83it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:18, 11.81it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.69it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.77it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.80it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.87it/s]\u001b[A\n\r 35%|███▍      | 112/324 [00:08<00:17, 11.85it/s]\u001b[A\n\r 35%|███▌      | 114/324 [00:08<00:17, 11.83it/s]\u001b[A\n\r 36%|███▌      | 116/324 [00:08<00:17, 11.79it/s]\u001b[A\n\r 36%|███▋      | 118/324 [00:08<00:17, 11.80it/s]\u001b[A\n\r 37%|███▋      | 120/324 [00:08<00:17, 11.78it/s]\u001b[A\n\r 38%|███▊      | 122/324 [00:09<00:17, 11.83it/s]\u001b[A\n\r 38%|███▊      | 124/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 39%|███▉      | 126/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 40%|███▉      | 128/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 40%|████      | 130/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 41%|████      | 132/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 41%|████▏     | 134/324 [00:10<00:16, 11.81it/s]\u001b[A\n\r 42%|████▏     | 136/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 43%|████▎     | 138/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 43%|████▎     | 140/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 142/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 144/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 45%|████▌     | 146/324 [00:11<00:15, 11.81it/s]\u001b[A\n\r 46%|████▌     | 148/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 46%|████▋     | 150/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 47%|████▋     | 152/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 48%|████▊     | 154/324 [00:11<00:14, 11.80it/s]\u001b[A\n\r 48%|████▊     | 156/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 158/324 [00:12<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 160/324 [00:12<00:13, 11.80it/s]\u001b[A\n\r 50%|█████     | 162/324 [00:12<00:13, 11.81it/s]\u001b[A\n\r 51%|█████     | 164/324 [00:12<00:13, 11.80it/s]\u001b[A\n\r 51%|█████     | 166/324 [00:12<00:13, 11.81it/s]\u001b[A\n\r 52%|█████▏    | 168/324 [00:13<00:13, 11.81it/s]\u001b[A\n\r 52%|█████▏    | 170/324 [00:13<00:13, 11.74it/s]\u001b[A\n\r 53%|█████▎    | 172/324 [00:13<00:12, 11.83it/s]\u001b[A\n\r 54%|█████▎    | 174/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 54%|█████▍    | 176/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 55%|█████▍    | 178/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 56%|█████▌    | 180/324 [00:14<00:12, 11.82it/s]\u001b[A\n\r 56%|█████▌    | 182/324 [00:14<00:12, 11.81it/s]\u001b[A\n\r 57%|█████▋    | 184/324 [00:14<00:11, 11.81it/s]\u001b[A\n\r 57%|█████▋    | 186/324 [00:14<00:11, 11.81it/s]\u001b[A\n\r 58%|█████▊    | 188/324 [00:14<00:11, 11.81it/s]\u001b[A\n\r 59%|█████▊    | 190/324 [00:14<00:11, 11.76it/s]\u001b[A\n\r 59%|█████▉    | 192/324 [00:15<00:11, 11.82it/s]\u001b[A\n\r 60%|█████▉    | 194/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 60%|██████    | 196/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 61%|██████    | 198/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 62%|██████▏   | 200/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 62%|██████▏   | 202/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 63%|██████▎   | 204/324 [00:16<00:10, 11.81it/s]\u001b[A\n\r 64%|██████▎   | 206/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 64%|██████▍   | 208/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▍   | 210/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▌   | 212/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 66%|██████▌   | 214/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 216/324 [00:17<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 218/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 68%|██████▊   | 220/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▊   | 222/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▉   | 224/324 [00:17<00:08, 11.80it/s]\u001b[A\n\r 70%|██████▉   | 226/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|███████   | 228/324 [00:18<00:08, 11.81it/s]\u001b[A\n\r 71%|███████   | 230/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 232/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 234/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 236/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 238/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 74%|███████▍  | 240/324 [00:19<00:07, 11.81it/s]\u001b[A\n\r 75%|███████▍  | 242/324 [00:19<00:06, 11.80it/s]\u001b[A\n\r 75%|███████▌  | 244/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 76%|███████▌  | 246/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 248/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 250/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 252/324 [00:20<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 254/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 79%|███████▉  | 256/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|███████▉  | 258/324 [00:20<00:05, 11.82it/s]\u001b[A\n\r 80%|████████  | 260/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████  | 262/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████▏ | 264/324 [00:21<00:05, 11.82it/s]\u001b[A\n\r 82%|████████▏ | 266/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 268/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 270/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 84%|████████▍ | 272/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 85%|████████▍ | 274/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 85%|████████▌ | 276/324 [00:22<00:04, 11.81it/s]\u001b[A\n\r 86%|████████▌ | 278/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 86%|████████▋ | 280/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 87%|████████▋ | 282/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 284/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 286/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 89%|████████▉ | 288/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 90%|████████▉ | 290/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 90%|█████████ | 292/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████ | 294/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████▏| 296/324 [00:23<00:02, 11.80it/s]\u001b[A\n\r 92%|█████████▏| 298/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 300/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 302/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 304/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 306/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 95%|█████████▌| 308/324 [00:24<00:01, 11.80it/s]\u001b[A\n\r 96%|█████████▌| 310/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▋| 312/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 97%|█████████▋| 314/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 316/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 318/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 320/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 322/324 [00:26<00:00, 11.81it/s]\u001b[A\n\r100%|██████████| 324/324 [00:26<00:00, 11.81it/s]\u001b[A\r100%|██████████| 324/324 [00:26<00:00, 12.35it/s]\n\r 20%|██        | 1/5 [00:30<02:02, 30.62s/it]\n\r  0%|          | 0/324 [00:00<?, ?it/s]\u001b[A\n\r  5%|▍         | 16/324 [00:00<00:02, 112.20it/s]\u001b[A\n\r  9%|▊         | 28/324 [00:01<00:14, 20.72it/s] \u001b[A\n\r 10%|█         | 34/324 [00:01<00:16, 17.28it/s]\u001b[A\n\r 12%|█▏        | 38/324 [00:02<00:18, 15.71it/s]\u001b[A\n\r 13%|█▎        | 41/324 [00:02<00:18, 14.91it/s]\u001b[A\n\r 14%|█▎        | 44/324 [00:02<00:19, 14.12it/s]\u001b[A\n\r 14%|█▍        | 46/324 [00:02<00:20, 13.71it/s]\u001b[A\n\r 15%|█▍        | 48/324 [00:02<00:20, 13.31it/s]\u001b[A\n\r 15%|█▌        | 50/324 [00:03<00:21, 12.97it/s]\u001b[A\n\r 16%|█▌        | 52/324 [00:03<00:21, 12.68it/s]\u001b[A\n\r 17%|█▋        | 54/324 [00:03<00:21, 12.45it/s]\u001b[A\n\r 17%|█▋        | 56/324 [00:03<00:21, 12.28it/s]\u001b[A\n\r 18%|█▊        | 58/324 [00:03<00:21, 12.15it/s]\u001b[A\n\r 19%|█▊        | 60/324 [00:03<00:21, 12.05it/s]\u001b[A\n\r 19%|█▉        | 62/324 [00:04<00:21, 11.98it/s]\u001b[A\n\r 20%|█▉        | 64/324 [00:04<00:21, 11.93it/s]\u001b[A\n\r 20%|██        | 66/324 [00:04<00:21, 11.89it/s]\u001b[A\n\r 21%|██        | 68/324 [00:04<00:21, 11.87it/s]\u001b[A\n\r 22%|██▏       | 70/324 [00:04<00:21, 11.86it/s]\u001b[A\n\r 22%|██▏       | 72/324 [00:04<00:21, 11.84it/s]\u001b[A\n\r 23%|██▎       | 74/324 [00:05<00:21, 11.83it/s]\u001b[A\n\r 23%|██▎       | 76/324 [00:05<00:20, 11.82it/s]\u001b[A\n\r 24%|██▍       | 78/324 [00:05<00:20, 11.81it/s]\u001b[A\n\r 25%|██▍       | 80/324 [00:05<00:20, 11.70it/s]\u001b[A\n\r 25%|██▌       | 82/324 [00:05<00:20, 11.86it/s]\u001b[A\n\r 26%|██▌       | 84/324 [00:05<00:20, 11.84it/s]\u001b[A\n\r 27%|██▋       | 86/324 [00:06<00:20, 11.84it/s]\u001b[A\n\r 27%|██▋       | 88/324 [00:06<00:20, 11.72it/s]\u001b[A\n\r 28%|██▊       | 90/324 [00:06<00:19, 11.84it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.85it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.84it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.83it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.83it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.79it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.81it/s]\u001b[A\n\r 35%|███▍      | 112/324 [00:08<00:17, 11.81it/s]\u001b[A\n\r 35%|███▌      | 114/324 [00:08<00:17, 11.80it/s]\u001b[A\n\r 36%|███▌      | 116/324 [00:08<00:17, 11.82it/s]\u001b[A\n\r 36%|███▋      | 118/324 [00:08<00:17, 11.81it/s]\u001b[A\n\r 37%|███▋      | 120/324 [00:08<00:17, 11.81it/s]\u001b[A\n\r 38%|███▊      | 122/324 [00:09<00:17, 11.81it/s]\u001b[A\n\r 38%|███▊      | 124/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 39%|███▉      | 126/324 [00:09<00:17, 11.60it/s]\u001b[A\n\r 40%|███▉      | 128/324 [00:09<00:16, 11.88it/s]\u001b[A\n\r 40%|████      | 130/324 [00:09<00:16, 11.86it/s]\u001b[A\n\r 41%|████      | 132/324 [00:09<00:16, 11.84it/s]\u001b[A\n\r 41%|████▏     | 134/324 [00:10<00:16, 11.83it/s]\u001b[A\n\r 42%|████▏     | 136/324 [00:10<00:15, 11.83it/s]\u001b[A\n\r 43%|████▎     | 138/324 [00:10<00:15, 11.82it/s]\u001b[A\n\r 43%|████▎     | 140/324 [00:10<00:15, 11.82it/s]\u001b[A\n\r 44%|████▍     | 142/324 [00:10<00:15, 11.82it/s]\u001b[A\n\r 44%|████▍     | 144/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 45%|████▌     | 146/324 [00:11<00:15, 11.82it/s]\u001b[A\n\r 46%|████▌     | 148/324 [00:11<00:14, 11.82it/s]\u001b[A\n\r 46%|████▋     | 150/324 [00:11<00:14, 11.75it/s]\u001b[A\n\r 47%|████▋     | 152/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 48%|████▊     | 154/324 [00:11<00:14, 11.84it/s]\u001b[A\n\r 48%|████▊     | 156/324 [00:11<00:14, 11.82it/s]\u001b[A\n\r 49%|████▉     | 158/324 [00:12<00:14, 11.82it/s]\u001b[A\n\r 49%|████▉     | 160/324 [00:12<00:13, 11.80it/s]\u001b[A\n\r 50%|█████     | 162/324 [00:12<00:13, 11.76it/s]\u001b[A\n\r 51%|█████     | 164/324 [00:12<00:13, 11.83it/s]\u001b[A\n\r 51%|█████     | 166/324 [00:12<00:13, 11.82it/s]\u001b[A\n\r 52%|█████▏    | 168/324 [00:13<00:13, 11.82it/s]\u001b[A\n\r 52%|█████▏    | 170/324 [00:13<00:13, 11.82it/s]\u001b[A\n\r 53%|█████▎    | 172/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 54%|█████▎    | 174/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 54%|█████▍    | 176/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 55%|█████▍    | 178/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 56%|█████▌    | 180/324 [00:14<00:12, 11.73it/s]\u001b[A\n\r 56%|█████▌    | 182/324 [00:14<00:11, 11.84it/s]\u001b[A\n\r 57%|█████▋    | 184/324 [00:14<00:12, 11.66it/s]\u001b[A\n\r 57%|█████▋    | 186/324 [00:14<00:11, 11.88it/s]\u001b[A\n\r 58%|█████▊    | 188/324 [00:14<00:11, 11.86it/s]\u001b[A\n\r 59%|█████▊    | 190/324 [00:14<00:11, 11.84it/s]\u001b[A\n\r 59%|█████▉    | 192/324 [00:15<00:11, 11.83it/s]\u001b[A\n\r 60%|█████▉    | 194/324 [00:15<00:11, 11.74it/s]\u001b[A\n\r 60%|██████    | 196/324 [00:15<00:10, 11.85it/s]\u001b[A\n\r 61%|██████    | 198/324 [00:15<00:10, 11.83it/s]\u001b[A\n\r 62%|██████▏   | 200/324 [00:15<00:10, 11.83it/s]\u001b[A\n\r 62%|██████▏   | 202/324 [00:15<00:10, 11.83it/s]\u001b[A\n\r 63%|██████▎   | 204/324 [00:16<00:10, 11.82it/s]\u001b[A\n\r 64%|██████▎   | 206/324 [00:16<00:09, 11.82it/s]\u001b[A\n\r 64%|██████▍   | 208/324 [00:16<00:09, 11.82it/s]\u001b[A\n\r 65%|██████▍   | 210/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▌   | 212/324 [00:16<00:09, 11.82it/s]\u001b[A\n\r 66%|██████▌   | 214/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 216/324 [00:17<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 218/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 68%|██████▊   | 220/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▊   | 222/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▉   | 224/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|██████▉   | 226/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|███████   | 228/324 [00:18<00:08, 11.81it/s]\u001b[A\n\r 71%|███████   | 230/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 232/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 234/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 236/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 238/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 74%|███████▍  | 240/324 [00:19<00:07, 11.81it/s]\u001b[A\n\r 75%|███████▍  | 242/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 75%|███████▌  | 244/324 [00:19<00:06, 11.82it/s]\u001b[A\n\r 76%|███████▌  | 246/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 248/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 250/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 252/324 [00:20<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 254/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 79%|███████▉  | 256/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|███████▉  | 258/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|████████  | 260/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████  | 262/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████▏ | 264/324 [00:21<00:05, 11.81it/s]\u001b[A\n\r 82%|████████▏ | 266/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 268/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 270/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 84%|████████▍ | 272/324 [00:21<00:04, 11.82it/s]\u001b[A\n\r 85%|████████▍ | 274/324 [00:21<00:04, 11.82it/s]\u001b[A\n\r 85%|████████▌ | 276/324 [00:22<00:04, 11.81it/s]\u001b[A\n\r 86%|████████▌ | 278/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 86%|████████▋ | 280/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 87%|████████▋ | 282/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 284/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 286/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 89%|████████▉ | 288/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 90%|████████▉ | 290/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 90%|█████████ | 292/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████ | 294/324 [00:23<00:02, 11.79it/s]\u001b[A\n\r 91%|█████████▏| 296/324 [00:23<00:02, 11.82it/s]\u001b[A\n\r 92%|█████████▏| 298/324 [00:24<00:02, 11.82it/s]\u001b[A\n\r 93%|█████████▎| 300/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 302/324 [00:24<00:01, 11.82it/s]\u001b[A\n\r 94%|█████████▍| 304/324 [00:24<00:01, 11.82it/s]\u001b[A\n\r 94%|█████████▍| 306/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 95%|█████████▌| 308/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▌| 310/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▋| 312/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 97%|█████████▋| 314/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 316/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 318/324 [00:25<00:00, 11.80it/s]\u001b[A\n\r 99%|█████████▉| 320/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 322/324 [00:26<00:00, 11.81it/s]\u001b[A\n\r100%|██████████| 324/324 [00:26<00:00, 11.81it/s]\u001b[A\r100%|██████████| 324/324 [00:26<00:00, 12.36it/s]\n\r 40%|████      | 2/5 [01:01<01:31, 30.59s/it]\n\r  0%|          | 0/324 [00:00<?, ?it/s]\u001b[A\n\r  5%|▍         | 16/324 [00:00<00:02, 111.46it/s]\u001b[A\n\r  9%|▊         | 28/324 [00:01<00:14, 20.56it/s] \u001b[A\n\r 10%|█         | 34/324 [00:01<00:16, 17.23it/s]\u001b[A\n\r 12%|█▏        | 38/324 [00:02<00:18, 15.75it/s]\u001b[A\n\r 13%|█▎        | 41/324 [00:02<00:19, 14.86it/s]\u001b[A\n\r 14%|█▎        | 44/324 [00:02<00:19, 14.11it/s]\u001b[A\n\r 14%|█▍        | 46/324 [00:02<00:20, 13.68it/s]\u001b[A\n\r 15%|█▍        | 48/324 [00:02<00:20, 13.29it/s]\u001b[A\n\r 15%|█▌        | 50/324 [00:03<00:21, 12.94it/s]\u001b[A\n\r 16%|█▌        | 52/324 [00:03<00:21, 12.66it/s]\u001b[A\n\r 17%|█▋        | 54/324 [00:03<00:21, 12.44it/s]\u001b[A\n\r 17%|█▋        | 56/324 [00:03<00:21, 12.26it/s]\u001b[A\n\r 18%|█▊        | 58/324 [00:03<00:21, 12.13it/s]\u001b[A\n\r 19%|█▊        | 60/324 [00:03<00:21, 12.04it/s]\u001b[A\n\r 19%|█▉        | 62/324 [00:04<00:21, 11.97it/s]\u001b[A\n\r 20%|█▉        | 64/324 [00:04<00:21, 11.92it/s]\u001b[A\n\r 20%|██        | 66/324 [00:04<00:21, 11.89it/s]\u001b[A\n\r 21%|██        | 68/324 [00:04<00:21, 11.86it/s]\u001b[A\n\r 22%|██▏       | 70/324 [00:04<00:21, 11.85it/s]\u001b[A\n\r 22%|██▏       | 72/324 [00:04<00:21, 11.83it/s]\u001b[A\n\r 23%|██▎       | 74/324 [00:05<00:21, 11.82it/s]\u001b[A\n\r 23%|██▎       | 76/324 [00:05<00:21, 11.69it/s]\u001b[A\n\r 24%|██▍       | 78/324 [00:05<00:20, 11.86it/s]\u001b[A\n\r 25%|██▍       | 80/324 [00:05<00:20, 11.84it/s]\u001b[A\n\r 25%|██▌       | 82/324 [00:05<00:20, 11.80it/s]\u001b[A\n\r 26%|██▌       | 84/324 [00:05<00:20, 11.83it/s]\u001b[A\n\r 27%|██▋       | 86/324 [00:06<00:20, 11.83it/s]\u001b[A\n\r 27%|██▋       | 88/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 28%|██▊       | 90/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.76it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.83it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.82it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:18, 11.82it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.81it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.78it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.74it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.61it/s]\u001b[A\n\r 35%|███▍      | 112/324 [00:08<00:17, 11.87it/s]\u001b[A\n\r 35%|███▌      | 114/324 [00:08<00:17, 11.86it/s]\u001b[A\n\r 36%|███▌      | 116/324 [00:08<00:17, 11.86it/s]\u001b[A\n\r 36%|███▋      | 118/324 [00:08<00:17, 11.85it/s]\u001b[A\n\r 37%|███▋      | 120/324 [00:08<00:17, 11.84it/s]\u001b[A\n\r 38%|███▊      | 122/324 [00:09<00:17, 11.79it/s]\u001b[A\n\r 38%|███▊      | 124/324 [00:09<00:17, 11.71it/s]\u001b[A\n\r 39%|███▉      | 126/324 [00:09<00:16, 11.87it/s]\u001b[A\n\r 40%|███▉      | 128/324 [00:09<00:16, 11.85it/s]\u001b[A\n\r 40%|████      | 130/324 [00:09<00:16, 11.84it/s]\u001b[A\n\r 41%|████      | 132/324 [00:09<00:16, 11.83it/s]\u001b[A\n\r 41%|████▏     | 134/324 [00:10<00:16, 11.82it/s]\u001b[A\n\r 42%|████▏     | 136/324 [00:10<00:15, 11.82it/s]\u001b[A\n\r 43%|████▎     | 138/324 [00:10<00:15, 11.82it/s]\u001b[A\n\r 43%|████▎     | 140/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 142/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 144/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 45%|████▌     | 146/324 [00:11<00:15, 11.81it/s]\u001b[A\n\r 46%|████▌     | 148/324 [00:11<00:15, 11.71it/s]\u001b[A\n\r 46%|████▋     | 150/324 [00:11<00:14, 11.84it/s]\u001b[A\n\r 47%|████▋     | 152/324 [00:11<00:14, 11.83it/s]\u001b[A\n\r 48%|████▊     | 154/324 [00:11<00:14, 11.72it/s]\u001b[A\n\r 48%|████▊     | 156/324 [00:12<00:14, 11.86it/s]\u001b[A\n\r 49%|████▉     | 158/324 [00:12<00:14, 11.84it/s]\u001b[A\n\r 49%|████▉     | 160/324 [00:12<00:13, 11.84it/s]\u001b[A\n\r 50%|█████     | 162/324 [00:12<00:13, 11.83it/s]\u001b[A\n\r 51%|█████     | 164/324 [00:12<00:13, 11.83it/s]\u001b[A\n\r 51%|█████     | 166/324 [00:12<00:13, 11.82it/s]\u001b[A\n\r 52%|█████▏    | 168/324 [00:13<00:13, 11.82it/s]\u001b[A\n\r 52%|█████▏    | 170/324 [00:13<00:13, 11.79it/s]\u001b[A\n\r 53%|█████▎    | 172/324 [00:13<00:12, 11.83it/s]\u001b[A\n\r 54%|█████▎    | 174/324 [00:13<00:12, 11.82it/s]\u001b[A\n\r 54%|█████▍    | 176/324 [00:13<00:12, 11.81it/s]\u001b[A\n\r 55%|█████▍    | 178/324 [00:13<00:12, 11.71it/s]\u001b[A\n\r 56%|█████▌    | 180/324 [00:14<00:12, 11.84it/s]\u001b[A\n\r 56%|█████▌    | 182/324 [00:14<00:12, 11.83it/s]\u001b[A\n\r 57%|█████▋    | 184/324 [00:14<00:11, 11.83it/s]\u001b[A\n\r 57%|█████▋    | 186/324 [00:14<00:11, 11.82it/s]\u001b[A\n\r 58%|█████▊    | 188/324 [00:14<00:11, 11.81it/s]\u001b[A\n\r 59%|█████▊    | 190/324 [00:14<00:11, 11.79it/s]\u001b[A\n\r 59%|█████▉    | 192/324 [00:15<00:11, 11.82it/s]\u001b[A\n\r 60%|█████▉    | 194/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 60%|██████    | 196/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 61%|██████    | 198/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 62%|██████▏   | 200/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 62%|██████▏   | 202/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 63%|██████▎   | 204/324 [00:16<00:10, 11.81it/s]\u001b[A\n\r 64%|██████▎   | 206/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 64%|██████▍   | 208/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▍   | 210/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▌   | 212/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 66%|██████▌   | 214/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 216/324 [00:17<00:09, 11.80it/s]\u001b[A\n\r 67%|██████▋   | 218/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 68%|██████▊   | 220/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▊   | 222/324 [00:17<00:08, 11.80it/s]\u001b[A\n\r 69%|██████▉   | 224/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|██████▉   | 226/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|███████   | 228/324 [00:18<00:08, 11.81it/s]\u001b[A\n\r 71%|███████   | 230/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 232/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 234/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 236/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 238/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 74%|███████▍  | 240/324 [00:19<00:07, 11.80it/s]\u001b[A\n\r 75%|███████▍  | 242/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 75%|███████▌  | 244/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 76%|███████▌  | 246/324 [00:19<00:06, 11.80it/s]\u001b[A\n\r 77%|███████▋  | 248/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 250/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 252/324 [00:20<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 254/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 79%|███████▉  | 256/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|███████▉  | 258/324 [00:20<00:05, 11.80it/s]\u001b[A\n\r 80%|████████  | 260/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████  | 262/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████▏ | 264/324 [00:21<00:05, 11.81it/s]\u001b[A\n\r 82%|████████▏ | 266/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 268/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 270/324 [00:21<00:04, 11.80it/s]\u001b[A\n\r 84%|████████▍ | 272/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 85%|████████▍ | 274/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 85%|████████▌ | 276/324 [00:22<00:04, 11.81it/s]\u001b[A\n\r 86%|████████▌ | 278/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 86%|████████▋ | 280/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 87%|████████▋ | 282/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 284/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 286/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 89%|████████▉ | 288/324 [00:23<00:03, 11.80it/s]\u001b[A\n\r 90%|████████▉ | 290/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 90%|█████████ | 292/324 [00:23<00:02, 11.80it/s]\u001b[A\n\r 91%|█████████ | 294/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████▏| 296/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 92%|█████████▏| 298/324 [00:24<00:02, 11.80it/s]\u001b[A\n\r 93%|█████████▎| 300/324 [00:24<00:02, 11.80it/s]\u001b[A\n\r 93%|█████████▎| 302/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 304/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 306/324 [00:24<00:01, 11.80it/s]\u001b[A\n\r 95%|█████████▌| 308/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▌| 310/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▋| 312/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 97%|█████████▋| 314/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 316/324 [00:25<00:00, 11.80it/s]\u001b[A\n\r 98%|█████████▊| 318/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 320/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 322/324 [00:26<00:00, 11.81it/s]\u001b[A\n\r100%|██████████| 324/324 [00:26<00:00, 11.81it/s]\u001b[A\r100%|██████████| 324/324 [00:26<00:00, 12.35it/s]\n\r 60%|██████    | 3/5 [01:31<01:01, 30.61s/it]\n\r  0%|          | 0/324 [00:00<?, ?it/s]\u001b[A\n\r  5%|▍         | 16/324 [00:00<00:02, 111.38it/s]\u001b[A\n\r  9%|▊         | 28/324 [00:01<00:14, 20.70it/s] \u001b[A\n\r 10%|█         | 34/324 [00:01<00:16, 17.17it/s]\u001b[A\n\r 12%|█▏        | 38/324 [00:02<00:18, 15.74it/s]\u001b[A\n\r 13%|█▎        | 41/324 [00:02<00:19, 14.78it/s]\u001b[A\n\r 14%|█▎        | 44/324 [00:02<00:19, 14.13it/s]\u001b[A\n\r 14%|█▍        | 46/324 [00:02<00:20, 13.69it/s]\u001b[A\n\r 15%|█▍        | 48/324 [00:02<00:20, 13.29it/s]\u001b[A\n\r 15%|█▌        | 50/324 [00:03<00:21, 12.94it/s]\u001b[A\n\r 16%|█▌        | 52/324 [00:03<00:21, 12.61it/s]\u001b[A\n\r 17%|█▋        | 54/324 [00:03<00:21, 12.45it/s]\u001b[A\n\r 17%|█▋        | 56/324 [00:03<00:21, 12.28it/s]\u001b[A\n\r 18%|█▊        | 58/324 [00:03<00:21, 12.14it/s]\u001b[A\n\r 19%|█▊        | 60/324 [00:03<00:21, 12.05it/s]\u001b[A\n\r 19%|█▉        | 62/324 [00:04<00:21, 11.98it/s]\u001b[A\n\r 20%|█▉        | 64/324 [00:04<00:21, 11.93it/s]\u001b[A\n\r 20%|██        | 66/324 [00:04<00:21, 11.89it/s]\u001b[A\n\r 21%|██        | 68/324 [00:04<00:21, 11.86it/s]\u001b[A\n\r 22%|██▏       | 70/324 [00:04<00:21, 11.84it/s]\u001b[A\n\r 22%|██▏       | 72/324 [00:04<00:21, 11.79it/s]\u001b[A\n\r 23%|██▎       | 74/324 [00:05<00:21, 11.84it/s]\u001b[A\n\r 23%|██▎       | 76/324 [00:05<00:20, 11.83it/s]\u001b[A\n\r 24%|██▍       | 78/324 [00:05<00:20, 11.82it/s]\u001b[A\n\r 25%|██▍       | 80/324 [00:05<00:20, 11.70it/s]\u001b[A\n\r 25%|██▌       | 82/324 [00:05<00:20, 11.85it/s]\u001b[A\n\r 26%|██▌       | 84/324 [00:05<00:20, 11.69it/s]\u001b[A\n\r 27%|██▋       | 86/324 [00:06<00:20, 11.88it/s]\u001b[A\n\r 27%|██▋       | 88/324 [00:06<00:20, 11.79it/s]\u001b[A\n\r 28%|██▊       | 90/324 [00:06<00:19, 11.81it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.86it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.84it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.73it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.80it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:19, 11.79it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:18, 11.84it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.86it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.69it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.88it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.85it/s]\u001b[A\n\r 35%|███▍      | 112/324 [00:08<00:18, 11.70it/s]\u001b[A\n\r 35%|███▌      | 114/324 [00:08<00:17, 11.75it/s]\u001b[A\n\r 36%|███▌      | 116/324 [00:08<00:17, 11.83it/s]\u001b[A\n\r 36%|███▋      | 118/324 [00:08<00:17, 11.88it/s]\u001b[A\n\r 37%|███▋      | 120/324 [00:08<00:17, 11.86it/s]\u001b[A\n\r 38%|███▊      | 122/324 [00:09<00:17, 11.85it/s]\u001b[A\n\r 38%|███▊      | 124/324 [00:09<00:16, 11.84it/s]\u001b[A\n\r 39%|███▉      | 126/324 [00:09<00:16, 11.83it/s]\u001b[A\n\r 40%|███▉      | 128/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 40%|████      | 130/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 41%|████      | 132/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 41%|████▏     | 134/324 [00:10<00:16, 11.81it/s]\u001b[A\n\r 42%|████▏     | 136/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 43%|████▎     | 138/324 [00:10<00:15, 11.80it/s]\u001b[A\n\r 43%|████▎     | 140/324 [00:10<00:15, 11.80it/s]\u001b[A\n\r 44%|████▍     | 142/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 144/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 45%|████▌     | 146/324 [00:11<00:15, 11.81it/s]\u001b[A\n\r 46%|████▌     | 148/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 46%|████▋     | 150/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 47%|████▋     | 152/324 [00:11<00:14, 11.76it/s]\u001b[A\n\r 48%|████▊     | 154/324 [00:11<00:14, 11.83it/s]\u001b[A\n\r 48%|████▊     | 156/324 [00:12<00:14, 11.82it/s]\u001b[A\n\r 49%|████▉     | 158/324 [00:12<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 160/324 [00:12<00:14, 11.69it/s]\u001b[A\n\r 50%|█████     | 162/324 [00:12<00:13, 11.70it/s]\u001b[A\n\r 51%|█████     | 164/324 [00:12<00:13, 11.82it/s]\u001b[A\n\r 51%|█████     | 166/324 [00:12<00:13, 11.79it/s]\u001b[A\n\r 52%|█████▏    | 168/324 [00:13<00:13, 11.82it/s]\u001b[A\n\r 52%|█████▏    | 170/324 [00:13<00:12, 11.88it/s]\u001b[A\n\r 53%|█████▎    | 172/324 [00:13<00:12, 11.86it/s]\u001b[A\n\r 54%|█████▎    | 174/324 [00:13<00:12, 11.84it/s]\u001b[A\n\r 54%|█████▍    | 176/324 [00:13<00:12, 11.83it/s]\u001b[A\n\r 55%|█████▍    | 178/324 [00:13<00:12, 11.75it/s]\u001b[A\n\r 56%|█████▌    | 180/324 [00:14<00:12, 11.71it/s]\u001b[A\n\r 56%|█████▌    | 182/324 [00:14<00:12, 11.79it/s]\u001b[A\n\r 57%|█████▋    | 184/324 [00:14<00:11, 11.82it/s]\u001b[A\n\r 57%|█████▋    | 186/324 [00:14<00:11, 11.87it/s]\u001b[A\n\r 58%|█████▊    | 188/324 [00:14<00:11, 11.85it/s]\u001b[A\n\r 59%|█████▊    | 190/324 [00:14<00:11, 11.84it/s]\u001b[A\n\r 59%|█████▉    | 192/324 [00:15<00:11, 11.83it/s]\u001b[A\n\r 60%|█████▉    | 194/324 [00:15<00:10, 11.83it/s]\u001b[A\n\r 60%|██████    | 196/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 61%|██████    | 198/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 62%|██████▏   | 200/324 [00:15<00:10, 11.78it/s]\u001b[A\n\r 62%|██████▏   | 202/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 63%|██████▎   | 204/324 [00:16<00:10, 11.82it/s]\u001b[A\n\r 64%|██████▎   | 206/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 64%|██████▍   | 208/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▍   | 210/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 65%|██████▌   | 212/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 66%|██████▌   | 214/324 [00:16<00:09, 11.81it/s]\u001b[A\n\r 67%|██████▋   | 216/324 [00:17<00:09, 11.80it/s]\u001b[A\n\r 67%|██████▋   | 218/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 68%|██████▊   | 220/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▊   | 222/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 69%|██████▉   | 224/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|██████▉   | 226/324 [00:17<00:08, 11.81it/s]\u001b[A\n\r 70%|███████   | 228/324 [00:18<00:08, 11.80it/s]\u001b[A\n\r 71%|███████   | 230/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 232/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 234/324 [00:18<00:07, 11.80it/s]\u001b[A\n\r 73%|███████▎  | 236/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 238/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 74%|███████▍  | 240/324 [00:19<00:07, 11.80it/s]\u001b[A\n\r 75%|███████▍  | 242/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 75%|███████▌  | 244/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 76%|███████▌  | 246/324 [00:19<00:06, 11.80it/s]\u001b[A\n\r 77%|███████▋  | 248/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 250/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 252/324 [00:20<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 254/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 79%|███████▉  | 256/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|███████▉  | 258/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|████████  | 260/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████  | 262/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████▏ | 264/324 [00:21<00:05, 11.80it/s]\u001b[A\n\r 82%|████████▏ | 266/324 [00:21<00:04, 11.80it/s]\u001b[A\n\r 83%|████████▎ | 268/324 [00:21<00:04, 11.80it/s]\u001b[A\n\r 83%|████████▎ | 270/324 [00:21<00:04, 11.80it/s]\u001b[A\n\r 84%|████████▍ | 272/324 [00:21<00:04, 11.80it/s]\u001b[A\n\r 85%|████████▍ | 274/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 85%|████████▌ | 276/324 [00:22<00:04, 11.81it/s]\u001b[A\n\r 86%|████████▌ | 278/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 86%|████████▋ | 280/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 87%|████████▋ | 282/324 [00:22<00:03, 11.80it/s]\u001b[A\n\r 88%|████████▊ | 284/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 286/324 [00:23<00:03, 11.80it/s]\u001b[A\n\r 89%|████████▉ | 288/324 [00:23<00:03, 11.80it/s]\u001b[A\n\r 90%|████████▉ | 290/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 90%|█████████ | 292/324 [00:23<00:02, 11.80it/s]\u001b[A\n\r 91%|█████████ | 294/324 [00:23<00:02, 11.80it/s]\u001b[A\n\r 91%|█████████▏| 296/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 92%|█████████▏| 298/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 300/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 302/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 304/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 306/324 [00:24<00:01, 11.80it/s]\u001b[A\n\r 95%|█████████▌| 308/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▌| 310/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▋| 312/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 97%|█████████▋| 314/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 316/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 318/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 320/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 322/324 [00:26<00:00, 11.81it/s]\u001b[A\n\r100%|██████████| 324/324 [00:26<00:00, 11.80it/s]\u001b[A\r100%|██████████| 324/324 [00:26<00:00, 12.35it/s]\n\r 80%|████████  | 4/5 [02:02<00:30, 30.58s/it]\n\r  0%|          | 0/324 [00:00<?, ?it/s]\u001b[A\n\r  5%|▍         | 16/324 [00:00<00:02, 112.96it/s]\u001b[A\n\r  9%|▊         | 28/324 [00:01<00:14, 20.80it/s] \u001b[A\n\r 10%|█         | 34/324 [00:01<00:16, 17.25it/s]\u001b[A\n\r 12%|█▏        | 38/324 [00:02<00:18, 15.76it/s]\u001b[A\n\r 13%|█▎        | 41/324 [00:02<00:19, 14.88it/s]\u001b[A\n\r 14%|█▎        | 44/324 [00:02<00:19, 14.13it/s]\u001b[A\n\r 14%|█▍        | 46/324 [00:02<00:20, 13.66it/s]\u001b[A\n\r 15%|█▍        | 48/324 [00:02<00:20, 13.30it/s]\u001b[A\n\r 15%|█▌        | 50/324 [00:03<00:21, 12.89it/s]\u001b[A\n\r 16%|█▌        | 52/324 [00:03<00:21, 12.69it/s]\u001b[A\n\r 17%|█▋        | 54/324 [00:03<00:21, 12.36it/s]\u001b[A\n\r 17%|█▋        | 56/324 [00:03<00:21, 12.31it/s]\u001b[A\n\r 18%|█▊        | 58/324 [00:03<00:21, 12.17it/s]\u001b[A\n\r 19%|█▊        | 60/324 [00:03<00:21, 12.07it/s]\u001b[A\n\r 19%|█▉        | 62/324 [00:04<00:21, 12.00it/s]\u001b[A\n\r 20%|█▉        | 64/324 [00:04<00:21, 11.94it/s]\u001b[A\n\r 20%|██        | 66/324 [00:04<00:21, 11.91it/s]\u001b[A\n\r 21%|██        | 68/324 [00:04<00:21, 11.88it/s]\u001b[A\n\r 22%|██▏       | 70/324 [00:04<00:21, 11.86it/s]\u001b[A\n\r 22%|██▏       | 72/324 [00:04<00:21, 11.85it/s]\u001b[A\n\r 23%|██▎       | 74/324 [00:05<00:21, 11.75it/s]\u001b[A\n\r 23%|██▎       | 76/324 [00:05<00:20, 11.85it/s]\u001b[A\n\r 24%|██▍       | 78/324 [00:05<00:20, 11.73it/s]\u001b[A\n\r 25%|██▍       | 80/324 [00:05<00:20, 11.86it/s]\u001b[A\n\r 25%|██▌       | 82/324 [00:05<00:20, 11.85it/s]\u001b[A\n\r 26%|██▌       | 84/324 [00:05<00:20, 11.84it/s]\u001b[A\n\r 27%|██▋       | 86/324 [00:06<00:20, 11.83it/s]\u001b[A\n\r 27%|██▋       | 88/324 [00:06<00:19, 11.82it/s]\u001b[A\n\r 28%|██▊       | 90/324 [00:06<00:19, 11.79it/s]\u001b[A\n\r 28%|██▊       | 92/324 [00:06<00:19, 11.71it/s]\u001b[A\n\r 29%|██▉       | 94/324 [00:06<00:19, 11.79it/s]\u001b[A\n\r 30%|██▉       | 96/324 [00:06<00:19, 11.86it/s]\u001b[A\n\r 30%|███       | 98/324 [00:07<00:19, 11.83it/s]\u001b[A\n\r 31%|███       | 100/324 [00:07<00:19, 11.71it/s]\u001b[A\n\r 31%|███▏      | 102/324 [00:07<00:19, 11.68it/s]\u001b[A\n\r 32%|███▏      | 104/324 [00:07<00:18, 11.91it/s]\u001b[A\n\r 33%|███▎      | 106/324 [00:07<00:18, 11.88it/s]\u001b[A\n\r 33%|███▎      | 108/324 [00:07<00:18, 11.86it/s]\u001b[A\n\r 34%|███▍      | 110/324 [00:08<00:18, 11.82it/s]\u001b[A\n\r 35%|███▍      | 112/324 [00:08<00:17, 11.84it/s]\u001b[A\n\r 35%|███▌      | 114/324 [00:08<00:17, 11.81it/s]\u001b[A\n\r 36%|███▌      | 116/324 [00:08<00:17, 11.84it/s]\u001b[A\n\r 36%|███▋      | 118/324 [00:08<00:17, 11.83it/s]\u001b[A\n\r 37%|███▋      | 120/324 [00:08<00:17, 11.83it/s]\u001b[A\n\r 38%|███▊      | 122/324 [00:09<00:17, 11.83it/s]\u001b[A\n\r 38%|███▊      | 124/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 39%|███▉      | 126/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 40%|███▉      | 128/324 [00:09<00:16, 11.82it/s]\u001b[A\n\r 40%|████      | 130/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 41%|████      | 132/324 [00:09<00:16, 11.81it/s]\u001b[A\n\r 41%|████▏     | 134/324 [00:10<00:16, 11.81it/s]\u001b[A\n\r 42%|████▏     | 136/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 43%|████▎     | 138/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 43%|████▎     | 140/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 142/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 44%|████▍     | 144/324 [00:10<00:15, 11.81it/s]\u001b[A\n\r 45%|████▌     | 146/324 [00:11<00:15, 11.81it/s]\u001b[A\n\r 46%|████▌     | 148/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 46%|████▋     | 150/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 47%|████▋     | 152/324 [00:11<00:14, 11.80it/s]\u001b[A\n\r 48%|████▊     | 154/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 48%|████▊     | 156/324 [00:11<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 158/324 [00:12<00:14, 11.81it/s]\u001b[A\n\r 49%|████▉     | 160/324 [00:12<00:14, 11.69it/s]\u001b[A\n\r 50%|█████     | 162/324 [00:12<00:13, 11.77it/s]\u001b[A\n\r 51%|█████     | 164/324 [00:12<00:13, 11.85it/s]\u001b[A\n\r 51%|█████     | 166/324 [00:12<00:13, 11.84it/s]\u001b[A\n\r 52%|█████▏    | 168/324 [00:13<00:13, 11.83it/s]\u001b[A\n\r 52%|█████▏    | 170/324 [00:13<00:13, 11.82it/s]\u001b[A\n\r 53%|█████▎    | 172/324 [00:13<00:13, 11.62it/s]\u001b[A\n\r 54%|█████▎    | 174/324 [00:13<00:12, 11.88it/s]\u001b[A\n\r 54%|█████▍    | 176/324 [00:13<00:12, 11.86it/s]\u001b[A\n\r 55%|█████▍    | 178/324 [00:13<00:12, 11.80it/s]\u001b[A\n\r 56%|█████▌    | 180/324 [00:14<00:12, 11.85it/s]\u001b[A\n\r 56%|█████▌    | 182/324 [00:14<00:11, 11.84it/s]\u001b[A\n\r 57%|█████▋    | 184/324 [00:14<00:11, 11.83it/s]\u001b[A\n\r 57%|█████▋    | 186/324 [00:14<00:11, 11.83it/s]\u001b[A\n\r 58%|█████▊    | 188/324 [00:14<00:11, 11.82it/s]\u001b[A\n\r 59%|█████▊    | 190/324 [00:14<00:11, 11.82it/s]\u001b[A\n\r 59%|█████▉    | 192/324 [00:15<00:11, 11.82it/s]\u001b[A\n\r 60%|█████▉    | 194/324 [00:15<00:11, 11.81it/s]\u001b[A\n\r 60%|██████    | 196/324 [00:15<00:10, 11.81it/s]\u001b[A\n\r 61%|██████    | 198/324 [00:15<00:10, 11.80it/s]\u001b[A\n\r 62%|██████▏   | 200/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 62%|██████▏   | 202/324 [00:15<00:10, 11.82it/s]\u001b[A\n\r 63%|██████▎   | 204/324 [00:16<00:10, 11.82it/s]\u001b[A\n\r 64%|██████▎   | 206/324 [00:16<00:10, 11.66it/s]\u001b[A\n\r 64%|██████▍   | 208/324 [00:16<00:09, 11.73it/s]\u001b[A\n\r 65%|██████▍   | 210/324 [00:16<00:09, 11.88it/s]\u001b[A\n\r 65%|██████▌   | 212/324 [00:16<00:09, 11.86it/s]\u001b[A\n\r 66%|██████▌   | 214/324 [00:16<00:09, 11.85it/s]\u001b[A\n\r 67%|██████▋   | 216/324 [00:17<00:09, 11.84it/s]\u001b[A\n\r 67%|██████▋   | 218/324 [00:17<00:08, 11.83it/s]\u001b[A\n\r 68%|██████▊   | 220/324 [00:17<00:08, 11.83it/s]\u001b[A\n\r 69%|██████▊   | 222/324 [00:17<00:08, 11.82it/s]\u001b[A\n\r 69%|██████▉   | 224/324 [00:17<00:08, 11.82it/s]\u001b[A\n\r 70%|██████▉   | 226/324 [00:17<00:08, 11.82it/s]\u001b[A\n\r 70%|███████   | 228/324 [00:18<00:08, 11.81it/s]\u001b[A\n\r 71%|███████   | 230/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 232/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 72%|███████▏  | 234/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 236/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 73%|███████▎  | 238/324 [00:18<00:07, 11.81it/s]\u001b[A\n\r 74%|███████▍  | 240/324 [00:19<00:07, 11.81it/s]\u001b[A\n\r 75%|███████▍  | 242/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 75%|███████▌  | 244/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 76%|███████▌  | 246/324 [00:19<00:06, 11.81it/s]\u001b[A\n\r 77%|███████▋  | 248/324 [00:19<00:06, 11.80it/s]\u001b[A\n\r 77%|███████▋  | 250/324 [00:19<00:06, 11.80it/s]\u001b[A\n\r 78%|███████▊  | 252/324 [00:20<00:06, 11.81it/s]\u001b[A\n\r 78%|███████▊  | 254/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 79%|███████▉  | 256/324 [00:20<00:05, 11.80it/s]\u001b[A\n\r 80%|███████▉  | 258/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 80%|████████  | 260/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████  | 262/324 [00:20<00:05, 11.81it/s]\u001b[A\n\r 81%|████████▏ | 264/324 [00:21<00:05, 11.81it/s]\u001b[A\n\r 82%|████████▏ | 266/324 [00:21<00:04, 11.82it/s]\u001b[A\n\r 83%|████████▎ | 268/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 83%|████████▎ | 270/324 [00:21<00:04, 11.81it/s]\u001b[A\n\r 84%|████████▍ | 272/324 [00:21<00:04, 11.82it/s]\u001b[A\n\r 85%|████████▍ | 274/324 [00:21<00:04, 11.82it/s]\u001b[A\n\r 85%|████████▌ | 276/324 [00:22<00:04, 11.81it/s]\u001b[A\n\r 86%|████████▌ | 278/324 [00:22<00:03, 11.82it/s]\u001b[A\n\r 86%|████████▋ | 280/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 87%|████████▋ | 282/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 284/324 [00:22<00:03, 11.81it/s]\u001b[A\n\r 88%|████████▊ | 286/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 89%|████████▉ | 288/324 [00:23<00:03, 11.81it/s]\u001b[A\n\r 90%|████████▉ | 290/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 90%|█████████ | 292/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████ | 294/324 [00:23<00:02, 11.81it/s]\u001b[A\n\r 91%|█████████▏| 296/324 [00:23<00:02, 11.82it/s]\u001b[A\n\r 92%|█████████▏| 298/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 300/324 [00:24<00:02, 11.81it/s]\u001b[A\n\r 93%|█████████▎| 302/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 304/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 94%|█████████▍| 306/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 95%|█████████▌| 308/324 [00:24<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▌| 310/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 96%|█████████▋| 312/324 [00:25<00:01, 11.81it/s]\u001b[A\n\r 97%|█████████▋| 314/324 [00:25<00:00, 11.82it/s]\u001b[A\n\r 98%|█████████▊| 316/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 98%|█████████▊| 318/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 320/324 [00:25<00:00, 11.81it/s]\u001b[A\n\r 99%|█████████▉| 322/324 [00:26<00:00, 11.81it/s]\u001b[A\n\r100%|██████████| 324/324 [00:26<00:00, 11.81it/s]\u001b[A\r100%|██████████| 324/324 [00:26<00:00, 12.36it/s]\r100%|██████████| 5/5 [02:32<00:00, 30.30s/it]\r100%|██████████| 5/5 [02:32<00:00, 30.43s/it]\n"
  },
  {
    "path": "docs/sample_logs/merge_sub_modules.txt",
    "content": "fg test eval: tensor([[0.6409, 0.6197, 0.5834, 0.5288]], device='cuda:0')\nbg test eval: tensor([[ 0.2126,  0.2257,  0.1091, 55.9907]], device='cuda:0')\n"
  },
  {
    "path": "docs/weekly_nerf.md",
    "content": "\nWeekly Classified Neural Radiance Fields ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n=========================================================================================================================================================\n## Filter by classes: \n [all](./weekly_nerf.md) | [dynamic](./classified_weekly_nerf/dynamic.md) | [editing](./classified_weekly_nerf/editing.md) | [fast](./classified_weekly_nerf/fast.md) | [generalization](./classified_weekly_nerf/generalization.md) | [human](./classified_weekly_nerf/human.md) | [video](./classified_weekly_nerf/video.md) | [lighting](./classified_weekly_nerf/lighting.md) | [reconstruction](./classified_weekly_nerf/reconstruction.md) | [texture](./classified_weekly_nerf/texture.md) | [semantic](./classified_weekly_nerf/semantic.md) | [pose-slam](./classified_weekly_nerf/pose-slam.md) | [others](./classified_weekly_nerf/others.md) \n## Dec27 - Jan3, 2023\n  - [Boosting UAV Tracking with Voxel-Based Trajectory-Aware Pre-Training, RAL2022](https://ieeexplore.ieee.org/abstract/document/10015867) | [code]\n    > Siamese network-based object tracking has remarkably promoted the automatic capability for highly-maneuvered unmanned aerial vehicles (UAVs). However, the leading-edge tracking framework often depends on template matching, making it trapped when facing multiple views of object in consecutive frames. Moreover, the general image-level pretrained backbone can overfit to holistic representations, causing the misalignment to learn object-level properties in UAV tracking. To tackle these issues, this work presents TRTrack , a comprehensive framework to fully exploit the stereoscopic representation for UAV tracking. Specifically, a novel pre-training paradigm method is proposed. Through trajectory-aware reconstruction training (TRT), the capability of the backbone to extract stereoscopic structure feature is strengthened without any parameter increment. Accordingly, an innovative hierarchical self-attention Transformer is proposed to capture the local detail information and global structure knowledge. For optimizing the correlation map, we proposed a novel spatial correlation refinement (SCR) module, which promotes the capability of modeling the long-range spatial dependencies. Comprehensive experiments on three UAV challenging benchmarks demonstrate that the proposed TRTrack achieves superior UAV tracking performance in both precision and efficiency. Quantitative tests in real-world settings fully prove the effectiveness of our work.\n## Dec25 - Dec31, 2022\n  - [Neural Radiance Fields from Sparse RGB-D Images for High-Quality View Synthesis, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9999509) | [code]\n    > The recently proposed neural radiance fields (NeRF) use a continuous function formulated as a multi-layer perceptron (MLP) to model the appearance and geometry of a 3D scene. This enables realistic synthesis of novel views, even for scenes with view dependent appearance. Many follow-up works have since extended NeRFs in different ways. However, a fundamental restriction of the method remains that it requires a large number of images captured from densely placed viewpoints for high-quality synthesis and the quality of the results quickly degrades when the number of captured views is insufficient. To address this problem, we propose a novel NeRF-based framework capable of high-quality view synthesis using only a sparse set of RGB-D images, which can be easily captured using cameras and LiDAR sensors on current consumer devices. First, a geometric proxy of the scene is reconstructed from the captured RGB-D images. Renderings of the reconstructed scene along with precise camera parameters can then be used to pre-train a network. Finally, the network is fine-tuned with a small number of real captured images. We further introduce a patch discriminator to supervise the network under novel views during fine-tuning, as well as a 3D color prior to improve synthesis quality. We demonstrate that our method can generate arbitrary novel views of a 3D scene from as few as 6 RGB-D images. Extensive experiments show the improvements of our method compared with the existing NeRF-based methods, including approaches that also aim to reduce the number of input images.\n## Dec18 - Dec24, 2022\n  - [Removing Objects From Neural Radiance Fields](https://arxiv.org/abs/2212.11966) | [code]\n    > Neural Radiance Fields (NeRFs) are emerging as a ubiquitous scene representation that allows for novel view synthesis. Increasingly, NeRFs will be shareable with other people. Before sharing a NeRF, though, it might be desirable to remove personal information or unsightly objects. Such removal is not easily achieved with the current NeRF editing frameworks. We propose a framework to remove objects from a NeRF representation created from an RGB-D sequence. Our NeRF inpainting method leverages recent work in 2D image inpainting and is guided by a user-provided mask. Our algorithm is underpinned by a confidence based view selection procedure. It chooses which of the individual 2D inpainted images to use in the creation of the NeRF, so that the resulting inpainted NeRF is 3D consistent. We show that our method for NeRF editing is effective for synthesizing plausible inpaintings in a multi-view coherent manner. We validate our approach using a new and still-challenging dataset for the task of NeRF inpainting.\n  - [iLabel: Revealing Objects in Neural Fields, RAL2022](https://ieeexplore.ieee.org/abstract/document/9996585) | [code]\n    > A neural field trained with self-supervision to efficiently represent the geometry and colour of a 3D scene tends to automatically decompose it into coherent and accurate object-like regions, which can be revealed with sparse labelling interactions to produce a 3D semantic scene segmentation. Our real-time iLabel system takes input from a hand-held RGB-D camera, requires zero prior training data, and works in an ‘open set’ manner, with semantic classes defined on the fly by the user. iLabel's underlying model is a simple multilayer perceptron (MLP), trained from scratch to learn a neural representation of a single 3D scene. The model is updated continually and visualised in real-time, allowing the user to focus interactions to achieve extremely efficient semantic segmentation. A room-scale scene can be accurately labelled into 10+ semantic categories with around 100 clicks, taking less than 5 minutes. Quantitative labelling accuracy scales powerfully with the number of clicks, and rapidly surpasses standard pre-trained semantic segmentation methods. We also demonstrate a hierarchical labelling variant of iLabel and a ‘hands-free’ mode where the user only needs to supply label names for automatically-generated locations.\n  - [Masked Wavelet Representation for Compact Neural Radiance Fields](https://arxiv.org/abs/2212.09069) | [***``[code]``***](https://github.com/daniel03c1/masked_wavelet_nerf)\n    > Neural radiance fields (NeRF) have demonstrated the potential of coordinate-based neural representation (neural fields or implicit neural representation) in neural rendering. However, using a multi-layer perceptron (MLP) to represent a 3D scene or object requires enormous computational resources and time. There have been recent studies on how to reduce these computational inefficiencies by using additional data structures, such as grids or trees. Despite the promising performance, the explicit data structure necessitates a substantial amount of memory. In this work, we present a method to reduce the size without compromising the advantages of having additional data structures. In detail, we propose using the wavelet transform on grid-based neural fields. Grid-based neural fields are for fast convergence, and the wavelet transform, whose efficiency has been demonstrated in high-performance standard codecs, is to improve the parameter efficiency of grids. Furthermore, in order to achieve a higher sparsity of grid coefficients while maintaining reconstruction quality, we present a novel trainable masking approach. Experimental results demonstrate that non-spatial grid coefficients, such as wavelet coefficients, are capable of attaining a higher level of sparsity than spatial grid coefficients, resulting in a more compact representation. With our proposed mask and compression pipeline, we achieved state-of-the-art performance within a memory budget of 2 MB. Our code is available at this https URL.\n## Dec11 - Dec17, 2022\n  - [NeRF-Art: Text-Driven Neural Radiance Fields Stylization](https://arxiv.org/abs/2212.08070) | [***``[code]``***](https://cassiepython.github.io/nerfart/)\n    > As a powerful representation of 3D scenes, the neural radiance field (NeRF) enables high-quality novel view synthesis from multi-view images. Stylizing NeRF, however, remains challenging, especially on simulating a text-guided style with both the appearance and the geometry altered simultaneously. In this paper, we present NeRF-Art, a text-guided NeRF stylization approach that manipulates the style of a pre-trained NeRF model with a simple text prompt. Unlike previous approaches that either lack sufficient geometry deformations and texture details or require meshes to guide the stylization, our method can shift a 3D scene to the target style characterized by desired geometry and appearance variations without any mesh guidance. This is achieved by introducing a novel global-local contrastive learning strategy, combined with the directional constraint to simultaneously control both the trajectory and the strength of the target style. Moreover, we adopt a weight regularization method to effectively suppress cloudy artifacts and geometry noises which arise easily when the density field is transformed during geometry stylization. Through extensive experiments on various styles, we demonstrate that our method is effective and robust regarding both single-view stylization quality and cross-view consistency. The code and more results can be found in our project page: this https URL.\n## Dec4 - Dec10, 2022\n  - [4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions](https://arxiv.org/abs/2212.04701) | [***``[code]``***](https://github.com/frozoul/4K-NeRF)\n    > In this paper, we present a novel and effective framework, named 4K-NeRF, to pursue high fidelity view synthesis on the challenging scenarios of ultra high resolutions, building on the methodology of neural radiance fields (NeRF). The rendering procedure of NeRF-based methods typically relies on a pixel wise manner in which rays (or pixels) are treated independently on both training and inference phases, limiting its representational ability on describing subtle details especially when lifting to a extremely high resolution. We address the issue by better exploring ray correlation for enhancing high-frequency details benefiting from the use of geometry-aware local context. Particularly, we use the view-consistent encoder to model geometric information effectively in a lower resolution space and recover fine details through the view-consistent decoder, conditioned on ray features and depths estimated by the encoder. Joint training with patch-based sampling further facilitates our method incorporating the supervision from perception oriented regularization beyond pixel wise loss. Quantitative and qualitative comparisons with modern NeRF methods demonstrate that our method can significantly boost rendering quality for retaining high-frequency details, achieving the state-of-the-art visual quality on 4K ultra-high-resolution scenario. Code Available at \\url{this https URL}\n  - [Diffusion Guided Domain Adaptation of Image Generators](https://arxiv.org/abs/2212.04473) | [code]\n    > Can a text-to-image diffusion model be used as a training objective for adapting a GAN generator to another domain? In this paper, we show that the classifier-free guidance can be leveraged as a critic and enable generators to distill knowledge from large-scale text-to-image diffusion models. Generators can be efficiently shifted into new domains indicated by text prompts without access to groundtruth samples from target domains. We demonstrate the effectiveness and controllability of our method through extensive experiments. Although not trained to minimize CLIP loss, our model achieves equally high CLIP scores and significantly lower FID than prior work on short prompts, and outperforms the baseline qualitatively and quantitatively on long and complicated prompts. To our best knowledge, the proposed method is the first attempt at incorporating large-scale pre-trained diffusion models and distillation sampling for text-driven image generator domain adaptation and gives a quality previously beyond possible. Moreover, we extend our work to 3D-aware style-based generators and DreamBooth guidance.\n  - [Ref-NPR: Reference-Based Non-Photorealistic Radiance Fields](https://arxiv.org/abs/2212.02766) | [code]\n    > Existing 3D scene stylization methods employ an arbitrary style reference to transfer textures and colors as styles without establishing meaningful semantic correspondences. We present Reference-Based Non-Photorealistic Radiance Fields, i.e., Ref-NPR. It is a controllable scene stylization method utilizing radiance fields to stylize a 3D scene, with a single stylized 2D view taken as reference. To achieve decent results, we propose a ray registration process based on the stylized reference view to obtain pseudo-ray supervision in novel views, and exploit the semantic correspondence in content images to fill occluded regions with perceptually similar styles. Combining these operations, Ref-NPR generates non-photorealistic and continuous novel view sequences with a single reference while obtaining reasonable stylization in occluded regions. Experiments show that Ref-NPR significantly outperforms other scene and video stylization methods in terms of both visual quality and semantic correspondence. Code and data will be made publicly available.\n  - [NeRDi: Single-View NeRF Synthesis with Language-Guided Diffusion as General Image Priors](https://arxiv.org/abs/2212.03267) | [code]\n    > 2D-to-3D reconstruction is an ill-posed problem, yet humans are good at solving this problem due to their prior knowledge of the 3D world developed over years. Driven by this observation, we propose NeRDi, a single-view NeRF synthesis framework with general image priors from 2D diffusion models. Formulating single-view reconstruction as an image-conditioned 3D generation problem, we optimize the NeRF representations by minimizing a diffusion loss on its arbitrary view renderings with a pretrained image diffusion model under the input-view constraint. We leverage off-the-shelf vision-language models and introduce a two-section language guidance as conditioning inputs to the diffusion model. This is essentially helpful for improving multiview content coherence as it narrows down the general image prior conditioned on the semantic and visual features of the single-view input image. Additionally, we introduce a geometric loss based on estimated depth maps to regularize the underlying 3D geometry of the NeRF. Experimental results on the DTU MVS dataset show that our method can synthesize novel views with higher quality even compared to existing methods trained on this dataset. We also demonstrate our generalizability in zero-shot NeRF synthesis for in-the-wild images.\n  - [GARF:Geometry-Aware Generalized Neural Radiance Field](https://arxiv.org/abs/2212.02280) | [code]\n    > Neural Radiance Field (NeRF) has revolutionized free viewpoint rendering tasks and achieved impressive results. However, the efficiency and accuracy problems hinder its wide applications. To address these issues, we propose Geometry-Aware Generalized Neural Radiance Field (GARF) with a geometry-aware dynamic sampling (GADS) strategy to perform real-time novel view rendering and unsupervised depth estimation on unseen scenes without per-scene optimization. Distinct from most existing generalized NeRFs, our framework infers the unseen scenes on both pixel-scale and geometry-scale with only a few input images. More specifically, our method learns common attributes of novel-view synthesis by an encoder-decoder structure and a point-level learnable multi-view feature fusion module which helps avoid occlusion. To preserve scene characteristics in the generalized model, we introduce an unsupervised depth estimation module to derive the coarse geometry, narrow down the ray sampling interval to proximity space of the estimated surface and sample in expectation maximum position, constituting Geometry-Aware Dynamic Sampling strategy (GADS). Moreover, we introduce a Multi-level Semantic Consistency loss (MSC) to assist more informative representation learning. Extensive experiments on indoor and outdoor datasets show that comparing with state-of-the-art generalized NeRF methods, GARF reduces samples by more than 25\\%, while improving rendering quality and 3D geometry estimation.\n  - [Fast and Lightweight Scene Regressor for Camera Relocalization](https://arxiv.org/abs/2212.01830) | [***``[code]``***](https://github.com/aislab/feat2map)\n    > Camera relocalization involving a prior 3D reconstruction plays a crucial role in many mixed reality and robotics applications. Estimating the camera pose directly with respect to pre-built 3D models can be prohibitively expensive for several applications with limited storage and/or communication bandwidth. Although recent scene and absolute pose regression methods have become popular for efficient camera localization, most of them are computation-resource intensive and difficult to obtain a real-time inference with high accuracy constraints. This study proposes a simple scene regression method that requires only a multi-layer perceptron network for mapping scene coordinates to achieve accurate camera pose estimations. The proposed approach uses sparse descriptors to regress the scene coordinates, instead of a dense RGB image. The use of sparse features provides several advantages. First, the proposed regressor network is substantially smaller than those reported in previous studies. This makes our system highly efficient and scalable. Second, the pre-built 3D models provide the most reliable and robust 2D-3D matches. Therefore, learning from them can lead to an awareness of equivalent features and substantially improve the generalization performance. A detailed analysis of our approach and extensive evaluations using existing datasets are provided to support the proposed method. The implementation detail is available at this https URL\n## Nov27 - Dec3, 2022\n  - [StegaNeRF: Embedding Invisible Information within Neural Radiance Fields](https://arxiv.org/abs/2212.01602) | [***``[code]``***](https://github.com/XGGNet/StegaNeRF)\n    > Recent advances in neural rendering imply a future of widespread visual data distributions through sharing NeRF model weights. However, while common visual data (images and videos) have standard approaches to embed ownership or copyright information explicitly or subtly, the problem remains unexplored for the emerging NeRF format. We present StegaNeRF, a method for steganographic information embedding in NeRF renderings. We design an optimization framework allowing accurate hidden information extractions from images rendered by NeRF, while preserving its original visual quality. We perform experimental evaluations of our method under several potential deployment scenarios, and we further discuss the insights discovered through our analysis. StegaNeRF signifies an initial exploration into the novel problem of instilling customizable, imperceptible, and recoverable information to NeRF renderings, with minimal impact to rendered images. Project page: this https URL.\n  - [QFF: Quantized Fourier Features for Neural Field Representations](https://arxiv.org/abs/2212.00914) | [code]\n    > Multilayer perceptrons (MLPs) learn high frequencies slowly. Recent approaches encode features in spatial bins to improve speed of learning details, but at the cost of larger model size and loss of continuity. Instead, we propose to encode features in bins of Fourier features that are commonly used for positional encoding. We call these Quantized Fourier Features (QFF). As a naturally multiresolution and periodic representation, our experiments show that using QFF can result in smaller model size, faster training, and better quality outputs for several applications, including Neural Image Representations (NIR), Neural Radiance Field (NeRF) and Signed Distance Function (SDF) modeling. QFF are easy to code, fast to compute, and serve as a simple drop-in addition to many neural field representations.\n  - [3D-TOGO: Towards Text-Guided Cross-Category 3D Object Generation, AAAI2023](https://arxiv.org/abs/2212.01103) | [code]\n    > Text-guided 3D object generation aims to generate 3D objects described by user-defined captions, which paves a flexible way to visualize what we imagined. Although some works have been devoted to solving this challenging task, these works either utilize some explicit 3D representations (e.g., mesh), which lack texture and require post-processing for rendering photo-realistic views; or require individual time-consuming optimization for every single case. Here, we make the first attempt to achieve generic text-guided cross-category 3D object generation via a new 3D-TOGO model, which integrates a text-to-views generation module and a views-to-3D generation module. The text-to-views generation module is designed to generate different views of the target 3D object given an input caption. prior-guidance, caption-guidance and view contrastive learning are proposed for achieving better view-consistency and caption similarity. Meanwhile, a pixelNeRF model is adopted for the views-to-3D generation module to obtain the implicit 3D neural representation from the previously-generated views. Our 3D-TOGO model generates 3D objects in the form of the neural radiance field with good texture and requires no time-cost optimization for every single caption. Besides, 3D-TOGO can control the category, color and shape of generated 3D objects with the input caption. Extensive experiments on the largest 3D object dataset (i.e., ABO) are conducted to verify that 3D-TOGO can better generate high-quality 3D objects according to the input captions across 98 different categories, in terms of PSNR, SSIM, LPIPS and CLIP-score, compared with text-NeRF and Dreamfields.\n  - [LatentSwap3D: Semantic Edits on 3D Image GANs](https://arxiv.org/abs/2212.01381) | [***``[code]``***](https://github.com/enisimsar/latentswap3d)\n    > Recent 3D-aware GANs rely on volumetric rendering techniques to disentangle the pose and appearance of objects, de facto generating entire 3D volumes rather than single-view 2D images from a latent code. Complex image editing tasks can be performed in standard 2D-based GANs (e.g., StyleGAN models) as manipulation of latent dimensions. However, to the best of our knowledge, similar properties have only been partially explored for 3D-aware GAN models. This work aims to fill this gap by showing the limitations of existing methods and proposing LatentSwap3D, a model-agnostic approach designed to enable attribute editing in the latent space of pre-trained 3D-aware GANs. We first identify the most relevant dimensions in the latent space of the model controlling the targeted attribute by relying on the feature importance ranking of a random forest classifier. Then, to apply the transformation, we swap the top-K most relevant latent dimensions of the image being edited with an image exhibiting the desired attribute. Despite its simplicity, LatentSwap3D provides remarkable semantic edits in a disentangled manner and outperforms alternative approaches both qualitatively and quantitatively. We demonstrate our semantic edit approach on various 3D-aware generative models such as pi-GAN, GIRAFFE, StyleSDF, MVCGAN, EG3D and VolumeGAN, and on diverse datasets, such as FFHQ, AFHQ, Cats, MetFaces, and CompCars. The project page can be found: \\url{this https URL}.\n  - [DiffRF: Rendering-Guided 3D Radiance Field Diffusion](https://arxiv.org/abs/2212.01206) | [code]\n    > We introduce DiffRF, a novel approach for 3D radiance field synthesis based on denoising diffusion probabilistic models. While existing diffusion-based methods operate on images, latent codes, or point cloud data, we are the first to directly generate volumetric radiance fields. To this end, we propose a 3D denoising model which directly operates on an explicit voxel grid representation. However, as radiance fields generated from a set of posed images can be ambiguous and contain artifacts, obtaining ground truth radiance field samples is non-trivial. We address this challenge by pairing the denoising formulation with a rendering loss, enabling our model to learn a deviated prior that favours good image quality instead of trying to replicate fitting errors like floating artifacts. In contrast to 2D-diffusion models, our model learns multi-view consistent priors, enabling free-view synthesis and accurate shape generation. Compared to 3D GANs, our diffusion-based approach naturally enables conditional generation such as masked completion or single-view 3D synthesis at inference time.\n  - [NeuWigs: A Neural Dynamic Model for Volumetric Hair Capture and Animation](https://arxiv.org/abs/2212.00613) | [code]\n    > The capture and animation of human hair are two of the major challenges in the creation of realistic avatars for the virtual reality. Both problems are highly challenging, because hair has complex geometry and appearance, as well as exhibits challenging motion. In this paper, we present a two-stage approach that models hair independently from the head to address these challenges in a data-driven manner. The first stage, state compression, learns a low-dimensional latent space of 3D hair states containing motion and appearance, via a novel autoencoder-as-a-tracker strategy. To better disentangle the hair and head in appearance learning, we employ multi-view hair segmentation masks in combination with a differentiable volumetric renderer. The second stage learns a novel hair dynamics model that performs temporal hair transfer based on the discovered latent codes. To enforce higher stability while driving our dynamics model, we employ the 3D point-cloud autoencoder from the compression stage for de-noising of the hair state. Our model outperforms the state of the art in novel view synthesis and is capable of creating novel hair animations without having to rely on hair observations as a driving signal. Project page is here this https URL.\n  - [SparseFusion: Distilling View-conditioned Diffusion for 3D Reconstruction](https://arxiv.org/abs/2212.00792) | [code]\n    > We propose SparseFusion, a sparse view 3D reconstruction approach that unifies recent advances in neural rendering and probabilistic image generation. Existing approaches typically build on neural rendering with re-projected features but fail to generate unseen regions or handle uncertainty under large viewpoint changes. Alternate methods treat this as a (probabilistic) 2D synthesis task, and while they can generate plausible 2D images, they do not infer a consistent underlying 3D. However, we find that this trade-off between 3D consistency and probabilistic image generation does not need to exist. In fact, we show that geometric consistency and generative inference can be complementary in a mode-seeking behavior. By distilling a 3D consistent scene representation from a view-conditioned latent diffusion model, we are able to recover a plausible 3D representation whose renderings are both accurate and realistic. We evaluate our approach across 51 categories in the CO3D dataset and show that it outperforms existing methods, in both distortion and perception metrics, for sparse-view novel view synthesis.\n  - [Mixed Neural Voxels for Fast Multi-view Video Synthesis](https://arxiv.org/abs/2212.00190) | [code]\n    > Synthesizing high-fidelity videos from real-world multi-view input is challenging because of the complexities of real-world environments and highly dynamic motions. Previous works based on neural radiance fields have demonstrated high-quality reconstructions of dynamic scenes. However, training such models on real-world scenes is time-consuming, usually taking days or weeks. In this paper, we present a novel method named MixVoxels to better represent the dynamic scenes with fast training speed and competitive rendering qualities. The proposed MixVoxels represents the 4D dynamic scenes as a mixture of static and dynamic voxels and processes them with different networks. In this way, the computation of the required modalities for static voxels can be processed by a lightweight model, which essentially reduces the amount of computation, especially for many daily dynamic scenes dominated by the static background. To separate the two kinds of voxels, we propose a novel variation field to estimate the temporal variance of each voxel. For the dynamic voxels, we design an inner-product time query method to efficiently query multiple time steps, which is essential to recover the high-dynamic motions. As a result, with 15 minutes of training for dynamic scenes with inputs of 300-frame videos, MixVoxels achieves better PSNR than previous methods. Codes and trained models are available at this https URL\n  - [Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation](https://arxiv.org/abs/2212.00774) | [code]\n    > A diffusion model learns to predict a vector field of gradients. We propose to apply chain rule on the learned gradients, and back-propagate the score of a diffusion model through the Jacobian of a differentiable renderer, which we instantiate to be a voxel radiance field. This setup aggregates 2D scores at multiple camera viewpoints into a 3D score, and repurposes a pretrained 2D model for 3D data generation. We identify a technical challenge of distribution mismatch that arises in this application, and propose a novel estimation mechanism to resolve it. We run our algorithm on several off-the-shelf diffusion image generative models, including the recently released Stable Diffusion trained on the large-scale LAION dataset.\n  - [Neural Subspaces for Light Fields, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9968104) | [code]\n    > We introduce a framework for compactly representing light field content with the novel concept of neural subspaces. While the recently proposed neural light field representation achieves great compression results by encoding a light field into a single neural network, the unified design is not optimized for the composite structures exhibited in light fields. Moreover, encoding every part of the light field into one network is not ideal for applications that require rapid transmission and decoding. We recognize this problem's connection to subspace learning. We present a method that uses several small neural networks, specializing in learning the neural subspace for a particular light field segment. Moreover, we propose an adaptive weight sharing strategy among those small networks, improving parameter efficiency. In effect, this strategy enables a concerted way to track the similarity among nearby neural subspaces by leveraging the layered structure of neural networks. Furthermore, we develop a soft-classification technique to enhance the color prediction accuracy of neural representations. Our experimental results show that our method better reconstructs the light field than previous methods on various light field scenes. We further demonstrate its successful deployment on encoding light fields with irregular viewpoint layout and dynamic scene content.\n  - [3D-LDM: Neural Implicit 3D Shape Generation with Latent Diffusion Models](https://arxiv.org/abs/2212.00842) | [code]\n    > Diffusion models have shown great promise for image generation, beating GANs in terms of generation diversity, with comparable image quality. However, their application to 3D shapes has been limited to point or voxel representations that can in practice not accurately represent a 3D surface. We propose a diffusion model for neural implicit representations of 3D shapes that operates in the latent space of an auto-decoder. This allows us to generate diverse and high quality 3D surfaces. We additionally show that we can condition our model on images or text to enable image-to-3D generation and text-to-3D generation using CLIP embeddings. Furthermore, adding noise to the latent codes of existing shapes allows us to explore shape variations.\n  - [SinGRAF: Learning a 3D Generative Radiance Field for a Single Scene](https://arxiv.org/abs/2211.17260) | [code]\n    > Generative models have shown great promise in synthesizing photorealistic 3D objects, but they require large amounts of training data. We introduce SinGRAF, a 3D-aware generative model that is trained with a few input images of a single scene. Once trained, SinGRAF generates different realizations of this 3D scene that preserve the appearance of the input while varying scene layout. For this purpose, we build on recent progress in 3D GAN architectures and introduce a novel progressive-scale patch discrimination approach during training. With several experiments, we demonstrate that the results produced by SinGRAF outperform the closest related works in both quality and diversity by a large margin.\n  - [NeAF: Learning Neural Angle Fields for Point Normal Estimation, AAAI2023](https://arxiv.org/abs/2211.16869) | [***``[code]``***](https://github.com/lisj575/NeAF)\n    > Normal estimation for unstructured point clouds is an important task in 3D computer vision. Current methods achieve encouraging results by mapping local patches to normal vectors or learning local surface fitting using neural networks. However, these methods are not generalized well to unseen scenarios and are sensitive to parameter settings. To resolve these issues, we propose an implicit function to learn an angle field around the normal of each point in the spherical coordinate system, which is dubbed as Neural Angle Fields (NeAF). Instead of directly predicting the normal of an input point, we predict the angle offset between the ground truth normal and a randomly sampled query normal. This strategy pushes the network to observe more diverse samples, which leads to higher prediction accuracy in a more robust manner. To predict normals from the learned angle fields at inference time, we randomly sample query vectors in a unit spherical space and take the vectors with minimal angle values as the predicted normals. To further leverage the prior learned by NeAF, we propose to refine the predicted normal vectors by minimizing the angle offsets. The experimental results with synthetic data and real scans show significant improvements over the state-of-the-art under widely used benchmarks.\n  - [SNAF: Sparse-view CBCT Reconstruction with Neural Attenuation Fields](https://arxiv.org/abs/2211.17048) | [code]\n    > Cone beam computed tomography (CBCT) has been widely used in clinical practice, especially in dental clinics, while the radiation dose of X-rays when capturing has been a long concern in CBCT imaging. Several research works have been proposed to reconstruct high-quality CBCT images from sparse-view 2D projections, but the current state-of-the-arts suffer from artifacts and the lack of fine details. In this paper, we propose SNAF for sparse-view CBCT reconstruction by learning the neural attenuation fields, where we have invented a novel view augmentation strategy to overcome the challenges introduced by insufficient data from sparse input views. Our approach achieves superior performance in terms of high reconstruction quality (30+ PSNR) with only 20 input views (25 times fewer than clinical collections), which outperforms the state-of-the-arts. We have further conducted comprehensive experiments and ablation analysis to validate the effectiveness of our approach.\n  - [NeRFInvertor: High Fidelity NeRF-GAN Inversion for Single-shot Real Image Animation](https://arxiv.org/abs/2211.17235) | [code]\n    > Nerf-based Generative models have shown impressive capacity in generating high-quality images with consistent 3D geometry. Despite successful synthesis of fake identity images randomly sampled from latent space, adopting these models for generating face images of real subjects is still a challenging task due to its so-called inversion issue. In this paper, we propose a universal method to surgically fine-tune these NeRF-GAN models in order to achieve high-fidelity animation of real subjects only by a single image. Given the optimized latent code for an out-of-domain real image, we employ 2D loss functions on the rendered image to reduce the identity gap. Furthermore, our method leverages explicit and implicit 3D regularizations using the in-domain neighborhood samples around the optimized latent code to remove geometrical and visual artifacts. Our experiments confirm the effectiveness of our method in realistic, high-fidelity, and 3D consistent animation of real faces on multiple NeRF-GAN models across different datasets.\n  - [Differentiable Rendering Using RGBXY Derivatives and Optimal Transport, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555479) | [code]\n    > Traditional differentiable rendering approaches are usually hard to converge in inverse rendering optimizations, especially when initial and target object locations are not so close. Inspired by Lagrangian fluid simulation, we present a novel differentiable rendering method to address this problem. We associate each screen-space pixel with the visible 3D geometric point covered by the center of the pixel and compute derivatives on geometric points rather than on pixels. We refer to the associated geometric points as point proxies of pixels. For each point proxy, we compute its 5D RGBXY derivatives which measures how its 3D RGB color and 2D projected screen-space position change with respect to scene parameters. Furthermore, in order to capture global and long-range object motions, we utilize optimal transport based pixel matching to design a more sophisticated loss function. We have conducted experiments to evaluate the effectiveness of our proposed method on various inverse rendering applications and have demonstrated superior convergence behavior compared to state-of-the-art baselines.\n  - [Efficient Light Probes for Real-Time Global Illumination, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555452) | [code]\n    > Reproducing physically-based global illumination (GI) effects has been a long-standing demand for many real-time graphical applications. In pursuit of this goal, many recent engines resort to some form of light probes baked in a precomputation stage. Unfortunately, the GI effects stemming from the precomputed probes are rather limited due to the constraints in the probe storage, representation or query. In this paper, we propose a new method for probe-based GI rendering which can generate a wide range of GI effects, including glossy reflection with multiple bounces, in complex scenes. The key contributions behind our work include a gradient-based search algorithm and a neural image reconstruction method. The search algorithm is designed to reproject the probes' contents to any query viewpoint, without introducing parallax errors, and converges fast to the optimal solution. The neural image reconstruction method, based on a dedicated neural network and several G-buffers, tries to recover high-quality images from low-quality inputs due to limited resolution or (potential) low sampling rate of the probes. This neural method makes the generation of light probes efficient. Moreover, a temporal reprojection strategy and a temporal loss are employed to improve temporal stability for animation sequences. The whole pipeline runs in realtime (>30 frames per second) even for high-resolution (1920×1080) outputs, thanks to the fast convergence rate of the gradient-based search algorithm and a light-weight design of the neural network. Extensive experiments on multiple complex scenes have been conducted to show the superiority of our method over the state-of-the-arts.\n  - [LaplacianFusion: Detailed 3D Clothed-Human Body Reconstruction, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555511) | [code]\n    > We propose LaplacianFusion, a novel approach that reconstructs detailed and controllable 3D clothed-human body shapes from an input depth or 3D point cloud sequence. The key idea of our approach is to use Laplacian coordinates, well-known differential coordinates that have been used for mesh editing, for representing the local structures contained in the input scans, instead of implicit 3D functions or vertex displacements used previously. Our approach reconstructs a controllable base mesh using SMPL, and learns a surface function that predicts Laplacian coordinates representing surface details on the base mesh. For a given pose, we first build and subdivide a base mesh, which is a deformed SMPL template, and then estimate Laplacian coordinates for the mesh vertices using the surface function. The final reconstruction for the pose is obtained by integrating the estimated Laplacian coordinates as a whole. Experimental results show that our approach based on Laplacian coordinates successfully reconstructs more visually pleasing shape details than previous methods. The approach also enables various surface detail manipulations, such as detail transfer and enhancement.\n  - [QuadStream: A Quad-Based Scene Streaming Architecture for Novel Viewpoint Reconstruction, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555524) | [code]\n    > Streaming rendered 3D content over a network to a thin client device, such as a phone or a VR/AR headset, brings high-fidelity graphics to platforms where it would not normally possible due to thermal, power, or cost constraints. Streamed 3D content must be transmitted with a representation that is both robust to latency and potential network dropouts. Transmitting a video stream and reprojecting to correct for changing viewpoints fails in the presence of disocclusion events; streaming scene geometry and performing high-quality rendering on the client is not possible on limited-power mobile GPUs. To balance the competing goals of disocclusion robustness and minimal client workload, we introduce QuadStream, a new streaming content representation that reduces motion-to-photon latency by allowing clients to efficiently render novel views without artifacts caused by disocclusion events. Motivated by traditional macroblock approaches to video codec design, we decompose the scene seen from positions in a view cell into a series of quad proxies, or view-aligned quads from multiple views. By operating on a rasterized G-Buffer, our approach is independent of the representation used for the scene itself; the resulting QuadStream is an approximate geometric representation of the scene that can be reconstructed by a thin client to render both the current view and nearby adjacent views. Our technical contributions are an efficient parallel quad generation, merging, and packing strategy for proxy views covering potential client movement in a scene; a packing and encoding strategy that allows masked quads with depth information to be transmitted as a frame-coherent stream; and an efficient rendering approach for rendering our QuadStream representation into entirely novel views on thin clients. We show that our approach achieves superior quality compared both to video data streaming methods, and to geometry-based streaming.\n  - [DINER: Depth-aware Image-based NEural Radiance Fields](https://arxiv.org/abs/2211.16630) | [code]\n    > We present Depth-aware Image-based NEural Radiance fields (DINER). Given a sparse set of RGB input views, we predict depth and feature maps to guide the reconstruction of a volumetric scene representation that allows us to render 3D objects under novel views. Specifically, we propose novel techniques to incorporate depth information into feature fusion and efficient scene sampling. In comparison to the previous state of the art, DINER achieves higher synthesis quality and can process input views with greater disparity. This allows us to capture scenes more completely without changing capturing hardware requirements and ultimately enables larger viewpoint changes during novel view synthesis. We evaluate our method by synthesizing novel views, both for human heads and for general objects, and observe significantly improved qualitative results and increased perceptual metrics compared to the previous state of the art. The code will be made publicly available for research purposes.\n  - [Reconstructing Hand-Held Objects from Monocular Video, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555401) | [code]\n    > This paper presents an approach that reconstructs a hand-held object from a monocular video. In contrast to many recent methods that directly predict object geometry by a trained network, the proposed approach does not require any learned prior about the object and is able to recover more accurate and detailed object geometry. The key idea is that the hand motion naturally provides multiple views of the object and the motion can be reliably estimated by a hand pose tracker. Then, the object geometry can be recovered by solving a multi-view reconstruction problem. We devise an implicit neural representation-based method to solve the reconstruction problem and address the issues of imprecise hand pose estimation, relative hand-object motion, and insufficient geometry optimization for small objects. We also provide a newly collected dataset with 3D ground truth to validate the proposed approach. The dataset and code will be released at https://dihuangdh.github.io/hhor.\n  - [Dr.3D: Adapting 3D GANs to Artistic Drawings, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555422) | [code]\n    > While 3D GANs have recently demonstrated the high-quality synthesis of multi-view consistent images and 3D shapes, they are mainly restricted to photo-realistic human portraits. This paper aims to extend 3D GANs to a different, but meaningful visual form: artistic portrait drawings. However, extending existing 3D GANs to drawings is challenging due to the inevitable geometric ambiguity present in drawings. To tackle this, we present Dr.3D, a novel adaptation approach that adapts an existing 3D GAN to artistic drawings. Dr.3D is equipped with three novel components to handle the geometric ambiguity: a deformation-aware 3D synthesis network, an alternating adaptation of pose estimation and image synthesis, and geometric priors. Experiments show that our approach can successfully adapt 3D GANs to drawings and enable multi-view consistent semantic editing of drawings.\n  - [Efficient Neural Radiance Fields for Interactive Free-viewpoint Video, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555376) | [code]\n    > This paper aims to tackle the challenge of efficiently producing interactive free-viewpoint videos. Some recent works equip neural radiance fields with image encoders, enabling them to generalize across scenes. When processing dynamic scenes, they can simply treat each video frame as an individual scene and perform novel view synthesis to generate free-viewpoint videos. However, their rendering process is slow and cannot support interactive applications. A major factor is that they sample lots of points in empty space when inferring radiance fields. We propose a novel scene representation, called ENeRF, for the fast creation of interactive free-viewpoint videos. Specifically, given multi-view images at one frame, we first build the cascade cost volume to predict the coarse geometry of the scene. The coarse geometry allows us to sample few points near the scene surface, thereby significantly improving the rendering speed. This process is fully differentiable, enabling us to jointly learn the depth prediction and radiance field networks from RGB images. Experiments on multiple benchmarks show that our approach exhibits competitive performance while being at least 60 times faster than previous generalizable radiance field methods.\n  - [NeuLighting: Neural Lighting for Free Viewpoint Outdoor Scene Relighting with Unconstrained Photo Collections, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555384) | [code]\n    > We propose NeuLighting, a new framework for free viewpoint outdoor scene relighting from a sparse set of unconstrained in-the-wild photo collections. Our framework represents all the scene components as continuous functions parameterized by MLPs that take a 3D location and the lighting condition as input and output reflectance and necessary outdoor illumination properties. Unlike object-level relighting methods which often leverage training images with controllable and consistent indoor illumination, we concentrate on the more challenging outdoor situation where all the images are captured under arbitrary unknown illumination. The key to our method includes a neural lighting representation that compresses the per-image illumination into a disentangled latent vector, and a new free viewpoint relighting scheme that is robust to arbitrary lighting variations across images. The lighting representation is compressive to explain a wide range of illumination and can be easily fed into the query-based NeuLighting framework, enabling efficient shading effect evaluation under any kind of novel illumination. Furthermore, to produce high-quality cast shadows, we estimate the sun visibility map to indicate the shadow regions according to the scene geometry and the sun direction. Thanks to the flexible and explainable neural lighting representation, our system supports outdoor relighting with many different illumination sources, including natural images, environment maps, and time-lapse videos. The high-fidelity renderings under novel views and illumination prove the superiority of our method against state-of-the-art relighting solutions.\n  - [Lightweight Neural Basis Functions for All-Frequency Shading, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555386) | [code]\n    > Basis functions provide both the abilities for compact representation and the properties for efficient computation. Therefore, they are pervasively used in rendering to perform all-frequency shading. However, common basis functions, including spherical harmonics (SH), wavelets, and spherical Gaussians (SG) all have their own limitations, such as low-frequency for SH, not rotationally invariant for wavelets, and no multiple product support for SG. In this paper, we present neural basis functions, an implicit and data-driven set of basis functions that circumvents the limitations with all desired properties. We first introduce a representation neural network that takes any general 2D spherical function (e.g. environment lighting, BRDF, and visibility) as input and projects it onto the latent space as coefficients of our neural basis functions. Then, we design several lightweight neural networks that perform different types of computation, giving our basis functions different computational properties such as double/triple product integrals and rotations. We demonstrate the practicality of our neural basis functions by integrating them into all-frequency shading applications, showing that our method not only achieves a compression rate of and 10 × -40 × better performance than wavelets at equal quality, but also renders all-frequency lighting effects in real-time without the aforementioned limitations from classic basis functions.\n  - [DeepMVSHair: Deep Hair Modeling from Sparse Views, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555385) | [code]\n    > We present DeepMVSHair, the first deep learning-based method for multi-view hair strand reconstruction. The key component of our pipeline is HairMVSNet, a differentiable neural architecture which represents a spatial hair structure as a continuous 3D hair growing direction field implicitly. Specifically, given a 3D query point, we decide its occupancy value and direction from observed 2D structure features. With the query point’s pixel-aligned features from each input view, we utilize a view-aware transformer encoder to aggregate anisotropic structure features to an integrated representation, which is decoded to yield 3D occupancy and direction at the query point. HairMVSNet effectively gathers multi-view hair structure features and preserves high-frequency details based on this implicit representation. Guided by HairMVSNet, our hair-growing algorithm produces results faithful to input multi-view images. We propose a novel image-guided multi-view strand deformation algorithm to enrich modeling details further. Extensive experiments show that the results by our sparse-view method are comparable to those by state-of-the-art dense multi-view methods and significantly better than those by single-view and sparse-view methods. In addition, our method is an order of magnitude faster than previous multi-view hair modeling methods.\n  - [A Light Touch Approach to Teaching Transformers Multi-view Geometry](https://arxiv.org/abs/2211.15107) | [code]\n    > Transformers are powerful visual learners, in large part due to their conspicuous lack of manually-specified priors. This flexibility can be problematic in tasks that involve multiple-view geometry, due to the near-infinite possible variations in 3D shapes and viewpoints (requiring flexibility), and the precise nature of projective geometry (obeying rigid laws). To resolve this conundrum, we propose a \"light touch\" approach, guiding visual Transformers to learn multiple-view geometry but allowing them to break free when needed. We achieve this by using epipolar lines to guide the Transformer's cross-attention maps, penalizing attention values outside the epipolar lines and encouraging higher attention along these lines since they contain geometrically plausible matches. Unlike previous methods, our proposal does not require any camera pose information at test-time. We focus on pose-invariant object instance retrieval, where standard Transformer networks struggle, due to the large differences in viewpoint between query and retrieved images. Experimentally, our method outperforms state-of-the-art approaches at object retrieval, without needing pose information at test-time.\n  - [Fast-SNARF: A Fast Deformer for Articulated Neural Fields](https://arxiv.org/abs/2211.15601) | [code]\n    > Neural fields have revolutionized the area of 3D reconstruction and novel view synthesis of rigid scenes. A key challenge in making such methods applicable to articulated objects, such as the human body, is to model the deformation of 3D locations between the rest pose (a canonical space) and the deformed space. We propose a new articulation module for neural fields, Fast-SNARF, which finds accurate correspondences between canonical space and posed space via iterative root finding. Fast-SNARF is a drop-in replacement in functionality to our previous work, SNARF, while significantly improving its computational efficiency. We contribute several algorithmic and implementation improvements over SNARF, yielding a speed-up of 150×. These improvements include voxel-based correspondence search, pre-computing the linear blend skinning function, and an efficient software implementation with CUDA kernels. Fast-SNARF enables efficient and simultaneous optimization of shape and skinning weights given deformed observations without correspondences (e.g. 3D meshes). Because learning of deformation maps is a crucial component in many 3D human avatar methods and since Fast-SNARF provides a computationally efficient solution, we believe that this work represents a significant step towards the practical creation of 3D virtual humans.\n  - [Non-uniform Sampling Strategies for NeRF on 360° images , BMVC2022](https://arxiv.org/abs/2212.03635) | [code]\n    > In recent years, the performance of novel view synthesis using perspective images has dramatically improved with the advent of neural radiance fields (NeRF). This study proposes two novel techniques that effectively build NeRF for 360{\\textdegree} omnidirectional images. Due to the characteristics of a 360{\\textdegree} image of ERP format that has spatial distortion in their high latitude regions and a 360{\\textdegree} wide viewing angle, NeRF's general ray sampling strategy is ineffective. Hence, the view synthesis accuracy of NeRF is limited and learning is not efficient. We propose two non-uniform ray sampling schemes for NeRF to suit 360{\\textdegree} images - distortion-aware ray sampling and content-aware ray sampling. We created an evaluation dataset Synth360 using Replica and SceneCity models of indoor and outdoor scenes, respectively. In experiments, we show that our proposal successfully builds 360{\\textdegree} image NeRF in terms of both accuracy and efficiency. The proposal is widely applicable to advanced variants of NeRF. DietNeRF, AugNeRF, and NeRF++ combined with the proposed techniques further improve the performance. Moreover, we show that our proposed method enhances the quality of real-world scenes in 360{\\textdegree} images. Synth360: this https URL.\n  - [High-fidelity 3D GAN Inversion by Pseudo-multi-view Optimization](https://arxiv.org/abs/2211.15662) | [***``[code]``***](https://github.com/jiaxinxie97/HFGI3D)\n    > We present a high-fidelity 3D generative adversarial network (GAN) inversion framework that can synthesize photo-realistic novel views while preserving specific details of the input image. High-fidelity 3D GAN inversion is inherently challenging due to the geometry-texture trade-off in high-fidelity 3D inversion, where overfitting to a single view input image often damages the estimated geometry during the latent optimization. To solve this challenge, we propose a novel pipeline that builds on the pseudo-multi-view estimation with visibility analysis. We keep the original textures for the visible parts and utilize generative priors for the occluded parts. Extensive experiments show that our approach achieves advantageous reconstruction and novel view synthesis quality over state-of-the-art methods, even for images with out-of-distribution textures. The proposed pipeline also enables image attribute editing with the inverted latent code and 3D-aware texture modification. Our approach enables high-fidelity 3D rendering from a single image, which is promising for various applications of AI-generated 3D content.\n## Nov20 - Nov26, 2022\n  - [ResNeRF: Geometry-Guided Residual Neural Radiance Field for Indoor Scene Novel View Synthesis](https://arxiv.org/abs/2211.16211) | [code]\n    > We represent the ResNeRF, a novel geometry-guided two-stage framework for indoor scene novel view synthesis. Be aware of that a good geometry would greatly boost the performance of novel view synthesis, and to avoid the geometry ambiguity issue, we propose to characterize the density distribution of the scene based on a base density estimated from scene geometry and a residual density parameterized by the geometry. In the first stage, we focus on geometry reconstruction based on SDF representation, which would lead to a good geometry surface of the scene and also a sharp density. In the second stage, the residual density is learned based on the SDF learned in the first stage for encoding more details about the appearance. In this way, our method can better learn the density distribution with the geometry prior for high-fidelity novel view synthesis while preserving the 3D structures. Experiments on large-scale indoor scenes with many less-observed and textureless areas show that with the good 3D surface, our method achieves state-of-the-art performance for novel view synthesis.\n  - [RUST: Latent Neural Scene Representations from Unposed Imagery](https://arxiv.org/abs/2211.14306) | [code]\n    > Inferring the structure of 3D scenes from 2D observations is a fundamental challenge in computer vision. Recently popularized approaches based on neural scene representations have achieved tremendous impact and have been applied across a variety of applications. One of the major remaining challenges in this space is training a single model which can provide latent representations which effectively generalize beyond a single scene. Scene Representation Transformer (SRT) has shown promise in this direction, but scaling it to a larger set of diverse scenes is challenging and necessitates accurately posed ground truth data. To address this problem, we propose RUST (Really Unposed Scene representation Transformer), a pose-free approach to novel view synthesis trained on RGB images alone. Our main insight is that one can train a Pose Encoder that peeks at the target image and learns a latent pose embedding which is used by the decoder for view synthesis. We perform an empirical investigation into the learned latent pose structure and show that it allows meaningful test-time camera transformations and accurate explicit pose readouts. Perhaps surprisingly, RUST achieves similar quality as methods which have access to perfect camera pose, thereby unlocking the potential for large-scale training of amortized neural scene representations.\n  - [Unsupervised Continual Semantic Adaptation through Neural Rendering](https://arxiv.org/abs/2211.13969) | [code]\n    > An increasing amount of applications rely on data-driven models that are deployed for perception tasks across a sequence of scenes. Due to the mismatch between training and deployment data, adapting the model on the new scenes is often crucial to obtain good performance. In this work, we study continual multi-scene adaptation for the task of semantic segmentation, assuming that no ground-truth labels are available during deployment and that performance on the previous scenes should be maintained. We propose training a Semantic-NeRF network for each scene by fusing the predictions of a segmentation model and then using the view-consistent rendered semantic labels as pseudo-labels to adapt the model. Through joint training with the segmentation model, the Semantic-NeRF model effectively enables 2D-3D knowledge transfer. Furthermore, due to its compact size, it can be stored in a long-term memory and subsequently used to render data from arbitrary viewpoints to reduce forgetting. We evaluate our approach on ScanNet, where we outperform both a voxel-based baseline and a state-of-the-art unsupervised domain adaptation method.\n  - [ShadowNeuS: Neural SDF Reconstruction by Shadow Ray Supervision](https://arxiv.org/abs/2211.14086) | [code]\n    > By supervising camera rays between a scene and multi-view image planes, NeRF reconstructs a neural scene representation for the task of novel view synthesis. On the other hand, shadow rays between the light source and the scene have yet to be considered. Therefore, we propose a novel shadow ray supervision scheme that optimizes both the samples along the ray and the ray location. By supervising shadow rays, we successfully reconstruct a neural SDF of the scene from single-view pure shadow or RGB images under multiple lighting conditions. Given single-view binary shadows, we train a neural network to reconstruct a complete scene not limited by the camera's line of sight. By further modeling the correlation between the image colors and the shadow rays, our technique can also be effectively extended to RGB inputs. We compare our method with previous works on challenging tasks of shape reconstruction from single-view binary shadow or RGB images and observe significant improvements. The code and data will be released.\n  - [Dynamic Neural Portraits, WACV2023](https://arxiv.org/abs/2211.13994) | [code]\n    > We present Dynamic Neural Portraits, a novel approach to the problem of full-head reenactment. Our method generates photo-realistic video portraits by explicitly controlling head pose, facial expressions and eye gaze. Our proposed architecture is different from existing methods that rely on GAN-based image-to-image translation networks for transforming renderings of 3D faces into photo-realistic images. Instead, we build our system upon a 2D coordinate-based MLP with controllable dynamics. Our intuition to adopt a 2D-based representation, as opposed to recent 3D NeRF-like systems, stems from the fact that video portraits are captured by monocular stationary cameras, therefore, only a single viewpoint of the scene is available. Primarily, we condition our generative model on expression blendshapes, nonetheless, we show that our system can be successfully driven by audio features as well. Our experiments demonstrate that the proposed method is 270 times faster than recent NeRF-based reenactment methods, with our networks achieving speeds of 24 fps for resolutions up to 1024 x 1024, while outperforming prior works in terms of visual quality.\n  - [ScanNeRF: a Scalable Benchmark for Neural Radiance Fields, WACV2023](https://arxiv.org/abs/2211.13762) | [code]\n    > In this paper, we propose the first-ever real benchmark thought for evaluating Neural Radiance Fields (NeRFs) and, in general, Neural Rendering (NR) frameworks. We design and implement an effective pipeline for scanning real objects in quantity and effortlessly. Our scan station is built with less than 500$ hardware budget and can collect roughly 4000 images of a scanned object in just 5 minutes. Such a platform is used to build ScanNeRF, a dataset characterized by several train/val/test splits aimed at benchmarking the performance of modern NeRF methods under different conditions. Accordingly, we evaluate three cutting-edge NeRF variants on it to highlight their strengths and weaknesses. The dataset is available on our project page, together with an online benchmark to foster the development of better and better NeRFs.\n  - [DiffusionSDF: Conditional Generative Modeling of Signed Distance Functions](https://arxiv.org/abs/2211.13757) | [code]\n    > Probabilistic diffusion models have achieved state-of-the-art results for image synthesis, inpainting, and text-to-image tasks. However, they are still in the early stages of generating complex 3D shapes. This work proposes DiffusionSDF, a generative model for shape completion, single-view reconstruction, and reconstruction of real-scanned point clouds. We use neural signed distance functions (SDFs) as our 3D representation to parameterize the geometry of various signals (e.g., point clouds, 2D images) through neural networks. Neural SDFs are implicit functions and diffusing them amounts to learning the reversal of their neural network weights, which we solve using a custom modulation module. Extensive experiments show that our method is capable of both realistic unconditional generation and conditional generation from partial inputs. This work expands the domain of diffusion models from learning 2D, explicit representations, to 3D, implicit representations.\n  - [Immersive Neural Graphics Primitives](https://arxiv.org/abs/2211.13494) | [code]\n    > Neural radiance field (NeRF), in particular its extension by instant neural graphics primitives, is a novel rendering method for view synthesis that uses real-world images to build photo-realistic immersive virtual scenes. Despite its potential, research on the combination of NeRF and virtual reality (VR) remains sparse. Currently, there is no integration into typical VR systems available, and the performance and suitability of NeRF implementations for VR have not been evaluated, for instance, for different scene complexities or screen resolutions. In this paper, we present and evaluate a NeRF-based framework that is capable of rendering scenes in immersive VR allowing users to freely move their heads to explore complex real-world scenes. We evaluate our framework by benchmarking three different NeRF scenes concerning their rendering performance at different scene complexities and resolutions. Utilizing super-resolution, our approach can yield a frame rate of 30 frames per second with a resolution of 1280x720 pixels per eye. We discuss potential applications of our framework and provide an open source implementation online.\n  - [BAD-NeRF: Bundle Adjusted Deblur Neural Radiance Fields](https://arxiv.org/abs/2211.12853) | [code]\n    > Neural Radiance Fields (NeRF) have received considerable attention recently, due to its impressive capability in photo-realistic 3D reconstruction and novel view synthesis, given a set of posed camera images. Earlier work usually assumes the input images are in good quality. However, image degradation (e.g. image motion blur in low-light conditions) can easily happen in real-world scenarios, which would further affect the rendering quality of NeRF. In this paper, we present a novel bundle adjusted deblur Neural Radiance Fields (BAD-NeRF), which can be robust to severe motion blurred images and inaccurate camera poses. Our approach models the physical image formation process of a motion blurred image, and jointly learns the parameters of NeRF and recovers the camera motion trajectories during exposure time. In experiments, we show that by directly modeling the real physical image formation process, BAD-NeRF achieves superior performance over prior works on both synthetic and real datasets.\n  - [Peekaboo: Text to Image Diffusion Models are Zero-Shot Segmentors](https://arxiv.org/abs/2211.13224) | [code]\n    > Recent diffusion-based generative models combined with vision-language models are capable of creating realistic images from natural language prompts. While these models are trained on large internet-scale datasets, such pre-trained models are not directly introduced to any semantic localization or grounding. Most current approaches for localization or grounding rely on human-annotated localization information in the form of bounding boxes or segmentation masks. The exceptions are a few unsupervised methods that utilize architectures or loss functions geared towards localization, but they need to be trained separately. In this work, we explore how off-the-shelf diffusion models, trained with no exposure to such localization information, are capable of grounding various semantic phrases with no segmentation-specific re-training. An inference time optimization process is introduced, that is capable of generating segmentation masks conditioned on natural language. We evaluate our proposal Peekaboo for unsupervised semantic segmentation on the Pascal VOC dataset. In addition, we evaluate for referring segmentation on the RefCOCO dataset. In summary, we present a first zero-shot, open-vocabulary, unsupervised (no localization information), semantic grounding technique leveraging diffusion-based generative models with no re-training. Our code will be released publicly.\n  - [OReX: Object Reconstruction from Planner Cross-sections Using Neural Fields](https://arxiv.org/abs/2211.12886) | [code]\n    > Reconstructing 3D shapes from planar cross-sections is a challenge inspired by downstream applications like medical imaging and geographic informatics. The input is an in/out indicator function fully defined on a sparse collection of planes in space, and the output is an interpolation of the indicator function to the entire volume. Previous works addressing this sparse and ill-posed problem either produce low quality results, or rely on additional priors such as target topology, appearance information, or input normal directions. In this paper, we present OReX, a method for 3D shape reconstruction from slices alone, featuring a Neural Field as the interpolation prior. A simple neural network is trained on the input planes to receive a 3D coordinate and return an inside/outside estimate for the query point. This prior is powerful in inducing smoothness and self-similarities. The main challenge for this approach is high-frequency details, as the neural prior is overly smoothing. To alleviate this, we offer an iterative estimation architecture and a hierarchical input sampling scheme that encourage coarse-to-fine training, allowing focusing on high frequencies at later stages. In addition, we identify and analyze a common ripple-like effect stemming from the mesh extraction step. We mitigate it by regularizing the spatial gradients of the indicator function around input in/out boundaries, cutting the problem at the root.\n  - [PANeRF: Pseudo-view Augmentation for Improved Neural Radiance Fields Based on Few-shot Inputs](https://arxiv.org/abs/2211.12758) | [code]\n    > The method of neural radiance fields (NeRF) has been developed in recent years, and this technology has promising applications for synthesizing novel views of complex scenes. However, NeRF requires dense input views, typically numbering in the hundreds, for generating high-quality images. With a decrease in the number of input views, the rendering quality of NeRF for unseen viewpoints tends to degenerate drastically. To overcome this challenge, we propose pseudo-view augmentation of NeRF, a scheme that expands a sufficient amount of data by considering the geometry of few-shot inputs. We first initialized the NeRF network by leveraging the expanded pseudo-views, which efficiently minimizes uncertainty when rendering unseen views. Subsequently, we fine-tuned the network by utilizing sparse-view inputs containing precise geometry and color information. Through experiments under various settings, we verified that our model faithfully synthesizes novel-view images of superior quality and outperforms existing methods for multi-view datasets.\n  - [ActiveRMAP: Radiance Field for Active Mapping And Planning](https://arxiv.org/abs/2211.12656) | [code]\n    > A high-quality 3D reconstruction of a scene from a collection of 2D images can be achieved through offline/online mapping methods. In this paper, we explore active mapping from the perspective of implicit representations, which have recently produced compelling results in a variety of applications. One of the most popular implicit representations - Neural Radiance Field (NeRF), first demonstrated photorealistic rendering results using multi-layer perceptrons, with promising offline 3D reconstruction as a by-product of the radiance field. More recently, researchers also applied this implicit representation for online reconstruction and localization (i.e. implicit SLAM systems). However, the study on using implicit representation for active vision tasks is still very limited. In this paper, we are particularly interested in applying the neural radiance field for active mapping and planning problems, which are closely coupled tasks in an active system. We, for the first time, present an RGB-only active vision framework using radiance field representation for active 3D reconstruction and planning in an online manner. Specifically, we formulate this joint task as an iterative dual-stage optimization problem, where we alternatively optimize for the radiance field representation and path planning. Experimental results suggest that the proposed method achieves competitive results compared to other offline methods and outperforms active reconstruction methods using NeRFs.\n  - [Zero NeRF: Registration with Zero Overlap](https://arxiv.org/abs/2211.12544) | [code]\n    > We present Zero-NeRF, a projective surface registration method that, to the best of our knowledge, offers the first general solution capable of alignment between scene representations with minimal or zero visual correspondence. To do this, we enforce consistency between visible surfaces of partial and complete reconstructions, which allows us to constrain occluded geometry. We use a NeRF as our surface representation and the NeRF rendering pipeline to perform this alignment. To demonstrate the efficacy of our method, we register real-world scenes from opposite sides with infinitesimal overlaps that cannot be accurately registered using prior methods, and we compare these results against widely used registration methods.\n  - [FLNeRF: 3D Facial Landmarks Estimation in Neural Radiance Fields](https://arxiv.org/abs/2211.11202) | [code]\n    > This paper presents the first significant work on directly predicting 3D face landmarks on neural radiance fields (NeRFs), without using intermediate representations such as 2D images, depth maps, or point clouds. Our 3D coarse-to-fine Face Landmarks NeRF (FLNeRF) model efficiently samples from the NeRF on the whole face with individual facial features for accurate landmarks. To mitigate the limited number of facial expressions in the available data, local and non-linear NeRF warp is applied at facial features in fine scale to simulate large emotions range, including exaggerated facial expressions (e.g., cheek blowing, wide opening mouth, eye blinking), for training FLNeRF. With such expression augmentation, our model can predict 3D landmarks not limited to the 20 discrete expressions given in the data. Robust 3D NeRF facial landmarks contribute to many downstream tasks. As an example, we modify MoFaNeRF to enable high-quality face editing and swapping using face landmarks on NeRF, allowing more direct control and wider range of complex expressions. Experiments show that the improved model using landmarks achieves comparable to better results.\n  - [SPARF: Neural Radiance Fields from Sparse and Noisy Poses](https://arxiv.org/abs/2211.11738) | [code]\n    > Neural Radiance Field (NeRF) has recently emerged as a powerful representation to synthesize photorealistic novel views. While showing impressive performance, it relies on the availability of dense input views with highly accurate camera poses, thus limiting its application in real-world scenarios. In this work, we introduce Sparse Pose Adjusting Radiance Field (SPARF), to address the challenge of novel-view synthesis given only few wide-baseline input images (as low as 3) with noisy camera poses. Our approach exploits multi-view geometry constraints in order to jointly learn the NeRF and refine the camera poses. By relying on pixel matches extracted between the input views, our multi-view correspondence objective enforces the optimized scene and camera poses to converge to a global and geometrically accurate solution. Our depth consistency loss further encourages the reconstructed scene to be consistent from any viewpoint. Our approach sets a new state of the art in the sparse-view regime on multiple challenging datasets.\n  - [Tensor4D : Efficient Neural 4D Decomposition for High-fidelity Dynamic Reconstruction and Rendering](https://arxiv.org/abs/2211.11610) | [code]\n    > We present Tensor4D, an efficient yet effective approach to dynamic scene modeling. The key of our solution is an efficient 4D tensor decomposition method so that the dynamic scene can be directly represented as a 4D spatio-temporal tensor. To tackle the accompanying memory issue, we decompose the 4D tensor hierarchically by projecting it first into three time-aware volumes and then nine compact feature planes. In this way, spatial information over time can be simultaneously captured in a compact and memory-efficient manner. When applying Tensor4D for dynamic scene reconstruction and rendering, we further factorize the 4D fields to different scales in the sense that structural motions and dynamic detailed changes can be learned from coarse to fine. The effectiveness of our method is validated on both synthetic and real-world scenes. Extensive experiments show that our method is able to achieve high-quality dynamic reconstruction and rendering from sparse-view camera rigs or even a monocular camera. The code and dataset will be released at this https URL.\n  - [NeRF-RPN: A general framework for object detection in NeRFs](https://arxiv.org/abs/2211.11646) | [code]\n    > This paper presents the first significant object detection framework, NeRF-RPN, which directly operates on NeRF. Given a pre-trained NeRF model, NeRF-RPN aims to detect all bounding boxes of objects in a scene. By exploiting a novel voxel representation that incorporates multi-scale 3D neural volumetric features, we demonstrate it is possible to regress the 3D bounding boxes of objects in NeRF directly without rendering the NeRF at any viewpoint. NeRF-RPN is a general framework and can be applied to detect objects without class labels. We experimented the NeRF-RPN with various backbone architectures, RPN head designs and loss functions. All of them can be trained in an end-to-end manner to estimate high quality 3D bounding boxes. To facilitate future research in object detection for NeRF, we built a new benchmark dataset which consists of both synthetic and real-world data with careful labeling and clean up. Please click this https URL for visualizing the 3D region proposals by our NeRF-RPN. Code and dataset will be made available.\n  - [Local-to-Global Registration for Bundle-Adjusting Neural Radiance Fields](https://arxiv.org/abs/2211.11505) | [***``[code]``***](https://github.com/rover-xingyu/L2G-NeRF)\n    > Neural Radiance Fields (NeRF) have achieved photorealistic novel views synthesis; however, the requirement of accurate camera poses limits its application. Despite analysis-by-synthesis extensions for jointly learning neural 3D representations and registering camera frames exist, they are susceptible to suboptimal solutions if poorly initialized. We propose L2G-NeRF, a Local-to-Global registration method for bundle-adjusting Neural Radiance Fields: first, a pixel-wise flexible alignment, followed by a frame-wise constrained parametric alignment. Pixel-wise local alignment is learned in an unsupervised way via a deep network which optimizes photometric reconstruction errors. Frame-wise global alignment is performed using differentiable parameter estimation solvers on the pixel-wise correspondences to find a global transformation. Experiments on synthetic and real-world data show that our method outperforms the current state-of-the-art in terms of high-fidelity reconstruction and resolving large camera pose misalignment. Our module is an easy-to-use plugin that can be applied to NeRF variants and other neural field applications. The Code and supplementary materials are available at this https URL.\n  - [SegNeRF: 3D Part Segmentation with Neural Radiance Fields](https://arxiv.org/abs/2211.11215) | [code]\n    > Recent advances in Neural Radiance Fields (NeRF) boast impressive performances for generative tasks such as novel view synthesis and 3D reconstruction. Methods based on neural radiance fields are able to represent the 3D world implicitly by relying exclusively on posed images. Yet, they have seldom been explored in the realm of discriminative tasks such as 3D part segmentation. In this work, we attempt to bridge that gap by proposing SegNeRF: a neural field representation that integrates a semantic field along with the usual radiance field. SegNeRF inherits from previous works the ability to perform novel view synthesis and 3D reconstruction, and enables 3D part segmentation from a few images. Our extensive experiments on PartNet show that SegNeRF is capable of simultaneously predicting geometry, appearance, and semantic information from posed images, even for unseen objects. The predicted semantic fields allow SegNeRF to achieve an average mIoU of 30.30% for 2D novel view segmentation, and 37.46% for 3D part segmentation, boasting competitive performance against point-based methods by using only a few posed images. Additionally, SegNeRF is able to generate an explicit 3D model from a single image of an object taken in the wild, with its corresponding part segmentation.\n  - [Shape, Pose, and Appearance from a Single Image via Bootstrapped Radiance Field Inversion](https://arxiv.org/abs/2211.11674) | [code]\n    > Neural Radiance Fields (NeRF) coupled with GANs represent a promising direction in the area of 3D reconstruction from a single view, owing to their ability to efficiently model arbitrary topologies. Recent work in this area, however, has mostly focused on synthetic datasets where exact ground-truth poses are known, and has overlooked pose estimation, which is important for certain downstream applications such as augmented reality (AR) and robotics. We introduce a principled end-to-end reconstruction framework for natural images, where accurate ground-truth poses are not available. Our approach recovers an SDF-parameterized 3D shape, pose, and appearance from a single image of an object, without exploiting multiple views during training. More specifically, we leverage an unconditional 3D-aware generator, to which we apply a hybrid inversion scheme where a model produces a first guess of the solution which is then refined via optimization. Our framework can de-render an image in as few as 10 steps, enabling its use in practical scenarios. We demonstrate state-of-the-art results on a variety of real and synthetic benchmarks.\n  - [Recovering Fine Details for Neural Implicit Surface Reconstruction](https://arxiv.org/abs/2211.11320) | [code]\n    > Recent works on implicit neural representations have made significant strides. Learning implicit neural surfaces using volume rendering has gained popularity in multi-view reconstruction without 3D supervision. However, accurately recovering fine details is still challenging, due to the underlying ambiguity of geometry and appearance representation. In this paper, we present D-NeuS, a volume rendering-base neural implicit surface reconstruction method capable to recover fine geometry details, which extends NeuS by two additional loss functions targeting enhanced reconstruction quality. First, we encourage the rendered surface points from alpha compositing to have zero signed distance values, alleviating the geometry bias arising from transforming SDF to density for volume rendering. Second, we impose multi-view feature consistency on the surface points, derived by interpolating SDF zero-crossings from sampled points along rays. Extensive quantitative and qualitative results demonstrate that our method reconstructs high-accuracy surfaces with details, and outperforms the state of the art.\n  - [Neural Puppeteer: Keypoint-Based Neural Rendering of Dynamic Shapes, ACCV2022](https://openaccess.thecvf.com/content/ACCV2022/html/Giebenhain_Neural_Puppeteer_Keypoint-Based_Neural_Rendering_of_Dynamic_Shapes_ACCV_2022_paper.html) | [***``[code]``***](https://github.com/urs-waldmann/NePu/)\n    > We introduce Neural Puppeteer, an efficient neural rendering pipeline for articulated shapes. By inverse rendering, we can predict 3D keypoints from multi-view 2D silhouettes alone, without requiring texture information. Furthermore, we can easily predict 3D keypoints of the same class of shapes with one and the same trained model and generalize more easily from training with synthetic data which we demonstrate by successfully applying zero-shot synthetic to real-world experiments. We demonstrate the flexibility of our method by fitting models to synthetic videos of different animals and a human, and achieve quantitative results which outperform our baselines. Our method uses 3D keypoints in conjunction with individual local feature vectors and a global latent code to allow for an efficient representation of time-varying and articulated shapes such as humans and animals. In contrast to previous work, we do not perform reconstruction in the 3D domain, but project the 3D features into 2D cameras and perform reconstruction of 2D RGB-D images from these projected features, which is significantly faster than volumetric rendering. Our synthetic dataset will be publicly available, to further develop the evolving field of animal pose and shape reconstruction.\n  - [DynIBaR: Neural Dynamic Image-Based Rendering, -](https://arxiv.org/abs/2211.11082) | [code]\n    > We address the problem of synthesizing novel views from a monocular video depicting a complex dynamic scene. State-of-the-art methods based on temporally varying Neural Radiance Fields (aka dynamic NeRFs) have shown impressive results on this task. However, for long videos with complex object motions and uncontrolled camera trajectories, these methods can produce blurry or inaccurate renderings, hampering their use in real-world applications. Instead of encoding the entire dynamic scene within the weights of an MLP, we present a new approach that addresses these limitations by adopting a volumetric image-based rendering framework that synthesizes new viewpoints by aggregating features from nearby views in a scene-motion-aware manner. Our system retains the advantages of prior methods in its ability to model complex scenes and view-dependent effects, but also enables synthesizing photo-realistic novel views from long videos featuring complex scene dynamics with unconstrained camera trajectories. We demonstrate significant improvements over state-of-the-art methods on dynamic scene datasets, and also apply our approach to in-the-wild videos with challenging camera and object motion, where prior methods fail to produce high-quality renderings. Our project webpage is at this http URL.\n  - [Sampling Neural Radiance Fields for Refractive Objects, SIGGRAPH-Asia2022](https://arxiv.org/abs/2211.14799) | [***``[code]``***](https://github.com/alexkeroro86/SampleNeRFRO)\n    > Recently, differentiable volume rendering in neural radiance fields (NeRF) has gained a lot of popularity, and its variants have attained many impressive results. However, existing methods usually assume the scene is a homogeneous volume so that a ray is cast along the straight path. In this work, the scene is instead a heterogeneous volume with a piecewise-constant refractive index, where the path will be curved if it intersects the different refractive indices. For novel view synthesis of refractive objects, our NeRF-based framework aims to optimize the radiance fields of bounded volume and boundary from multi-view posed images with refractive object silhouettes. To tackle this challenging problem, the refractive index of a scene is reconstructed from silhouettes. Given the refractive index, we extend the stratified and hierarchical sampling techniques in NeRF to allow drawing samples along a curved path tracked by the Eikonal equation. The results indicate that our framework outperforms the state-of-the-art method both quantitatively and qualitatively, demonstrating better performance on the perceptual similarity metric and an apparent improvement in the rendering quality on several synthetic and real scenes.\n## Nov13 - Nov19, 2022\n  - [Real-Time Omnidirectional Roaming in Large Scale Indoor Scenes, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550340.3564222) | [code]\n    > Neural radiance field (NeRF) has recently achieved impressive results in novel view synthesis. However, previous works on NeRF mainly focus on object-centric scenarios. They would suffer observable performance degradation in outward-facing and large-scale scenes due to limiting positional encoding capacity. To narrow the gap, we explore radiance fields in a geometry-aware fashion. We estimate explicit geometry from the omnidirectional neural radiance field that was learned from multiple 360° images. Relying on the recovered geometry, we use an adaptive divide-and-conquer strategy to slim and fine-tune the radiance fields and further improve render speed and quality. Quantitative and qualitative comparisons among baselines illustrated our predominant performance in large-scale indoor scenes and our system supports real-time VR roaming.\n  - [Magic3D: High-Resolution Text-to-3D Content Creation](https://arxiv.org/abs/2211.10440) | [code]\n    > DreamFusion has recently demonstrated the utility of a pre-trained text-to-image diffusion model to optimize Neural Radiance Fields (NeRF), achieving remarkable text-to-3D synthesis results. However, the method has two inherent limitations: (a) extremely slow optimization of NeRF and (b) low-resolution image space supervision on NeRF, leading to low-quality 3D models with a long processing time. In this paper, we address these limitations by utilizing a two-stage optimization framework. First, we obtain a coarse model using a low-resolution diffusion prior and accelerate with a sparse 3D hash grid structure. Using the coarse representation as the initialization, we further optimize a textured 3D mesh model with an efficient differentiable renderer interacting with a high-resolution latent diffusion model. Our method, dubbed Magic3D, can create high quality 3D mesh models in 40 minutes, which is 2x faster than DreamFusion (reportedly taking 1.5 hours on average), while also achieving higher resolution. User studies show 61.7% raters to prefer our approach over DreamFusion. Together with the image-conditioned generation capabilities, we provide users with new ways to control 3D synthesis, opening up new avenues to various creative applications.\n  - [AligNeRF: High-Fidelity Neural Radiance Fields via Alignment-Aware Training](https://arxiv.org/abs/2211.09682) | [code]\n    > Neural Radiance Fields (NeRFs) are a powerful representation for modeling a 3D scene as a continuous function. Though NeRF is able to render complex 3D scenes with view-dependent effects, few efforts have been devoted to exploring its limits in a high-resolution setting. Specifically, existing NeRF-based methods face several limitations when reconstructing high-resolution real scenes, including a very large number of parameters, misaligned input data, and overly smooth details. In this work, we conduct the first pilot study on training NeRF with high-resolution data and propose the corresponding solutions: 1) marrying the multilayer perceptron (MLP) with convolutional layers which can encode more neighborhood information while reducing the total number of parameters; 2) a novel training strategy to address misalignment caused by moving objects or small camera calibration errors; and 3) a high-frequency aware loss. Our approach is nearly free without introducing obvious training/testing costs, while experiments on different datasets demonstrate that it can recover more high-frequency details compared with the current state-of-the-art NeRF models. Project page: \\url{this https URL.}\n  - [3DLatNav: Navigating Generative Latent Spaces for Semantic-Aware 3D Object Manipulation](https://arxiv.org/abs/2211.09770) | [code]\n    > 3D generative models have been recently successful in generating realistic 3D objects in the form of point clouds. However, most models do not offer controllability to manipulate the shape semantics of component object parts without extensive semantic attribute labels or other reference point clouds. Moreover, beyond the ability to perform simple latent vector arithmetic or interpolations, there is a lack of understanding of how part-level semantics of 3D shapes are encoded in their corresponding generative latent spaces. In this paper, we propose 3DLatNav; a novel approach to navigating pretrained generative latent spaces to enable controlled part-level semantic manipulation of 3D objects. First, we propose a part-level weakly-supervised shape semantics identification mechanism using latent representations of 3D shapes. Then, we transfer that knowledge to a pretrained 3D object generative latent space to unravel disentangled embeddings to represent different shape semantics of component parts of an object in the form of linear subspaces, despite the unavailability of part-level labels during the training. Finally, we utilize those identified subspaces to show that controllable 3D object part manipulation can be achieved by applying the proposed framework to any pretrained 3D generative model. With two novel quantitative metrics to evaluate the consistency and localization accuracy of part-level manipulations, we show that 3DLatNav outperforms existing unsupervised latent disentanglement methods in identifying latent directions that encode part-level shape semantics of 3D objects. With multiple ablation studies and testing on state-of-the-art generative models, we show that 3DLatNav can implement controlled part-level semantic manipulations on an input point cloud while preserving other features and the realistic nature of the object.\n  - [RenderDiffusion: Image Diffusion for 3D Reconstruction, Inpainting and Generation](https://arxiv.org/abs/2211.09869) | [code]\n    > Diffusion models currently achieve state-of-the-art performance for both conditional and unconditional image generation. However, so far, image diffusion models do not support tasks required for 3D understanding, such as view-consistent 3D generation or single-view object reconstruction. In this paper, we present RenderDiffusion as the first diffusion model for 3D generation and inference that can be trained using only monocular 2D supervision. At the heart of our method is a novel image denoising architecture that generates and renders an intermediate three-dimensional representation of a scene in each denoising step. This enforces a strong inductive structure into the diffusion process that gives us a 3D consistent representation while only requiring 2D supervision. The resulting 3D representation can be rendered from any viewpoint. We evaluate RenderDiffusion on ShapeNet and Clevr datasets and show competitive performance for generation of 3D scenes and inference of 3D scenes from 2D images. Additionally, our diffusion-based approach allows us to use 2D inpainting to edit 3D scenes. We believe that our work promises to enable full 3D generation at scale when trained on massive image collections, thus circumventing the need to have large-scale 3D model collections for supervision.\n  - [DINER: Disorder-Invariant Implicit Neural Representation](https://arxiv.org/abs/2211.07871) | [code]\n    > Implicit neural representation (INR) characterizes the attributes of a signal as a function of corresponding coordinates which emerges as a sharp weapon for solving inverse problems. However, the capacity of INR is limited by the spectral bias in the network training. In this paper, we find that such a frequency-related problem could be largely solved by re-arranging the coordinates of the input signal, for which we propose the disorder-invariant implicit neural representation (DINER) by augmenting a hash-table to a traditional INR backbone. Given discrete signals sharing the same histogram of attributes and different arrangement orders, the hash-table could project the coordinates into the same distribution for which the mapped signal can be better modeled using the subsequent INR network, leading to significantly alleviated spectral bias. Experiments not only reveal the generalization of the DINER for different INR backbones (MLP vs. SIREN) and various tasks (image/video representation, phase retrieval, and refractive index recovery) but also show the superiority over the state-of-the-art algorithms both in quality and speed.\n  - [Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures](https://arxiv.org/abs/2211.07600) | [code]\n    > Text-guided image generation has progressed rapidly in recent years, inspiring major breakthroughs in text-guided shape generation. Recently, it has been shown that using score distillation, one can successfully text-guide a NeRF model to generate a 3D object. We adapt the score distillation to the publicly available, and computationally efficient, Latent Diffusion Models, which apply the entire diffusion process in a compact latent space of a pretrained autoencoder. As NeRFs operate in image space, a naive solution for guiding them with latent score distillation would require encoding to the latent space at each guidance step. Instead, we propose to bring the NeRF to the latent space, resulting in a Latent-NeRF. Analyzing our Latent-NeRF, we show that while Text-to-3D models can generate impressive results, they are inherently unconstrained and may lack the ability to guide or enforce a specific 3D structure. To assist and direct the 3D generation, we propose to guide our Latent-NeRF using a Sketch-Shape: an abstract geometry that defines the coarse structure of the desired object. Then, we present means to integrate such a constraint directly into a Latent-NeRF. This unique combination of text and shape guidance allows for increased control over the generation process. We also show that latent score distillation can be successfully applied directly on 3D meshes. This allows for generating high-quality textures on a given geometry. Our experiments validate the power of our different forms of guidance and the efficiency of using latent rendering. Implementation is available at this https URL\n  - [AsyncNeRF: Learning Large-scale Radiance Fields from Asynchronous RGB-D Sequences with Time-Pose Function](https://arxiv.org/abs/2211.07459) | [code]\n    > Large-scale radiance fields are promising mapping tools for smart transportation applications like autonomous driving or drone delivery. But for large-scale scenes, compact synchronized RGB-D cameras are not applicable due to limited sensing range, and using separate RGB and depth sensors inevitably leads to unsynchronized sequences. Inspired by the recent success of self-calibrating radiance field training methods that do not require known intrinsic or extrinsic parameters, we propose the first solution that self-calibrates the mismatch between RGB and depth frames. We leverage the important domain-specific fact that RGB and depth frames are actually sampled from the same trajectory and develop a novel implicit network called the time-pose function. Combining it with a large-scale radiance field leads to an architecture that cascades two implicit representation networks. To validate its effectiveness, we construct a diverse and photorealistic dataset that covers various RGB-D mismatch scenarios. Through a comprehensive benchmarking on this dataset, we demonstrate the flexibility of our method in different scenarios and superior performance over applicable prior counterparts. Codes, data, and models will be made publicly available.\n## Nov6 - Nov12, 2022\n  - [NeXT: Towards High Quality Neural Radiance Fields via Multi-skip Transformer, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19824-3_5) | [***``[code]``***](https://github.com/Crishawy/NeXT)\n    > Neural Radiance Fields (NeRF) methods show impressive performance for novel view synthesis by representing a scene via a neural network. However, most existing NeRF based methods, including its variants, treat each sample point individually as input, while ignoring the inherent relationships between adjacent sample points from the corresponding rays, thus hindering the reconstruction performance. To address this issue, we explore a brand new scheme, namely NeXT, introducing a multi-skip transformer to capture the rich relationships between various sample points in a ray-level query. Specifically, ray tokenization is proposed to represent each ray as a sequence of point embeddings which is taken as input of our proposed NeXT. In this way, relationships between sample points are captured via the built-in self-attention mechanism to promote the reconstruction. Besides, our proposed NeXT can be easily combined with other NeRF based methods to improve their rendering quality. Extensive experiments conducted on three datasets demonstrate that NeXT significantly outperforms all previous state-of-the-art work by a large margin. In particular, the proposed NeXT surpasses the strong NeRF baseline by 2.74 dB of PSNR on Blender dataset. The code is available at https://github.com/Crishawy/NeXT.\n  - [Directed Ray Distance Functions for 3D Scene Reconstruction, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-20086-1_12) | [code]\n    > We present an approach for full 3D scene reconstruction from a single unseen image. We trained on dataset of realistic non-watertight scans of scenes. Our approach uses a predicted distance function, since these have shown promise in handling complex topologies and large spaces. We identify and analyze two key challenges for predicting such image conditioned distance functions that have prevented their success on real 3D scene data. First, we show that predicting a conventional scene distance from an image requires reasoning over a large receptive field. Second, we analytically show that the optimal output of the network trained to predict these distance functions does not obey all the distance function properties. We propose an alternate distance function, the Directed Ray Distance Function (DRDF), that tackles both challenges. We show that a deep network trained to predict DRDFs outperforms all other methods quantitatively and qualitatively on 3D reconstruction from single image on Matterport3D, 3DFront, and ScanNet. (Project Page: https://nileshkulkarni.github.io/scene_drdf)\n  - [ParticleNeRF: A Particle-Based Encoding for Online Neural Radiance Fields in Dynamic Scenes](https://arxiv.org/abs/2211.04041) | [code]\n    > Neural Radiance Fields (NeRFs) learn implicit representations of - typically static - environments from images. Our paper extends NeRFs to handle dynamic scenes in an online fashion. We propose ParticleNeRF that adapts to changes in the geometry of the environment as they occur, learning a new up-to-date representation every 350 ms. ParticleNeRF can represent the current state of dynamic environments with much higher fidelity as other NeRF frameworks. To achieve this, we introduce a new particle-based parametric encoding, which allows the intermediate NeRF features - now coupled to particles in space - to move with the dynamic geometry. This is possible by backpropagating the photometric reconstruction loss into the position of the particles. The position gradients are interpreted as particle velocities and integrated into positions using a position-based dynamics (PBS) physics system. Introducing PBS into the NeRF formulation allows us to add collision constraints to the particle motion and creates future opportunities to add other movement priors into the system, such as rigid and deformable body\n  - [Temporal Coherence-Based Distributed Ray Tracing of Massive Scenes, ToG2022](https://ieeexplore.ieee.org/abstract/document/9940545) | [code]\n    > Distributed ray tracing algorithms are widely used when rendering massive scenes, where data utilization and load balancing are the keys to improving performance. One essential observation is that rays are temporally coherent, which indicates that temporal information can be used to improve computational efficiency. In this paper, we use temporal coherence to optimize the performance of distributed ray tracing. First, we propose a temporal coherence-based scheduling algorithm to guide the task/data assignment and scheduling. Then, we propose a virtual portal structure to predict the radiance of rays based on the previous frame, and send the rays with low radiance to a precomputed simplified model for further tracing, which can dramatically reduce the traversal complexity and the overhead of network data transmission. The approach was validated on scenes of sizes up to 355 GB. Our algorithm can achieve a speedup of up to 81% compared to previous algorithms, with a very small mean squared error.\n  - [QRF: Implicit Neural Representations with Quantum Radiance Fields](https://arxiv.org/abs/2211.03418) | [code]\n    > Photorealistic rendering of real-world scenes is a tremendous challenge with a wide range of applications, including mixed reality (MR), and virtual reality (VR). Neural networks, which have long been investigated in the context of solving differential equations, have previously been introduced as implicit representations for photorealistic rendering. However, realistic rendering using classic computing is challenging because it requires time-consuming optical ray marching, and suffer computational bottlenecks due to the curse of dimensionality. In this paper, we propose Quantum Radiance Fields (QRF), which integrate the quantum circuit, quantum activation function, and quantum volume rendering for implicit scene representation. The results indicate that QRF not only exploits the advantage of quantum computing, such as high speed, fast convergence, and high parallelism, but also ensure high quality of volume rendering.\n  - [Common Pets in 3D: Dynamic New-View Synthesis of Real-Life Deformable Categories](https://arxiv.org/abs/2211.03889) | [code]\n    > Obtaining photorealistic reconstructions of objects from sparse views is inherently ambiguous and can only be achieved by learning suitable reconstruction priors. Earlier works on sparse rigid object reconstruction successfully learned such priors from large datasets such as CO3D. In this paper, we extend this approach to dynamic objects. We use cats and dogs as a representative example and introduce Common Pets in 3D (CoP3D), a collection of crowd-sourced videos showing around 4,200 distinct pets. CoP3D is one of the first large-scale datasets for benchmarking non-rigid 3D reconstruction \"in the wild\". We also propose Tracker-NeRF, a method for learning 4D reconstruction from our dataset. At test time, given a small number of video frames of an unseen object, Tracker-NeRF predicts the trajectories of its 3D points and generates new views, interpolating viewpoint and time. Results on CoP3D reveal significantly better non-rigid new-view synthesis performance than existing baselines.\n  - [Learning-based Inverse Rendering of Complex Indoor Scenes with Differentiable Monte Carlo Raytracing, SIGGRAPH-Asia2022](https://jingsenzhu.github.io/invrend/) | [code]\n    > We present a learning-based approach for inverse rendering of complex indoor scenes with differentiable Monte Carlo raytracing. Our method takes a single indoor scene RGB image as input and automatically infers its underlying surface reflectance , geometry, and spatially-varying illumination. This enables us to perform photorealistic editing of the scene, such as inserting multiple complex virtual objects and editing surface materials faithfully with global illumination.\n## Oct30 - Nov5, 2022\n  - [Deep Appearance Prefiltering, ToG2022](https://dl.acm.org/doi/abs/10.1145/3570327) | [code]\n    > Physically based rendering of complex scenes can be prohibitively costly with a potentially unbounded and uneven distribution of complexity across the rendered image. The goal of an ideal level of detail (LoD) method is to make rendering costs independent of the 3D scene complexity, while preserving the appearance of the scene. However, current prefiltering LoD methods are limited in the appearances they can support due to their reliance of approximate models and other heuristics. We propose the first comprehensive multi-scale LoD framework for prefiltering 3D environments with complex geometry and materials (e.g., the Disney BRDF), while maintaining the appearance with respect to the ray-traced reference. Using a multi-scale hierarchy of the scene, we perform a data-driven prefiltering step to obtain an appearance phase function and directional coverage mask at each scale. At the heart of our approach is a novel neural representation that encodes this information into a compact latent form that is easy to decode inside a physically based renderer. Once a scene is baked out, our method requires no original geometry, materials, or textures at render time. We demonstrate that our approach compares favorably to state-of-the-art prefiltering methods and achieves considerable savings in memory for complex scenes.\n  - [Neural Grasp Distance Fields for Robot Manipulation](https://arxiv.org/abs/2211.02647) | [code]\n    > We formulate grasp learning as a neural field and present Neural Grasp Distance Fields (NGDF). Here, the input is a 6D pose of a robot end effector and output is a distance to a continuous manifold of valid grasps for an object. In contrast to current approaches that predict a set of discrete candidate grasps, the distance-based NGDF representation is easily interpreted as a cost, and minimizing this cost produces a successful grasp pose. This grasp distance cost can be incorporated directly into a trajectory optimizer for joint optimization with other costs such as trajectory smoothness and collision avoidance. During optimization, as the various costs are balanced and minimized, the grasp target is allowed to smoothly vary, as the learned grasp field is continuous. In simulation benchmarks with a Franka arm, we find that joint grasping and planning with NGDF outperforms baselines by 63% execution success while generalizing to unseen query poses and unseen object shapes. Project page: this https URL.\n  - [nerf2nerf: Pairwise Registration of Neural Radiance Fields](https://arxiv.org/abs/2211.01600) | [code]\n    > We introduce a technique for pairwise registration of neural fields that extends classical optimization-based local registration (i.e. ICP) to operate on Neural Radiance Fields (NeRF) -- neural 3D scene representations trained from collections of calibrated images. NeRF does not decompose illumination and color, so to make registration invariant to illumination, we introduce the concept of a ''surface field'' -- a field distilled from a pre-trained NeRF model that measures the likelihood of a point being on the surface of an object. We then cast nerf2nerf registration as a robust optimization that iteratively seeks a rigid transformation that aligns the surface fields of the two scenes. We evaluate the effectiveness of our technique by introducing a dataset of pre-trained NeRF scenes -- our synthetic scenes enable quantitative evaluations and comparisons to classical registration techniques, while our real scenes demonstrate the validity of our technique in real-world scenarios. Additional results available at: this https URL\n  - [HyperSound: Generating Implicit Neural Representations of Audio Signals with Hypernetworks](https://arxiv.org/abs/2211.01839) | [code]\n    > Implicit neural representations (INRs) are a rapidly growing research field, which provides alternative ways to represent multimedia signals. Recent applications of INRs include image super-resolution, compression of high-dimensional signals, or 3D rendering. However, these solutions usually focus on visual data, and adapting them to the audio domain is not trivial. Moreover, it requires a separately trained model for every data sample. To address this limitation, we propose HyperSound, a meta-learning method leveraging hypernetworks to produce INRs for audio signals unseen at training time. We show that our approach can reconstruct sound waves with quality comparable to other state-of-the-art models.\n  - [Attention-based Neural Cellular Automata, NeurIPS2022](https://arxiv.org/abs/2211.01233) | [code]\n    > Recent extensions of Cellular Automata (CA) have incorporated key ideas from modern deep learning, dramatically extending their capabilities and catalyzing a new family of Neural Cellular Automata (NCA) techniques. Inspired by Transformer-based architectures, our work presents a new class of attention-based NCAs formed using a spatially localized—yet globally organized—self-attention scheme. We introduce an instance of this class named Vision Transformer Cellular Automata (ViTCA). We present quantitative and qualitative results on denoising autoencoding across six benchmark datasets, comparing ViTCA to a U-Net, a U-Net-based CA baseline (UNetCA), and a Vision Transformer (ViT). When comparing across architectures configured to similar parameter complexity, ViTCA architectures yield superior performance across all benchmarks and for nearly every evaluation metric. We present an ablation study on various architectural configurations of ViTCA, an analysis of its effect on cell states, and an investigation on its inductive biases. Finally, we examine its learned representations via linear probes on its converged cell state hidden representations, yielding, on average, superior results when compared to our U-Net, ViT, and UNetCA baselines.\n  - [GARF: Gaussian Activated Radiance Fields for High Fidelity Reconstruction and Pose Estimation, ECCV2022](https://arxiv.org/abs/2204.05735) | [code]\n    > Despite Neural Radiance Fields (NeRF) showing compelling results in photorealistic novel views synthesis of real-world scenes, most existing approaches require accurate prior camera poses. Although approaches for jointly recovering the radiance field and camera pose exist (BARF), they rely on a cumbersome coarse-to-fine auxiliary positional embedding to ensure good performance. We present Gaussian Activated neural Radiance Fields (GARF), a new positional embedding-free neural radiance field architecture - employing Gaussian activations - that outperforms the current state-of-the-art in terms of high fidelity reconstruction and pose estimation.\n  - [Learning Neural Implicit Representations with Surface Signal Parameterizations](https://arxiv.org/abs/2211.00519) | [code]\n    > Neural implicit surface representations have recently emerged as popular alternative to explicit 3D object encodings, such as polygonal meshes, tabulated points, or voxels. While significant work has improved the geometric fidelity of these representations, much less attention is given to their final appearance. Traditional explicit object representations commonly couple the 3D shape data with auxiliary surface-mapped image data, such as diffuse color textures and fine-scale geometric details in normal maps that typically require a mapping of the 3D surface onto a plane, i.e., a surface parameterization; implicit representations, on the other hand, cannot be easily textured due to lack of configurable surface parameterization. Inspired by this digital content authoring methodology, we design a neural network architecture that implicitly encodes the underlying surface parameterization suitable for appearance data. As such, our model remains compatible with existing mesh-based digital content with appearance data. Motivated by recent work that overfits compact networks to individual 3D objects, we present a new weight-encoded neural implicit representation that extends the capability of neural implicit surfaces to enable various common and important applications of texture mapping. Our method outperforms reasonable baselines and state-of-the-art alternatives.\n  - [gCoRF: Generative Compositional Radiance Fields, 3DV2022](https://vcai.mpi-inf.mpg.de/projects/gCoRF/) | [code]\n    > 3D generative models of objects enable photorealistic image synthesis with 3D control. Existing methods model the scene as a global scene representation, ignoring the compositional aspect of the scene. Compositional reasoning can enable a wide variety of editing applications, in addition to enabling generalizable 3D reasoning. In this paper, we present a compositional generative model, where each semantic part of the object is represented as an independent 3D representation learnt from only in-the-wild 2D data. We start with a global generative model (GAN) and learn to decompose it into different semantic parts using supervision from 2D segmentation masks. We then learn to composite independently sampled parts in order to create coherent global scenes. Different parts can be independently sampled, while keeping rest of the object fixed. We evaluate our method on a wide variety of objects and parts, and demonstrate editing applications.\n  - [Digging into Radiance Grid for Real-Time View Synthesis with Detail Preservation, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19784-0_42) | [code]\n    > Neural Radiance Fields (NeRF) [31] series are impressive in representing scenes and synthesizing high-quality novel views. However, most previous works fail to preserve texture details and suffer from slow training speed. A recent method SNeRG [11] demonstrates that baking a trained NeRF as a Sparse Neural Radiance Grid enables real-time view synthesis with slight scarification of rendering quality. In this paper, we dig into the Radiance Grid representation and present a set of improvements, which together result in boosted performance in terms of both speed and quality. First, we propose an HieRarchical Sparse Radiance Grid (HrSRG) representation that has higher voxel resolution for informative spaces and fewer voxels for other spaces. HrSRG leverages a hierarchical voxel grid building process inspired by [30, 55], and can describe a scene at high resolution without excessive memory footprint. Furthermore, we show that directly optimizing the voxel grid leads to surprisingly good texture details in rendered images. This direct optimization is memory-friendly and requires multiple orders of magnitude less time than conventional NeRFs as it only involves a tiny MLP. Finally, we find that a critical factor that prevents fine details restoration is the misaligned 2D pixels among images caused by camera pose errors. We propose to use the perceptual loss to add tolerance to misalignments, leading to the improved visual quality of rendered images.\n## Oct23 - Oct29, 2022\n  - [NeX360: Real-time All-around View Synthesis with Neural Basis Expansion, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9931981) | [code]\n    > We present NeX, a new approach to novel view synthesis based on enhancements of multiplane images (MPI) that can reproduce view-dependent effects in real time. Unlike traditional MPI, our technique parameterizes each pixel as a linear combination of spherical basis functions learned from a neural network to model view-dependent effects and uses a hybrid implicit-explicit modeling strategy to improve fine detail. Moreover, we also present an extension to NeX, which leverages knowledge distillation to train multiple MPIs for unbounded 360 ∘ scenes. Our method is evaluated on several benchmark datasets: NeRF-Synthetic dataset, Light Field dataset, Real Forward-Facing dataset, Space dataset, as well as Shiny , our new dataset that contains significantly more challenging view-dependent effects, such as the rainbow reflections on the CD. Our method outperforms other real-time rendering approaches on PSNR, SSIM, and LPIPS and can render unbounded 360 ∘ scenes in real time.\n  - [NeRFPlayer: A Streamable Dynamic Scene Representation with Decomposed Neural Radiance Fields](https://arxiv.org/abs/2210.15947) | [code]\n    > Visually exploring in a real-world 4D spatiotemporal space freely in VR has been a long-term quest. The task is especially appealing when only a few or even single RGB cameras are used for capturing the dynamic scene. To this end, we present an efficient framework capable of fast reconstruction, compact modeling, and streamable rendering. First, we propose to decompose the 4D spatiotemporal space according to temporal characteristics. Points in the 4D space are associated with probabilities of belonging to three categories: static, deforming, and new areas. Each area is represented and regularized by a separate neural field. Second, we propose a hybrid representations based feature streaming scheme for efficiently modeling the neural fields. Our approach, coined NeRFPlayer, is evaluated on dynamic scenes captured by single hand-held cameras and multi-camera arrays, achieving comparable or superior rendering performance in terms of quality and speed comparable to recent state-of-the-art methods, achieving reconstruction in 10 seconds per frame and real-time rendering.\n  - [Vox-Fusion: Dense Tracking and Mapping with Voxel-based Neural Implicit Representation](https://arxiv.org/abs/2210.15858) | [***``[code]``***](https://github.com/zju3dv/Vox-Fusion)\n    > In this work, we present a dense tracking and mapping system named Vox-Fusion, which seamlessly fuses neural implicit representations with traditional volumetric fusion methods. Our approach is inspired by the recently developed implicit mapping and positioning system and further extends the idea so that it can be freely applied to practical scenarios. Specifically, we leverage a voxel-based neural implicit surface representation to encode and optimize the scene inside each voxel. Furthermore, we adopt an octree-based structure to divide the scene and support dynamic expansion, enabling our system to track and map arbitrary scenes without knowing the environment like in previous works. Moreover, we proposed a high-performance multi-process framework to speed up the method, thus supporting some applications that require real-time performance. The evaluation results show that our methods can achieve better accuracy and completeness than previous methods. We also show that our Vox-Fusion can be used in augmented reality and virtual reality applications. Our source code is publicly available at this https URL.\n  - [Boosting Point Clouds Rendering via Radiance Mapping](https://arxiv.org/abs/2210.15107) | [code]\n    > Recent years we have witnessed rapid development in NeRF-based image rendering due to its high quality. However, point clouds rendering is somehow less explored. Compared to NeRF-based rendering which suffers from dense spatial sampling, point clouds rendering is naturally less computation intensive, which enables its deployment in mobile computing device. In this work, we focus on boosting the image quality of point clouds rendering with a compact model design. We first analyze the adaption of the volume rendering formulation on point clouds. Based on the analysis, we simplify the NeRF representation to a spatial mapping function which only requires single evaluation per pixel. Further, motivated by ray marching, we rectify the the noisy raw point clouds to the estimated intersection between rays and surfaces as queried coordinates, which could avoid spatial frequency collapse and neighbor point disturbance. Composed of rasterization, spatial mapping and the refinement stages, our method achieves the state-of-the-art performance on point clouds rendering, outperforming prior works by notable margins, with a smaller model size. We obtain a PSNR of 31.74 on NeRF-Synthetic, 25.88 on ScanNet and 30.81 on DTU. Code and data would be released soon.\n  - [Streaming Radiance Fields for 3D Video Synthesis, NeurIPS2022](https://arxiv.org/abs/2210.14831) | [code]\n    > We present an explicit-grid based method for efficiently reconstructing streaming radiance fields for novel view synthesis of real world dynamic scenes. Instead of training a single model that combines all the frames, we formulate the dynamic modeling problem with an incremental learning paradigm in which per-frame model difference is trained to complement the adaption of a base model on the current frame. By exploiting the simple yet effective tuning strategy with narrow bands, the proposed method realizes a feasible framework for handling video sequences on-the-fly with high training efficiency. The storage overhead induced by using explicit grid representations can be significantly reduced through the use of model difference based compression. We also introduce an efficient strategy to further accelerate model optimization for each frame. Experiments on challenging video sequences demonstrate that our approach is capable of achieving a training speed of 15 seconds per-frame with competitive rendering quality, which attains 1000× speedup over the state-of-the-art implicit methods. Code is available at this https URL.\n  - [EpipolarNVS: leveraging on Epipolar geometry for single-image Novel View Synthesis, BMVC2022](https://arxiv.org/abs/2210.13077) | [code]\n    > Novel-view synthesis (NVS) can be tackled through different approaches, depending on the general setting: a single source image to a short video sequence, exact or noisy camera pose information, 3D-based information such as point clouds etc. The most challenging scenario, the one where we stand in this work, only considers a unique source image to generate a novel one from another viewpoint. However, in such a tricky situation, the latest learning-based solutions often struggle to integrate the camera viewpoint transformation. Indeed, the extrinsic information is often passed as-is, through a low-dimensional vector. It might even occur that such a camera pose, when parametrized as Euler angles, is quantized through a one-hot representation. This vanilla encoding choice prevents the learnt architecture from inferring novel views on a continuous basis (from a camera pose perspective). We claim it exists an elegant way to better encode relative camera pose, by leveraging 3D-related concepts such as the epipolar constraint. We, therefore, introduce an innovative method that encodes the viewpoint transformation as a 2D feature image. Such a camera encoding strategy gives meaningful insights to the network regarding how the camera has moved in space between the two views. By encoding the camera pose information as a finite number of coloured epipolar lines, we demonstrate through our experiments that our strategy outperforms vanilla encoding.\n  - [NeRF-SLAM: Real-Time Dense Monocular SLAM with Neural Radiance Fields](https://arxiv.org/abs/2210.13641) | [code]\n    > We propose a novel geometric and photometric 3D mapping pipeline for accurate and real-time scene reconstruction from monocular images. To achieve this, we leverage recent advances in dense monocular SLAM and real-time hierarchical volumetric neural radiance fields. Our insight is that dense monocular SLAM provides the right information to fit a neural radiance field of the scene in real-time, by providing accurate pose estimates and depth-maps with associated uncertainty. With our proposed uncertainty-based depth loss, we achieve not only good photometric accuracy, but also great geometric accuracy. In fact, our proposed pipeline achieves better geometric and photometric accuracy than competing approaches (up to 179% better PSNR and 86% better L1 depth), while working in real-time and using only monocular images.\n  - [Compressing Explicit Voxel Grid Representations: fast NeRFs become also small](https://arxiv.org/abs/2210.12782) | [code]\n    > NeRFs have revolutionized the world of per-scene radiance field reconstruction because of their intrinsic compactness. One of the main limitations of NeRFs is their slow rendering speed, both at training and inference time. Recent research focuses on the optimization of an explicit voxel grid (EVG) that represents the scene, which can be paired with neural networks to learn radiance fields. This approach significantly enhances the speed both at train and inference time, but at the cost of large memory occupation. In this work we propose Re:NeRF, an approach that specifically targets EVG-NeRFs compressibility, aiming to reduce memory storage of NeRF models while maintaining comparable performance. We benchmark our approach with three different EVG-NeRF architectures on four popular benchmarks, showing Re:NeRF's broad usability and effectiveness.\n## Oct16 - Oct22, 2022\n  - [Compressing multidimensional weather and climate data into neural networks](https://arxiv.org/abs/2210.12538) | [code]\n    > Weather and climate simulations produce petabytes of high-resolution data that are later analyzed by researchers in order to understand climate change or severe weather. We propose a new method of compressing this multidimensional weather and climate data: a coordinate-based neural network is trained to overfit the data, and the resulting parameters are taken as a compact representation of the original grid-based data. While compression ratios range from 300x to more than 3,000x, our method outperforms the state-of-the-art compressor SZ3 in terms of weighted RMSE, MAE. It can faithfully preserve important large scale atmosphere structures and does not introduce artifacts. When using the resulting neural network as a 790x compressed dataloader to train the WeatherBench forecasting model, its RMSE increases by less than 2%. The three orders of magnitude compression democratizes access to high-resolution climate data and enables numerous new research directions.\n  - [NeARportation: A Remote Real-time Neural Rendering Framework, VRST22](https://arxiv.org/abs/2210.12398) | [code]\n    > While the presentation of photo-realistic appearance plays a major role in immersion in an augmented virtuality environment, displaying the photo-realistic appearance of real objects remains a challenging problem. Recent developments in photogrammetry have facilitated the incorporation of real objects into virtual space. However, photo-realistic photogrammetry requires a dedicated measurement environment, and there is a trade-off between measurement cost and quality. Furthermore, even with photo-realistic appearance measurements, there is a trade-off between rendering quality and framerate. There is no framework that could resolve these trade-offs and easily provide a photo-realistic appearance in real-time. Our NeARportation framework combines server-client bidirectional communication and neural rendering to resolve these trade-offs. Neural rendering on the server receives the client's head posture and generates a novel-view image with realistic appearance reproduction, which is streamed onto the client's display. By applying our framework to a stereoscopic display, we confirmed that it could display a high-fidelity appearance on full-HD stereo videos at 35-40 frames-per-second (fps), according to the user's head motion.\n  - [Neural Sound Field Decomposition with Super-resolution of Sound Direction](https://arxiv.org/abs/2210.12345) | [code]\n    > Sound field decomposition predicts waveforms in arbitrary directions using signals from a limited number of microphones as inputs. Sound field decomposition is fundamental to downstream tasks, including source localization, source separation, and spatial audio reproduction. Conventional sound field decomposition methods such as Ambisonics have limited spatial decomposition resolution. This paper proposes a learning-based Neural Sound field Decomposition (NeSD) framework to allow sound field decomposition with fine spatial direction resolution, using recordings from microphone capsules of a few microphones at arbitrary positions. The inputs of a NeSD system include microphone signals, microphone positions, and queried directions. The outputs of a NeSD include the waveform and the presence probability of a queried position. We model the NeSD systems respectively with different neural networks, including fully connected, time delay, and recurrent neural networks. We show that the NeSD systems outperform conventional Ambisonics and DOANet methods in sound field decomposition and source localization on speech, music, and sound events datasets. Demos are available at this https URL.\n  - [An Exploration of Neural Radiance Field Scene Reconstruction: Synthetic, Real-world and Dynamic Scenes](https://arxiv.org/abs/2210.12268) | [code]\n    > This project presents an exploration into 3D scene reconstruction of synthetic and real-world scenes using Neural Radiance Field (NeRF) approaches. We primarily take advantage of the reduction in training and rendering time of neural graphic primitives multi-resolution hash encoding, to reconstruct static video game scenes and real-world scenes, comparing and observing reconstruction detail and limitations. Additionally, we explore dynamic scene reconstruction using Neural Radiance Fields for Dynamic Scenes(D-NeRF). Finally, we extend the implementation of D-NeRF, originally constrained to handle synthetic scenes to also handle real-world dynamic scenes.\n  - [Generative Range Imaging for Learning Scene Priors of 3D LiDAR Data, WACV2023](https://arxiv.org/abs/2210.11750) | [code]\n    > 3D LiDAR sensors are indispensable for the robust vision of autonomous mobile robots. However, deploying LiDAR-based perception algorithms often fails due to a domain gap from the training environment, such as inconsistent angular resolution and missing properties. Existing studies have tackled the issue by learning inter-domain mapping, while the transferability is constrained by the training configuration and the training is susceptible to peculiar lossy noises called ray-drop. To address the issue, this paper proposes a generative model of LiDAR range images applicable to the data-level domain transfer. Motivated by the fact that LiDAR measurement is based on point-by-point range imaging, we train an implicit image representation-based generative adversarial networks along with a differentiable ray-drop effect. We demonstrate the fidelity and diversity of our model in comparison with the point-based and image-based state-of-the-art generative models. We also showcase upsampling and restoration applications. Furthermore, we introduce a Sim2Real application for LiDAR semantic segmentation. We demonstrate that our method is effective as a realistic ray-drop simulator and outperforms state-of-the-art methods.\n  - [HDHumans: A Hybrid Approach for High-fidelity Digital Humans](https://arxiv.org/abs/2210.12003) | [code]\n    > Photo-real digital human avatars are of enormous importance in graphics, as they enable immersive communication over the globe, improve gaming and entertainment experiences, and can be particularly beneficial for AR and VR settings. However, current avatar generation approaches either fall short in high-fidelity novel view synthesis, generalization to novel motions, reproduction of loose clothing, or they cannot render characters at the high resolution offered by modern displays. To this end, we propose HDHumans, which is the first method for HD human character synthesis that jointly produces an accurate and temporally coherent 3D deforming surface and highly photo-realistic images of arbitrary novel views and of motions not seen at training time. At the technical core, our method tightly integrates a classical deforming character template with neural radiance fields (NeRF). Our method is carefully designed to achieve a synergy between classical surface deformation and NeRF. First, the template guides the NeRF, which allows synthesizing novel views of a highly dynamic and articulated character and even enables the synthesis of novel motions. Second, we also leverage the dense pointclouds resulting from NeRF to further improve the deforming surface via 3D-to-3D supervision. We outperform the state of the art quantitatively and qualitatively in terms of synthesis quality and resolution, as well as the quality of 3D surface reconstruction.\n  - [High-Quality RGB-D Reconstruction via Multi-View Uncalibrated Photometric Stereo and Gradient-SDF, WACV2023](https://arxiv.org/abs/2210.12202) | [code]\n    > Fine-detailed reconstructions are in high demand in many applications. However, most of the existing RGB-D reconstruction methods rely on pre-calculated accurate camera poses to recover the detailed surface geometry, where the representation of a surface needs to be adapted when optimizing different quantities. In this paper, we present a novel multi-view RGB-D based reconstruction method that tackles camera pose, lighting, albedo, and surface normal estimation via the utilization of a gradient signed distance field (gradient-SDF). The proposed method formulates the image rendering process using specific physically-based model(s) and optimizes the surface's quantities on the actual surface using its volumetric representation, as opposed to other works which estimate surface quantities only near the actual surface. To validate our method, we investigate two physically-based image formation models for natural light and point light source applications. The experimental results on synthetic and real-world datasets demonstrate that the proposed method can recover high-quality geometry of the surface more faithfully than the state-of-the-art and further improves the accuracy of estimated camera poses.\n  - [Neural Fields for Robotic Object Manipulation from a Single Image, ICRA2023](https://arxiv.org/abs/2210.12126) | [code]\n    > We present a unified and compact representation for object rendering, 3D reconstruction, and grasp pose prediction that can be inferred from a single image within a few seconds. We achieve this by leveraging recent advances in the Neural Radiance Field (NeRF) literature that learn category-level priors and fine-tune on novel objects with minimal data and time. Our insight is that we can learn a compact shape representation and extract meaningful additional information from it, such as grasping poses. We believe this to be the first work to retrieve grasping poses directly from a NeRF-based representation using a single viewpoint (RGB-only), rather than going through a secondary network and/or representation. When compared to prior art, our method is two to three orders of magnitude smaller while achieving comparable performance at view reconstruction and grasping. Accompanying our method, we also propose a new dataset of rendered shoes for training a sim-2-real NeRF method with grasping poses for different widths of grippers.\n  - [TANGO: Text-driven Photorealistic and Robust 3D Stylization via Lighting Decomposition, NeurIPS2022](https://arxiv.org/abs/2210.11277) | [***``[code]``***](https://cyw-3d.github.io/tango/)\n    > Creation of 3D content by stylization is a promising yet challenging problem in computer vision and graphics research. In this work, we focus on stylizing photorealistic appearance renderings of a given surface mesh of arbitrary topology. Motivated by the recent surge of cross-modal supervision of the Contrastive Language-Image Pre-training (CLIP) model, we propose TANGO, which transfers the appearance style of a given 3D shape according to a text prompt in a photorealistic manner. Technically, we propose to disentangle the appearance style as the spatially varying bidirectional reflectance distribution function, the local geometric variation, and the lighting condition, which are jointly optimized, via supervision of the CLIP loss, by a spherical Gaussians based differentiable renderer. As such, TANGO enables photorealistic 3D style transfer by automatically predicting reflectance effects even for bare, low-quality meshes, without training on a task-specific dataset. Extensive experiments show that TANGO outperforms existing methods of text-driven 3D style transfer in terms of photorealistic quality, consistency of 3D geometry, and robustness when stylizing low-quality meshes. Our codes and results are available at our project webpage this https URL.\n  - [Coordinates Are NOT Lonely -- Codebook Prior Helps Implicit Neural 3D Representations, NeurIPS2022](https://arxiv.org/abs/2210.11170) | [code]\n    > Implicit neural 3D representation has achieved impressive results in surface or scene reconstruction and novel view synthesis, which typically uses the coordinate-based multi-layer perceptrons (MLPs) to learn a continuous scene representation. However, existing approaches, such as Neural Radiance Field (NeRF) and its variants, usually require dense input views (i.e. 50-150) to obtain decent results. To relive the over-dependence on massive calibrated images and enrich the coordinate-based feature representation, we explore injecting the prior information into the coordinate-based network and introduce a novel coordinate-based model, CoCo-INR, for implicit neural 3D representation. The cores of our method are two attention modules: codebook attention and coordinate attention. The former extracts the useful prototypes containing rich geometry and appearance information from the prior codebook, and the latter propagates such prior information into each coordinate and enriches its feature representation for a scene or object surface. With the help of the prior information, our method can render 3D views with more photo-realistic appearance and geometries than the current methods using fewer calibrated images available. Experiments on various scene reconstruction datasets, including DTU and BlendedMVS, and the full 3D head reconstruction dataset, H3DS, demonstrate the robustness under fewer input views and fine detail-preserving capability of our proposed method.\n  - [Parallel Inversion of Neural Radiance Fields for Robust Pose Estimation, ICRA2023](https://arxiv.org/abs/2210.10108) | [code]\n    > We present a parallelized optimization method based on fast Neural Radiance Fields (NeRF) for estimating 6-DoF target poses. Given a single observed RGB image of the target, we can predict the translation and rotation of the camera by minimizing the residual between pixels rendered from a fast NeRF model and pixels in the observed image. We integrate a momentum-based camera extrinsic optimization procedure into Instant Neural Graphics Primitives, a recent exceptionally fast NeRF implementation. By introducing parallel Monte Carlo sampling into the pose estimation task, our method overcomes local minima and improves efficiency in a more extensive search space. We also show the importance of adopting a more robust pixel-based loss function to reduce error. Experiments demonstrate that our method can achieve improved generalization and robustness on both synthetic and real-world benchmarks.\n  - [Neural Contact Fields: Tracking Extrinsic Contact with Tactile Sensing](https://arxiv.org/abs/2210.09297) | [code]\n    > We present Neural Contact Fields, a method that brings together neural fields and tactile sensing to address the problem of tracking extrinsic contact between object and environment. Knowing where the external contact occurs is a first step towards methods that can actively control it in facilitating downstream manipulation tasks. Prior work for localizing environmental contacts typically assume a contact type (e.g. point or line), does not capture contact/no-contact transitions, and only works with basic geometric-shaped objects. Neural Contact Fields are the first method that can track arbitrary multi-modal extrinsic contacts without making any assumptions about the contact type. Our key insight is to estimate the probability of contact for any 3D point in the latent space of object shapes, given vision-based tactile inputs that sense the local motion resulting from the external contact. In experiments, we find that Neural Contact Fields are able to localize multiple contact patches without making any assumptions about the geometry of the contact, and capture contact/no-contact transitions for known categories of objects with unseen shapes in unseen environment configurations. In addition to Neural Contact Fields, we also release our YCB-Extrinsic-Contact dataset of simulated extrinsic contact interactions to enable further research in this area. Project repository: this https URL\n  - [S3-NeRF: Neural Reflectance Field from Shading and Shadow under a Single Viewpoint, NeurIPS2022](https://arxiv.org/abs/2210.08936) | [***``[code]``***](https://github.com/ywq/s3nerf)\n    > In this paper, we address the \"dual problem\" of multi-view scene reconstruction in which we utilize single-view images captured under different point lights to learn a neural scene representation. Different from existing single-view methods which can only recover a 2.5D scene representation (i.e., a normal / depth map for the visible surface), our method learns a neural reflectance field to represent the 3D geometry and BRDFs of a scene. Instead of relying on multi-view photo-consistency, our method exploits two information-rich monocular cues, namely shading and shadow, to infer scene geometry. Experiments on multiple challenging datasets show that our method is capable of recovering 3D geometry, including both visible and invisible parts, of a scene from single-view images. Thanks to the neural reflectance field representation, our method is robust to depth discontinuities. It supports applications like novel-view synthesis and relighting. Our code and model can be found at this https URL.\n  - [Differentiable Physics Simulation of Dynamics-Augmented Neural Objects](https://arxiv.org/abs/2210.09420) | [code]\n    > We present a differentiable pipeline for simulating the motion of objects that represent their geometry as a continuous density field parameterized as a deep network. This includes Neural Radiance Fields (NeRFs), and other related models. From the density field, we estimate the dynamical properties of the object, including its mass, center of mass, and inertia matrix. We then introduce a differentiable contact model based on the density field for computing normal and friction forces resulting from collisions. This allows a robot to autonomously build object models that are visually and dynamically accurate from still images and videos of objects in motion. The resulting Dynamics-Augmented Neural Objects (DANOs) are simulated with an existing differentiable simulation engine, Dojo, interacting with other standard simulation objects, such as spheres, planes, and robots specified as URDFs. A robot can use this simulation to optimize grasps and manipulation trajectories of neural objects, or to improve the neural object models through gradient-based real-to-simulation transfer. We demonstrate the pipeline to learn the coefficient of friction of a bar of soap from a real video of the soap sliding on a table. We also learn the coefficient of friction and mass of a Stanford bunny through interactions with a Panda robot arm from synthetic data, and we optimize trajectories in simulation for the Panda arm to push the bunny to a goal location.\n## Oct9 - Oct15, 2022\n  - [LB-NERF: Light Bending Neural Radiance Fields for Transparent Medium, ICIP2022](https://ieeexplore.ieee.org/abstract/document/9897642) | [code]\n    > Neural radiance fields (NeRFs) have been proposed as methods of novel view synthesis and have been used to address various problems because of its versatility. NeRF can represent colors and densities in 3D space using neural rendering assuming a straight light path. However, a medium with a different refractive index in the scene, such as a transparent medium, causes light refraction and breaks the assumption of the straight path of light. Therefore, the NeRFs cannot be learned consistently across multi-view images. To solve this problem, this study proposes a method to learn consistent radiance fields across multiple viewpoints by introducing the light refraction effect as an offset from the straight line originating from the camera center. The experimental results quantitatively and qualitatively verified that our method can interpolate viewpoints better than the conventional NeRF method when considering the refraction of transparent objects.\n  - [IBL-NeRF: Image-Based Lighting Formulation of Neural Radiance Fields](https://arxiv.org/abs/2210.08202) | [code]\n    > We propose IBL-NeRF, which decomposes the neural radiance fields (NeRF) of large-scale indoor scenes into intrinsic components. Previous approaches for the inverse rendering of NeRF transform the implicit volume to fit the rendering pipeline of explicit geometry, and approximate the views of segmented, isolated objects with environment lighting. In contrast, our inverse rendering extends the original NeRF formulation to capture the spatial variation of lighting within the scene volume, in addition to surface properties. Specifically, the scenes of diverse materials are decomposed into intrinsic components for image-based rendering, namely, albedo, roughness, surface normal, irradiance, and prefiltered radiance. All of the components are inferred as neural images from MLP, which can model large-scale general scenes. By adopting the image-based formulation of NeRF, our approach inherits superior visual quality and multi-view consistency for synthesized images. We demonstrate the performance on scenes with complex object layouts and light configurations, which could not be processed in any of the previous works.\n  - [ExAug: Robot-Conditioned Navigation Policies via Geometric Experience Augmentation](https://arxiv.org/abs/2210.07450) | [code]\n    > Machine learning techniques rely on large and diverse datasets for generalization. Computer vision, natural language processing, and other applications can often reuse public datasets to train many different models. However, due to differences in physical configurations, it is challenging to leverage public datasets for training robotic control policies on new robot platforms or for new tasks. In this work, we propose a novel framework, ExAug to augment the experiences of different robot platforms from multiple datasets in diverse environments. ExAug leverages a simple principle: by extracting 3D information in the form of a point cloud, we can create much more complex and structured augmentations, utilizing both generating synthetic images and geometric-aware penalization that would have been suitable in the same situation for a different robot, with different size, turning radius, and camera placement. The trained policy is evaluated on two new robot platforms with three different cameras in indoor and outdoor environments with obstacles.\n  - [Lightweight Stepless Super-Resolution of Remote Sensing Images via Saliency-Aware Dynamic Routing Strategy](https://arxiv.org/abs/2210.07598) | [***``[code]``***](https://github.com/hanlinwu/SalDRN)\n    > Deep learning-based algorithms have greatly improved the performance of remote sensing image (RSI) super-resolution (SR). However, increasing network depth and parameters cause a huge burden of computing and storage. Directly reducing the depth or width of existing models results in a large performance drop. We observe that the SR difficulty of different regions in an RSI varies greatly, and existing methods use the same deep network to process all regions in an image, resulting in a waste of computing resources. In addition, existing SR methods generally predefine integer scale factors and cannot perform stepless SR, i.e., a single model can deal with any potential scale factor. Retraining the model on each scale factor wastes considerable computing resources and model storage space. To address the above problems, we propose a saliency-aware dynamic routing network (SalDRN) for lightweight and stepless SR of RSIs. First, we introduce visual saliency as an indicator of region-level SR difficulty and integrate a lightweight saliency detector into the SalDRN to capture pixel-level visual characteristics. Then, we devise a saliency-aware dynamic routing strategy that employs path selection switches to adaptively select feature extraction paths of appropriate depth according to the SR difficulty of sub-image patches. Finally, we propose a novel lightweight stepless upsampling module whose core is an implicit feature function for realizing mapping from low-resolution feature space to high-resolution feature space. Comprehensive experiments verify that the SalDRN can achieve a good trade-off between performance and complexity. The code is available at \\url{this https URL}.\n  - [NOCaL: Calibration-Free Semi-Supervised Learning of Odometry and Camera Intrinsics](https://arxiv.org/abs/2210.07435) | [code]\n    > There are a multitude of emerging imaging technologies that could benefit robotics. However the need for bespoke models, calibration and low-level processing represents a key barrier to their adoption. In this work we present NOCaL, Neural odometry and Calibration using Light fields, a semi-supervised learning architecture capable of interpreting previously unseen cameras without calibration. NOCaL learns to estimate camera parameters, relative pose, and scene appearance. It employs a scene-rendering hypernetwork pretrained on a large number of existing cameras and scenes, and adapts to previously unseen cameras using a small supervised training set to enforce metric scale. We demonstrate NOCaL on rendered and captured imagery using conventional cameras, demonstrating calibration-free odometry and novel view synthesis. This work represents a key step toward automating the interpretation of general camera geometries and emerging imaging technologies.\n  - [Multi-View Photometric Stereo Revisited, WACV2023](https://arxiv.org/abs/2210.07670) | [code]\n    > Multi-view photometric stereo (MVPS) is a preferred method for detailed and precise 3D acquisition of an object from images. Although popular methods for MVPS can provide outstanding results, they are often complex to execute and limited to isotropic material objects. To address such limitations, we present a simple, practical approach to MVPS, which works well for isotropic as well as other object material types such as anisotropic and glossy. The proposed approach in this paper exploits the benefit of uncertainty modeling in a deep neural network for a reliable fusion of photometric stereo (PS) and multi-view stereo (MVS) network predictions. Yet, contrary to the recently proposed state-of-the-art, we introduce neural volume rendering methodology for a trustworthy fusion of MVS and PS measurements. The advantage of introducing neural volume rendering is that it helps in the reliable modeling of objects with diverse material types, where existing MVS methods, PS methods, or both may fail. Furthermore, it allows us to work on neural 3D shape representation, which has recently shown outstanding results for many geometric processing tasks. Our suggested new loss function aims to fits the zero level set of the implicit neural function using the most certain MVS and PS network predictions coupled with weighted neural volume rendering cost. The proposed approach shows state-of-the-art results when tested extensively on several benchmark datasets.\n  - [Controllable Style Transfer via Test-time Training of Implicit Neural Representation](https://arxiv.org/abs/2210.07762) | [code]\n    > We propose a controllable style transfer framework based on Implicit Neural Representation that pixel-wisely controls the stylized output via test-time training. Unlike traditional image optimization methods that often suffer from unstable convergence and learning-based methods that require intensive training and have limited generalization ability, we present a model optimization framework that optimizes the neural networks during test-time with explicit loss functions for style transfer. After being test-time trained once, thanks to the flexibility of the INR-based model, our framework can precisely control the stylized images in a pixel-wise manner and freely adjust image resolution without further optimization or training. We demonstrate several applications.\n  - [Scalable Neural Video Representations with Learnable Positional Features, NeurIPS2022](https://arxiv.org/abs/2210.06823) | [***``[code]``***](https://github.com/subin-kim-cv/NVP)\n    > Succinct representation of complex signals using coordinate-based neural representations (CNRs) has seen great progress, and several recent efforts focus on extending them for handling videos. Here, the main challenge is how to (a) alleviate a compute-inefficiency in training CNRs to (b) achieve high-quality video encoding while (c) maintaining the parameter-efficiency. To meet all requirements (a), (b), and (c) simultaneously, we propose neural video representations with learnable positional features (NVP), a novel CNR by introducing \"learnable positional features\" that effectively amortize a video as latent codes. Specifically, we first present a CNR architecture based on designing 2D latent keyframes to learn the common video contents across each spatio-temporal axis, which dramatically improves all of those three requirements. Then, we propose to utilize existing powerful image and video codecs as a compute-/memory-efficient compression procedure of latent codes. We demonstrate the superiority of NVP on the popular UVG benchmark; compared with prior arts, NVP not only trains 2 times faster (less than 5 minutes) but also exceeds their encoding quality as 34.07→34.57 (measured with the PSNR metric), even using >8 times fewer parameters. We also show intriguing properties of NVP, e.g., video inpainting, video frame interpolation, etc.\n  - [NeuralRoom: Geometry-Constrained Neural Implicit Surfaces for Indoor Scene Reconstruction](https://arxiv.org/abs/2210.06853) | [code]\n    > We present a novel neural surface reconstruction method called NeuralRoom for reconstructing room-sized indoor scenes directly from a set of 2D images. Recently, implicit neural representations have become a promising way to reconstruct surfaces from multiview images due to their high-quality results and simplicity. However, implicit neural representations usually cannot reconstruct indoor scenes well because they suffer severe shape-radiance ambiguity. We assume that the indoor scene consists of texture-rich and flat texture-less regions. In texture-rich regions, the multiview stereo can obtain accurate results. In the flat area, normal estimation networks usually obtain a good normal estimation. Based on the above observations, we reduce the possible spatial variation range of implicit neural surfaces by reliable geometric priors to alleviate shape-radiance ambiguity. Specifically, we use multiview stereo results to limit the NeuralRoom optimization space and then use reliable geometric priors to guide NeuralRoom training. Then the NeuralRoom would produce a neural scene representation that can render an image consistent with the input training images. In addition, we propose a smoothing method called perturbation-residual restrictions to improve the accuracy and completeness of the flat region, which assumes that the sampling points in a local surface should have the same normal and similar distance to the observation center. Experiments on the ScanNet dataset show that our method can reconstruct the texture-less area of indoor scenes while maintaining the accuracy of detail. We also apply NeuralRoom to more advanced multiview reconstruction algorithms and significantly improve their reconstruction quality.\n  - [CUF: Continuous Upsampling Filters](https://arxiv.org/abs/2210.06965) | [code]\n    > Neural fields have rapidly been adopted for representing 3D signals, but their application to more classical 2D image-processing has been relatively limited. In this paper, we consider one of the most important operations in image processing: upsampling. In deep learning, learnable upsampling layers have extensively been used for single image super-resolution. We propose to parameterize upsampling kernels as neural fields. This parameterization leads to a compact architecture that obtains a 40-fold reduction in the number of parameters when compared with competing arbitrary-scale super-resolution architectures. When upsampling images of size 256x256 we show that our architecture is 2x-10x more efficient than competing arbitrary-scale super-resolution architectures, and more efficient than sub-pixel convolutions when instantiated to a single-scale model. In the general setting, these gains grow polynomially with the square of the target scale. We validate our method on standard benchmarks showing such efficiency gains can be achieved without sacrifices in super-resolution performance.\n  - [GeoAug: Data Augmentation for Few-Shot NeRF with Geometry Constraints, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19790-1_20) | [code]\n    > Neural Radiance Fields (NeRF) show remarkable ability to render novel views of a certain scene by learning an implicit volumetric representation with only posed RGB images. Despite its impressiveness and simplicity, NeRF usually converges to sub-optimal solutions with incorrect geometries given few training images. We hereby present GeoAug: a data augmentation method for NeRF, which enriches training data based on multi-view geometric constraint. GeoAug provides random artificial (novel pose, RGB image) pairs for training, where the RGB image is from a nearby training view. The rendering of a novel pose is warped to the nearby training view with depth map and relative pose to match the RGB image supervision. Our method reduces the risk of over-fitting by introducing more data during training, while also provides additional implicit supervision for depth maps. In experiments, our method significantly boosts the performance of neural radiance fields conditioned on few training views.\n  - [Photo-realistic Neural Domain Randomization, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19806-9_18) | [code]\n    > Synthetic data is a scalable alternative to manual supervision, but it requires overcoming the sim-to-real domain gap. This discrepancy between virtual and real worlds is addressed by two seemingly opposed approaches: improving the realism of simulation or foregoing realism entirely via domain randomization. In this paper, we show that the recent progress in neural rendering enables a new unified approach we call Photo-realistic Neural Domain Randomization (PNDR). We propose to learn a composition of neural networks that acts as a physics-based ray tracer generating high-quality renderings from scene geometry alone. Our approach is modular, composed of different neural networks for materials, lighting, and rendering, thus enabling randomization of different key image generation components in a differentiable pipeline. Once trained, our method can be combined with other methods and used to generate photo-realistic image augmentations online and significantly more efficiently than via traditional ray-tracing. We demonstrate the usefulness of PNDR through two downstream tasks: 6D object detection and monocular depth estimation. Our experiments show that training with PNDR enables generalization to novel scenes and significantly outperforms the state of the art in terms of real-world transfer.\n  - [AniFaceGAN: Animatable 3D-Aware Face Image Generation for Video Avatars, NeurIPS2022](https://arxiv.org/abs/2210.06465) | [***``[code]``***](https://yuewuhkust.github.io/AniFaceGAN/files/github_icon.jpeg)\n    > Although 2D generative models have made great progress in face image generation and animation, they often suffer from undesirable artifacts such as 3D inconsistency when rendering images from different camera viewpoints. This prevents them from synthesizing video animations indistinguishable from real ones. Recently, 3D-aware GANs extend 2D GANs for explicit disentanglement of camera pose by leveraging 3D scene representations. These methods can well preserve the 3D consistency of the generated images across different views, yet they cannot achieve fine-grained control over other attributes, among which facial expression control is arguably the most useful and desirable for face animation. In this paper, we propose an animatable 3D-aware GAN for multiview consistent face animation generation. The key idea is to decompose the 3D representation of the 3D-aware GAN into a template field and a deformation field, where the former represents different identities with a canonical expression, and the latter characterizes expression variations of each identity. To achieve meaningful control over facial expressions via deformation, we propose a 3D-level imitative learning scheme between the generator and a parametric 3D face model during adversarial training of the 3D-aware GAN. This helps our method achieve high-quality animatable face image generation with strong visual 3D consistency, even though trained with only unstructured 2D images. Extensive experiments demonstrate our superior performance over prior works. Project page: this https URL\n  - [Reconstructing Personalized Semantic Facial NeRF Models From Monocular Video, SIGGRAPH-Asia2022](https://arxiv.org/abs/2210.06108) | [***``[code]``***](https://github.com/USTC3DV/NeRFBlendShape-code)\n    > We present a novel semantic model for human head defined with neural radiance field. The 3D-consistent head model consist of a set of disentangled and interpretable bases, and can be driven by low-dimensional expression coefficients. Thanks to the powerful representation ability of neural radiance field, the constructed model can represent complex facial attributes including hair, wearings, which can not be represented by traditional mesh blendshape. To construct the personalized semantic facial model, we propose to define the bases as several multi-level voxel fields. With a short monocular RGB video as input, our method can construct the subject's semantic facial NeRF model with only ten to twenty minutes, and can render a photo-realistic human head image in tens of miliseconds with a given expression coefficient and view direction. With this novel representation, we apply it to many tasks like facial retargeting and expression editing. Experimental results demonstrate its strong representation ability and training/inference speed. Demo videos and released code are provided in our project page: this https URL\n  - [LION: Latent Point Diffusion Models for 3D Shape Generation, NeurIPS2022](https://arxiv.org/abs/2210.06978) | [***``[code]``***](https://nv-tlabs.github.io/LION)\n    > Denoising diffusion models (DDMs) have shown promising results in 3D point cloud synthesis. To advance 3D DDMs and make them useful for digital artists, we require (i) high generation quality, (ii) flexibility for manipulation and applications such as conditional synthesis and shape interpolation, and (iii) the ability to output smooth surfaces or meshes. To this end, we introduce the hierarchical Latent Point Diffusion Model (LION) for 3D shape generation. LION is set up as a variational autoencoder (VAE) with a hierarchical latent space that combines a global shape latent representation with a point-structured latent space. For generation, we train two hierarchical DDMs in these latent spaces. The hierarchical VAE approach boosts performance compared to DDMs that operate on point clouds directly, while the point-structured latents are still ideally suited for DDM-based modeling. Experimentally, LION achieves state-of-the-art generation performance on multiple ShapeNet benchmarks. Furthermore, our VAE framework allows us to easily use LION for different relevant tasks: LION excels at multimodal shape denoising and voxel-conditioned synthesis, and it can be adapted for text- and image-driven 3D generation. We also demonstrate shape autoencoding and latent shape interpolation, and we augment LION with modern surface reconstruction techniques to generate smooth 3D meshes. We hope that LION provides a powerful tool for artists working with 3D shapes due to its high-quality generation, flexibility, and surface reconstruction. Project page and code: this https URL.\n  - [GraspNeRF: Multiview-based 6-DoF Grasp Detection for Transparent and Specular Objects Using Generalizable NeRF](https://arxiv.org/abs/2210.06575) | [code]\n    > In this work, we tackle 6-DoF grasp detection for transparent and specular objects, which is an important yet challenging problem in vision-based robotic systems, due to the failure of depth cameras in sensing their geometry. We, for the first time, propose a multiview RGB-based 6-DoF grasp detection network, GraspNeRF, that leverages the generalizable neural radiance field (NeRF) to achieve material-agnostic object grasping in clutter. Compared to the existing NeRF-based 3-DoF grasp detection methods that rely on densely captured input images and time-consuming per-scene optimization, our system can perform zero-shot NeRF construction with sparse RGB inputs and reliably detect 6-DoF grasps, both in real-time. The proposed framework jointly learns generalizable NeRF and grasp detection in an end-to-end manner, optimizing the scene representation construction for the grasping. For training data, we generate a large-scale photorealistic domain-randomized synthetic dataset of grasping in cluttered tabletop scenes that enables direct transfer to the real world. Our extensive experiments in synthetic and real-world environments demonstrate that our method significantly outperforms all the baselines in all the experiments while remaining in real-time.\n  - [X-NeRF: Explicit Neural Radiance Field for Multi-Scene 360∘ Insufficient RGB-D Views, WACV2023](https://arxiv.org/abs/2210.05135) | [***``[code]``***](https://github.com/HaoyiZhu/XNeRF)\n    > Neural Radiance Fields (NeRFs), despite their outstanding performance on novel view synthesis, often need dense input views. Many papers train one model for each scene respectively and few of them explore incorporating multi-modal data into this problem. In this paper, we focus on a rarely discussed but important setting: can we train one model that can represent multiple scenes, with 360∘ insufficient views and RGB-D images? We refer insufficient views to few extremely sparse and almost non-overlapping views. To deal with it, X-NeRF, a fully explicit approach which learns a general scene completion process instead of a coordinate-based mapping, is proposed. Given a few insufficient RGB-D input views, X-NeRF first transforms them to a sparse point cloud tensor and then applies a 3D sparse generative Convolutional Neural Network (CNN) to complete it to an explicit radiance field whose volumetric rendering can be conducted fast without running networks during inference. To avoid overfitting, besides common rendering loss, we apply perceptual loss as well as view augmentation through random rotation on point clouds. The proposed methodology significantly out-performs previous implicit methods in our setting, indicating the great potential of proposed problem and approach. Codes and data are available at this https URL.\n  - [Multi-Object Navigation with dynamically learned neural implicit representations](https://arxiv.org/abs/2210.05129) | [code]\n    > Understanding and mapping a new environment are core abilities of any autonomously navigating agent. While classical robotics usually estimates maps in a stand-alone manner with SLAM variants, which maintain a topological or metric representation, end-to-end learning of navigation keeps some form of memory in a neural network. Networks are typically imbued with inductive biases, which can range from vectorial representations to birds-eye metric tensors or topological structures. In this work, we propose to structure neural networks with two neural implicit representations, which are learned dynamically during each episode and map the content of the scene: (i) the Semantic Finder predicts the position of a previously seen queried object; (ii) the Occupancy and Exploration Implicit Representation encapsulates information about explored area and obstacles, and is queried with a novel global read mechanism which directly maps from function space to a usable embedding space. Both representations are leveraged by an agent trained with Reinforcement Learning (RL) and learned online during each episode. We evaluate the agent on Multi-Object Navigation and show the high impact of using neural implicit representations as a memory source.\n  - [CLIP-Fields: Weakly Supervised Semantic Fields for Robotic Memory](https://mahis.life/clip-fields/) | [code]\n    > We propose CLIP-Fields, an implicit scene model that can be trained with no direct human supervision. This model learns a mapping from spatial locations to semantic embedding vectors. The mapping can then be used for a variety of tasks, such as segmentation, instance identification, semantic search over space, and view localization. Most importantly, the mapping can be trained with supervision coming only from web-image and web-text trained models such as CLIP, Detic, and Sentence-BERT. When compared to baselines like Mask-RCNN, our method outperforms on few-shot instance identification or semantic segmentation on the HM3D dataset with only a fraction of the examples. Finally, we show that using CLIP-Fields as a scene memory, robots can perform semantic navigation in real-world environments. Our code and demonstrations are available here: https://mahis.life/clip-fields/\n  - [Neural Shape Deformation Priors, NeurIPS2022](https://arxiv.org/abs/2210.05616) | [code]\n    > We present Neural Shape Deformation Priors, a novel method for shape manipulation that predicts mesh deformations of non-rigid objects from user-provided handle movements. State-of-the-art methods cast this problem as an optimization task, where the input source mesh is iteratively deformed to minimize an objective function according to hand-crafted regularizers such as ARAP. In this work, we learn the deformation behavior based on the underlying geometric properties of a shape, while leveraging a large-scale dataset containing a diverse set of non-rigid deformations. Specifically, given a source mesh and desired target locations of handles that describe the partial surface deformation, we predict a continuous deformation field that is defined in 3D space to describe the space deformation. To this end, we introduce transformer-based deformation networks that represent a shape deformation as a composition of local surface deformations. It learns a set of local latent codes anchored in 3D space, from which we can learn a set of continuous deformation functions for local surfaces. Our method can be applied to challenging deformations and generalizes well to unseen deformations. We validate our approach in experiments using the DeformingThing4D dataset, and compare to both classic optimization-based and recent neural network-based methods.\n  - [Controllable Radiance Fields for Dynamic Face Synthesis, 3DV2022](https://arxiv.org/abs/2210.05825) | [code]\n    > Recent work on 3D-aware image synthesis has achieved compelling results using advances in neural rendering. However, 3D-aware synthesis of face dynamics hasn't received much attention. Here, we study how to explicitly control generative model synthesis of face dynamics exhibiting non-rigid motion (e.g., facial expression change), while simultaneously ensuring 3D-awareness. For this we propose a Controllable Radiance Field (CoRF): 1) Motion control is achieved by embedding motion features within the layered latent motion space of a style-based generator; 2) To ensure consistency of background, motion features and subject-specific attributes such as lighting, texture, shapes, albedo, and identity, a face parsing net, a head regressor and an identity encoder are incorporated. On head image/video data we show that CoRFs are 3D-aware while enabling editing of identity, viewing directions, and motion.\n  - [Continuous conditional video synthesis by neural processes](https://arxiv.org/abs/2210.05810) | [***``[code]``***](https://github.com/NPVS/NPVS)\n    > We propose a unified model for multiple conditional video synthesis tasks, including video prediction and video frame interpolation. We show that conditional video synthesis can be formulated as a neural process, which maps input spatio-temporal coordinates to target pixel values given context spatio-temporal coordinates and pixels values. Specifically, we feed an implicit neural representations of coordinates into a Transformer-based non-autoregressive conditional video synthesis model. Our task-specific models outperform previous work for video interpolation on multiple datasets and reach a competitive performance with the state-of-the-art models for video prediction. Importantly, the model is able to interpolate or predict with an arbitrary high frame rate, i.e., continuous synthesis. Our source code is available at this https URL.\n  - [SiNeRF: Sinusoidal Neural Radiance Fields for Joint Pose Estimation and Scene Reconstruction, BMVC2022](https://arxiv.org/abs/2210.04553) | [***``[code]``***](https://github.com/yitongx/sinerf)\n    > NeRFmm is the Neural Radiance Fields (NeRF) that deal with Joint Optimization tasks, i.e., reconstructing real-world scenes and registering camera parameters simultaneously. Despite NeRFmm producing precise scene synthesis and pose estimations, it still struggles to outperform the full-annotated baseline on challenging scenes. In this work, we identify that there exists a systematic sub-optimality in joint optimization and further identify multiple potential sources for it. To diminish the impacts of potential sources, we propose Sinusoidal Neural Radiance Fields (SiNeRF) that leverage sinusoidal activations for radiance mapping and a novel Mixed Region Sampling (MRS) for selecting ray batch efficiently. Quantitative and qualitative results show that compared to NeRFmm, SiNeRF achieves comprehensive significant improvements in image synthesis quality and pose estimation accuracy. Codes are available at this https URL.\n  - [NerfAcc: A General NeRF Acceleration Toolbox](https://arxiv.org/abs/2210.04847) | [***``[code]``***](https://github.com/KAIR-BAIR/nerfacc)\n    > We propose NerfAcc, a toolbox for efficient volumetric rendering of radiance fields. We build on the techniques proposed in Instant-NGP, and extend these techniques to not only support bounded static scenes, but also for dynamic scenes and unbounded scenes. NerfAcc comes with a user-friendly Python API, and is ready for plug-and-play acceleration of most NeRFs. Various examples are provided to show how to use this toolbox. Code can be found here: this https URL.\n  - [Self-Supervised 3D Human Pose Estimation in Static Video Via Neural Rendering](https://arxiv.org/abs/2210.04514) | [code]\n    > Inferring 3D human pose from 2D images is a challenging and long-standing problem in the field of computer vision with many applications including motion capture, virtual reality, surveillance or gait analysis for sports and medicine. We present preliminary results for a method to estimate 3D pose from 2D video containing a single person and a static background without the need for any manual landmark annotations. We achieve this by formulating a simple yet effective self-supervision task: our model is required to reconstruct a random frame of a video given a frame from another timepoint and a rendered image of a transformed human shape template. Crucially for optimisation, our ray casting based rendering pipeline is fully differentiable, enabling end to end training solely based on the reconstruction task.\n  - [MVSPlenOctree: Fast and Generic Reconstruction of Radiance Fields in PlenOctree from Multi-view Stereo, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547795) | [code]\n    > We present MVSPlenOctree, a novel approach that can efficiently reconstruct radiance fields for view synthesis. Unlike previous scene-specific radiance fields reconstruction methods, we present a generic pipeline that can efficiently reconstruct 360-degree-renderable radiance fields via multi-view stereo (MVS) inference from tens of sparse-spread out images. Our approach leverages variance-based statistic features for MVS inference, and combines this with image based rendering and volume rendering for radiance field reconstruction. We first train a MVS Machine for reasoning scene's density and appearance. Then, based on the spatial hierarchy of the PlenOctree and coarse-to-fine dense sampling mechanism, we design a robust and efficient sampling strategy for PlenOctree reconstruction, which handles occlusion robustly. A 360-degree-renderable radiance fields can be reconstructed in PlenOctree from MVS Machine in an efficient single forward pass. We trained our method on real-world DTU, LLFF datasets, and synthetic datasets. We validate its generalizability by evaluating on the test set of DTU dataset which are unseen in training. In summary, our radiance field reconstruction method is both efficient and generic, a coarse 360-degree-renderable radiance field can be reconstructed in seconds and a dense one within minutes. Please visit the project page for more details: https://derry-xing.github.io/projects/MVSPlenOctree.\n  - [ParseMVS: Learning Primitive-aware Surface Representations for Sparse Multi-view Stereopsis, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547920) | [code]\n    > Multi-view stereopsis (MVS) recovers 3D surfaces by finding dense photo-consistent correspondences from densely sampled images. In this paper, we tackle the challenging MVS task from sparsely sampled views (up to an order of magnitude fewer images), which is more practical and cost-efficient in applications. The major challenge comes from the significant correspondence ambiguity introduced by the severe occlusions and the highly skewed patches. On the other hand, such ambiguity can be resolved by incorporating geometric cues from the global structure. In light of this, we propose ParseMVS, boosting sparse MVS by learning the P rimitive-A waR e S urface rE presentation. In particular, on top of being aware of global structure, our novel representation further allows for the preservation of fine details including geometry, texture, and visibility. More specifically, the whole scene is parsed into multiple geometric primitives. On each of them, the geometry is defined as the displacement along the primitives' normal directions, together with the texture and visibility along each view direction. An unsupervised neural network is trained to learn these factors by progressively increasing the photo-consistency and render-consistency among all input images. Since the surface properties are changed locally in the 2D space of each primitive, ParseMVS can preserve global primitive structures while optimizing local details, handling the 'incompleteness' and the 'inaccuracy' problems. We experimentally demonstrate that ParseMVS constantly outperforms the state-of-the-art surface reconstruction method in both completeness and the overall score under varying sampling sparsity, especially under the extreme sparse-MVS settings. Beyond that, ParseMVS also shows great potential in compression, robustness, and efficiency.\n  - [Self-Supervised Multi-view Stereo via Adjacent Geometry Guided Volume Completion, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547926) | [code]\n    > Existing self-supervised multi-view stereo (MVS) approaches largely rely on photometric consistency for geometry inference, and hence suffer from low-texture or non-Lambertian appearances. In this paper, we observe that adjacent geometry shares certain commonality that can help to infer the correct geometry of the challenging or low-confident regions. Yet exploiting such property in a non-supervised MVS approach remains challenging for the lacking of training data and necessity of ensuring consistency between views. To address the issues, we propose a novel geometry inference training scheme by selectively masking regions with rich textures, where geometry can be well recovered and used for supervisory signal, and then lead a deliberately designed cost volume completion network to learn how to recover geometry of the masked regions. During inference, we then mask the low-confident regions instead and use the cost volume completion network for geometry correction. To deal with the different depth hypotheses of the cost volume pyramid, we design a three-branch volume inference structure for the completion network. Further, by considering plane as a special geometry, we first identify planar regions from pseudo labels and then correct the low-confident pixels by high-confident labels through plane normal consistency. Extensive experiments on DTU and Tanks & Temples demonstrate the effectiveness of the proposed framework and the state-of-the-art performance.\n  - [Geometric Warping Error Aware CNN for DIBR Oriented View Synthesis, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547946) | [code]\n    > Depth Image based Rendering (DIBR) oriented view synthesis is an important virtual view generation technique. It warps the reference view images to the target viewpoint based on their depth maps, without requiring many available viewpoints. However, in the 3D warping process, pixels are warped to fractional pixel locations and then rounded (or interpolated) to integer pixels, resulting in geometric warping error and reducing the image quality. This resembles, to some extent, the image super-resolution problem, but with unfixed fractional pixel locations. To address this problem, we propose a geometric warping error aware CNN (GWEA) framework to enhance the DIBR oriented view synthesis. First, a deformable convolution based geometric warping error aware alignment (GWEA-DCA) module is developed, by taking advantage of the geometric warping error preserved in the DIBR module. The offset learned in the deformable convolution can account for the geometric warping error to facilitate the mapping from the fractional pixels to integer pixels. Moreover, in view that the pixels in the warped images are of different qualities due to the different strengths of warping errors, an attention enhanced view blending (GWEA-AttVB) module is further developed to adaptively fuse the pixels from different warped images. Finally, a partial convolution based hole filling and refinement module fills the remaining holes and improves the quality of the overall image. Experiments show that our model can synthesize higher-quality images than the existing methods, and ablation study is also conducted, validating the effectiveness of each proposed module.\n  - [ReFu: Refine and Fuse the Unobserved View for Detail-Preserving Single-Image 3D Human Reconstruction](https://dl.acm.org/doi/abs/10.1145/3503161.3547971) | [code]\n    > Single-image 3D human reconstruction aims to reconstruct the 3D textured surface of the human body given a single image. While implicit function-based methods recently achieved reasonable reconstruction performance, they still bear limitations showing degraded quality in both surface geometry and texture from an unobserved view. In response, to generate a realistic textured surface, we propose ReFu, a coarse-to-fine approach that refines the projected backside view image and fuses the refined image to predict the final human body. To suppress the diffused occupancy that causes noise in projection images and reconstructed meshes, we propose to train occupancy probability by simultaneously utilizing 2D and 3D supervisions with occupancy-based volume rendering. We also introduce a refinement architecture that generates detail-preserving backside-view images with front-to-back warping. Extensive experiments demonstrate that our method achieves state-of-the-art performance in 3D human reconstruction from a single image, showing enhanced geometry and texture quality from an unobserved view.\n  - [NeRF2Real: Sim2real Transfer of Vision-guided Bipedal Motion Skills using Neural Radiance Fields](https://arxiv.org/abs/2210.04932) | [code]\n    > We present a system for applying sim2real approaches to \"in the wild\" scenes with realistic visuals, and to policies which rely on active perception using RGB cameras. Given a short video of a static scene collected using a generic phone, we learn the scene's contact geometry and a function for novel view synthesis using a Neural Radiance Field (NeRF). We augment the NeRF rendering of the static scene by overlaying the rendering of other dynamic objects (e.g. the robot's own body, a ball). A simulation is then created using the rendering engine in a physics simulator which computes contact dynamics from the static scene geometry (estimated from the NeRF volume density) and the dynamic objects' geometry and physical properties (assumed known). We demonstrate that we can use this simulation to learn vision-based whole body navigation and ball pushing policies for a 20 degrees of freedom humanoid robot with an actuated head-mounted RGB camera, and we successfully transfer these policies to a real robot. Project video is available at this https URL\n  - [Uncertainty-Aware Semi-Supervised Learning of 3D Face Rigging from Single Image, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3548285) | [code]\n    > We present a method to rig 3D faces via Action Units (AUs), viewpoint and light direction, from single input image. Existing 3D methods for face synthesis and animation rely heavily on 3D morphable model (3DMM), which was built on 3D data and cannot provide intuitive expression parameters, while AU-driven 2D methods cannot handle head pose and lighting effect. We bridge the gap by integrating a recent 3D reconstruction method with 2D AU-driven method in a semi-supervised fashion. Built upon the auto-encoding 3D face reconstruction model that decouples depth, albedo, viewpoint and light without any supervision, we further decouple expression from identity for depth and albedo with a novel conditional feature translation module and pretrained critics for AU intensity estimation and image classification. Novel objective functions are designed using unlabeled in-the-wild images and in-door images with AU labels. We also leverage uncertainty losses to model the probably changing AU region of images as input noise for synthesis, and model the noisy AU intensity labels for intensity estimation of the AU critic. Experiments with face editing and animation on four datasets show that, compared with six state-of-the-art methods, our proposed method is superior and effective on expression consistency, identity similarity and pose similarity.\n  - [Robustifying the Multi-Scale Representation of Neural Radiance Fields, BMVC2022](https://arxiv.org/abs/2210.04233) | [code]\n    > Neural Radiance Fields (NeRF) recently emerged as a new paradigm for object representation from multi-view (MV) images. Yet, it cannot handle multi-scale (MS) images and camera pose estimation errors, which generally is the case with multi-view images captured from a day-to-day commodity camera. Although recently proposed Mip-NeRF could handle multi-scale imaging problems with NeRF, it cannot handle camera pose estimation error. On the other hand, the newly proposed BARF can solve the camera pose problem with NeRF but fails if the images are multi-scale in nature. This paper presents a robust multi-scale neural radiance fields representation approach to simultaneously overcome both real-world imaging issues. Our method handles multi-scale imaging effects and camera-pose estimation problems with NeRF-inspired approaches by leveraging the fundamentals of scene rigidity. To reduce unpleasant aliasing artifacts due to multi-scale images in the ray space, we leverage Mip-NeRF multi-scale representation. For joint estimation of robust camera pose, we propose graph-neural network-based multiple motion averaging in the neural volume rendering framework. We demonstrate, with examples, that for an accurate neural representation of an object from day-to-day acquired multi-view images, it is crucial to have precise camera-pose estimates. Without considering robustness measures in the camera pose estimation, modeling for multi-scale aliasing artifacts via conical frustum can be counterproductive. We present extensive experiments on the benchmark datasets to demonstrate that our approach provides better results than the recent NeRF-inspired approaches for such realistic settings.\n  - [Estimating Neural Reflectance Field from Radiance Field using Tree Structures](https://arxiv.org/abs/2210.04217) | [code]\n    > We present a new method for estimating the Neural Reflectance Field (NReF) of an object from a set of posed multi-view images under unknown lighting. NReF represents 3D geometry and appearance of objects in a disentangled manner, and are hard to be estimated from images only. Our method solves this problem by exploiting the Neural Radiance Field (NeRF) as a proxy representation, from which we perform further decomposition. A high-quality NeRF decomposition relies on good geometry information extraction as well as good prior terms to properly resolve ambiguities between different components. To extract high-quality geometry information from radiance fields, we re-design a new ray-casting based method for surface point extraction. To efficiently compute and apply prior terms, we convert different prior terms into different type of filter operations on the surface extracted from radiance field. We then employ two type of auxiliary data structures, namely Gaussian KD-tree and octree, to support fast querying of surface points and efficient computation of surface filters during training. Based on this, we design a multi-stage decomposition optimization pipeline for estimating neural reflectance field from neural radiance fields. Extensive experiments show our method outperforms other state-of-the-art methods on different data, and enable high-quality free-view relighting as well as material editing tasks.\n  - [Towards Efficient Neural Scene Graphs by Learning Consistency Fields, BMVC2022](https://arxiv.org/abs/2210.04127) | [***``[code]``***](https://github.com/ldynx/CF-NSG)\n    > Neural Radiance Fields (NeRF) achieves photo-realistic image rendering from novel views, and the Neural Scene Graphs (NSG) \\cite{ost2021neural} extends it to dynamic scenes (video) with multiple objects. Nevertheless, computationally heavy ray marching for every image frame becomes a huge burden. In this paper, taking advantage of significant redundancy across adjacent frames in videos, we propose a feature-reusing framework. From the first try of naively reusing the NSG features, however, we learn that it is crucial to disentangle object-intrinsic properties consistent across frames from transient ones. Our proposed method, \\textit{Consistency-Field-based NSG (CF-NSG)}, reformulates neural radiance fields to additionally consider \\textit{consistency fields}. With disentangled representations, CF-NSG takes full advantage of the feature-reusing scheme and performs an extended degree of scene manipulation in a more controllable manner. We empirically verify that CF-NSG greatly improves the inference efficiency by using 85\\% less queries than NSG without notable degradation in rendering quality. Code will be available at: this https URL\n## Oct2 - Oct8, 2022\n  - [ViewFool: Evaluating the Robustness of Visual Recognition to Adversarial Viewpoints, NeurIPS2022](https://arxiv.org/abs/2210.03895) | [code]\n    > Recent studies have demonstrated that visual recognition models lack robustness to distribution shift. However, current work mainly considers model robustness to 2D image transformations, leaving viewpoint changes in the 3D world less explored. In general, viewpoint changes are prevalent in various real-world applications (e.g., autonomous driving), making it imperative to evaluate viewpoint robustness. In this paper, we propose a novel method called ViewFool to find adversarial viewpoints that mislead visual recognition models. By encoding real-world objects as neural radiance fields (NeRF), ViewFool characterizes a distribution of diverse adversarial viewpoints under an entropic regularizer, which helps to handle the fluctuations of the real camera pose and mitigate the reality gap between the real objects and their neural representations. Experiments validate that the common image classifiers are extremely vulnerable to the generated adversarial viewpoints, which also exhibit high cross-model transferability. Based on ViewFool, we introduce ImageNet-V, a new out-of-distribution dataset for benchmarking viewpoint robustness of image classifiers. Evaluation results on 40 classifiers with diverse architectures, objective functions, and data augmentations reveal a significant drop in model performance when tested on ImageNet-V, which provides a possibility to leverage ViewFool as an effective data augmentation strategy to improve viewpoint robustness.\n  - [Novel View Synthesis for Surgical Recording](https://link.springer.com/chapter/10.1007/978-3-031-18576-2_7) | [code]\n    > Recording surgery in operating rooms is one of the essential tasks for education and evaluation of medical treatment. However, recording the fields which depict the surgery is difficult because the targets are heavily occluded during surgery by the heads or hands of doctors or nurses. We use a recording system which multiple cameras embedded in the surgical lamp, assuming that at least one camera is recording the target without occlusion. In this paper, we propose Conditional-BARF (C-BARF) to generate occlusion-free images by synthesizing novel view images from the camera, aiming to generate videos with smooth camera pose transitions. To the best of our knowledge, this is the first work to tackle the problem of synthesizing a novel view image from multiple images for the surgery scene. We conduct experiments using an original dataset of three different types of surgeries. Our experiments show that we can successfully synthesize novel views from the images recorded by the multiple cameras embedded in the surgical lamp.\n  - [A Keypoint Based Enhancement Method for Audio Driven Free View Talking Head Synthesis](https://arxiv.org/abs/2210.03335) | [code]\n    > Audio driven talking head synthesis is a challenging task that attracts increasing attention in recent years. Although existing methods based on 2D landmarks or 3D face models can synthesize accurate lip synchronization and rhythmic head pose for arbitrary identity, they still have limitations, such as the cut feeling in the mouth mapping and the lack of skin highlights. The morphed region is blurry compared to the surrounding face. A Keypoint Based Enhancement (KPBE) method is proposed for audio driven free view talking head synthesis to improve the naturalness of the generated video. Firstly, existing methods were used as the backend to synthesize intermediate results. Then we used keypoint decomposition to extract video synthesis controlling parameters from the backend output and the source image. After that, the controlling parameters were composited to the source keypoints and the driving keypoints. A motion field based method was used to generate the final image from the keypoint representation. With keypoint representation, we overcame the cut feeling in the mouth mapping and the lack of skin highlights. Experiments show that our proposed enhancement method improved the quality of talking-head videos in terms of mean opinion score.\n  - [A Simple Plugin for Transforming Images to Arbitrary Scales](https://arxiv.org/abs/2210.03417) | [code]\n    > Existing models on super-resolution often specialized for one scale, fundamentally limiting their use in practical scenarios. In this paper, we aim to develop a general plugin that can be inserted into existing super-resolution models, conveniently augmenting their ability towards Arbitrary Resolution Image Scaling, thus termed ARIS. We make the following contributions: (i) we propose a transformer-based plugin module, which uses spatial coordinates as query, iteratively attend the low-resolution image feature through cross-attention, and output visual feature for the queried spatial location, resembling an implicit representation for images; (ii) we introduce a novel self-supervised training scheme, that exploits consistency constraints to effectively augment the model's ability for upsampling images towards unseen scales, i.e. ground-truth high-resolution images are not available; (iii) without loss of generality, we inject the proposed ARIS plugin module into several existing models, namely, IPT, SwinIR, and HAT, showing that the resulting models can not only maintain their original performance on fixed scale factor but also extrapolate to unseen scales, substantially outperforming existing any-scale super-resolution models on standard benchmarks, e.g. Urban100, DIV2K, etc.\n  - [Feature-Realistic Neural Fusion for Real-Time, Open Set Scene Understanding](https://arxiv.org/abs/2210.03043) | [code]\n    > General scene understanding for robotics requires flexible semantic representation, so that novel objects and structures which may not have been known at training time can be identified, segmented and grouped. We present an algorithm which fuses general learned features from a standard pre-trained network into a highly efficient 3D geometric neural field representation during real-time SLAM. The fused 3D feature maps inherit the coherence of the neural field's geometry representation. This means that tiny amounts of human labelling interacting at runtime enable objects or even parts of objects to be robustly and accurately segmented in an open set manner.\n  - [XDGAN: Multi-Modal 3D Shape Generation in 2D Space](https://arxiv.org/abs/2210.03007) | [code]\n    > Generative models for 2D images has recently seen tremendous progress in quality, resolution and speed as a result of the efficiency of 2D convolutional architectures. However it is difficult to extend this progress into the 3D domain since most current 3D representations rely on custom network components. This paper addresses a central question: Is it possible to directly leverage 2D image generative models to generate 3D shapes instead? To answer this, we propose XDGAN, an effective and fast method for applying 2D image GAN architectures to the generation of 3D object geometry combined with additional surface attributes, like color textures and normals. Specifically, we propose a novel method to convert 3D shapes into compact 1-channel geometry images and leverage StyleGAN3 and image-to-image translation networks to generate 3D objects in 2D space. The generated geometry images are quick to convert to 3D meshes, enabling real-time 3D object synthesis, visualization and interactive editing. Moreover, the use of standard 2D architectures can help bring more 2D advances into the 3D realm. We show both quantitatively and qualitatively that our method is highly effective at various tasks such as 3D shape generation, single view reconstruction and shape manipulation, while being significantly faster and more flexible compared to recent 3D generative models.\n  - [A Real2Sim2Real Method for Robust Object Grasping with Neural Surface Reconstruction](https://arxiv.org/abs/2210.02685) | [code]\n    > Recent 3D-based manipulation methods either directly predict the grasp pose using 3D neural networks, or solve the grasp pose using similar objects retrieved from shape databases. However, the former faces generalizability challenges when testing with new robot arms or unseen objects; and the latter assumes that similar objects exist in the databases. We hypothesize that recent 3D modeling methods provides a path towards building digital replica of the evaluation scene that affords physical simulation and supports robust manipulation algorithm learning. We propose to reconstruct high-quality meshes from real-world point clouds using state-of-the-art neural surface reconstruction method (the Real2Sim step). Because most simulators take meshes for fast simulation, the reconstructed meshes enable grasp pose labels generation without human efforts. The generated labels can train grasp network that performs robustly in the real evaluation scene (the Sim2Real step). In synthetic and real experiments, we show that the Real2Sim2Real pipeline performs better than baseline grasp networks trained with a large dataset and a grasp sampling method with retrieval-based reconstruction. The benefit of the Real2Sim2Real pipeline comes from 1) decoupling scene modeling and grasp sampling into sub-problems, and 2) both sub-problems can be solved with sufficiently high quality using recent 3D learning algorithms and mesh-based physical simulation techniques.\n  - [Feature-Realistic Neural Fusion for Real-Time, Open Set Scene Understanding](https://arxiv.org/abs/2210.03043) | [code]\n    > General scene understanding for robotics requires flexible semantic representation, so that novel objects and structures which may not have been known at training time can be identified, segmented and grouped. We present an algorithm which fuses general learned features from a standard pre-trained network into a highly efficient 3D geometric neural field representation during real-time SLAM. The fused 3D feature maps inherit the coherence of the neural field's geometry representation. This means that tiny amounts of human labelling interacting at runtime enable objects or even parts of objects to be robustly and accurately segmented in an open set manner.\n  - [Neural Matching Fields: Implicit Representation of Matching Fields for Visual Correspondence, NeurIPS2022](https://arxiv.org/abs/2210.02689) | [***``[code]``***](https://ku-cvlab.github.io/NeMF/)\n    > Existing pipelines of semantic correspondence commonly include extracting high-level semantic features for the invariance against intra-class variations and background clutters. This architecture, however, inevitably results in a low-resolution matching field that additionally requires an ad-hoc interpolation process as a post-processing for converting it into a high-resolution one, certainly limiting the overall performance of matching results. To overcome this, inspired by recent success of implicit neural representation, we present a novel method for semantic correspondence, called Neural Matching Field (NeMF). However, complicacy and high-dimensionality of a 4D matching field are the major hindrances, which we propose a cost embedding network to process a coarse cost volume to use as a guidance for establishing high-precision matching field through the following fully-connected network. Nevertheless, learning a high-dimensional matching field remains challenging mainly due to computational complexity, since a naive exhaustive inference would require querying from all pixels in the 4D space to infer pixel-wise correspondences. To overcome this, we propose adequate training and inference procedures, which in the training phase, we randomly sample matching candidates and in the inference phase, we iteratively performs PatchMatch-based inference and coordinate optimization at test time. With these combined, competitive results are attained on several standard benchmarks for semantic correspondence. Code and pre-trained weights are available at this https URL.\n  - [IR-MCL: Implicit Representation-Based Online Global Localization](https://arxiv.org/abs/2210.03113) | [***``[code]``***](https://github.com/PRBonn/ir-mcl)\n    > Determining the state of a mobile robot is an essential building block of robot navigation systems. In this paper, we address the problem of estimating the robots pose in an indoor environment using 2D LiDAR data and investigate how modern environment models can improve gold standard Monte-Carlo localization (MCL) systems. We propose a neural occupancy field (NOF) to implicitly represent the scene using a neural network. With the pretrained network, we can synthesize 2D LiDAR scans for an arbitrary robot pose through volume rendering. Based on the implicit representation, we can obtain the similarity between a synthesized and actual scan as an observation model and integrate it into an MCL system to perform accurate localization. We evaluate our approach on five sequences of a self-recorded dataset and three publicly available datasets. We show that we can accurately and efficiently localize a robot using our approach surpassing the localization performance of state-of-the-art methods. The experiments suggest that the presented implicit representation is able to predict more accurate 2D LiDAR scans leading to an improved observation model for our particle filter-based localization. The code of our approach is released at: this https URL.\n  - [SelfNeRF: Fast Training NeRF for Human from Monocular Self-rotating Video](https://arxiv.org/abs/2210.01651) | [code]\n    > In this paper, we propose SelfNeRF, an efficient neural radiance field based novel view synthesis method for human performance. Given monocular self-rotating videos of human performers, SelfNeRF can train from scratch and achieve high-fidelity results in about twenty minutes. Some recent works have utilized the neural radiance field for dynamic human reconstruction. However, most of these methods need multi-view inputs and require hours of training, making it still difficult for practical use. To address this challenging problem, we introduce a surface-relative representation based on multi-resolution hash encoding that can greatly improve the training speed and aggregate inter-frame information. Extensive experimental results on several different datasets demonstrate the effectiveness and efficiency of SelfNeRF to challenging monocular videos.\n  - [Capturing and Animation of Body and Clothing from Monocular Video](https://arxiv.org/abs/2210.01868) | [code]\n    > While recent work has shown progress on extracting clothed 3D human avatars from a single image, video, or a set of 3D scans, several limitations remain. Most methods use a holistic representation to jointly model the body and clothing, which means that the clothing and body cannot be separated for applications like virtual try-on. Other methods separately model the body and clothing, but they require training from a large set of 3D clothed human meshes obtained from 3D/4D scanners or physics simulations. Our insight is that the body and clothing have different modeling requirements. While the body is well represented by a mesh-based parametric 3D model, implicit representations and neural radiance fields are better suited to capturing the large variety in shape and appearance present in clothing. Building on this insight, we propose SCARF (Segmented Clothed Avatar Radiance Field), a hybrid model combining a mesh-based body with a neural radiance field. Integrating the mesh into the volumetric rendering in combination with a differentiable rasterizer enables us to optimize SCARF directly from monocular videos, without any 3D supervision. The hybrid modeling enables SCARF to (i) animate the clothed body avatar by changing body poses (including hand articulation and facial expressions), (ii) synthesize novel views of the avatar, and (iii) transfer clothing between avatars in virtual try-on applications. We demonstrate that SCARF reconstructs clothing with higher visual quality than existing methods, that the clothing deforms with changing body pose and body shape, and that clothing can be successfully transferred between avatars of different subjects. The code and models are available at this https URL.\n  - [Learning Perception-Aware Agile Flight in Cluttered Environments](https://arxiv.org/abs/2210.01841) | [code]\n    > Recently, neural control policies have outperformed existing model-based planning-and-control methods for autonomously navigating quadrotors through cluttered environments in minimum time. However, they are not perception aware, a crucial requirement in vision-based navigation due to the camera's limited field of view and the underactuated nature of a quadrotor. We propose a method to learn neural network policies that achieve perception-aware, minimum-time flight in cluttered environments. Our method combines imitation learning and reinforcement learning (RL) by leveraging a privileged learning-by-cheating framework. Using RL, we first train a perception-aware teacher policy with full-state information to fly in minimum time through cluttered environments. Then, we use imitation learning to distill its knowledge into a vision-based student policy that only perceives the environment via a camera. Our approach tightly couples perception and control, showing a significant advantage in computation speed (10x faster) and success rate. We demonstrate the closed-loop control performance using a physical quadrotor and hardware-in-the-loop simulation at speeds up to 50km/h.\n  - [Dfferentiable Raycasting for Self-supervised Occupancy Forecasting, ECCV2022](https://arxiv.org/abs/2210.01917) | [***``[code]``***](https://github.com/tarashakhurana/emergent-occ-forecasting)\n    > Motion planning for safe autonomous driving requires learning how the environment around an ego-vehicle evolves with time. Ego-centric perception of driveable regions in a scene not only changes with the motion of actors in the environment, but also with the movement of the ego-vehicle itself. Self-supervised representations proposed for large-scale planning, such as ego-centric freespace, confound these two motions, making the representation difficult to use for downstream motion planners. In this paper, we use geometric occupancy as a natural alternative to view-dependent representations such as freespace. Occupancy maps naturally disentangle the motion of the environment from the motion of the ego-vehicle. However, one cannot directly observe the full 3D occupancy of a scene (due to occlusion), making it difficult to use as a signal for learning. Our key insight is to use differentiable raycasting to \"render\" future occupancy predictions into future LiDAR sweep predictions, which can be compared with ground-truth sweeps for self-supervised learning. The use of differentiable raycasting allows occupancy to emerge as an internal representation within the forecasting network. In the absence of groundtruth occupancy, we quantitatively evaluate the forecasting of raycasted LiDAR sweeps and show improvements of upto 15 F1 points. For downstream motion planners, where emergent occupancy can be directly used to guide non-driveable regions, this representation relatively reduces the number of collisions with objects by up to 17% as compared to freespace-centric motion planners.\n  - [Self-improving Multiplane-to-layer Images for Novel View Synthesis, WACV2023](https://samsunglabs.github.io/MLI/) | [***``[code]``***](https://github.com/SamsungLabs/MLI)\n    > We present a new method for lightweight novel-view synthesis that generalizes to an arbitrary forward-facing scene. Recent approaches are computationally expensive, require per-scene optimization, or produce a memory-expensive representation. We start by representing the scene with a set of fronto-parallel semitransparent planes and afterward convert them to deformable layers in an end-to-end manner. Additionally, we employ a feed-forward refinement procedure that corrects the estimated representation by aggregating information from input views. Our method does not require fine-tuning when a new scene is processed and can handle an arbitrary number of views without restrictions. Experimental results show that our approach surpasses recent models in terms of common metrics and human evaluation, with the noticeable advantage in inference speed and compactness of the inferred layered geometry, see this https URL\n  - [Uncertainty-Driven Active Vision for Implicit Scene Reconstruction](https://arxiv.org/abs/2210.00978) | [code]\n    > Multi-view implicit scene reconstruction methods have become increasingly popular due to their ability to represent complex scene details. Recent efforts have been devoted to improving the representation of input information and to reducing the number of views required to obtain high quality reconstructions. Yet, perhaps surprisingly, the study of which views to select to maximally improve scene understanding remains largely unexplored. We propose an uncertainty-driven active vision approach for implicit scene reconstruction, which leverages occupancy uncertainty accumulated across the scene using volume rendering to select the next view to acquire. To this end, we develop an occupancy-based reconstruction method which accurately represents scenes using either 2D or 3D supervision. We evaluate our proposed approach on the ABC dataset and the in the wild CO3D dataset, and show that: (1) we are able to obtain high quality state-of-the-art occupancy reconstructions; (2) our perspective conditioned uncertainty definition is effective to drive improvements in next best view selection and outperforms strong baseline approaches; and (3) we can further improve shape understanding by performing a gradient-based search on the view selection candidates. Overall, our results highlight the importance of view selection for implicit scene reconstruction, making it a promising avenue to explore further.\n  - [NARF22: Neural Articulated Radiance Fields for Configuration-Aware Rendering, IROS2022](https://progress.eecs.umich.edu/projects/narf/) | [code]\n    > Articulated objects pose a unique challenge for robotic perception and manipulation. Their increased number of degrees-of-freedom makes tasks such as localization computationally difficult, while also making the process of real-world dataset collection unscalable. With the aim of addressing these scalability issues, we propose Neural Articulated Radiance Fields (NARF22), a pipeline which uses a fully-differentiable, configuration-parameterized Neural Radiance Field (NeRF) as a means of providing high quality renderings of articulated objects. NARF22 requires no explicit knowledge of the object structure at inference time. We propose a two-stage parts-based training mechanism which allows the object rendering models to generalize well across the configuration space even if the underlying training data has as few as one configuration represented. We demonstrate the efficacy of NARF22 by training configurable renderers on a real-world articulated tool dataset collected via a Fetch mobile manipulation robot. We show the applicability of the model to gradient-based inference methods through a configuration estimation and 6 degree-of-freedom pose refinement task. The project webpage is available at: this https URL.\n  - [Probabilistic Volumetric Fusion for Dense Monocular SLAM](https://arxiv.org/abs/2210.01276) | [code]\n    > We present a novel method to reconstruct 3D scenes from images by leveraging deep dense monocular SLAM and fast uncertainty propagation. The proposed approach is able to 3D reconstruct scenes densely, accurately, and in real-time while being robust to extremely noisy depth estimates coming from dense monocular SLAM. Differently from previous approaches, that either use ad-hoc depth filters, or that estimate the depth uncertainty from RGB-D cameras' sensor models, our probabilistic depth uncertainty derives directly from the information matrix of the underlying bundle adjustment problem in SLAM. We show that the resulting depth uncertainty provides an excellent signal to weight the depth-maps for volumetric fusion. Without our depth uncertainty, the resulting mesh is noisy and with artifacts, while our approach generates an accurate 3D mesh with significantly fewer artifacts. We provide results on the challenging Euroc dataset, and show that our approach achieves 92% better accuracy than directly fusing depths from monocular SLAM, and up to 90% improvements compared to the best competing approach.\n  - [SinGRAV: Learning a Generative Radiance Volume from a Single Natural Scene](https://arxiv.org/abs/2210.01202) | [code]\n    > We present a 3D generative model for general natural scenes. Lacking necessary volumes of 3D data characterizing the target scene, we propose to learn from a single scene. Our key insight is that a natural scene often contains multiple constituents whose geometry, texture, and spatial arrangements follow some clear patterns, but still exhibit rich variations over different regions within the same scene. This suggests localizing the learning of a generative model on substantial local regions. Hence, we exploit a multi-scale convolutional network, which possesses the spatial locality bias in nature, to learn from the statistics of local regions at multiple scales within a single scene. In contrast to existing methods, our learning setup bypasses the need to collect data from many homogeneous 3D scenes for learning common features. We coin our method SinGRAV, for learning a Generative RAdiance Volume from a Single natural scene. We demonstrate the ability of SinGRAV in generating plausible and diverse variations from a single scene, the merits of SinGRAV over state-of-the-art generative neural scene methods, as well as the versatility of SinGRAV by its use in a variety of applications, spanning 3D scene editing, composition, and animation. Code and data will be released to facilitate further research.\n  - [IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable Novel View Synthesis](https://arxiv.org/abs/2210.00647) | [***``[code]``***](https://github.com/zju3dv/IntrinsicNeRF)\n    > We present intrinsic neural radiance fields, dubbed IntrinsicNeRF, that introduce intrinsic decomposition into the NeRF-based~\\cite{mildenhall2020nerf} neural rendering method and can perform editable novel view synthesis in room-scale scenes while existing inverse rendering combined with neural rendering methods~\\cite{zhang2021physg, zhang2022modeling} can only work on object-specific scenes. Given that intrinsic decomposition is a fundamentally ambiguous and under-constrained inverse problem, we propose a novel distance-aware point sampling and adaptive reflectance iterative clustering optimization method that enables IntrinsicNeRF with traditional intrinsic decomposition constraints to be trained in an unsupervised manner, resulting in temporally consistent intrinsic decomposition results. To cope with the problem of different adjacent instances of similar reflectance in a scene being incorrectly clustered together, we further propose a hierarchical clustering method with coarse-to-fine optimization to obtain a fast hierarchical indexing representation. It enables compelling real-time augmented reality applications such as scene recoloring, material editing, and illumination variation. Extensive experiments on Blender Object and Replica Scene demonstrate that we can obtain high-quality, consistent intrinsic decomposition results and high-fidelity novel view synthesis even for challenging sequences. Code and data are available on the project webpage: this https URL.\n  - [Unsupervised Multi-View Object Segmentation Using Radiance Field Propagation, NeurIPS2022](https://arxiv.org/abs/2210.00489) | [code]\n    > We present radiance field propagation (RFP), a novel approach to segmenting objects in 3D during reconstruction given only unlabeled multi-view images of a scene. RFP is derived from emerging neural radiance field-based techniques, which jointly encodes semantics with appearance and geometry. The core of our method is a novel propagation strategy for individual objects' radiance fields with a bidirectional photometric loss, enabling an unsupervised partitioning of a scene into salient or meaningful regions corresponding to different object instances. To better handle complex scenes with multiple objects and occlusions, we further propose an iterative expectation-maximization algorithm to refine object masks. To the best of our knowledge, RFP is the first unsupervised approach for tackling 3D scene object segmentation for neural radiance field (NeRF) without any supervision, annotations, or other cues such as 3D bounding boxes and prior knowledge of object class. Experiments demonstrate that RFP achieves feasible segmentation results that are more accurate than previous unsupervised image/scene segmentation approaches, and are comparable to existing supervised NeRF-based methods. The segmented object representations enable individual 3D object editing operations.\n  - [MonoNHR: Monocular Neural Human Renderer](https://arxiv.org/abs/2210.00627) | [code]\n    > Existing neural human rendering methods struggle with a single image input due to the lack of information in invisible areas and the depth ambiguity of pixels in visible areas. In this regard, we propose Monocular Neural Human Renderer (MonoNHR), a novel approach that renders robust free-viewpoint images of an arbitrary human given only a single image. MonoNHR is the first method that (i) renders human subjects never seen during training in a monocular setup, and (ii) is trained in a weakly-supervised manner without geometry supervision. First, we propose to disentangle 3D geometry and texture features and to condition the texture inference on the 3D geometry features. Second, we introduce a Mesh Inpainter module that inpaints the occluded parts exploiting human structural priors such as symmetry. Experiments on ZJU-MoCap, AIST, and HUMBI datasets show that our approach significantly outperforms the recent methods adapted to the monocular case.\n  - [NeRF: Neural Radiance Field in 3D Vision, A Comprehensive Review](https://arxiv.org/abs/2210.00379) | [code]\n    > Neural Radiance Field (NeRF), a new novel view synthesis with implicit scene representation has taken the field of Computer Vision by storm. As a novel view synthesis and 3D reconstruction method, NeRF models find applications in robotics, urban mapping, autonomous navigation, virtual reality/augmented reality, and more. Since the original paper by Mildenhall et al., more than 250 preprints were published, with more than 100 eventually being accepted in tier one Computer Vision Conferences. Given NeRF popularity and the current interest in this research area, we believe it necessary to compile a comprehensive survey of NeRF papers from the past two years, which we organized into both architecture, and application based taxonomies. We also provide an introduction to the theory of NeRF based novel view synthesis, and a benchmark comparison of the performance and speed of key NeRF models. By creating this survey, we hope to introduce new researchers to NeRF, provide a helpful reference for influential works in this field, as well as motivate future research directions with our discussion section.\n## Sep25 - Oct1, 2022\n  - [Structure-Aware NeRF without Posed Camera via Epipolar Constraint](https://arxiv.org/abs/2210.00183) | [***``[code]``***](https://github.com/XTU-PR-LAB/SaNerf)\n    > The neural radiance field (NeRF) for realistic novel view synthesis requires camera poses to be pre-acquired by a structure-from-motion (SfM) approach. This two-stage strategy is not convenient to use and degrades the performance because the error in the pose extraction can propagate to the view synthesis. We integrate the pose extraction and view synthesis into a single end-to-end procedure so they can benefit from each other. For training NeRF models, only RGB images are given, without pre-known camera poses. The camera poses are obtained by the epipolar constraint in which the identical feature in different views has the same world coordinates transformed from the local camera coordinates according to the extracted poses. The epipolar constraint is jointly optimized with pixel color constraint. The poses are represented by a CNN-based deep network, whose input is the related frames. This joint optimization enables NeRF to be aware of the scene's structure that has an improved generalization performance. Extensive experiments on a variety of scenes demonstrate the effectiveness of the proposed approach. Code is available at this https URL.\n  - [SCI: A spectrum concentrated implicit neural compression for biomedical data](https://arxiv.org/abs/2209.15180) | [code]\n    > Massive collection and explosive growth of the huge amount of medical data, demands effective compression for efficient storage, transmission and sharing. Readily available visual data compression techniques have been studied extensively but tailored for nature images/videos, and thus show limited performance on medical data which are of different characteristics. Emerging implicit neural representation (INR) is gaining momentum and demonstrates high promise for fitting diverse visual data in target-data-specific manner, but a general compression scheme covering diverse medical data is so far absent. To address this issue, we firstly derive a mathematical explanation for INR's spectrum concentration property and an analytical insight on the design of compression-oriented INR architecture. Further, we design a funnel shaped neural network capable of covering broad spectrum of complex medical data and achieving high compression ratio. Based on this design, we conduct compression via optimization under given budget and propose an adaptive compression approach SCI, which adaptively partitions the target data into blocks matching the concentrated spectrum envelop of the adopted INR, and allocates parameter with high representation accuracy under given compression ratio. The experiments show SCI's superior performance over conventional techniques and wide applicability across diverse medical data.\n  - [Improving 3D-aware Image Synthesis with A Geometry-aware Discriminator, NeurIPS2022](https://arxiv.org/abs/2209.15637) | [***``[code]``***](https://github.com/vivianszf/geod)\n    > 3D-aware image synthesis aims at learning a generative model that can render photo-realistic 2D images while capturing decent underlying 3D shapes. A popular solution is to adopt the generative adversarial network (GAN) and replace the generator with a 3D renderer, where volume rendering with neural radiance field (NeRF) is commonly used. Despite the advancement of synthesis quality, existing methods fail to obtain moderate 3D shapes. We argue that, considering the two-player game in the formulation of GANs, only making the generator 3D-aware is not enough. In other words, displacing the generative mechanism only offers the capability, but not the guarantee, of producing 3D-aware images, because the supervision of the generator primarily comes from the discriminator. To address this issue, we propose GeoD through learning a geometry-aware discriminator to improve 3D-aware GANs. Concretely, besides differentiating real and fake samples from the 2D image space, the discriminator is additionally asked to derive the geometry information from the inputs, which is then applied as the guidance of the generator. Such a simple yet effective design facilitates learning substantially more accurate 3D shapes. Extensive experiments on various generator architectures and training datasets verify the superiority of GeoD over state-of-the-art alternatives. Moreover, our approach is registered as a general framework such that a more capable discriminator (i.e., with a third task of novel view synthesis beyond domain classification and geometry extraction) can further assist the generator with a better multi-view consistency.\n  - [Understanding Pure CLIP Guidance for Voxel Grid NeRF Models](https://arxiv.org/abs/2209.15172) | [code]\n    > We explore the task of text to 3D object generation using CLIP. Specifically, we use CLIP for guidance without access to any datasets, a setting we refer to as pure CLIP guidance. While prior work has adopted this setting, there is no systematic study of mechanics for preventing adversarial generations within CLIP. We illustrate how different image-based augmentations prevent the adversarial generation problem, and how the generated results are impacted. We test different CLIP model architectures and show that ensembling different models for guidance can prevent adversarial generations within bigger models and generate sharper results. Furthermore, we implement an implicit voxel grid model to show how neural networks provide an additional layer of regularization, resulting in better geometrical structure and coherency of generated objects. Compared to prior work, we achieve more coherent results with higher memory efficiency and faster training speeds.\n  - [Distilling Style from Image Pairs for Global Forward and Inverse Tone Mapping, CVMP2022](https://arxiv.org/abs/2209.15165) | [code]\n    > Many image enhancement or editing operations, such as forward and inverse tone mapping or color grading, do not have a unique solution, but instead a range of solutions, each representing a different style. Despite this, existing learning-based methods attempt to learn a unique mapping, disregarding this style. In this work, we show that information about the style can be distilled from collections of image pairs and encoded into a 2- or 3-dimensional vector. This gives us not only an efficient representation but also an interpretable latent space for editing the image style. We represent the global color mapping between a pair of images as a custom normalizing flow, conditioned on a polynomial basis of the pixel color. We show that such a network is more effective than PCA or VAE at encoding image style in low-dimensional space and lets us obtain an accuracy close to 40 dB, which is about 7-10 dB improvement over the state-of-the-art methods.\n  - [Sphere-Guided Training of Neural Implicit Surfaces](https://arxiv.org/abs/2209.15511) | [code]\n    > In recent years, surface modeling via neural implicit functions has become one of the main techniques for multi-view 3D reconstruction. However, the state-of-the-art methods rely on the implicit functions to model an entire volume of the scene, leading to reduced reconstruction fidelity in the areas with thin objects or high-frequency details. To address that, we present a method for jointly training neural implicit surfaces alongside an auxiliary explicit shape representation, which acts as surface guide. In our approach, this representation encapsulates the surface region of the scene and enables us to boost the efficiency of the implicit function training by only modeling the volume in that region. We propose using a set of learnable spherical primitives as a learnable surface guidance since they can be efficiently trained alongside the neural surface function using its gradients. Our training pipeline consists of iterative updates of the spheres' centers using the gradients of the implicit function and then fine-tuning the latter to the updated surface region of the scene. We show that such modification to the training procedure can be plugged into several popular implicit reconstruction methods, improving the quality of the results over multiple 3D reconstruction benchmarks.\n  - [Towards Multi-spatiotemporal-scale Generalized PDE Modeling](https://arxiv.org/abs/2209.15616) | [code]\n    > Partial differential equations (PDEs) are central to describing complex physical system simulations. Their expensive solution techniques have led to an increased interest in deep neural network based surrogates. However, the practical utility of training such surrogates is contingent on their ability to model complex multi-scale spatio-temporal phenomena. Various neural network architectures have been proposed to target such phenomena, most notably Fourier Neural Operators (FNOs) which give a natural handle over local \\& global spatial information via parameterization of different Fourier modes, and U-Nets which treat local and global information via downsampling and upsampling paths. However, generalizing across different equation parameters or different time-scales still remains a challenge. In this work, we make a comprehensive comparison between various FNO and U-Net like approaches on fluid mechanics problems in both vorticity-stream and velocity function form. For U-Nets, we transfer recent architectural improvements from computer vision, most notably from object segmentation and generative modeling. We further analyze the design considerations for using FNO layers to improve performance of U-Net architectures without major degradation of computational performance. Finally, we show promising results on generalization to different PDE parameters and time-scales with a single surrogate model.\n  - [MonoNeuralFusion: Online Monocular Neural 3D Reconstruction with Geometric Priors](https://arxiv.org/abs/2209.15153) | [code]\n    > High-fidelity 3D scene reconstruction from monocular videos continues to be challenging, especially for complete and fine-grained geometry reconstruction. The previous 3D reconstruction approaches with neural implicit representations have shown a promising ability for complete scene reconstruction, while their results are often over-smooth and lack enough geometric details. This paper introduces a novel neural implicit scene representation with volume rendering for high-fidelity online 3D scene reconstruction from monocular videos. For fine-grained reconstruction, our key insight is to incorporate geometric priors into both the neural implicit scene representation and neural volume rendering, thus leading to an effective geometry learning mechanism based on volume rendering optimization. Benefiting from this, we present MonoNeuralFusion to perform the online neural 3D reconstruction from monocular videos, by which the 3D scene geometry is efficiently generated and optimized during the on-the-fly 3D monocular scanning. The extensive comparisons with state-of-the-art approaches show that our MonoNeuralFusion consistently generates much better complete and fine-grained reconstruction results, both quantitatively and qualitatively.\n  - [Implicit Neural Spatial Representations for Time-dependent PDEs](https://arxiv.org/abs/2210.00124) | [code]\n    > Numerically solving partial differential equations (PDEs) often entails spatial and temporal discretizations. Traditional methods (e.g., finite difference, finite element, smoothed-particle hydrodynamics) frequently adopt explicit spatial discretizations, such as grids, meshes, and point clouds, where each degree-of-freedom corresponds to a location in space. While these explicit spatial correspondences are intuitive to model and understand, these representations are not necessarily optimal for accuracy, memory-usage, or adaptivity. In this work, we explore implicit neural representation as an alternative spatial discretization, where spatial information is implicitly stored in the neural network weights. With implicit neural spatial representation, PDE-constrained time-stepping translates into updating neural network weights, which naturally integrates with commonly adopted optimization time integrators. We validate our approach on a variety of classic PDEs with examples involving large elastic deformations, turbulent fluids, and multiscale phenomena. While slower to compute than traditional representations, our approach exhibits higher accuracy, lower memory consumption, and dynamically adaptive allocation of degrees of freedom without complex remeshing.\n  - [Continuous PDE Dynamics Forecasting with Implicit Neural Representations](https://arxiv.org/abs/2209.14855) | [code]\n    > Effective data-driven PDE forecasting methods often rely on fixed spatial and / or temporal discretizations. This raises limitations in real-world applications like weather prediction where flexible extrapolation at arbitrary spatiotemporal locations is required. We address this problem by introducing a new data-driven approach, DINo, that models a PDE's flow with continuous-time dynamics of spatially continuous functions. This is achieved by embedding spatial observations independently of their discretization via Implicit Neural Representations in a small latent space temporally driven by a learned ODE. This separate and flexible treatment of time and space makes DINo the first data-driven model to combine the following advantages. It extrapolates at arbitrary spatial and temporal locations; it can learn from sparse irregular grids or manifolds; at test time, it generalizes to new grids or resolutions. DINo outperforms alternative neural PDE forecasters in a variety of challenging generalization scenarios on representative PDE systems.\n  - [SymmNeRF: Learning to Explore Symmetry Prior for Single-View View Synthesis, ACCV2022](https://arxiv.org/abs/2209.14819) | [***``[code]``***](https://github.com/xingyi-li/SymmNeRF)\n    > We study the problem of novel view synthesis of objects from a single image. Existing methods have demonstrated the potential in single-view view synthesis. However, they still fail to recover the fine appearance details, especially in self-occluded areas. This is because a single view only provides limited information. We observe that manmade objects usually exhibit symmetric appearances, which introduce additional prior knowledge. Motivated by this, we investigate the potential performance gains of explicitly embedding symmetry into the scene representation. In this paper, we propose SymmNeRF, a neural radiance field (NeRF) based framework that combines local and global conditioning under the introduction of symmetry priors. In particular, SymmNeRF takes the pixel-aligned image features and the corresponding symmetric features as extra inputs to the NeRF, whose parameters are generated by a hypernetwork. As the parameters are conditioned on the image-encoded latent codes, SymmNeRF is thus scene-independent and can generalize to new scenes. Experiments on synthetic and realworld datasets show that SymmNeRF synthesizes novel views with more details regardless of the pose transformation, and demonstrates good generalization when applied to unseen objects. Code is available at: this https URL.\n  - [Towards General-Purpose Representation Learning of Polygonal Geometries, GeoInformatica](https://arxiv.org/abs/2209.15458) | [code]\n    > Neural network representation learning for spatial data is a common need for geographic artificial intelligence (GeoAI) problems. In recent years, many advancements have been made in representation learning for points, polylines, and networks, whereas little progress has been made for polygons, especially complex polygonal geometries. In this work, we focus on developing a general-purpose polygon encoding model, which can encode a polygonal geometry (with or without holes, single or multipolygons) into an embedding space. The result embeddings can be leveraged directly (or finetuned) for downstream tasks such as shape classification, spatial relation prediction, and so on. To achieve model generalizability guarantees, we identify a few desirable properties: loop origin invariance, trivial vertex invariance, part permutation invariance, and topology awareness. We explore two different designs for the encoder: one derives all representations in the spatial domain; the other leverages spectral domain representations. For the spatial domain approach, we propose ResNet1D, a 1D CNN-based polygon encoder, which uses circular padding to achieve loop origin invariance on simple polygons. For the spectral domain approach, we develop NUFTspec based on Non-Uniform Fourier Transformation (NUFT), which naturally satisfies all the desired properties. We conduct experiments on two tasks: 1) shape classification based on MNIST; 2) spatial relation prediction based on two new datasets - DBSR-46K and DBSR-cplx46K. Our results show that NUFTspec and ResNet1D outperform multiple existing baselines with significant margins. While ResNet1D suffers from model performance degradation after shape-invariance geometry modifications, NUFTspec is very robust to these modifications due to the nature of the NUFT.\n  - [City-scale Incremental Neural Mapping with Three-layer Sampling and Panoptic Representation](https://arxiv.org/abs/2209.14072) | [code]\n    > Neural implicit representations are drawing a lot of attention from the robotics community recently, as they are expressive, continuous and compact. However, city-scale incremental implicit dense mapping based on sparse LiDAR input is still an under-explored challenge. To this end,we successfully build the first city-scale incremental neural mapping system with a panoptic representation that consists of both environment-level and instance-level modelling. Given a stream of sparse LiDAR point cloud, it maintains a dynamic generative model that maps 3D coordinates to signed distance field (SDF) values. To address the difficulty of representing geometric information at different levels in city-scale space, we propose a tailored three-layer sampling strategy to dynamically sample the global, local and near-surface domains. Meanwhile, to realize high fidelity mapping, category-specific prior is introduced to better model the geometric details, leading to a panoptic representation. We evaluate on the public SemanticKITTI dataset and demonstrate the significance of the newly proposed three-layer sampling strategy and panoptic representation, using both quantitative and qualitative results. Codes and data will be publicly available.\n  - [360FusionNeRF: Panoramic Neural Radiance Fields with Joint Guidance](https://arxiv.org/abs/2209.14265) | [code]\n    > We present a method to synthesize novel views from a single 360∘ panorama image based on the neural radiance field (NeRF). Prior studies in a similar setting rely on the neighborhood interpolation capability of multi-layer perceptions to complete missing regions caused by occlusion, which leads to artifacts in their predictions. We propose 360FusionNeRF, a semi-supervised learning framework where we introduce geometric supervision and semantic consistency to guide the progressive training process. Firstly, the input image is re-projected to 360∘ images, and auxiliary depth maps are extracted at other camera positions. The depth supervision, in addition to the NeRF color guidance, improves the geometry of the synthesized views. Additionally, we introduce a semantic consistency loss that encourages realistic renderings of novel views. We extract these semantic features using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse 2D photographs mined from the web with natural language supervision. Experiments indicate that our proposed method can produce plausible completions of unobserved regions while preserving the features of the scene. When trained across various scenes, 360FusionNeRF consistently achieves the state-of-the-art performance when transferring to synthetic Structured3D dataset (PSNR~5%, SSIM~3% LPIPS~13%), real-world Matterport3D dataset (PSNR~3%, SSIM~3% LPIPS~9%) and Replica360 dataset (PSNR~8%, SSIM~2% LPIPS~18%).\n  - [Orbeez-SLAM: A Real-time Monocular Visual SLAM with ORB Features and NeRF-realized Mapping](https://arxiv.org/abs/2209.13274) | [code]\n    > A spatial AI that can perform complex tasks through visual signals and cooperate with humans is highly anticipated. To achieve this, we need a visual SLAM that easily adapts to new scenes without pre-training and generates dense maps for downstream tasks in real-time. None of the previous learning-based and non-learning-based visual SLAMs satisfy all needs due to the intrinsic limitations of their components. In this work, we develop a visual SLAM named Orbeez-SLAM, which successfully collaborates with implicit neural representation (NeRF) and visual odometry to achieve our goals. Moreover, Orbeez-SLAM can work with the monocular camera since it only needs RGB inputs, making it widely applicable to the real world. We validate its effectiveness on various challenging benchmarks. Results show that our SLAM is up to 800x faster than the strong baseline with superior rendering outcomes.\n  - [Enforcing safety for vision-based controllers via Control Barrier Functions and Neural Radiance Fields](https://arxiv.org/abs/2209.12266) | [code]\n    > To navigate complex environments, robots must increasingly use high-dimensional visual feedback (e.g. images) for control. However, relying on high-dimensional image data to make control decisions raises important questions; particularly, how might we prove the safety of a visual-feedback controller? Control barrier functions (CBFs) are powerful tools for certifying the safety of feedback controllers in the state-feedback setting, but CBFs have traditionally been poorly-suited to visual feedback control due to the need to predict future observations in order to evaluate the barrier function. In this work, we solve this issue by leveraging recent advances in neural radiance fields (NeRFs), which learn implicit representations of 3D scenes and can render images from previously-unseen camera perspectives, to provide single-step visual foresight for a CBF-based controller. This novel combination is able to filter out unsafe actions and intervene to preserve safety. We demonstrate the effect of our controller in real-time simulation experiments where it successfully prevents the robot from taking dangerous actions.\n  - [Efficient View Path Planning for Autonomous Implicit Reconstruction](https://arxiv.org/abs/2209.13159) | [code]\n    > Implicit neural representations have shown promising potential for the 3D scene reconstruction. Recent work applies it to autonomous 3D reconstruction by learning information gain for view path planning. Effective as it is, the computation of the information gain is expensive, and compared with that using volumetric representations, collision checking using the implicit representation for a 3D point is much slower. In the paper, we propose to 1) leverage a neural network as an implicit function approximator for the information gain field and 2) combine the implicit fine-grained representation with coarse volumetric representations to improve efficiency. Further with the improved efficiency, we propose a novel informative path planning based on a graph-based planner. Our method demonstrates significant improvements in the reconstruction quality and planning efficiency compared with autonomous reconstructions with implicit and explicit representations. We deploy the method on a real UAV and the results show that our method can plan informative views and reconstruct a scene with high quality.\n  - [WaterNeRF: Neural Radiance Fields for Underwater Scenes](https://arxiv.org/abs/2209.13091) | [code]\n    > Underwater imaging is a critical task performed by marine robots for a wide range of applications including aquaculture, marine infrastructure inspection, and environmental monitoring. However, water column effects, such as attenuation and backscattering, drastically change the color and quality of imagery captured underwater. Due to varying water conditions and range-dependency of these effects, restoring underwater imagery is a challenging problem. This impacts downstream perception tasks including depth estimation and 3D reconstruction. In this paper, we advance state-of-the-art in neural radiance fields (NeRFs) to enable physics-informed dense depth estimation and color correction. Our proposed method, WaterNeRF, estimates parameters of a physics-based model for underwater image formation, leading to a hybrid data-driven and model-based solution. After determining the scene structure and radiance field, we can produce novel views of degraded as well as corrected underwater images, along with dense depth of the scene. We evaluate the proposed method qualitatively and quantitatively on a real underwater dataset.\n  - [Neural Global Illumination: Interactive Indirect Illumination Prediction under Dynamic Area Lights, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9904431) | [code]\n    > We propose neural global illumination, a novel method for fast rendering full global illumination in static scenes with dynamic viewpoint and area lighting. The key idea of our method is to utilize a deep rendering network to model the complex mapping from each shading point to global illumination. To efficiently learn the mapping, we propose a neural-network-friendly input representation including attributes of each shading point, viewpoint information, and a combinational lighting representation that enables high-quality fitting with a compact neural network. To synthesize high-frequency global illumination effects, we transform the low-dimension input to higher-dimension space by positional encoding and model the rendering network as a deep fully-connected network. Besides, we feed a screen-space neural buffer to our rendering network to share global information between objects in the screen-space to each shading point. We have demonstrated our neural global illumination method in rendering a wide variety of scenes exhibiting complex and all-frequency global illumination effects such as multiple-bounce glossy interreflection, color bleeding, and caustics.\n  - [Baking in the Feature: Accelerating Volumetric Segmentation by Rendering Feature Maps](https://arxiv.org/abs/2209.12744) | [code]\n    > Methods have recently been proposed that densely segment 3D volumes into classes using only color images and expert supervision in the form of sparse semantically annotated pixels. While impressive, these methods still require a relatively large amount of supervision and segmenting an object can take several minutes in practice. Such systems typically only optimize their representation on the particular scene they are fitting, without leveraging any prior information from previously seen images. In this paper, we propose to use features extracted with models trained on large existing datasets to improve segmentation performance. We bake this feature representation into a Neural Radiance Field (NeRF) by volumetrically rendering feature maps and supervising on features extracted from each input image. We show that by baking this representation into the NeRF, we make the subsequent classification task much easier. Our experiments show that our method achieves higher segmentation accuracy with fewer semantic annotations than existing methods over a wide range of scenes.\n## Sep18 - Sep24, 2022\n  - [Local_INN: Implicit Map Representation and Localization with Invertible Neural Networks](https://arxiv.org/abs/2209.11925) | [code]\n    > Robot localization is an inverse problem of finding a robot's pose using a map and sensor measurements. In recent years, Invertible Neural Networks (INNs) have successfully solved ambiguous inverse problems in various fields. This paper proposes a framework that solves the localization problem with INN. We design an INN that provides implicit map representation in the forward path and localization in the inverse path. By sampling the latent space in evaluation, Local\\_INN outputs robot poses with covariance, which can be used to estimate the uncertainty. We show that the localization performance of Local\\_INN is on par with current methods with much lower latency. We show detailed 2D and 3D map reconstruction from Local\\_INN using poses exterior to the training set. We also provide a global localization algorithm using Local\\_INN to tackle the kidnapping problem.\n  - [NeRF-Loc: Transformer-Based Object Localization Within Neural Radiance Fields](https://arxiv.org/abs/2209.12068) | [code]\n    > Neural Radiance Fields (NeRFs) have been successfully used for scene representation. Recent works have also developed robotic navigation and manipulation systems using NeRF-based environment representations. As object localization is the foundation for many robotic applications, to further unleash the potential of NeRFs in robotic systems, we study object localization within a NeRF scene. We propose a transformer-based framework NeRF-Loc to extract 3D bounding boxes of objects in NeRF scenes. NeRF-Loc takes a pre-trained NeRF model and camera view as input, and produces labeled 3D bounding boxes of objects as output. Concretely, we design a pair of paralleled transformer encoder branches, namely the coarse stream and the fine stream, to encode both the context and details of target objects. The encoded features are then fused together with attention layers to alleviate ambiguities for accurate object localization. We have compared our method with the conventional transformer-based method and our method achieves better performance. In addition, we also present the first NeRF samples-based object localization benchmark NeRFLocBench.\n  - [SG-SRNs: Superpixel-Guided Scene Representation Networks, SignalProcessingLetters](https://ieeexplore.ieee.org/abstract/document/9900405) | [code]\n    > Recently, Scene Representation Networks (SRNs) have attracted increasing attention in computer vision, due to their continuous and light-weight scene representation ability. However, SRNs generally perform poorly on low-texture image regions. Addressing this problem, we propose superpixel-guided scene representation networks in this paper, called SG-SRNs, consisting of a backbone module (SRNs), a superpixel segmentation module, and a superpixel regularization module. In the proposed method, except for the novel view synthesis task, the task of representation-aware superpixel segmentation mask generation is realized by the proposed superpixel segmentation module. Then, the superpixel regularization module utilizes the superpixel segmentation mask to guide the backbone to be learned in a locally smooth way, and optimizes the scene representations of the local regions to indirectly alleviate the structure distortion of low-texture regions in a self-supervised manner. Extensive experimental results on both our constructed datasets and the public Synthetic-NeRF dataset demonstrated that the proposed SG-SRNs achieved a significantly better 3D structure representing performance.\n  - [PNeRF: Probabilistic Neural Scene Representations for Uncertain 3D Visual Mapping, ICRA2023](https://arxiv.org/abs/2209.11677) | [code]\n    > Recently neural scene representations have provided very impressive results for representing 3D scenes visually, however, their study and progress have mainly been limited to visualization of virtual models in computer graphics or scene reconstruction in computer vision without explicitly accounting for sensor and pose uncertainty. Using this novel scene representation in robotics applications, however, would require accounting for this uncertainty in the neural map. The aim of this paper is therefore to propose a novel method for training {\\em probabilistic neural scene representations} with uncertain training data that could enable the inclusion of these representations in robotics applications. Acquiring images using cameras or depth sensors contains inherent uncertainty, and furthermore, the camera poses used for learning a 3D model are also imperfect. If these measurements are used for training without accounting for their uncertainty, then the resulting models are non-optimal, and the resulting scene representations are likely to contain artifacts such as blur and un-even geometry. In this work, the problem of uncertainty integration to the learning process is investigated by focusing on training with uncertain information in a probabilistic manner. The proposed method involves explicitly augmenting the training likelihood with an uncertainty term such that the learnt probability distribution of the network is minimized with respect to the training uncertainty. It will be shown that this leads to more accurate image rendering quality, in addition to more precise and consistent geometry. Validation has been carried out on both synthetic and real datasets showing that the proposed approach outperforms state-of-the-art methods. The results show notably that the proposed method is capable of rendering novel high-quality views even when the training data is limited.\n  - [How Does It Feel? Self-Supervised Costmap Learning for Off-Road Vehicle Traversability](https://arxiv.org/abs/2209.10788) | [code]\n    > Estimating terrain traversability in off-road environments requires reasoning about complex interaction dynamics between the robot and these terrains. However, it is challenging to build an accurate physics model, or create informative labels to learn a model in a supervised manner, for these interactions. We propose a method that learns to predict traversability costmaps by combining exteroceptive environmental information with proprioceptive terrain interaction feedback in a self-supervised manner. Additionally, we propose a novel way of incorporating robot velocity in the costmap prediction pipeline. We validate our method in multiple short and large-scale navigation tasks on a large, autonomous all-terrain vehicle (ATV) on challenging off-road terrains, and demonstrate ease of integration on a separate large ground robot. Our short-scale navigation results show that using our learned costmaps leads to overall smoother navigation, and provides the robot with a more fine-grained understanding of the interactions between the robot and different terrain types, such as grass and gravel. Our large-scale navigation trials show that we can reduce the number of interventions by up to 57% compared to an occupancy-based navigation baseline in challenging off-road courses ranging from 400 m to 3150 m.\n  - [Edge-oriented Implicit Neural Representation with Channel Tuning](https://arxiv.org/abs/2209.11697) | [code]\n    > Implicit neural representation, which expresses an image as a continuous function rather than a discrete grid form, is widely used for image processing. Despite its outperforming results, there are still remaining limitations on restoring clear shapes of a given signal such as the edges of an image. In this paper, we propose Gradient Magnitude Adjustment algorithm which calculates the gradient of an image for training the implicit representation. In addition, we propose Edge-oriented Representation Network (EoREN) that can reconstruct the image with clear edges by fitting gradient information (Edge-oriented module). Furthermore, we add Channel-tuning module to adjust the distribution of given signals so that it solves a chronic problem of fitting gradients. By separating backpropagation paths of the two modules, EoREN can learn true color of the image without hindering the role for gradients. We qualitatively show that our model can reconstruct complex signals and demonstrate general reconstruction ability of our model with quantitative results.\n  - [Fast Disparity Estimation from a Single Compressed Light Field Measurement](https://arxiv.org/abs/2209.11342) | [code]\n    > The abundant spatial and angular information from light fields has allowed the development of multiple disparity estimation approaches. However, the acquisition of light fields requires high storage and processing cost, limiting the use of this technology in practical applications. To overcome these drawbacks, the compressive sensing (CS) theory has allowed the development of optical architectures to acquire a single coded light field measurement. This measurement is decoded using an optimization algorithm or deep neural network that requires high computational costs. The traditional approach for disparity estimation from compressed light fields requires first recovering the entire light field and then a post-processing step, thus requiring long times. In contrast, this work proposes a fast disparity estimation from a single compressed measurement by omitting the recovery step required in traditional approaches. Specifically, we propose to jointly optimize an optical architecture for acquiring a single coded light field snapshot and a convolutional neural network (CNN) for estimating the disparity maps. Experimentally, the proposed method estimates disparity maps comparable with those obtained from light fields reconstructed using deep learning approaches. Furthermore, the proposed method is 20 times faster in training and inference than the best method that estimates the disparity from reconstructed light fields.\n  - [FNeVR: Neural Volume Rendering for Face Animation](https://arxiv.org/abs/2209.10340) | [code]\n    > Face animation, one of the hottest topics in computer vision, has achieved a promising performance with the help of generative models. However, it remains a critical challenge to generate identity preserving and photo-realistic images due to the sophisticated motion deformation and complex facial detail modeling. To address these problems, we propose a Face Neural Volume Rendering (FNeVR) network to fully explore the potential of 2D motion warping and 3D volume rendering in a unified framework. In FNeVR, we design a 3D Face Volume Rendering (FVR) module to enhance the facial details for image rendering. Specifically, we first extract 3D information with a well-designed architecture, and then introduce an orthogonal adaptive ray-sampling module for efficient rendering. We also design a lightweight pose editor, enabling FNeVR to edit the facial pose in a simple yet effective way. Extensive experiments show that our FNeVR obtains the best overall quality and performance on widely used talking-head benchmarks.\n  - [PREF: Predictability Regularized Neural Motion Fields, ECCV2022(oral)](https://arxiv.org/abs/2209.10691) | [code]\n    > Knowing the 3D motions in a dynamic scene is essential to many vision applications. Recent progress is mainly focused on estimating the activity of some specific elements like humans. In this paper, we leverage a neural motion field for estimating the motion of all points in a multiview setting. Modeling the motion from a dynamic scene with multiview data is challenging due to the ambiguities in points of similar color and points with time-varying color. We propose to regularize the estimated motion to be predictable. If the motion from previous frames is known, then the motion in the near future should be predictable. Therefore, we introduce a predictability regularization by first conditioning the estimated motion on latent embeddings, then by adopting a predictor network to enforce predictability on the embeddings. The proposed framework PREF (Predictability REgularized Fields) achieves on par or better results than state-of-the-art neural motion field-based dynamic scene representation methods, while requiring no prior knowledge of the scene.\n  - [wildNeRF: Complete view synthesis of in-the-wild dynamic scenes captured using sparse monocular data](https://arxiv.org/abs/2209.10399) | [code]\n    > We present a novel neural radiance model that is trainable in a self-supervised manner for novel-view synthesis of dynamic unstructured scenes. Our end-to-end trainable algorithm learns highly complex, real-world static scenes within seconds and dynamic scenes with both rigid and non-rigid motion within minutes. By differentiating between static and motion-centric pixels, we create high-quality representations from a sparse set of images. We perform extensive qualitative and quantitative evaluation on existing benchmarks and set the state-of-the-art on performance measures on the challenging NVIDIA Dynamic Scenes Dataset. Additionally, we evaluate our model performance on challenging real-world datasets such as Cholec80 and SurgicalActions160.\n  - [Loc-NeRF: Monte Carlo Localization using Neural Radiance Fields](https://arxiv.org/abs/2209.09050) | [***``[code]``***](https://github.com/MIT-SPARK/Loc-NeRF)\n    > We present Loc-NeRF, a real-time vision-based robot localization approach that combines Monte Carlo localization and Neural Radiance Fields (NeRF). Our system uses a pre-trained NeRF model as the map of an environment and can localize itself in real-time using an RGB camera as the only exteroceptive sensor onboard the robot. While neural radiance fields have seen significant applications for visual rendering in computer vision and graphics, they have found limited use in robotics. Existing approaches for NeRF-based localization require both a good initial pose guess and significant computation, making them impractical for real-time robotics applications. By using Monte Carlo localization as a workhorse to estimate poses using a NeRF map model, Loc-NeRF is able to perform localization faster than the state of the art and without relying on an initial pose estimate. In addition to testing on synthetic data, we also run our system using real data collected by a Clearpath Jackal UGV and demonstrate for the first time the ability to perform real-time global localization with neural radiance fields. We make our code publicly available at this https URL.\n  - [Density-aware NeRF Ensembles: Quantifying Predictive Uncertainty in Neural Radiance Fields](https://arxiv.org/abs/2209.08718) | [code]\n    > We show that ensembling effectively quantifies model uncertainty in Neural Radiance Fields (NeRFs) if a density-aware epistemic uncertainty term is considered. The naive ensembles investigated in prior work simply average rendered RGB images to quantify the model uncertainty caused by conflicting explanations of the observed scene. In contrast, we additionally consider the termination probabilities along individual rays to identify epistemic model uncertainty due to a lack of knowledge about the parts of a scene unobserved during training. We achieve new state-of-the-art performance across established uncertainty quantification benchmarks for NeRFs, outperforming methods that require complex changes to the NeRF architecture and training regime. We furthermore demonstrate that NeRF uncertainty can be utilised for next-best view selection and model refinement.\n  - [NeRF-SOS: Any-View Self-supervised Object Segmentation on Complex Scenes](https://zhiwenfan.github.io/NeRF-SOS/) | [***``[code]``***](https://github.com/VITA-Group/NeRF-SOS)\n    > Neural volumetric representations have shown the potential that Multi-layer Perceptrons (MLPs) can be optimized with multi-view calibrated images to represent scene geometry and appearance, without explicit 3D supervision. Object segmentation can enrich many downstream applications based on the learned radiance field. However, introducing hand-crafted segmentation to define regions of interest in a complex real-world scene is non-trivial and expensive as it acquires per view annotation. This paper carries out the exploration of self-supervised learning for object segmentation using NeRF for complex real-world scenes. Our framework, called NeRF with Self-supervised Object Segmentation NeRF-SOS, couples object segmentation and neural radiance field to segment objects in any view within a scene. By proposing a novel collaborative contrastive loss in both appearance and geometry levels, NeRF-SOS encourages NeRF models to distill compact geometry-aware segmentation clusters from their density fields and the self-supervised pre-trained 2D visual features. The self-supervised object segmentation framework can be applied to various NeRF models that both lead to photo-realistic rendering results and convincing segmentation maps for both indoor and outdoor scenarios. Extensive results on the LLFF, Tank & Temple, and BlendedMVS datasets validate the effectiveness of NeRF-SOS. It consistently surpasses other 2D-based self-supervised baselines and predicts finer semantics masks than existing supervised counterparts. Please refer to the video on our project page for more details:this https URL.\n  - [MeSLAM: Memory Efficient SLAM based on Neural Fields, SMC2022](https://arxiv.org/abs/2209.09357) | [code]\n    > Existing Simultaneous Localization and Mapping (SLAM) approaches are limited in their scalability due to growing map size in long-term robot operation. Moreover, processing such maps for localization and planning tasks leads to the increased computational resources required onboard. To address the problem of memory consumption in long-term operation, we develop a novel real-time SLAM algorithm, MeSLAM, that is based on neural field implicit map representation. It combines the proposed global mapping strategy, including neural networks distribution and region tracking, with an external odometry system. As a result, the algorithm is able to efficiently train multiple networks representing different map regions and track poses accurately in large-scale environments. Experimental results show that the accuracy of the proposed approach is comparable to the state-of-the-art methods (on average, 6.6 cm on TUM RGB-D sequences) and outperforms the baseline, iMAP∗. Moreover, the proposed SLAM approach provides the most compact-sized maps without details distortion (1.9 MB to store 57 m3) among the state-of-the-art SLAM approaches.\n  - [Human Performance Modeling and Rendering via Neural Animated Mesh](https://arxiv.org/abs/2209.08468) | [code]\n    > We have recently seen tremendous progress in the neural advances for photo-real human modeling and rendering. However, it's still challenging to integrate them into an existing mesh-based pipeline for downstream applications. In this paper, we present a comprehensive neural approach for high-quality reconstruction, compression, and rendering of human performances from dense multi-view videos. Our core intuition is to bridge the traditional animated mesh workflow with a new class of highly efficient neural techniques. We first introduce a neural surface reconstructor for high-quality surface generation in minutes. It marries the implicit volumetric rendering of the truncated signed distance field (TSDF) with multi-resolution hash encoding. We further propose a hybrid neural tracker to generate animated meshes, which combines explicit non-rigid tracking with implicit dynamic deformation in a self-supervised framework. The former provides the coarse warping back into the canonical space, while the latter implicit one further predicts the displacements using the 4D hash encoding as in our reconstructor. Then, we discuss the rendering schemes using the obtained animated meshes, ranging from dynamic texturing to lumigraph rendering under various bandwidth settings. To strike an intricate balance between quality and bandwidth, we propose a hierarchical solution by first rendering 6 virtual views covering the performer and then conducting occlusion-aware neural texture blending. We demonstrate the efficacy of our approach in a variety of mesh-based applications and photo-realistic free-view experiences on various platforms, i.e., inserting virtual human performances into real environments through mobile AR or immersively watching talent shows with VR headsets.\n  - [LATITUDE: Robotic Global Localization with Truncated Dynamic Low-pass Filter in City-scale NeRF, ICRA2023](https://arxiv.org/abs/2209.08498) | [***``[code]``***](https://github.com/jike5/LATITUDE)\n    > Neural Radiance Fields (NeRFs) have made great success in representing complex 3D scenes with high-resolution details and efficient memory. Nevertheless, current NeRF-based pose estimators have no initial pose prediction and are prone to local optima during optimization. In this paper, we present LATITUDE: Global Localization with Truncated Dynamic Low-pass Filter, which introduces a two-stage localization mechanism in city-scale NeRF. In place recognition stage, we train a regressor through images generated from trained NeRFs, which provides an initial value for global localization. In pose optimization stage, we minimize the residual between the observed image and rendered image by directly optimizing the pose on tangent plane. To avoid convergence to local optimum, we introduce a Truncated Dynamic Low-pass Filter (TDLF) for coarse-to-fine pose registration. We evaluate our method on both synthetic and real-world data and show its potential applications for high-precision navigation in large-scale city scenes. Codes and data will be publicly available at this https URL.\n  - [Neural Implicit Surface Reconstruction using Imaging Sonar](https://arxiv.org/abs/2209.08221) | [code]\n    > We present a technique for dense 3D reconstruction of objects using an imaging sonar, also known as forward-looking sonar (FLS). Compared to previous methods that model the scene geometry as point clouds or volumetric grids, we represent the geometry as a neural implicit function. Additionally, given such a representation, we use a differentiable volumetric renderer that models the propagation of acoustic waves to synthesize imaging sonar measurements. We perform experiments on real and synthetic datasets and show that our algorithm reconstructs high-fidelity surface geometry from multi-view FLS images at much higher quality than was possible with previous techniques and without suffering from their associated memory overhead.\n  - [Uncertainty Guided Policy for Active Robotic 3D Reconstruction using Neural Radiance Fields, RAL2022](https://arxiv.org/abs/2209.08409) | [code]\n    > In this paper, we tackle the problem of active robotic 3D reconstruction of an object. In particular, we study how a mobile robot with an arm-held camera can select a favorable number of views to recover an object's 3D shape efficiently. Contrary to the existing solution to this problem, we leverage the popular neural radiance fields-based object representation, which has recently shown impressive results for various computer vision tasks. However, it is not straightforward to directly reason about an object's explicit 3D geometric details using such a representation, making the next-best-view selection problem for dense 3D reconstruction challenging. This paper introduces a ray-based volumetric uncertainty estimator, which computes the entropy of the weight distribution of the color samples along each ray of the object's implicit neural representation. We show that it is possible to infer the uncertainty of the underlying 3D geometry given a novel view with the proposed estimator. We then present a next-best-view selection policy guided by the ray-based volumetric uncertainty in neural radiance fields-based representations. Encouraging experimental results on synthetic and real-world data suggest that the approach presented in this paper can enable a new research direction of using an implicit 3D object representation for the next-best-view problem in robot vision applications, distinguishing our approach from the existing approaches that rely on explicit 3D geometric modeling.\n  - [Implicit Neural Representations for Medical Imaging Segmentation, MICCAI2022](https://link.springer.com/chapter/10.1007/978-3-031-16443-9_42) | [code]\n    > 3D signals in medical imaging, such as CT scans, are usually parameterized as a discrete grid of voxels. For instance, existing state-of-the-art organ segmentation methods learn discrete segmentation maps. Unfortunately, the memory requirements of such methods grow cubically with increasing spatial resolution, which makes them unsuitable for processing high resolution scans. To overcome this, we design an Implicit Organ Segmentation Network (IOSNet) that utilizes continuous Implicit Neural Representations and has several useful properties. Firstly, the IOSNet decoder memory is roughly constant and independent of the spatial resolution since it parameterizes the segmentation map as a continuous function. Secondly, IOSNet converges much faster than discrete voxel based methods due to its ability to accurately segment organs irrespective of organ sizes, thereby alleviating size imbalance issues without requiring any auxiliary tricks. Thirdly, IOSNet naturally supports super-resolution (i.e. sampling at arbitrary resolutions during inference) due to its continuous learnt representations. Moreover, despite using a simple lightweight decoder, IOSNet consistently outperforms the discrete specialized segmentation architecture UNet. Hence, our approach demonstrates that Implicit Neural Representations are well-suited for medical imaging applications, especially for processing high-resolution 3D medical scans.\n  - [ActiveNeRF: Learning where to See with Uncertainty Estimation](https://arxiv.org/abs/2209.08546) | [***``[code]``***](https://github.com/LeapLabTHU/ActiveNeRF)\n    > Recently, Neural Radiance Fields (NeRF) has shown promising performances on reconstructing 3D scenes and synthesizing novel views from a sparse set of 2D images. Albeit effective, the performance of NeRF is highly influenced by the quality of training samples. With limited posed images from the scene, NeRF fails to generalize well to novel views and may collapse to trivial solutions in unobserved regions. This makes NeRF impractical under resource-constrained scenarios. In this paper, we present a novel learning framework, ActiveNeRF, aiming to model a 3D scene with a constrained input budget. Specifically, we first incorporate uncertainty estimation into a NeRF model, which ensures robustness under few observations and provides an interpretation of how NeRF understands the scene. On this basis, we propose to supplement the existing training set with newly captured samples based on an active learning scheme. By evaluating the reduction of uncertainty given new inputs, we select the samples that bring the most information gain. In this way, the quality of novel view synthesis can be improved with minimal additional resources. Extensive experiments validate the performance of our model on both realistic and synthetic scenes, especially with scarcer training data. Code will be released at \\url{this https URL}.\n## Sep11 - Sep17, 2022\n  - [iDF-SLAM: End-to-End RGB-D SLAM with Neural Implicit Mapping and Deep Feature Tracking](https://arxiv.org/abs/2209.07919) | [code]\n    > We propose a novel end-to-end RGB-D SLAM, iDF-SLAM, which adopts a feature-based deep neural tracker as the front-end and a NeRF-style neural implicit mapper as the back-end. The neural implicit mapper is trained on-the-fly, while though the neural tracker is pretrained on the ScanNet dataset, it is also finetuned along with the training of the neural implicit mapper. Under such a design, our iDF-SLAM is capable of learning to use scene-specific features for camera tracking, thus enabling lifelong learning of the SLAM system. Both the training for the tracker and the mapper are self-supervised without introducing ground truth poses. We test the performance of our iDF-SLAM on the Replica and ScanNet datasets and compare the results to the two recent NeRF-based neural SLAM systems. The proposed iDF-SLAM demonstrates state-of-the-art results in terms of scene reconstruction and competitive performance in camera tracking.\n  - [3DMM-RF: Convolutional Radiance Fields for 3D Face Modeling](https://arxiv.org/abs/2209.07366) | [code]\n    > Facial 3D Morphable Models are a main computer vision subject with countless applications and have been highly optimized in the last two decades. The tremendous improvements of deep generative networks have created various possibilities for improving such models and have attracted wide interest. Moreover, the recent advances in neural radiance fields, are revolutionising novel-view synthesis of known scenes. In this work, we present a facial 3D Morphable Model, which exploits both of the above, and can accurately model a subject's identity, pose and expression and render it in arbitrary illumination. This is achieved by utilizing a powerful deep style-based generator to overcome two main weaknesses of neural radiance fields, their rigidity and rendering speed. We introduce a style-based generative network that synthesizes in one pass all and only the required rendering samples of a neural radiance field. We create a vast labelled synthetic dataset of facial renders, and train the network on these data, so that it can accurately model and generalize on facial identity, pose and appearance. Finally, we show that this model can accurately be fit to \"in-the-wild\" facial images of arbitrary pose and illumination, extract the facial characteristics, and be used to re-render the face in controllable conditions.\n  - [DevNet: Self-supervised Monocular Depth Learning via Density Volume Construction, ECCV2022](https://arxiv.org/abs/2209.06351) | [code]\n    > Self-supervised depth learning from monocular images normally relies on the 2D pixel-wise photometric relation between temporally adjacent image frames. However, they neither fully exploit the 3D point-wise geometric correspondences, nor effectively tackle the ambiguities in the photometric warping caused by occlusions or illumination inconsistency. To address these problems, this work proposes Density Volume Construction Network (DevNet), a novel self-supervised monocular depth learning framework, that can consider 3D spatial information, and exploit stronger geometric constraints among adjacent camera frustums. Instead of directly regressing the pixel value from a single image, our DevNet divides the camera frustum into multiple parallel planes and predicts the pointwise occlusion probability density on each plane. The final depth map is generated by integrating the density along corresponding rays. During the training process, novel regularization strategies and loss functions are introduced to mitigate photometric ambiguities and overfitting. Without obviously enlarging model parameters size or running time, DevNet outperforms several representative baselines on both the KITTI-2015 outdoor dataset and NYU-V2 indoor dataset. In particular, the root-mean-square-deviation is reduced by around 4% with DevNet on both KITTI-2015 and NYU-V2 in the task of depth estimation. Code is available at this https URL.\n  - [Explicitly Controllable 3D-Aware Portrait Generation](https://arxiv.org/abs/2209.05434) | [code]\n    > In contrast to the traditional avatar creation pipeline which is a costly process, contemporary generative approaches directly learn the data distribution from photographs. While plenty of works extend unconditional generative models and achieve some levels of controllability, it is still challenging to ensure multi-view consistency, especially in large poses. In this work, we propose a network that generates 3D-aware portraits while being controllable according to semantic parameters regarding pose, identity, expression and illumination. Our network uses neural scene representation to model 3D-aware portraits, whose generation is guided by a parametric face model that supports explicit control. While the latent disentanglement can be further enhanced by contrasting images with partially different attributes, there still exists noticeable inconsistency in non-face areas, e.g., hair and background, when animating expressions. Wesolve this by proposing a volume blending strategy in which we form a composite output by blending dynamic and static areas, with two parts segmented from the jointly learned semantic field. Our method outperforms prior arts in extensive experiments, producing realistic portraits with vivid expression in natural lighting when viewed from free viewpoints. It also demonstrates generalization ability to real images as well as out-of-domain data, showing great promise in real applications.\n  - [StructNeRF: Neural Radiance Fields for Indoor Scenes with Structural Hints](https://arxiv.org/abs/2209.05277) | [code]\n    > Neural Radiance Fields (NeRF) achieve photo-realistic view synthesis with densely captured input images. However, the geometry of NeRF is extremely under-constrained given sparse views, resulting in significant degradation of novel view synthesis quality. Inspired by self-supervised depth estimation methods, we propose StructNeRF, a solution to novel view synthesis for indoor scenes with sparse inputs. StructNeRF leverages the structural hints naturally embedded in multi-view inputs to handle the unconstrained geometry issue in NeRF. Specifically, it tackles the texture and non-texture regions respectively: a patch-based multi-view consistent photometric loss is proposed to constrain the geometry of textured regions; for non-textured ones, we explicitly restrict them to be 3D consistent planes. Through the dense self-supervised depth constraints, our method improves both the geometry and the view synthesis performance of NeRF without any additional training on external data. Extensive experiments on several real-world datasets demonstrate that StructNeRF surpasses state-of-the-art methods for indoor scenes with sparse inputs both quantitatively and qualitatively.\n  - [Learning A Unified 3D Point Cloud for View Synthesis](https://arxiv.org/abs/2209.05013) | [code]\n    > 3D point cloud representation-based view synthesis methods have demonstrated effectiveness. However, existing methods usually synthesize novel views only from a single source view, and it is non-trivial to generalize them to handle multiple source views for pursuing higher reconstruction quality. In this paper, we propose a new deep learning-based view synthesis paradigm, which learns a unified 3D point cloud from different source views. Specifically, we first construct sub-point clouds by projecting source views to 3D space based on their depth maps. Then, we learn the unified 3D point cloud by adaptively fusing points at a local neighborhood defined on the union of the sub-point clouds. Besides, we also propose a 3D geometry-guided image restoration module to fill the holes and recover high-frequency details of the rendered novel views. Experimental results on three benchmark datasets demonstrate that our method outperforms state-of-the-art view synthesis methods to a large extent both quantitatively and visually.\n  - [Self-Supervised Coordinate Projection Network for Sparse-View Computed Tomography](https://arxiv.org/abs/2209.05483) | [code]\n    > In the present work, we propose a Self-supervised COordinate Projection nEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV sinogram by solving the inverse tomography imaging problem. Compared with recent related works that solve similar problems using implicit neural representation network (INR), our essential contribution is an effective and simple re-projection strategy that pushes the tomography image reconstruction quality over supervised deep learning CT reconstruction works. The proposed strategy is inspired by the simple relationship between linear algebra and inverse problems. To solve the under-determined linear equation system, we first introduce INR to constrain the solution space via image continuity prior and achieve a rough solution. And secondly, we propose to generate a dense view sinogram that improves the rank of the linear equation system and produces a more stable CT image solution space. Our experiment results demonstrate that the re-projection strategy significantly improves the image reconstruction quality (+3 dB for PSNR at least). Besides, we integrate the recent hash encoding into our SCOPE model, which greatly accelerates the model training. Finally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction tasks. Experimental results indicate that the proposed SCOPE model outperforms two latest INR-based methods and two well-popular supervised DL methods quantitatively and qualitatively.\n  - [CU-Net: Efficient Point Cloud Color Upsampling Network](https://arxiv.org/abs/2209.06112) | [code]\n    > Point cloud upsampling is necessary for Augmented Reality, Virtual Reality, and telepresence scenarios. Although the geometry upsampling is well studied to densify point cloud coordinates, the upsampling of colors has been largely overlooked. In this paper, we propose CU-Net, the first deep-learning point cloud color upsampling model. Leveraging a feature extractor based on sparse convolution and a color prediction module based on neural implicit function, CU-Net achieves linear time and space complexity. Therefore, CU-Net is theoretically guaranteed to be more efficient than most existing methods with quadratic complexity. Experimental results demonstrate that CU-Net can colorize a photo-realistic point cloud with nearly a million points in real time, while having better visual quality than baselines. Besides, CU-Net can adapt to an arbitrary upsampling ratio and unseen objects. Our source code will be released to the public soon.\n## Sep4 - Sep10, 2022\n  - [PixTrack: Precise 6DoF Object Pose Tracking using NeRF Templates and Feature-metric Alignment](https://arxiv.org/abs/2209.03910) | [code]\n    > We present PixTrack, a vision based object pose tracking framework using novel view synthesis and deep feature-metric alignment. Our evaluations demonstrate that our method produces highly accurate, robust, and jitter-free 6DoF pose estimates of objects in RGB images without the need of any data annotation or trajectory smoothing. Our method is also computationally efficient making it easy to have multi-object tracking with no alteration to our method and just using CPU multiprocessing.\n  - [Implicit Full Waveform Inversion with Deep Neural Representation](https://arxiv.org/abs/2209.03525) | [code]\n    > Full waveform inversion (FWI) commonly stands for the state-of-the-art approach for imaging subsurface structures and physical parameters, however, its implementation usually faces great challenges, such as building a good initial model to escape from local minima, and evaluating the uncertainty of inversion results. In this paper, we propose the implicit full waveform inversion (IFWI) algorithm using continuously and implicitly defined deep neural representations. Compared to FWI, which is sensitive to the initial model, IFWI benefits from the increased degrees of freedom with deep learning optimization, thus allowing to start from a random initialization, which greatly reduces the risk of non-uniqueness and being trapped in local minima. Both theoretical and experimental analyses indicates that, given a random initial model, IFWI is able to converge to the global minimum and produce a high-resolution image of subsurface with fine structures. In addition, uncertainty analysis of IFWI can be easily performed by approximating Bayesian inference with various deep learning approaches, which is analyzed in this paper by adding dropout neurons. Furthermore, IFWI has a certain degree of robustness and strong generalization ability that are exemplified in the experiments of various 2D geological models. With proper setup, IFWI can also be well suited for multi-scale joint geophysical inversion.\n  - [3D Textured Shape Recovery with Learned Geometric Priors](https://arxiv.org/abs/2209.03254) | [code]\n    > 3D textured shape recovery from partial scans is crucial for many real-world applications. Existing approaches have demonstrated the efficacy of implicit function representation, but they suffer from partial inputs with severe occlusions and varying object types, which greatly hinders their application value in the real world. This technical report presents our approach to address these limitations by incorporating learned geometric priors. To this end, we generate a SMPL model from learned pose prediction and fuse it into the partial input to add prior knowledge of human bodies. We also propose a novel completeness-aware bounding box adaptation for handling different levels of scales and partialness of partial scans.\n  - [SIRA: Relightable Avatars from a Single Image](https://arxiv.org/abs/2209.03027) | [code]\n    > Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.\n  - [Neural Feature Fusion Fields: 3D Distillation of Self-Supervised 2D Image Representations, 3DV2022(oral)](https://arxiv.org/abs/2209.03494) | [***``[code]``***](https://github.com/dichotomies/N3F)\n    > We present Neural Feature Fusion Fields (N3F), a method that improves dense 2D image feature extractors when the latter are applied to the analysis of multiple images reconstructible as a 3D scene. Given an image feature extractor, for example pre-trained using self-supervision, N3F uses it as a teacher to learn a student network defined in 3D space. The 3D student network is similar to a neural radiance field that distills said features and can be trained with the usual differentiable rendering machinery. As a consequence, N3F is readily applicable to most neural rendering formulations, including vanilla NeRF and its extensions to complex dynamic scenes. We show that our method not only enables semantic understanding in the context of scene-specific neural fields without the use of manual labels, but also consistently improves over the self-supervised 2D baselines. This is demonstrated by considering various tasks, such as 2D object retrieval, 3D segmentation, and scene editing, in diverse sequences, including long egocentric videos in the EPIC-KITCHENS benchmark.\n  - [MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model](https://arxiv.org/abs/2208.15001) | [***``[code]``***](https://github.com/mingyuan-zhang/MotionDiffuse)\n    > Human motion modeling is important for many modern graphics applications, which typically require professional skills. In order to remove the skill barriers for laymen, recent motion generation methods can directly generate human motions conditioned on natural languages. However, it remains challenging to achieve diverse and fine-grained motion generation with various text inputs. To address this problem, we propose MotionDiffuse, the first diffusion model-based text-driven motion generation framework, which demonstrates several desired properties over existing methods. 1) Probabilistic Mapping. Instead of a deterministic language-motion mapping, MotionDiffuse generates motions through a series of denoising steps in which variations are injected. 2) Realistic Synthesis. MotionDiffuse excels at modeling complicated data distribution and generating vivid motion sequences. 3) Multi-Level Manipulation. MotionDiffuse responds to fine-grained instructions on body parts, and arbitrary-length motion synthesis with time-varied text prompts. Our experiments show MotionDiffuse outperforms existing SoTA methods by convincing margins on text-driven motion generation and action-conditioned motion generation. A qualitative analysis further demonstrates MotionDiffuse's controllability for comprehensive motion generation. Homepage: this https URL\n## Aug28 - Sep3, 2022\n  - [Multi-View Reconstruction using Signed Ray Distance Functions (SRDF)](https://arxiv.org/abs/2209.00082) | [code]\n    > In this paper, we address the problem of multi-view 3D shape reconstruction. While recent differentiable rendering approaches associated to implicit shape representations have provided breakthrough performance, they are still computationally heavy and often lack precision on the estimated geometries. To overcome these limitations we investigate a new computational approach that builds on a novel shape representation that is volumetric, as in recent differentiable rendering approaches, but parameterized with depth maps to better materialize the shape surface. The shape energy associated to this representation evaluates 3D geometry given color images and does not need appearance prediction but still benefits from volumetric integration when optimized. In practice we propose an implicit shape representation, the SRDF, based on signed distances which we parameterize by depths along camera rays. The associated shape energy considers the agreement between depth prediction consistency and photometric consistency, this at 3D locations within the volumetric representation. Various photo-consistency priors can be accounted for such as a median based baseline, or a more elaborated criterion as with a learned function. The approach retains pixel-accuracy with depth maps and is parallelizable. Our experiments over standard datasets shows that it provides state-of-the-art results with respect to recent approaches with implicit shape representations as well as with respect to traditional multi-view stereo methods.\n  - [Dual-Space NeRF: Learning Animatable Avatars and Scene Lighting in Separate Spaces, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > Modeling the human body in a canonical space is a common practice for capturing and animation. But when involving the neural radiance field (NeRF), learning a static NeRF in the canonical space is not enough because the lighting of the body changes when the person moves even though the scene lighting is constant. Previous methods alleviate the inconsistency of lighting by learning a per-frame embedding, but this operation does not generalize to unseen poses. Given that the lighting condition is static in the world space while the human body is consistent in the canonical space, we propose a dual-space NeRF that models the scene lighting and the human body with two MLPs in two separate spaces. To bridge these two spaces, previous methods mostly rely on the linear blend skinning (LBS) algorithm. However, the blending weights for LBS of a dynamic neural field are intractable and thus are usually memorized with another MLP, which does not generalize to novel poses. Although it is possible to borrow the blending weights of a parametric mesh such as SMPL, the interpolation operation introduces more artifacts. In this paper, we propose to use the barycentric mapping, which can directly generalize to unseen poses and surprisingly achieves superior results than LBS with neural blending weights. Quantitative and qualitative results on the Human3.6M and the ZJU-MoCap datasets show the effectiveness of our method.\n  - [FoV-NeRF: Foveated Neural Radiance Fields for Virtual Reality, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9872532) | [code]\n    > Virtual Reality (VR) is becoming ubiquitous with the rise of consumer displays and commercial VR platforms. Such displays require low latency and high quality rendering of synthetic imagery with reduced compute overheads. Recent advances in neural rendering showed promise of unlocking new possibilities in 3D computer graphics via image-based representations of virtual or physical environments. Specifically, the neural radiance fields (NeRF) demonstrated that photo-realistic quality and continuous view changes of 3D scenes can be achieved without loss of view-dependent effects. While NeRF can significantly benefit rendering for VR applications, it faces unique challenges posed by high field-of-view, high resolution, and stereoscopic/egocentric viewing, typically causing low quality and high latency of the rendered images. In VR, this not only harms the interaction experience but may also cause sickness. To tackle these problems toward six-degrees-of-freedom, egocentric, and stereo NeRF in VR, we present the first gaze-contingent 3D neural representation and view synthesis method . We incorporate the human psychophysics of visual- and stereo-acuity into an egocentric neural representation of 3D scenery. We then jointly optimize the latency/performance and visual quality while mutually bridging human perception and neural scene synthesis to achieve perceptually high-quality immersive interaction. We conducted both objective analysis and subjective studies to evaluate the effectiveness of our approach. We find that our method significantly reduces latency (up to 99% time reduction compared with NeRF) without loss of high-fidelity rendering (perceptually identical to full-resolution ground truth). The presented approach may serve as the first step toward future VR/AR systems that capture, teleport, and visualize remote environments in real-time.\n  - [Cross-Spectral Neural Radiance Fields, 3DV2022](https://arxiv.org/abs/2209.00648) | [code]\n    > We propose X-NeRF, a novel method to learn a Cross-Spectral scene representation given images captured from cameras with different light spectrum sensitivity, based on the Neural Radiance Fields formulation. X-NeRF optimizes camera poses across spectra during training and exploits Normalized Cross-Device Coordinates (NXDC) to render images of different modalities from arbitrary viewpoints, which are aligned and at the same resolution. Experiments on 16 forward-facing scenes, featuring color, multi-spectral and infrared images, confirm the effectiveness of X-NeRF at modeling Cross-Spectral scene representations.\n  - [CLONeR: Camera-Lidar Fusion for Occupancy Grid-aided Neural Representations](https://arxiv.org/abs/2209.01194) | [code]\n    > This paper proposes CLONeR, which significantly improves upon NeRF by allowing it to model large outdoor driving scenes that are observed from sparse input sensor views. This is achieved by decoupling occupancy and color learning within the NeRF framework into separate Multi-Layer Perceptrons (MLPs) trained using LiDAR and camera data, respectively. In addition, this paper proposes a novel method to build differentiable 3D Occupancy Grid Maps (OGM) alongside the NeRF model, and leverage this occupancy grid for improved sampling of points along a ray for volumetric rendering in metric space.\n  - [NerfCap: Human Performance Capture With Dynamic Neural Radiance Fields, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > This paper addresses the challenge of human performance capture from sparse multi-view or monocular videos. Given a template mesh of the performer, previous methods capture the human motion by non-rigidly registering the template mesh to images with 2D silhouettes or dense photometric alignment. However, the detailed surface deformation cannot be recovered from the silhouettes, while the photometric alignment suffers from instability caused by appearance variation in the videos. To solve these problems, we propose NerfCap, a novel performance capture method based on the dynamic neural radiance field (NeRF) representation of the performer. Specifically, a canonical NeRF is initialized from the template geometry and registered to the video frames by optimizing the deformation field and the appearance model of the canonical NeRF. To capture both large body motion and detailed surface deformation, NerfCap combines linear blend skinning with embedded graph deformation. In contrast to the mesh-based methods that suffer from fixed topology and texture, NerfCap is able to flexibly capture complex geometry and appearance variation across the videos, and synthesize more photo-realistic images. In addition, NerfCap can be pre-trained end to end in a self-supervised manner by matching the synthesized videos with the input videos. Experimental results on various datasets show that NerfCap outperforms prior works in terms of both surface reconstruction accuracy and novel-view synthesis quality.\n## Aug21 - Aug27, 2022\n  - [Training and Tuning Generative Neural Radiance Fields for Attribute-Conditional 3D-Aware Face Generation](https://arxiv.org/abs/2208.12550) | [***``[code]``***](https://github.com/zhangqianhui/TT-GNeRF)\n    > 3D-aware GANs based on generative neural radiance fields (GNeRF) have achieved impressive high-quality image generation, while preserving strong 3D consistency. The most notable achievements are made in the face generation domain. However, most of these models focus on improving view consistency but neglect a disentanglement aspect, thus these models cannot provide high-quality semantic/attribute control over generation. To this end, we introduce a conditional GNeRF model that uses specific attribute labels as input in order to improve the controllabilities and disentangling abilities of 3D-aware generative models. We utilize the pre-trained 3D-aware model as the basis and integrate a dual-branches attribute-editing module (DAEM), that utilize attribute labels to provide control over generation. Moreover, we propose a TRIOT (TRaining as Init, and Optimizing for Tuning) method to optimize the latent vector to improve the precision of the attribute-editing further. Extensive experiments on the widely used FFHQ show that our model yields high-quality editing with better view consistency while preserving the non-target regions. The code is available at this https URL.\n  - [Voxurf: Voxel-based Efficient and Accurate Neural Surface Reconstruction](https://arxiv.org/abs/2208.12697) | [code]\n    > Neural surface reconstruction aims to reconstruct accurate 3D surfaces based on multi-view images. Previous methods based on neural volume rendering mostly train a fully implicit model, and they require hours of training for a single scene. Recent efforts explore the explicit volumetric representation, which substantially accelerates the optimization process by memorizing significant information in learnable voxel grids. However, these voxel-based methods often struggle in reconstructing fine-grained geometry. Through empirical studies, we found that high-quality surface reconstruction hinges on two key factors: the capability of constructing a coherent shape and the precise modeling of color-geometry dependency. In particular, the latter is the key to the accurate reconstruction of fine details. Inspired by these findings, we develop Voxurf, a voxel-based approach for efficient and accurate neural surface reconstruction, which consists of two stages: 1) leverage a learnable feature grid to construct the color field and obtain a coherent coarse shape, and 2) refine detailed geometry with a dual color network that captures precise color-geometry dependency. We further introduce a hierarchical geometry feature to enable information sharing across voxels. Our experiments show that Voxurf achieves high efficiency and high quality at the same time. On the DTU benchmark, Voxurf achieves higher reconstruction quality compared to state-of-the-art methods, with 20x speedup in training.\n  - [Neural Novel Actor: Learning a Generalized Animatable Neural Representation for Human Actors](https://arxiv.org/abs/2208.11905) | [code]\n    > We propose a new method for learning a generalized animatable neural human representation from a sparse set of multi-view imagery of multiple persons. The learned representation can be used to synthesize novel view images of an arbitrary person from a sparse set of cameras, and further animate them with the user's pose control. While existing methods can either generalize to new persons or synthesize animations with user control, none of them can achieve both at the same time. We attribute this accomplishment to the employment of a 3D proxy for a shared multi-person human model, and further the warping of the spaces of different poses to a shared canonical pose space, in which we learn a neural field and predict the person- and pose-dependent deformations, as well as appearance with the features extracted from input images. To cope with the complexity of the large variations in body shapes, poses, and clothing deformations, we design our neural human model with disentangled geometry and appearance. Furthermore, we utilize the image features both at the spatial point and on the surface points of the 3D proxy for predicting person- and pose-dependent properties. Experiments show that our method significantly outperforms the state-of-the-arts on both tasks. The video and code are available at this https URL.\n  - [DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://dreambooth.github.io/) | [code]\n    > Large text-to-image models achieved a remarkable leap in the evolution of AI, enabling high-quality and diverse synthesis of images from a given text prompt. However, these models lack the ability to mimic the appearance of subjects in a given reference set and synthesize novel renditions of them in different contexts. In this work, we present a new approach for \"personalization\" of text-to-image diffusion models (specializing them to users' needs). Given as input just a few images of a subject, we fine-tune a pretrained text-to-image model (Imagen, although our method is not limited to a specific model) such that it learns to bind a unique identifier with that specific subject. Once the subject is embedded in the output domain of the model, the unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in different scenes. By leveraging the semantic prior embedded in the model with a new autogenous class-specific prior preservation loss, our technique enables synthesizing the subject in diverse scenes, poses, views, and lighting conditions that do not appear in the reference images. We apply our technique to several previously-unassailable tasks, including subject recontextualization, text-guided view synthesis, appearance modification, and artistic rendering (all while preserving the subject's key features). Project page: this https URL\n  - [E-NeRF: Neural Radiance Fields from a Moving Event Camera](https://arxiv.org/abs/2208.11300) | [code]\n    > Estimating neural radiance fields (NeRFs) from ideal images has been extensively studied in the computer vision community. Most approaches assume optimal illumination and slow camera motion. These assumptions are often violated in robotic applications, where images contain motion blur and the scene may not have suitable illumination. This can cause significant problems for downstream tasks such as navigation, inspection or visualization of the scene. To alleviate these problems we present E-NeRF, the first method which estimates a volumetric scene representation in the form of a NeRF from a fast-moving event camera. Our method can recover NeRFs during very fast motion and in high dynamic range conditions, where frame-based approaches fail. We show that rendering high-quality frames is possible by only providing an event stream as input. Furthermore, by combining events and frames, we can estimate NeRFs of higher quality than state-of-the-art approaches under severe motion blur. We also show that combining events and frames can overcome failure cases of NeRF estimation in scenarios where only few input views are available, without requiring additional regularization.\n  - [FurryGAN: High Quality Foreground-aware Image Synthesis, ECCV2022](https://jeongminb.github.io/FurryGAN/) | [***``[code]``***](https://jeongminb.github.io/FurryGAN/)\n    > Foreground-aware image synthesis aims to generate images as well as their foreground masks. A common approach is to formulate an image as an masked blending of a foreground image and a background image. It is a challenging problem because it is prone to reach the trivial solution where either image overwhelms the other, i.e., the masks become completely full or empty, and the foreground and background are not meaningfully separated. We present FurryGAN with three key components: 1) imposing both the foreground image and the composite image to be realistic, 2) designing a mask as a combination of coarse and fine masks, and 3) guiding the generator by an auxiliary mask predictor in the discriminator. Our method produces realistic images with remarkably detailed alpha masks which cover hair, fur, and whiskers in a fully unsupervised manner.\n  - [SCONE: Surface Coverage Optimization in Unknown Environments by Volumetric Integration](https://arxiv.org/abs/2208.10449) | [code]\n    > Next Best View computation (NBV) is a long-standing problem in robotics, and consists in identifying the next most informative sensor position(s) for reconstructing a 3D object or scene efficiently and accurately. Like most current methods, we consider NBV prediction from a depth sensor. Learning-based methods relying on a volumetric representation of the scene are suitable for path planning, but do not scale well with the size of the scene and have lower accuracy than methods using a surface-based representation. However, the latter constrain the camera to a small number of poses. To obtain the advantages of both representations, we show that we can maximize surface metrics by Monte Carlo integration over a volumetric representation. Our method scales to large scenes and handles free camera motion: It takes as input an arbitrarily large point cloud gathered by a depth sensor like Lidar systems as well as camera poses to predict NBV. We demonstrate our approach on a novel dataset made of large and complex 3D scenes.\n## Aug14 - Aug20, 2022\n  - [Vox-Surf: Voxel-based Implicit Surface Representation](https://arxiv.org/abs/2208.10925) | [code]\n    > Virtual content creation and interaction play an important role in modern 3D applications such as AR and VR. Recovering detailed 3D models from real scenes can significantly expand the scope of its applications and has been studied for decades in the computer vision and computer graphics community. We propose Vox-Surf, a voxel-based implicit surface representation. Our Vox-Surf divides the space into finite bounded voxels. Each voxel stores geometry and appearance information in its corner vertices. Vox-Surf is suitable for almost any scenario thanks to sparsity inherited from voxel representation and can be easily trained from multiple view images. We leverage the progressive training procedure to extract important voxels gradually for further optimization so that only valid voxels are preserved, which greatly reduces the number of sampling points and increases rendering speed.The fine voxels can also be considered as the bounding volume for collision detection.The experiments show that Vox-Surf representation can learn delicate surface details and accurate color with less memory and faster rendering speed than other methods.We also show that Vox-Surf can be more practical in scene editing and AR applications.\n  - [Temporal View Synthesis of Dynamic Scenes through 3D Object Motion Estimation with Multi-Plane Images, ISMAR2022](https://arxiv.org/abs/2208.09463) | [***``[code]``***](https://github.com/NagabhushanSN95/DeCOMPnet)\n    > The challenge of graphically rendering high frame-rate videos on low compute devices can be addressed through periodic prediction of future frames to enhance the user experience in virtual reality applications. This is studied through the problem of temporal view synthesis (TVS), where the goal is to predict the next frames of a video given the previous frames and the head poses of the previous and the next frames. In this work, we consider the TVS of dynamic scenes in which both the user and objects are moving. We design a framework that decouples the motion into user and object motion to effectively use the available user motion while predicting the next frames. We predict the motion of objects by isolating and estimating the 3D object motion in the past frames and then extrapolating it. We employ multi-plane images (MPI) as a 3D representation of the scenes and model the object motion as the 3D displacement between the corresponding points in the MPI representation. In order to handle the sparsity in MPIs while estimating the motion, we incorporate partial convolutions and masked correlation layers to estimate corresponding points. The predicted object motion is then integrated with the given user or camera motion to generate the next frame. Using a disocclusion infilling module, we synthesize the regions uncovered due to the camera and object motion. We develop a new synthetic dataset for TVS of dynamic scenes consisting of 800 videos at full HD resolution. We show through experiments on our dataset and the MPI Sintel dataset that our model outperforms all the competing methods in the literature.\n  - [LoRD: Local 4D Implicit Representation for High-Fidelity Dynamic Human Modeling, ECCV2022](https://arxiv.org/abs/2208.08622) | [code]\n    > Recent progress in 4D implicit representation focuses on globally controlling the shape and motion with low dimensional latent vectors, which is prone to missing surface details and accumulating tracking error. While many deep local representations have shown promising results for 3D shape modeling, their 4D counterpart does not exist yet. In this paper, we fill this blank by proposing a novel Local 4D implicit Representation for Dynamic clothed human, named LoRD, which has the merits of both 4D human modeling and local representation, and enables high-fidelity reconstruction with detailed surface deformations, such as clothing wrinkles. Particularly, our key insight is to encourage the network to learn the latent codes of local part-level representation, capable of explaining the local geometry and temporal deformations. To make the inference at test-time, we first estimate the inner body skeleton motion to track local parts at each time step, and then optimize the latent codes for each part via auto-decoding based on different types of observed data. Extensive experiments demonstrate that the proposed method has strong capability for representing 4D human, and outperforms state-of-the-art methods on practical applications, including 4D reconstruction from sparse points, non-rigid depth fusion, both qualitatively and quantitatively.\n  - [Neural Capture of Animatable 3D Human from Monocular Video, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > We present a novel paradigm of building an animatable 3D human representation from a monocular video input, such that it can be rendered in any unseen poses and views. Our method is based on a dynamic Neural Radiance Field (NeRF) rigged by a mesh-based parametric 3D human model serving as a geometry proxy. Previous methods usually rely on multi-view videos or accurate 3D geometry information as additional inputs; besides, most methods suffer from degraded quality when generalized to unseen poses. We identify that the key to generalization is a good input embedding for querying dynamic NeRF: A good input embedding should define an injective mapping in the full volumetric space, guided by surface mesh deformation under pose variation. Based on this observation, we propose to embed the input query with its relationship to local surface regions spanned by a set of geodesic nearest neighbors on mesh vertices. By including both position and relative distance information, our embedding defines a distance-preserved deformation mapping and generalizes well to unseen poses. To reduce the dependency on additional inputs, we first initialize per-frame 3D meshes using off-the-shelf tools and then propose a pipeline to jointly optimize NeRF and refine the initial mesh. Extensive experiments show our method can synthesize plausible human rendering results under unseen poses and views.\n  - [The 8-Point Algorithm as an Inductive Bias for Relative Pose Prediction by ViTs, 3DV2022](https://arxiv.org/abs/2208.08988) | [***``[code]``***](https://github.com/crockwell/rel_pose)\n    > We present a simple baseline for directly estimating the relative pose (rotation and translation, including scale) between two images. Deep methods have recently shown strong progress but often require complex or multi-stage architectures. We show that a handful of modifications can be applied to a Vision Transformer (ViT) to bring its computations close to the Eight-Point Algorithm. This inductive bias enables a simple method to be competitive in multiple settings, often substantially improving over the state of the art with strong performance gains in limited data regimes.\n  - [PDRF: Progressively Deblurring Radiance Field for Fast and Robust Scene Reconstruction from Blurry Images](https://arxiv.org/abs/2208.08049) | [code]\n    > We present Progressively Deblurring Radiance Field (PDRF), a novel approach to efficiently reconstruct high quality radiance fields from blurry images. While current State-of-The-Art (SoTA) scene reconstruction methods achieve photo-realistic rendering results from clean source views, their performances suffer when the source views are affected by blur, which is commonly observed for images in the wild. Previous deblurring methods either do not account for 3D geometry, or are computationally intense. To addresses these issues, PDRF, a progressively deblurring scheme in radiance field modeling, accurately models blur by incorporating 3D scene context. PDRF further uses an efficient importance sampling scheme, which results in fast scene optimization. Specifically, PDRF proposes a Coarse Ray Renderer to quickly estimate voxel density and feature; a Fine Voxel Renderer is then used to achieve high quality ray tracing. We perform extensive experiments and show that PDRF is 15X faster than previous SoTA while achieving better performance on both synthetic and real scenes.\n  - [Text-to-Image Generation via Implicit Visual Guidance and Hypernetwork](https://arxiv.org/abs/2208.08493) | [code]\n    > We develop an approach for text-to-image generation that embraces additional retrieval images, driven by a combination of implicit visual guidance loss and generative objectives. Unlike most existing text-to-image generation methods which merely take the text as input, our method dynamically feeds cross-modal search results into a unified training stage, hence improving the quality, controllability and diversity of generation results. We propose a novel hypernetwork modulated visual-text encoding scheme to predict the weight update of the encoding layer, enabling effective transfer from visual information (e.g. layout, content) into the corresponding latent domain. Experimental results show that our model guided with additional retrieval visual data outperforms existing GAN-based models. On COCO dataset, we achieve better FID of 9.13 with up to 3.5× fewer generator parameters, compared with the state-of-the-art method.\n  - [Casual Indoor HDR Radiance Capture from Omnidirectional Images](https://arxiv.org/abs/2208.07903) | [code]\n    > We present PanoHDR-NeRF, a novel pipeline to casually capture a plausible full HDR radiance field of a large indoor scene without elaborate setups or complex capture protocols. First, a user captures a low dynamic range (LDR) omnidirectional video of the scene by freely waving an off-the-shelf camera around the scene. Then, an LDR2HDR network uplifts the captured LDR frames to HDR, subsequently used to train a tailored NeRF++ model. The resulting PanoHDR-NeRF pipeline can estimate full HDR panoramas from any location of the scene. Through experiments on a novel test dataset of a variety of real scenes with the ground truth HDR radiance captured at locations not seen during training, we show that PanoHDR-NeRF predicts plausible radiance from any scene point. We also show that the HDR images produced by PanoHDR-NeRF can synthesize correct lighting effects, enabling the augmentation of indoor scenes with synthetic objects that are lit correctly.\n  - [UPST-NeRF: Universal Photorealistic Style Transfer of Neural Radiance Fields for 3D Scene](https://arxiv.org/abs/2208.07059) | [***``[code]``***](https://github.com/semchan/UPST-NeRF)\n    > 3D scenes photorealistic stylization aims to generate photorealistic images from arbitrary novel views according to a given style image while ensuring consistency when rendering from different viewpoints. Some existing stylization methods with neural radiance fields can effectively predict stylized scenes by combining the features of the style image with multi-view images to train 3D scenes. However, these methods generate novel view images that contain objectionable artifacts. Besides, they cannot achieve universal photorealistic stylization for a 3D scene. Therefore, a styling image must retrain a 3D scene representation network based on a neural radiation field. We propose a novel 3D scene photorealistic style transfer framework to address these issues. It can realize photorealistic 3D scene style transfer with a 2D style image. We first pre-trained a 2D photorealistic style transfer network, which can meet the photorealistic style transfer between any given content image and style image. Then, we use voxel features to optimize a 3D scene and get the geometric representation of the scene. Finally, we jointly optimize a hyper network to realize the scene photorealistic style transfer of arbitrary style images. In the transfer stage, we use a pre-trained 2D photorealistic network to constrain the photorealistic style of different views and different style images in the 3D scene. The experimental results show that our method not only realizes the 3D photorealistic style transfer of arbitrary style images but also outperforms the existing methods in terms of visual quality and consistency. Project page:this https URL.\n  - [DM-NeRF: 3D Scene Geometry Decomposition and Manipulation from 2D Images](https://arxiv.org/abs/2208.07227) | [***``[code]``***](https://github.com/vLAR-group/DM-NeRF)\n    > In this paper, we study the problem of 3D scene geometry decomposition and manipulation from 2D views. By leveraging the recent implicit neural representation techniques, particularly the appealing neural radiance fields, we introduce an object field component to learn unique codes for all individual objects in 3D space only from 2D supervision. The key to this component is a series of carefully designed loss functions to enable every 3D point, especially in non-occupied space, to be effectively optimized even without 3D labels. In addition, we introduce an inverse query algorithm to freely manipulate any specified 3D object shape in the learned scene representation. Notably, our manipulation algorithm can explicitly tackle key issues such as object collisions and visual occlusions. Our method, called DM-NeRF, is among the first to simultaneously reconstruct, decompose, manipulate and render complex 3D scenes in a single pipeline. Extensive experiments on three datasets clearly show that our method can accurately decompose all 3D objects from 2D views, allowing any interested object to be freely manipulated in 3D space such as translation, rotation, size adjustment, and deformation.\n  - [HDR-Plenoxels: Self-Calibrating High Dynamic Range Radiance Fields, ECCV2022](https://arxiv.org/abs/2208.06787) | [code]\n    > We propose high dynamic range radiance (HDR) fields, HDR-Plenoxels, that learn a plenoptic function of 3D HDR radiance fields, geometry information, and varying camera settings inherent in 2D low dynamic range (LDR) images. Our voxel-based volume rendering pipeline reconstructs HDR radiance fields with only multi-view LDR images taken from varying camera settings in an end-to-end manner and has a fast convergence speed. To deal with various cameras in real-world scenarios, we introduce a tone mapping module that models the digital in-camera imaging pipeline (ISP) and disentangles radiometric settings. Our tone mapping module allows us to render by controlling the radiometric settings of each novel view. Finally, we build a multi-view dataset with varying camera conditions, which fits our problem setting. Our experiments show that HDR-Plenoxels can express detail and high-quality HDR novel views from only LDR images with various cameras.\n## Aug7 - Aug13, 2022\n  - [Progressive Multi-scale Light Field Networks, 3DV2022](https://arxiv.org/abs/2208.06710) | [code]\n    > Neural representations have shown great promise in their ability to represent radiance and light fields while being very compact compared to the image set representation. However, current representations are not well suited for streaming as decoding can only be done at a single level of detail and requires downloading the entire neural network model. Furthermore, high-resolution light field networks can exhibit flickering and aliasing as neural networks are sampled without appropriate filtering. To resolve these issues, we present a progressive multi-scale light field network that encodes a light field with multiple levels of detail. Lower levels of detail are encoded using fewer neural network weights enabling progressive streaming and reducing rendering time. Our progressive multi-scale light field network addresses aliasing by encoding smaller anti-aliased representations at its lower levels of detail. Additionally, per-pixel level of detail enables our representation to support dithered transitions and foveated rendering.\n  - [OmniVoxel: A Fast and Precise Reconstruction Method of Omnidirectional Neural Radiance Field, GCCE 2022](https://arxiv.org/abs/2208.06335) | [code]\n    > This paper proposes a method to reconstruct the neural radiance field with equirectangular omnidirectional images. Implicit neural scene representation with a radiance field can reconstruct the 3D shape of a scene continuously within a limited spatial area. However, training a fully implicit representation on commercial PC hardware requires a lot of time and computing resources (15 ∼ 20 hours per scene). Therefore, we propose a method to accelerate this process significantly (20 ∼ 40 minutes per scene). Instead of using a fully implicit representation of rays for radiance field reconstruction, we adopt feature voxels that contain density and color features in tensors. Considering omnidirectional equirectangular input and the camera layout, we use spherical voxelization for representation instead of cubic representation. Our voxelization method could balance the reconstruction quality of the inner scene and outer scene. In addition, we adopt the axis-aligned positional encoding method on the color features to increase the total image quality. Our method achieves satisfying empirical performance on synthetic datasets with random camera poses. Moreover, we test our method with real scenes which contain complex geometries and also achieve state-of-the-art performance. Our code and complete dataset will be released at the same time as the paper publication.\n  - [HyperTime: Implicit Neural Representation for Time Series](https://arxiv.org/abs/2208.05836) | [code]\n    > Implicit neural representations (INRs) have recently emerged as a powerful tool that provides an accurate and resolution-independent encoding of data. Their robustness as general approximators has been shown in a wide variety of data sources, with applications on image, sound, and 3D scene representation. However, little attention has been given to leveraging these architectures for the representation and analysis of time series data. In this paper, we analyze the representation of time series using INRs, comparing different activation functions in terms of reconstruction accuracy and training convergence speed. We show how these networks can be leveraged for the imputation of time series, with applications on both univariate and multivariate data. Finally, we propose a hypernetwork architecture that leverages INRs to learn a compressed latent representation of an entire time series dataset. We introduce an FFT-based loss to guide training so that all frequencies are preserved in the time series. We show that this network can be used to encode time series as INRs, and their embeddings can be interpolated to generate new time series from existing ones. We evaluate our generative method by using it for data augmentation, and show that it is competitive against current state-of-the-art approaches for augmentation of time series.\n  - [RelPose: Predicting Probabilistic Relative Rotation for Single Objects in the Wild, ECCV2022](https://jasonyzhang.com/relpose/) | [***``[code]``***](https://github.com/jasonyzhang/relpose)\n    > We describe a data-driven method for inferring the camera viewpoints given multiple images of an arbitrary object. This task is a core component of classic geometric pipelines such as SfM and SLAM, and also serves as a vital pre-processing requirement for contemporary neural approaches (e.g. NeRF) to object reconstruction and view synthesis. In contrast to existing correspondence-driven methods that do not perform well given sparse views, we propose a top-down prediction based approach for estimating camera viewpoints. Our key technical insight is the use of an energy-based formulation for representing distributions over relative camera rotations, thus allowing us to explicitly represent multiple camera modes arising from object symmetries or views. Leveraging these relative predictions, we jointly estimate a consistent set of camera rotations from multiple images. We show that our approach outperforms state-of-the-art SfM and SLAM methods given sparse images on both seen and unseen categories. Further, our probabilistic approach significantly outperforms directly regressing relative poses, suggesting that modeling multimodality is important for coherent joint reconstruction. We demonstrate that our system can be a stepping stone toward in-the-wild reconstruction from multi-view datasets. The project page with code and videos can be found at this https URL.\n  - [NIDN: Neural Inverse Design of Nanostructures](https://arxiv.org/abs/2208.05480) | [code]\n    > In the recent decade, computational tools have become central in material design, allowing rapid development cycles at reduced costs. Machine learning tools are especially on the rise in photonics. However, the inversion of the Maxwell equations needed for the design is particularly challenging from an optimization standpoint, requiring sophisticated software. We present an innovative, open-source software tool called Neural Inverse Design of Nanostructures (NIDN) that allows designing complex, stacked material nanostructures using a physics-based deep learning approach. Instead of a derivative-free or data-driven optimization or learning method, we perform a gradient-based neural network training where we directly optimize the material and its structure based on its spectral characteristics. NIDN supports two different solvers, rigorous coupled-wave analysis and a finite-difference time-domain method. The utility and validity of NIDN are demonstrated on several synthetic examples as well as the design of a 1550 nm filter and anti-reflection coating. Results match experimental baselines, other simulation tools, and the desired spectral characteristics. Given its full modularity in regard to network architectures and Maxwell solvers as well as open-source, permissive availability, NIDN will be able to support computational material design processes in a broad range of applications.\n  - [HRF-Net: Holistic Radiance Fields from Sparse Inputs](https://arxiv.org/abs/2208.04717) | [code]\n    > We present HRF-Net, a novel view synthesis method based on holistic radiance fields that renders novel views using a set of sparse inputs. Recent generalizing view synthesis methods also leverage the radiance fields but the rendering speed is not real-time. There are existing methods that can train and render novel views efficiently but they can not generalize to unseen scenes. Our approach addresses the problem of real-time rendering for generalizing view synthesis and consists of two main stages: a holistic radiance fields predictor and a convolutional-based neural renderer. This architecture infers not only consistent scene geometry based on the implicit neural fields but also renders new views efficiently using a single GPU. We first train HRF-Net on multiple 3D scenes of the DTU dataset and the network can produce plausible novel views on unseen real and synthetics data using only photometric losses. Moreover, our method can leverage a denser set of reference images of a single scene to produce accurate novel views without relying on additional explicit representations and still maintains the high-speed rendering of the pre-trained model. Experimental results show that HRF-Net outperforms state-of-the-art generalizable neural rendering methods on various synthetic and real datasets.\n  - [Monte Carlo Denoising Using Implicit Neural Representation](https://oaktrust.library.tamu.edu/handle/1969.1/196567) | [code]\n    > Monte Carlo path tracing is a popular 3D rendering technique in computer graphics, but it often requires a costly tradeoff between the amount of noise in the image and computation time. Therefore, it is useful to attempt to “smooth out” a noisy image, typically by constructing new data between the samples or applying filters to the image. In this work, we investigate the feasibility of training a neural network to implicitly represent the radiance of a fixed-viewpoint scene as a continuous function. We implement the neural network using a multilayer perceptron network and train it on a sparsely sampled image that is generated by an offline Monte Carlo renderer. This training data uses the (x, y) coordinate of each sample on the image plane as inputs and the RGB color of the sample as outputs. Additionally, we provide the network with the surface normal, depth, and albedo of the first ray intersection as extra inputs alongside the pixel coordinates. These extra input dimensions improve the quality of the implicit representation by helping the network account for changes in depth, normal, and diffuse color. Once the network is trained on the sparsely sampled scene, we can densely sample the network many times per pixel to create the final denoised image. We find that this network can quickly learn and denoise images in scenes with soft lighting and glossy reflections, and it can easily handle discontinuities in depth, normal, and diffuse color with just a small amount of training.\n  - [Fast Gradient Descent for Surface Capture Via Differentiable Rendering, 3DV2022](https://hal.inria.fr/hal-03748662/) | [code]\n    > Differential rendering has recently emerged as a powerful tool for image-based rendering or geometric reconstruction from multiple views, with very high quality. Up to now, such methods have been benchmarked on generic object databases and promisingly applied to some real data, but have yet to be applied to specific applications that may benefit. In this paper, we investigate how a differential rendering system can be crafted for raw multi-camera performance capture. We address several key issues in the way of practical usability and reproducibility, such as processing speed, explainability of the model, and general output model quality. This leads us to several contributions to the differential rendering framework. In particular we show that a unified view of differential rendering and classic optimization is possible, leading to a formulation and implementation where complete non-stochastic gradient steps can be analytically computed and the full perframe data stored in video memory, yielding a straightforward and efficient implementation. We also use a sparse storage and coarse-to-fine scheme to achieve extremely high resolution with contained memory and computation time. We show experimentally that results rivaling in quality with state of the art multi-view human surface capture methods are achievable in a fraction of the time, typically around a minute per frame.\n  - [PlaneFormers: From Sparse View Planes to 3D Reconstruction, ECCV2022](https://arxiv.org/abs/2208.04307) | [code]\n    > We present an approach for the planar surface reconstruction of a scene from images with limited overlap. This reconstruction task is challenging since it requires jointly reasoning about single image 3D reconstruction, correspondence between images, and the relative camera pose between images. Past work has proposed optimization-based approaches. We introduce a simpler approach, the PlaneFormer, that uses a transformer applied to 3D-aware plane tokens to perform 3D reasoning. Our experiments show that our approach is substantially more effective than prior work, and that several 3D-specific design decisions are crucial for its success.\n  - [PS-NeRV: Patch-wise Stylized Neural Representations for Videos](https://arxiv.org/abs/2208.03742) | [code]\n    > We study how to represent a video with implicit neural representations (INRs). Classical INRs methods generally utilize MLPs to map input coordinates to output pixels. While some recent works have tried to directly reconstruct the whole image with CNNs. However, we argue that both the above pixel-wise and image-wise strategies are not favorable to video data. Instead, we propose a patch-wise solution, PS-NeRV, which represents videos as a function of patches and the corresponding patch coordinate. It naturally inherits the advantages of image-wise methods, and achieves excellent reconstruction performance with fast decoding speed. The whole method includes conventional modules, like positional embedding, MLPs and CNNs, while also introduces AdaIN to enhance intermediate features. These simple yet essential changes could help the network easily fit high-frequency details. Extensive experiments have demonstrated its effectiveness in several video-related tasks, such as video compression and video inpainting.\n## Jul31 - Aug6, 2022\n  - [NFOMP: Neural Field for Optimal Motion Planner of Differential Drive Robots With Nonholonomic Constraints, IEEE Robotics and Automation Letters, IEEE Robotics and Automation Letters](https://ieeexplore.ieee.org/abstract/document/9851532/) | [code]\n    > Optimal motion planning is one of the most critical problems in mobile robotics. On the one hand, classical sampling-based methods propose asymptotically optimal solutions to this problem. However, these planners cannot achieve smooth and short trajectories in reasonable calculation time. On the other hand, optimization-based methods are able to generate smooth and plain trajectories in a variety of scenarios, including a dense human crowd. However, modern optimization-based methods use the precomputed signed distance function for collision loss estimation, and it limits the application of these methods for general configuration spaces, including a differential drive non-circular robot with non-holonomic constraints. Moreover, optimization-based methods lack the ability to handle U-shaped or thin obstacles accurately. We propose to improve the optimization methods in two aspects. Firstly, we developed an obstacle neural field model to estimate collision loss; training this model together with trajectory optimization allows improving collision loss continuously, while achieving more feasible and smoother trajectories. Secondly, we forced the trajectory to consider non-holonomic constraints by adding Lagrange multipliers to the trajectory loss function. We applied our method for solving the optimal motion planning problem for differential drive robots with non-holonomic constraints, benchmarked our solution, and proved that the novel planner generates smooth, short, and plain trajectories perfectly suitable for a robot to follow, and outperforms the state-of-the-art approaches by 25% on normalized curvature and by 75% on the number of cusps in the MovingAI environment.\n  - [NeSF: Neural Semantic Fields for Generalizable Semantic Segmentation of 3D Scenes](https://research.google/pubs/pub51563/) | [code]\n    > We present NeSF, a method for producing 3D semantic fields from pre-trained density fields and sparse 2D semantic supervision. Our method side-steps traditional scene representations by leveraging neural representations where 3D information is stored within neural fields. In spite of being supervised by 2D signals alone, our method is able to generate 3D-consistent semantic maps from novel camera poses and can be queried at arbitrary 3D points. Notably, NeSF is compatible with any method producing a density field, and its accuracy improves as the quality of the pre-trained density fields improve. Our empirical analysis demonstrates comparable quality to competitive 2D and 3D semantic segmentation baselines on convincing synthetic scenes while also offering features unavailable to existing methods.\n  - [PRIF: Primary Ray-based Implicit Function](https://research.google/pubs/pub51556/) | [code]\n    > We introduce a new implicit shape representation called Primary Ray-based Implicit Function (PRIF). In contrast to most existing approaches based on the signed distance function (SDF) which handles spatial locations, our representation operates on oriented rays. Specifically, PRIF is formulated to directly produce the surface hit point of a given input ray, without the expensive sphere-tracing operations, hence enabling efficient shape extraction and differentiable rendering. We demonstrate that neural networks trained to encode PRIF achieve successes in various tasks including single shape representation, category-wise shape generation, shape completion from sparse or noisy observations, inverse rendering for camera pose estimation, and neural rendering with color.\n  - [Transformers as Meta-Learners for Implicit Neural Representations, ECCV2022](https://arxiv.org/abs/2208.02801) | [***``[code]``***](https://yinboc.github.io/trans-inr/)\n    > Implicit Neural Representations (INRs) have emerged and shown their benefits over discrete representations in recent years. However, fitting an INR to the given observations usually requires optimization with gradient descent from scratch, which is inefficient and does not generalize well with sparse observations. To address this problem, most of the prior works train a hypernetwork that generates a single vector to modulate the INR weights, where the single vector becomes an information bottleneck that limits the reconstruction precision of the output INR. Recent work shows that the whole set of weights in INR can be precisely inferred without the single-vector bottleneck by gradient-based meta-learning. Motivated by a generalized formulation of gradient-based meta-learning, we propose a formulation that uses Transformers as hypernetworks for INRs, where it can directly build the whole set of INR weights with Transformers specialized as set-to-set mapping. We demonstrate the effectiveness of our method for building INRs in different tasks and domains, including 2D image regression and view synthesis for 3D objects. Our work draws connections between the Transformer hypernetworks and gradient-based meta-learning algorithms and we provide further analysis for understanding the generated INRs.\n  - [End-to-end learning of 3D phase-only holograms for holographic display](https://www.nature.com/articles/s41377-022-00894-6) | [code]\n    > Computer-generated holography (CGH) provides volumetric control of coherent wavefront and is fundamental to applications such as volumetric 3D displays, lithography, neural photostimulation, and optical/acoustic trapping. Recently, deep learning-based methods emerged as promising computational paradigms for CGH synthesis that overcome the quality-runtime tradeoff in conventional simulation/optimization-based methods. Yet, the quality of the predicted hologram is intrinsically bounded by the dataset’s quality. Here we introduce a new hologram dataset, MIT-CGH-4K-V2, that uses a layered depth image as a data-efficient volumetric 3D input and a two-stage supervised+unsupervised training protocol for direct synthesis of high-quality 3D phase-only holograms. The proposed system also corrects vision aberration, allowing customization for end-users. We experimentally show photorealistic 3D holographic projections and discuss relevant spatial light modulator calibration procedures. Our method runs in real-time on a consumer GPU and 5 FPS on an iPhone 13 Pro, promising drastically enhanced performance for the applications above.\n  - [VolTeMorph: Realtime, Controllable and Generalisable Animation of Volumetric Representations](https://arxiv.org/pdf/2208.00949) | [code]\n    > The recent increase in popularity of volumetric representations for scene reconstruction and novel view synthesis has put renewed focus on animating volumetric content at high visual quality and in real-time. While implicit deformation methods based on learned functions can produce impressive results, they are `black boxes' to artists and content creators, they require large amounts of training data to generalise meaningfully, and they do not produce realistic extrapolations outside the training data. In this work we solve these issues by introducing a volume deformation method which is real-time, easy to edit with off-the-shelf software and can extrapolate convincingly. To demonstrate the versatility of our method, we apply it in two scenarios: physics-based object deformation and telepresence where avatars are controlled using blendshapes. We also perform thorough experiments showing that our method compares favourably to both volumetric approaches combined with implicit deformation and methods based on mesh deformation.\n  - [Controllable Free Viewpoint Video Reconstruction Based on Neural Radiance Fields and Motion Graphs, IEEE Transactions on Visualization and Computer Graphics](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > In this paper, we propose a controllable high-quality free viewpoint video generation method based on the motion graph and neural radiance fields (NeRF). Different from existing pose-driven NeRF or time/structure conditioned NeRF works, we propose to first construct a directed motion graph of the captured sequence. Such a sequence-motion-parameterization strategy not only enables flexible pose control for free viewpoint video rendering but also avoids redundant calculation of similar poses and thus improves the overall reconstruction efficiency. Moreover, to support body shape control without losing the realistic free viewpoint rendering performance, we improve the vanilla NeRF by combining explicit surface deformation and implicit neural scene representations. Specifically, we train a local surface-guided NeRF for each valid frame on the motion graph, and the volumetric rendering was only performed in the local space around the real surface, thus enabling plausible shape control ability. As far as we know, our method is the first method that supports both realistic free viewpoint video reconstruction and motion graph-based user-guided motion traversal. The results and comparisons further demonstrate the effectiveness of the proposed method.\n  - [Robust Change Detection Based on Neural Descriptor Fields, IROS2022](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > The ability to reason about changes in the environment is crucial for robots operating over extended periods of time. Agents are expected to capture changes during operation so that actions can be followed to ensure a smooth progression of the working session. However, varying viewing angles and accumulated localization errors make it easy for robots to falsely detect changes in the surrounding world due to low observation overlap and drifted object associations. In this paper, based on the recently proposed category-level Neural Descriptor Fields (NDFs), we develop an object-level online change detection approach that is robust to partially overlapping observations and noisy localization results. Utilizing the shape completion capability and SE(3)-equivariance of NDFs, we represent objects with compact shape codes encoding full object shapes from partial observations. The objects are then organized in a spatial tree structure based on object centers recovered from NDFs for fast queries of object neighborhoods. By associating objects via shape code similarity and comparing local object-neighbor spatial layout, our proposed approach demonstrates robustness to low observation overlap and localization noises. We conduct experiments on both synthetic and real-world sequences and achieve improved change detection results compared to multiple baseline methods. Project webpage: this https URL\n## Jul24 - Jul30, 2022\n  - [DoF-NeRF: Depth-of-Field Meets Neural Radiance Fields, ACMMM2022](https://arxiv.org/pdf/2208.00945) | [***``[code]``***](https://github.com/zijinwuzijin/DoF-NeRF)\n    > Neural Radiance Field (NeRF) and its variants have exhibited great success on representing 3D scenes and synthesizing photo-realistic novel views. However, they are generally based on the pinhole camera model and assume all-in-focus inputs. This limits their applicability as images captured from the real world often have finite depth-of-field (DoF). To mitigate this issue, we introduce DoF-NeRF, a novel neural rendering approach that can deal with shallow DoF inputs and can simulate DoF effect. In particular, it extends NeRF to simulate the aperture of lens following the principles of geometric optics. Such a physical guarantee allows DoF-NeRF to operate views with different focus configurations. Benefiting from explicit aperture modeling, DoF-NeRF also enables direct manipulation of DoF effect by adjusting virtual aperture and focus parameters. It is plug-and-play and can be inserted into NeRF-based frameworks. Experiments on synthetic and real-world datasets show that, DoF-NeRF not only performs comparably with NeRF in the all-in-focus setting, but also can synthesize all-in-focus novel views conditioned on shallow DoF inputs. An interesting application of DoF-NeRF to DoF rendering is also demonstrated.\n  - [ZEPI-Net: Light Field Super Resolution via Internal Cross-Scale Epipolar Plane Image Zero-Shot Learning, Neural Processing Letters (2022)](https://link.springer.com/article/10.1007/s11063-022-10955-x) | [code]\n    > Many applications of light field (LF) imaging have been limited by the spatial-angular resolution problem, hence the need for efficient super-resolution techniques. Recently, learning-based solutions have achieved remarkably better performances than traditional super-resolution (SR) techniques. Unfortunately, the learning or training process relies heavily on the training dataset, which could be limited for most LF imaging applications. In this paper, we propose a novel LF spatial-angular SR algorithm based on zero-shot learning. We suggest learning cross-scale reusable features in the epipolar plane image (EPI) space, and avoiding explicitly modeling scene priors or implicitly learning that from a large number of LFs. Most importantly, without using any external LFs, the proposed algorithm can simultaneously super-resolve a LF in both spatial and angular domains. Moreover, the proposed solution is free of depth or disparity estimation, which is usually employed by existing LF spatial and angular SR. By using a simple 8-layers fully convolutional network, we show that the proposed algorithm can generate comparable results to the state-of-the-art spatial SR. Our algorithm outperforms the existing methods in terms of angular SR on multiple groups of public LF datasets. The experiment results indicate that the cross-scale features can be well learned and be reused for LF SR in the EPI space.\n  - [ObjectFusion: Accurate object-level SLAM with neural object priors, Graphical Models, Volume 123, September 2022](https://www.sciencedirect.com/science/article/pii/S1524070322000418) | [code]\n    > Previous object-level Simultaneous Localization and Mapping (SLAM) approaches still fail to create high quality object-oriented 3D map in an efficient way. The main challenges come from how to represent the object shape effectively and how to apply such object representation to accurate online camera tracking efficiently. In this paper, we provide ObjectFusion as a novel object-level SLAM in static scenes which efficiently creates object-oriented 3D map with high-quality object reconstruction, by leveraging neural object priors. We propose a neural object representation with only a single encoder–decoder network to effectively express the object shape across various categories, which benefits high quality reconstruction of object instance. More importantly, we propose to convert such neural object representation as precise measurements to jointly optimize the object shape, object pose and camera pose for the final accurate 3D object reconstruction. With extensive evaluations on synthetic and real-world RGB-D datasets, we show that our ObjectFusion outperforms previous approaches, with better object reconstruction quality, using much less memory footprint, and in a more efficient way, especially at the object level.\n  - [MobileNeRF: Exploiting the Polygon Rasterization Pipeline for Efficient Neural Field Rendering on Mobile Architectures](https://arxiv.org/abs/2208.00277) | [***``[code]``***](https://github.com/google-research/jax3d/tree/main/jax3d/projects/mobilenerf)\n    > Neural Radiance Fields (NeRFs) have demonstrated amazing ability to synthesize images of 3D scenes from novel views. However, they rely upon specialized volumetric rendering algorithms based on ray marching that are mismatched to the capabilities of widely deployed graphics hardware. This paper introduces a new NeRF representation based on textured polygons that can synthesize novel images efficiently with standard rendering pipelines. The NeRF is represented as a set of polygons with textures representing binary opacities and feature vectors. Traditional rendering of the polygons with a z-buffer yields an image with features at every pixel, which are interpreted by a small, view-dependent MLP running in a fragment shader to produce a final pixel color. This approach enables NeRFs to be rendered with the traditional polygon rasterization pipeline, which provides massive pixel-level parallelism, achieving interactive frame rates on a wide range of compute platforms, including mobile phones.\n  - [Neural Density-Distance Fields, ECCV2022](https://arxiv.org/abs/2207.14455) | [***``[code]``***](https://ueda0319.github.io/neddf/)\n    > The success of neural fields for 3D vision tasks is now indisputable. Following this trend, several methods aiming for visual localization (e.g., SLAM) have been proposed to estimate distance or density fields using neural fields. However, it is difficult to achieve high localization performance by only density fields-based methods such as Neural Radiance Field (NeRF) since they do not provide density gradient in most empty regions. On the other hand, distance field-based methods such as Neural Implicit Surface (NeuS) have limitations in objects' surface shapes. This paper proposes Neural Density-Distance Field (NeDDF), a novel 3D representation that reciprocally constrains the distance and density fields. We extend distance field formulation to shapes with no explicit boundary surface, such as fur or smoke, which enable explicit conversion from distance field to density field. Consistent distance and density fields realized by explicit conversion enable both robustness to initial values and high-quality registration. Furthermore, the consistency between fields allows fast convergence from sparse point clouds. Experiments show that NeDDF can achieve high localization performance while providing comparable results to NeRF on novel view synthesis. The code is available at this https URL.\n  - [End-to-end View Synthesis via NeRF Attention](https://arxiv.org/abs/2207.14741) | [code]\n    > In this paper, we present a simple seq2seq formulation for view synthesis where we take a set of ray points as input and output colors corresponding to the rays. Directly applying a standard transformer on this seq2seq formulation has two limitations. First, the standard attention cannot successfully fit the volumetric rendering procedure, and therefore high-frequency components are missing in the synthesized views. Second, applying global attention to all rays and pixels is extremely inefficient. Inspired by the neural radiance field (NeRF), we propose the NeRF attention (NeRFA) to address the above problems. On the one hand, NeRFA considers the volumetric rendering equation as a soft feature modulation procedure. In this way, the feature modulation enhances the transformers with the NeRF-like inductive bias. On the other hand, NeRFA performs multi-stage attention to reduce the computational overhead. Furthermore, the NeRFA model adopts the ray and pixel transformers to learn the interactions between rays and pixels. NeRFA demonstrates superior performance over NeRF and NerFormer on four datasets: DeepVoxels, Blender, LLFF, and CO3D. Besides, NeRFA establishes a new state-of-the-art under two settings: the single-scene view synthesis and the category-centric novel view synthesis. The code will be made publicly available.\n  - [Going Off-Grid: Continuous Implicit Neural Representations for 3D Vascular Modeling, MICCAI STACOM 2022](https://arxiv.org/abs/2207.14663) | [code]\n    > Personalised 3D vascular models are valuable for diagnosis, prognosis and treatment planning in patients with cardiovascular disease. Traditionally, such models have been constructed with explicit representations such as meshes and voxel masks, or implicit representations such as radial basis functions or atomic (tubular) shapes. Here, we propose to represent surfaces by the zero level set of their signed distance function (SDF) in a differentiable implicit neural representation (INR). This allows us to model complex vascular structures with a representation that is implicit, continuous, light-weight, and easy to integrate with deep learning algorithms. We here demonstrate the potential of this approach with three practical examples. First, we obtain an accurate and watertight surface for an abdominal aortic aneurysm (AAA) from CT images and show robust fitting from as little as 200 points on the surface. Second, we simultaneously fit nested vessel walls in a single INR without intersections. Third, we show how 3D models of individual arteries can be smoothly blended into a single watertight surface. Our results show that INRs are a flexible representation with potential for minimally interactive annotation and manipulation of complex vascular structures.\n  - [Neural Strands: Learning Hair Geometry and Appearance from Multi-View Images, ECCV2022](https://arxiv.org/pdf/2207.14067) | [***``[code]``***](https://radualexandru.github.io/neural_strands/)\n    > We present Neural Strands, a novel learning framework for modeling accurate hair geometry and appearance from multi-view image inputs. The learned hair model can be rendered in real-time from any viewpoint with high-fidelity view-dependent effects. Our model achieves intuitive shape and style control unlike volumetric counterparts. To enable these properties, we propose a novel hair representation based on a neural scalp texture that encodes the geometry and appearance of individual strands at each texel location. Furthermore, we introduce a novel neural rendering framework based on rasterization of the learned hair strands. Our neural rendering is strand-accurate and anti-aliased, making the rendering view-consistent and photorealistic. Combining appearance with a multi-view geometric prior, we enable, for the first time, the joint learning of appearance and explicit hair geometry from a multi-view setup. We demonstrate the efficacy of our approach in terms of fidelity and efficiency for various hairstyles.\n  - [Neural Radiance Transfer Fields for Relightable Novel-view Synthesis with Global Illumination](https://arxiv.org/abs/2207.13607) | [code]\n    > Given a set of images of a scene, the re-rendering of this scene from novel views and lighting conditions is an important and challenging problem in Computer Vision and Graphics. On the one hand, most existing works in Computer Vision usually impose many assumptions regarding the image formation process, e.g. direct illumination and predefined materials, to make scene parameter estimation tractable. On the other hand, mature Computer Graphics tools allow modeling of complex photo-realistic light transport given all the scene parameters. Combining these approaches, we propose a method for scene relighting under novel views by learning a neural precomputed radiance transfer function, which implicitly handles global illumination effects using novel environment maps. Our method can be solely supervised on a set of real images of the scene under a single unknown lighting condition. To disambiguate the task during training, we tightly integrate a differentiable path tracer in the training process and propose a combination of a synthesized OLAT and a real image loss. Results show that the recovered disentanglement of scene parameters improves significantly over the current state of the art and, thus, also our re-rendering results are more realistic and accurate.\n  - [ShAPO: Implicit Representations for Multi-Object Shape, Appearance, and Pose Optimization, ECCV2022](https://arxiv.org/abs/2207.13691) | [***``[code]``***](https://zubair-irshad.github.io/projects/ShAPO.html)\n    > Our method studies the complex task of object-centric 3D understanding from a single RGB-D observation. As it is an ill-posed problem, existing methods suffer from low performance for both 3D shape and 6D pose and size estimation in complex multi-object scenarios with occlusions. We present ShAPO, a method for joint multi-object detection, 3D textured reconstruction, 6D object pose and size estimation. Key to ShAPO is a single-shot pipeline to regress shape, appearance and pose latent codes along with the masks of each object instance, which is then further refined in a sparse-to-dense fashion. A novel disentangled shape and appearance database of priors is first learned to embed objects in their respective shape and appearance space. We also propose a novel, octree-based differentiable optimization step, allowing us to further improve object shape, pose and appearance simultaneously under the learned latent space, in an analysis-by-synthesis fashion. Our novel joint implicit textured object representation allows us to accurately identify and reconstruct novel unseen objects without having access to their 3D meshes. Through extensive experiments, we show that our method, trained on simulated indoor scenes, accurately regresses the shape, appearance and pose of novel objects in the real-world with minimal fine-tuning. Our method significantly out-performs all baselines on the NOCS dataset with an 8% absolute improvement in mAP for 6D pose estimation.\n  - [GAUDI: A Neural Architect for Immersive 3D Scene Generation](https://arxiv.org/abs/2207.13751) | [***``[code]``***](https://github.com/apple/ml-gaudi)\n    > We introduce GAUDI, a generative model capable of capturing the distribution of complex and realistic 3D scenes that can be rendered immersively from a moving camera. We tackle this challenging problem with a scalable yet powerful approach, where we first optimize a latent representation that disentangles radiance fields and camera poses. This latent representation is then used to learn a generative model that enables both unconditional and conditional generation of 3D scenes. Our model generalizes previous works that focus on single objects by removing the assumption that the camera pose distribution can be shared across samples. We show that GAUDI obtains state-of-the-art performance in the unconditional generative setting across multiple datasets and allows for conditional generation of 3D scenes given conditioning variables like sparse image observations or text that describes the scene.\n  - [AlignSDF: Pose-Aligned Signed Distance Fields for Hand-Object Reconstruction, ECCV2022](https://arxiv.org/abs/2207.12909) | [***``[code]``***](https://zerchen.github.io/projects/alignsdf.html)\n    > Recent work achieved impressive progress towards joint reconstruction of hands and manipulated objects from monocular color images. Existing methods focus on two alternative representations in terms of either parametric meshes or signed distance fields (SDFs). On one side, parametric models can benefit from prior knowledge at the cost of limited shape deformations and mesh resolutions. Mesh models, hence, may fail to precisely reconstruct details such as contact surfaces of hands and objects. SDF-based methods, on the other side, can represent arbitrary details but are lacking explicit priors. In this work we aim to improve SDF models using priors provided by parametric representations. In particular, we propose a joint learning framework that disentangles the pose and the shape. We obtain hand and object poses from parametric models and use them to align SDFs in 3D space. We show that such aligned SDFs better focus on reconstructing shape details and improve reconstruction accuracy both for hands and objects. We evaluate our method and demonstrate significant improvements over the state of the art on the challenging ObMan and DexYCB benchmarks.\n  - [Neural Green’s function for Laplacian systems, Computer & Graphics](https://www.sciencedirect.com/science/article/pii/S0097849322001406) | [code]\n    > Solving linear system of equations stemming from Laplacian operators is at the heart of a wide range of applications. Due to the sparsity of the linear systems, iterative solvers such as Conjugate Gradient and Multigrid are usually employed when the solution has a large number of degrees of freedom. These iterative solvers can be seen as sparse approximations of the Green’s function for the Laplacian operator. In this paper we propose a machine learning approach that regresses a Green’s function from boundary conditions. This is enabled by a Green’s function that can be effectively represented in a multi-scale fashion, drastically reducing the cost associated with a dense matrix representation. Additionally, since the Green’s function is solely dependent on boundary conditions, training the proposed neural network does not require sampling the right-hand side of the linear system. We show results that our method outperforms state of the art Conjugate Gradient and Multigrid methods.\n  - [Deforming Radiance Fields with Cages, ECCV2022](https://arxiv.org/abs/2207.12298) | [code]\n    > Recent advances in radiance fields enable photorealistic rendering of static or dynamic 3D scenes, but still do not support explicit deformation that is used for scene manipulation or animation. In this paper, we propose a method that enables a new type of deformation of the radiance field: free-form radiance field deformation. We use a triangular mesh that encloses the foreground object called cage as an interface, and by manipulating the cage vertices, our approach enables the free-form deformation of the radiance field. The core of our approach is cage-based deformation which is commonly used in mesh deformation. We propose a novel formulation to extend it to the radiance field, which maps the position and the view direction of the sampling points from the deformed space to the canonical space, thus enabling the rendering of the deformed scene. The deformation results of the synthetic datasets and the real-world datasets demonstrate the effectiveness of our approach.\n  - [NeuMesh: Learning Disentangled Neural Mesh-based Implicit Field for Geometry and Texture Editing, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > Very recently neural implicit rendering techniques have been rapidly evolved and shown great advantages in novel view synthesis and 3D scene reconstruction. However, existing neural rendering methods for editing purposes offer limited functionality, e.g., rigid transformation, or not applicable for fine-grained editing for general objects from daily lives. In this paper, we present a novel mesh-based representation by encoding the neural implicit field with disentangled geometry and texture codes on mesh vertices, which facilitates a set of editing functionalities, including mesh-guided geometry editing, designated texture editing with texture swapping, filling and painting operations. To this end, we develop several techniques including learnable sign indicators to magnify spatial distinguishability of mesh-based representation, distillation and fine-tuning mechanism to make a steady convergence, and the spatial-aware optimization strategy to realize precise texture editing. Extensive experiments and editing examples on both real and synthetic data demonstrate the superiority of our method on representation quality and editing ability. Code is available on the project webpage: this https URL.\n  - [On the Learnability of Physical Concepts: Can a Neural Network Understand What's Real?](https://arxiv.org/abs/2207.12186) | [code]\n    > We revisit the classic signal-to-symbol barrier in light of the remarkable ability of deep neural networks to generate realistic synthetic data. DeepFakes and spoofing highlight the feebleness of the link between physical reality and its abstract representation, whether learned by a digital computer or a biological agent. Starting from a widely applicable definition of abstract concept, we show that standard feed-forward architectures cannot capture but trivial concepts, regardless of the number of weights and the amount of training data, despite being extremely effective classifiers. On the other hand, architectures that incorporate recursion can represent a significantly larger class of concepts, but may still be unable to learn them from a finite dataset. We qualitatively describe the class of concepts that can be \"understood\" by modern architectures trained with variants of stochastic gradient descent, using a (free energy) Lagrangian to measure information complexity. Even if a concept has been understood, however, a network has no means of communicating its understanding to an external agent, except through continuous interaction and validation. We then characterize physical objects as abstract concepts and use the previous analysis to show that physical objects can be encoded by finite architectures. However, to understand physical concepts, sensors must provide persistently exciting observations, for which the ability to control the data acquisition process is essential (active perception). The importance of control depends on the modality, benefiting visual more than acoustic or chemical perception. Finally, we conclude that binding physical entities to digital identities is possible in finite time with finite resources, solving in principle the signal-to-symbol barrier problem, but we highlight the need for continuous validation.\n## Previous weeks\n  - [﻿Plenoxels: Radiance Fields without Neural Networks, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > We introduce Plenoxels (plenoptic voxels), a system for photorealistic view synthesis. Plenoxels represent a scene as a sparse 3D grid with spherical harmonics. This representation can be optimized from calibrated images via gradient methods and regularization without any neural components. On standard, benchmark tasks, Plenoxels are optimized two orders of magnitude faster than Neural Radiance Fields with no loss in visual quality.\n  - [Urban Radiance Fields, CVPR2022](https://urban-radiance-fields.github.io/) | [code]\n    > The goal of this work is to perform 3D reconstruction and novel view synthesis from data captured by scanning platforms commonly deployed for world mapping in urban outdoor environments (e.g., Street View). Given a sequence of posed RGB images and lidar sweeps acquired by cameras and scanners moving through an outdoor scene, we produce a model from which 3D surfaces can be extracted and novel RGB images can be synthesized. Our approach extends Neural Radiance Fields, which has been demonstrated to synthesize realistic novel images for small scenes in controlled settings, with new methods for leveraging asynchronously captured lidar data, for addressing exposure variation between captured images, and for leveraging predicted image segmentations to supervise densities on rays pointing at the sky. Each of these three extensions provides significant performance improvements in experiments on Street View data. Our system produces state-of-the-art 3D surface reconstructions and synthesizes higher quality novel views in comparison to both traditional methods (e.g.~COLMAP) and recent neural representations (e.g.~Mip-NeRF).\n  - [NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis, ECCV2020](https://arxiv.org/abs/2003.08934) | [***``[code]``***](http://tancik.com/nerf)\n    > We present a method that achieves state-of-the-art results for synthesizing novel views of complex scenes by optimizing an underlying continuous volumetric scene function using a sparse set of input views. Our algorithm represents a scene using a fully-connected (non-convolutional) deep network, whose input is a single continuous 5D coordinate (spatial location (x,y,z) and viewing direction (θ,ϕ)) and whose output is the volume density and view-dependent emitted radiance at that spatial location. We synthesize views by querying 5D coordinates along camera rays and use classic volume rendering techniques to project the output colors and densities into an image. Because volume rendering is naturally differentiable, the only input required to optimize our representation is a set of images with known camera poses. We describe how to effectively optimize neural radiance fields to render photorealistic novel views of scenes with complicated geometry and appearance, and demonstrate results that outperform prior work on neural rendering and view synthesis. View synthesis results are best viewed as videos, so we urge readers to view our supplementary video for convincing comparisons.\n  - [Neural Sparse Voxel Fields, NeurIPS2020](https://lingjie0206.github.io/papers/NSVF/) | [***``[code]``***](https://github.com/facebookresearch/NSVF)\n    > We introduce Neural Sparse Voxel Fields (NSVF), a new neural scene representation for fast and high-quality free-viewpoint rendering. NSVF defines a set of voxel-bounded implicit fields organized in a sparse voxel octree to model local properties in each cell. We progressively learn the underlying voxel structures with a diffentiable ray-marching operation from only a set of posed RGB images. With the sparse voxel octree structure, rendering novel views can be accelerated by skipping the voxels containing no relevant scene content. Our method is over 10 times faster than the state-of-the-art (namely, NeRF (Mildenhall et al., 2020)) at inference time while achieving higher quality results. Furthermore, by utilizing an explicit sparse voxel representation, our method can easily be applied to scene editing and scene composition. We also demonstrate several challenging tasks, including multi-scene learning, free-viewpoint rendering of a moving human, and large-scale scene rendering.\n  - [AutoInt: Automatic Integration for Fast Neural Volume Rendering, CVPR2021](http://www.computationalimaging.org/publications/automatic-integration/) | [***``[code]``***](https://github.com/computational-imaging/automatic-integration)\n    > Numerical integration is a foundational technique in scientific computing and is at the core of many computer vision applications. Among these applications, implicit neural volume rendering has recently been proposed as a new paradigm for view synthesis, achieving photorealistic image quality. However, a fundamental obstacle to making these methods practical is the extreme computational and memory requirements caused by the required volume integrations along the rendered rays during training and inference. Millions of rays, each requiring hundreds of forward passes through a neural network are needed to approximate those integrations with Monte Carlo sampling. Here, we propose automatic integration, a new framework for learning efficient, closed-form solutions to integrals using implicit neural representation networks. For training, we instantiate the computational graph corresponding to the derivative of the implicit neural representation. The graph is fitted to the signal to integrate. After optimization, we reassemble the graph to obtain a network that represents the antiderivative. By the fundamental theorem of calculus, this enables the calculation of any definite integral in two evaluations of the network. Using this approach, we demonstrate a greater than 10× improvement in computation requirements, enabling fast neural volume rendering.\n  - [DeRF: Decomposed Radiance Fields](https://arxiv.org/abs/2011.12490) | [code]\n    > With the advent of Neural Radiance Fields (NeRF), neural networks can now render novel views of a 3D scene with quality that fools the human eye. Yet, generating these images is very computationally intensive, limiting their applicability in practical scenarios. In this paper, we propose a technique based on spatial decomposition capable of mitigating this issue. Our key observation is that there are diminishing returns in employing larger (deeper and/or wider) networks. Hence, we propose to spatially decompose a scene and dedicate smaller networks for each decomposed part. When working together, these networks can render the whole scene. This allows us near-constant inference time regardless of the number of decomposed parts. Moreover, we show that a Voronoi spatial decomposition is preferable for this purpose, as it is provably compatible with the Painter's Algorithm for efficient and GPU-friendly rendering. Our experiments show that for real-world scenes, our method provides up to 3x more efficient inference than NeRF (with the same rendering quality), or an improvement of up to 1.0~dB in PSNR (for the same inference cost).\n  - [DONeRF: Towards Real-Time Rendering of Compact Neural Radiance Fields using Depth Oracle Networks, CGF2021](https://depthoraclenerf.github.io/) | [***``[code]``***](https://github.com/facebookresearch/DONERF)\n    > The recent research explosion around Neural Radiance Fields (NeRFs) shows that there is immense potential for implicitly storing scene and lighting information in neural networks, e.g., for novel view generation. However, one major limitation preventing the widespread use of NeRFs is the prohibitive computational cost of excessive network evaluations along each view ray, requiring dozens of petaFLOPS when aiming for real-time rendering on current devices. We show that the number of samples required for each view ray can be significantly reduced when local samples are placed around surfaces in the scene. To this end, we propose a depth oracle network, which predicts ray sample locations for each view ray with a single network evaluation. We show that using a classification network around logarithmically discretized and spherically warped depth values is essential to encode surface locations rather than directly estimating depth. The combination of these techniques leads to DONeRF, a dual network design with a depth oracle network as a first step and a locally sampled shading network for ray accumulation. With our design, we reduce the inference costs by up to 48x compared to NeRF. Using an off-the-shelf inference API in combination with simple compute kernels, we are the first to render raymarching-based neural representations at interactive frame rates (15 frames per second at 800x800) on a single GPU. At the same time, since we focus on the important parts of the scene around surfaces, we achieve equal or better quality compared to NeRF.\n  - [FastNeRF: High-Fidelity Neural Rendering at 200FPS, ICCV2021](https://arxiv.org/abs/2103.10380) | [code]\n    > Recent work on Neural Radiance Fields (NeRF) showed how neural networks can be used to encode complex 3D environments that can be rendered photorealistically from novel viewpoints. Rendering these images is very computationally demanding and recent improvements are still a long way from enabling interactive rates, even on high-end hardware. Motivated by scenarios on mobile and mixed reality devices, we propose FastNeRF, the first NeRF-based system capable of rendering high fidelity photorealistic images at 200Hz on a high-end consumer GPU. The core of our method is a graphics-inspired factorization that allows for (i) compactly caching a deep radiance map at each position in space, (ii) efficiently querying that map using ray directions to estimate the pixel values in the rendered image. Extensive experiments show that the proposed method is 3000 times faster than the original NeRF algorithm and at least an order of magnitude faster than existing work on accelerating NeRF, while maintaining visual quality and extensibility.\n  - [KiloNeRF: Speeding up Neural Radiance Fields with Thousands of Tiny MLPs , ICCV2021](https://arxiv.org/abs/2103.13744) | [***``[code]``***](https://github.com/creiser/kilonerf/)\n    > NeRF synthesizes novel views of a scene with unprecedented quality by fitting a neural radiance field to RGB images. However, NeRF requires querying a deep Multi-Layer Perceptron (MLP) millions of times, leading to slow rendering times, even on modern GPUs. In this paper, we demonstrate that real-time rendering is possible by utilizing thousands of tiny MLPs instead of one single large MLP. In our setting, each individual MLP only needs to represent parts of the scene, thus smaller and faster-to-evaluate MLPs can be used. By combining this divide-and-conquer strategy with further optimizations, rendering is accelerated by three orders of magnitude compared to the original NeRF model without incurring high storage costs. Further, using teacher-student distillation for training, we show that this speed-up can be achieved without sacrificing visual quality.\n  - [PlenOctrees for Real-time Rendering of Neural Radiance Fields, ICCV2021(oral)](https://alexyu.net/plenoctrees/) | [***``[code]``***](https://github.com/sxyu/volrend)\n    > Real-time performance is achieved by pre-tabulating the NeRF into an octree-based radiance field that we call PlenOctrees. In order to preserve view-dependent effects such as specularities, we propose to encode appearances via closed-form spherical basis functions. Specifically, we show that it is possible to train NeRFs to predict a spherical harmonic representation of radiance, removing the viewing direction as input to the neural network. Furthermore, we show that our PlenOctrees can be directly optimized to further minimize the reconstruction loss, which leads to equal or better quality than competing methods. We further show that this octree optimization step can be used to accelerate the training time, as we no longer need to wait for the NeRF training to converge fully. Our real-time neural rendering approach may potentially enable new applications such as 6-DOF industrial and product visualizations, as well as next generation AR/VR systems.\n  - [Mixture of Volumetric Primitives for Efficient Neural Rendering, SIGGRAPH2021](https://arxiv.org/abs/2103.01954) | [code]\n    > Real-time rendering and animation of humans is a core function in games, movies, and telepresence applications. Existing methods have a number of drawbacks we aim to address with our work. Triangle meshes have difficulty modeling thin structures like hair, volumetric representations like Neural Volumes are too low-resolution given a reasonable memory budget, and high-resolution implicit representations like Neural Radiance Fields are too slow for use in real-time applications. We present Mixture of Volumetric Primitives (MVP), a representation for rendering dynamic 3D content that combines the completeness of volumetric representations with the efficiency of primitive-based rendering, e.g., point-based or mesh-based methods. Our approach achieves this by leveraging spatially shared computation with a deconvolutional architecture and by minimizing computation in empty regions of space with volumetric primitives that can move to cover only occupied regions. Our parameterization supports the integration of correspondence and tracking constraints, while being robust to areas where classical tracking fails, such as around thin or translucent structures and areas with large topological variability. MVP is a hybrid that generalizes both volumetric and primitive-based representations. Through a series of extensive experiments we demonstrate that it inherits the strengths of each, while avoiding many of their limitations. We also compare our approach to several state-of-the-art methods and demonstrate that MVP produces superior results in terms of quality and runtime performance.\n  - [Light Field Networks: Neural Scene Representations with Single-Evaluation Rendering, NeurIPS2021(spotlight)](https://www.vincentsitzmann.com/lfns/) | [***``[code]``***](https://github.com/vsitzmann/light-field-networks)\n    > Inferring representations of 3D scenes from 2D observations is a fundamental problem of computer graphics, computer vision, and artificial intelligence. Emerging 3D-structured neural scene representations are a promising approach to 3D scene understanding. In this work, we propose a novel neural scene representation, Light Field Networks or LFNs, which represent both geometry and appearance of the underlying 3D scene in a 360-degree, four-dimensional light field parameterized via a neural implicit representation. Rendering a ray from an LFN requires only a *single* network evaluation, as opposed to hundreds of evaluations per ray for ray-marching or volumetric based renderers in 3D-structured neural scene representations. In the setting of simple scenes, we leverage meta-learning to learn a prior over LFNs that enables multi-view consistent light field reconstruction from as little as a single image observation. This results in dramatic reductions in time and memory complexity, and enables real-time rendering. The cost of storing a 360-degree light field via an LFN is two orders of magnitude lower than conventional methods such as the Lumigraph. Utilizing the analytical differentiability of neural implicit representations and a novel parameterization of light space, we further demonstrate the extraction of sparse depth maps from LFNs.\n  - [Depth-supervised NeRF: Fewer Views and Faster Training for Free, CVPR2022](https://arxiv.org/abs/2107.02791) | [***``[code]``***](https://github.com/dunbar12138/DSNeRF)\n    > A commonly observed failure mode of Neural Radiance Field (NeRF) is fitting incorrect geometries when given an insufficient number of input views. One potential reason is that standard volumetric rendering does not enforce the constraint that most of a scene's geometry consist of empty space and opaque surfaces. We formalize the above assumption through DS-NeRF (Depth-supervised Neural Radiance Fields), a loss for learning radiance fields that takes advantage of readily-available depth supervision. We leverage the fact that current NeRF pipelines require images with known camera poses that are typically estimated by running structure-from-motion (SFM). Crucially, SFM also produces sparse 3D points that can be used as \"free\" depth supervision during training: we add a loss to encourage the distribution of a ray's terminating depth matches a given 3D keypoint, incorporating depth uncertainty. DS-NeRF can render better images given fewer training views while training 2-3x faster. Further, we show that our loss is compatible with other recently proposed NeRF methods, demonstrating that depth is a cheap and easily digestible supervisory signal. And finally, we find that DS-NeRF can support other types of depth supervision such as scanned depth sensors and RGB-D reconstruction outputs.\n  - [Direct Voxel Grid Optimization: Super-fast Convergence for Radiance Fields Reconstruction, CVPR2022(oral)](https://arxiv.org/abs/2111.11215) | [***``[code]``***](https://github.com/sunset1995/DirectVoxGO)\n    > We present a super-fast convergence approach to reconstructing the per-scene radiance field from a set of images that capture the scene with known poses. This task, which is often applied to novel view synthesis, is recently revolutionized by Neural Radiance Field (NeRF) for its state-of-the-art quality and flexibility. However, NeRF and its variants require a lengthy training time ranging from hours to days for a single scene. In contrast, our approach achieves NeRF-comparable quality and converges rapidly from scratch in less than 15 minutes with a single GPU. We adopt a representation consisting of a density voxel grid for scene geometry and a feature voxel grid with a shallow network for complex view-dependent appearance. Modeling with explicit and discretized volume representations is not new, but we propose two simple yet non-trivial techniques that contribute to fast convergence speed and high-quality output. First, we introduce the post-activation interpolation on voxel density, which is capable of producing sharp surfaces in lower grid resolution. Second, direct voxel density optimization is prone to suboptimal geometry solutions, so we robustify the optimization process by imposing several priors. Finally, evaluation on five inward-facing benchmarks shows that our method matches, if not surpasses, NeRF's quality, yet it only takes about 15 minutes to train from scratch for a new scene.\n  - [NeRF in the Wild: Neural Radiance Fields for Unconstrained Photo Collections, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > We present a learning-based method for synthesizing novel views of complex scenes using only unstructured collections of in-the-wild photographs. We build on Neural Radiance Fields (NeRF), which uses the weights of a multilayer perceptron to model the density and color of a scene as a function of 3D coordinates. While NeRF works well on images of static subjects captured under controlled settings, it is incapable of modeling many ubiquitous, real-world phenomena in uncontrolled images, such as variable illumination or transient occluders. We introduce a series of extensions to NeRF to address these issues, thereby enabling accurate reconstructions from unstructured image collections taken from the internet. We apply our system, dubbed NeRF-W, to internet photo collections of famous landmarks, and demonstrate temporally consistent novel view renderings that are significantly closer to photorealism than the prior state of the art.\n  - [Ha-NeRF: Hallucinated Neural Radiance Fields in the Wild, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > Neural Radiance Fields (NeRF) has recently gained popularity for its impressive novel view synthesis ability. This paper studies the problem of hallucinated NeRF: i.e., recovering a realistic NeRF at a different time of day from a group of tourism images. Existing solutions adopt NeRF with a controllable appearance embedding to render novel views under various conditions, but they cannot render view-consistent images with an unseen appearance. To solve this problem, we present an end-to-end framework for constructing a hallucinated NeRF, dubbed as Ha-NeRF. Specifically, we propose an appearance hallucination module to handle time-varying appearances and transfer them to novel views. Considering the complex occlusions of tourism images, we introduce an anti-occlusion module to decompose the static subjects for visibility accurately. Experimental results on synthetic data and real tourism photo collections demonstrate that our method can hallucinate the desired appearances and render occlusion-free images from different views.\n  - [Nerfies: Deformable Neural Radiance Fields, ICCV2021](https://arxiv.org/abs/2011.12948) | [code]\n    > We present the first method capable of photorealistically reconstructing deformable scenes using photos/videos captured casually from mobile phones. Our approach augments neural radiance fields (NeRF) by optimizing an additional continuous volumetric deformation field that warps each observed point into a canonical 5D NeRF. We observe that these NeRF-like deformation fields are prone to local minima, and propose a coarse-to-fine optimization method for coordinate-based models that allows for more robust optimization. By adapting principles from geometry processing and physical simulation to NeRF-like models, we propose an elastic regularization of the deformation field that further improves robustness. We show that our method can turn casually captured selfie photos/videos into deformable NeRF models that allow for photorealistic renderings of the subject from arbitrary viewpoints, which we dub \"nerfies.\" We evaluate our method by collecting time-synchronized data using a rig with two mobile phones, yielding train/validation images of the same pose at different viewpoints. We show that our method faithfully reconstructs non-rigidly deforming scenes and reproduces unseen views with high fidelity.\n  - [D-NeRF: Neural Radiance Fields for Dynamic Scenes, CVPR2021](https://arxiv.org/abs/2011.13961) | [***``[code]``***](https://github.com/albertpumarola/D-NeRF)\n    > Neural rendering techniques combining machine learning with geometric reasoning have arisen as one of the most promising approaches for synthesizing novel views of a scene from a sparse set of images. Among these, stands out the Neural radiance fields (NeRF), which trains a deep network to map 5D input coordinates (representing spatial location and viewing direction) into a volume density and view-dependent emitted radiance. However, despite achieving an unprecedented level of photorealism on the generated images, NeRF is only applicable to static scenes, where the same spatial location can be queried from different images. In this paper we introduce D-NeRF, a method that extends neural radiance fields to a dynamic domain, allowing to reconstruct and render novel images of objects under rigid and non-rigid motions from a \\emph{single} camera moving around the scene. For this purpose we consider time as an additional input to the system, and split the learning process in two main stages: one that encodes the scene into a canonical space and another that maps this canonical representation into the deformed scene at a particular time. Both mappings are simultaneously learned using fully-connected networks. Once the networks are trained, D-NeRF can render novel images, controlling both the camera view and the time variable, and thus, the object movement. We demonstrate the effectiveness of our approach on scenes with objects under rigid, articulated and non-rigid motions. Code, model weights and the dynamic scenes dataset will be released.\n  - [Dynamic Neural Radiance Fields for Monocular 4D Facial Avatar Reconstruction, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > We present dynamic neural radiance fields for modeling the appearance and dynamics of a human face. Digitally modeling and reconstructing a talking human is a key building-block for a variety of applications. Especially, for telepresence applications in AR or VR, a faithful reproduction of the appearance including novel viewpoint or head-poses is required. In contrast to state-of-the-art approaches that model the geometry and material properties explicitly, or are purely image-based, we introduce an implicit representation of the head based on scene representation networks. To handle the dynamics of the face, we combine our scene representation network with a low-dimensional morphable model which provides explicit control over pose and expressions. We use volumetric rendering to generate images from this hybrid representation and demonstrate that such a dynamic neural scene representation can be learned from monocular input data only, without the need of a specialized capture setup. In our experiments, we show that this learned volumetric representation allows for photo-realistic image generation that surpasses the quality of state-of-the-art video-based reenactment methods.\n  - [Non-Rigid Neural Radiance Fields: Reconstruction and Novel View Synthesis of a Deforming Scene from Monocular Video,, ICCV2021](https://vcai.mpi-inf.mpg.de/projects/nonrigid_nerf/) | [***``[code]``***](https://github.com/facebookresearch/nonrigid_nerf)\n    > We present Non-Rigid Neural Radiance Fields (NR-NeRF), a reconstruction and novel view synthesis approach for general non-rigid dynamic scenes. Our approach takes RGB images of a dynamic scene as input (e.g., from a monocular video recording), and creates a high-quality space-time geometry and appearance representation. We show that a single handheld consumer-grade camera is sufficient to synthesize sophisticated renderings of a dynamic scene from novel virtual camera views, e.g. a `bullet-time' video effect. NR-NeRF disentangles the dynamic scene into a canonical volume and its deformation. Scene deformation is implemented as ray bending, where straight rays are deformed non-rigidly. We also propose a novel rigidity network to better constrain rigid regions of the scene, leading to more stable results. The ray bending and rigidity network are trained without explicit supervision. Our formulation enables dense correspondence estimation across views and time, and compelling video editing applications such as motion exaggeration. Our code will be open sourced.\n  - [PVA: Pixel-aligned Volumetric Avatars, CVPR2021](https://volumetric-avatars.github.io/) | [code]\n    > Acquisition and rendering of photorealistic human heads is a highly challenging research problem of particular importance for virtual telepresence. Currently, the highest quality is achieved by volumetric approaches trained in a person-specific manner on multi-view data. These models better represent fine structure, such as hair, compared to simpler mesh-based models. Volumetric models typically employ a global code to represent facial expressions, such that they can be driven by a small set of animation parameters. While such architectures achieve impressive rendering quality, they can not easily be extended to the multi-identity setting. In this paper, we devise a novel approach for predicting volumetric avatars of the human head given just a small number of inputs. We enable generalization across identities by a novel parameterization that combines neural radiance fields with local, pixel-aligned features extracted directly from the inputs, thus side-stepping the need for very deep or complex networks. Our approach is trained in an end-to-end manner solely based on a photometric rerendering loss without requiring explicit 3D supervision.We demonstrate that our approach outperforms the existing state of the art in terms of quality and is able to generate faithful facial expressions in a multi-identity setting.\n  - [Neural Articulated Radiance Field, ICCV2021](https://arxiv.org/abs/2104.03110) | [***``[code]``***](https://github.com/nogu-atsu/NARF#code)\n    > We present Neural Articulated Radiance Field (NARF), a novel deformable 3D representation for articulated objects learned from images. While recent advances in 3D implicit representation have made it possible to learn models of complex objects, learning pose-controllable representations of articulated objects remains a challenge, as current methods require 3D shape supervision and are unable to render appearance. In formulating an implicit representation of 3D articulated objects, our method considers only the rigid transformation of the most relevant object part in solving for the radiance field at each 3D location. In this way, the proposed method represents pose-dependent changes without significantly increasing the computational complexity. NARF is fully differentiable and can be trained from images with pose annotations. Moreover, through the use of an autoencoder, it can learn appearance variations over multiple instances of an object class. Experiments show that the proposed method is efficient and can generalize well to novel poses.\n  - [CLA-NeRF: Category-Level Articulated Neural Radiance Field, ICRA2022](https://arxiv.org/abs/2202.00181) | [code]\n    > We propose CLA-NeRF -- a Category-Level Articulated Neural Radiance Field that can perform view synthesis, part segmentation, and articulated pose estimation. CLA-NeRF is trained at the object category level using no CAD models and no depth, but a set of RGB images with ground truth camera poses and part segments. During inference, it only takes a few RGB views (i.e., few-shot) of an unseen 3D object instance within the known category to infer the object part segmentation and the neural radiance field. Given an articulated pose as input, CLA-NeRF can perform articulation-aware volume rendering to generate the corresponding RGB image at any camera pose. Moreover, the articulated pose of an object can be estimated via inverse rendering. In our experiments, we evaluate the framework across five categories on both synthetic and real-world data. In all cases, our method shows realistic deformation results and accurate articulated pose estimation. We believe that both few-shot articulated object rendering and articulated pose estimation open doors for robots to perceive and interact with unseen articulated objects.\n  - [Animatable Neural Radiance Fields for Human Body Modeling, ICCV2021](https://zju3dv.github.io/animatable_nerf/) | [***``[code]``***](https://github.com/zju3dv/animatable_nerf)\n    > This paper addresses the challenge of reconstructing an animatable human model from a multi-view video. Some recent works have proposed to decompose a non-rigidly deforming scene into a canonical neural radiance field and a set of deformation fields that map observation-space points to the canonical space, thereby enabling them to learn the dynamic scene from images. However, they represent the deformation field as translational vector field or SE(3) field, which makes the optimization highly under-constrained. Moreover, these representations cannot be explicitly controlled by input motions. Instead, we introduce neural blend weight fields to produce the deformation fields. Based on the skeleton-driven deformation, blend weight fields are used with 3D human skeletons to generate observation-to-canonical and canonical-to-observation correspondences. Since 3D human skeletons are more observable, they can regularize the learning of deformation fields. Moreover, the learned blend weight fields can be combined with input skeletal motions to generate new deformation fields to animate the human model. Experiments show that our approach significantly outperforms recent human synthesis methods. The code will be available at https://zju3dv.github.io/animatable_nerf/.\n  - [Neural Actor: Neural Free-view Synthesis of Human Actors with Pose Control, SIGSIGGRAPH Asia 2021](https://vcai.mpi-inf.mpg.de/projects/NeuralActor/) | [***``[code]``***](https://people.mpi-inf.mpg.de/~lliu/projects/NeuralActor/)\n    > We propose Neural Actor (NA), a new method for high-quality synthesis of humans from arbitrary viewpoints and under arbitrary controllable poses. Our method is built upon recent neural scene representation and rendering works which learn representations of geometry and appearance from only 2D images. While existing works demonstrated compelling rendering of static scenes and playback of dynamic scenes, photo-realistic reconstruction and rendering of humans with neural implicit methods, in particular under user-controlled novel poses, is still difficult. To address this problem, we utilize a coarse body model as the proxy to unwarp the surrounding 3D space into a canonical pose. A neural radiance field learns pose-dependent geometric deformations and pose- and view-dependent appearance effects in the canonical space from multi-view video input. To synthesize novel views of high fidelity dynamic geometry and appearance, we leverage 2D texture maps defined on the body model as latent variables for predicting residual deformations and the dynamic appearance. Experiments demonstrate that our method achieves better quality than the state-of-the-arts on playback as well as novel pose synthesis, and can even generalize well to new poses that starkly differ from the training poses. Furthermore, our method also supports body shape control of the synthesized results.\n  - [Neural Scene Flow Fields for Space-Time View Synthesis of Dynamic Scenes, CVPR2021](http://www.cs.cornell.edu/~zl548/NSFF/) | [***``[code]``***](https://github.com/zhengqili/Neural-Scene-Flow-Fields)\n    > We present a method to perform novel view and time synthesis of dynamic scenes, requiring only a monocular video with known camera poses as input. To do this, we introduce Neural Scene Flow Fields, a new representation that models the dynamic scene as a time-variant continuous function of appearance, geometry, and 3D scene motion. Our representation is optimized through a neural network to fit the observed input views. We show that our representation can be used for complex dynamic scenes, including thin structures, view-dependent effects, and natural degrees of motion. We conduct a number of experiments that demonstrate our approach significantly outperforms recent monocular view synthesis methods, and show qualitative results of space-time view synthesis on a variety of real-world videos.\n  - [Neural Body: Implicit Neural Representations with Structured Latent Codes for Novel View Synthesis of Dynamic Humans, CVPR2021](https://zju3dv.github.io/neuralbody/) | [***``[code]``***](https://github.com/zju3dv/neuralbody)\n    > This paper addresses the challenge of novel view synthesis for a human performer from a very sparse set of camera views. Some recent works have shown that learning implicit neural representations of 3D scenes achieves remarkable view synthesis quality given dense input views. However, the representation learning will be ill-posed if the views are highly sparse. To solve this ill-posed problem, our key idea is to integrate observations over video frames. To this end, we propose Neural Body, a new human body representation which assumes that the learned neural representations at different frames share the same set of latent codes anchored to a deformable mesh, so that the observations across frames can be naturally integrated. The deformable mesh also provides geometric guidance for the network to learn 3D representations more efficiently. To evaluate our approach, we create a multi-view dataset named ZJU-MoCap that captures performers with complex motions. Experiments on ZJU-MoCap show that our approach outperforms prior works by a large margin in terms of novel view synthesis quality. We also demonstrate the capability of our approach to reconstruct a moving person from a monocular video on the People-Snapshot dataset.\n  - [Neural 3D Video Synthesis from Multi-view Video, CVPR2022(oral)](https://neural-3d-video.github.io/) | [code]\n    > We propose a novel approach for 3D video synthesis that is able to represent multi-view video recordings of a dynamic real-world scene in a compact, yet expressive representation that enables high-quality view synthesis and motion interpolation. Our approach takes the high quality and compactness of static neural radiance fields in a new direction: to a model-free, dynamic setting. At the core of our approach is a novel time-conditioned neural radiance fields that represents scene dynamics using a set of compact latent codes. To exploit the fact that changes between adjacent frames of a video are typically small and locally consistent, we propose two novel strategies for efficient training of our neural network: 1) An efficient hierarchical training scheme, and 2) an importance sampling strategy that selects the next rays for training based on the temporal variation of the input videos. In combination, these two strategies significantly boost the training speed, lead to fast convergence of the training process, and enable high quality results. Our learned representation is highly compact and able to represent a 10 second 30 FPS multi-view video recording by 18 cameras with a model size of just 28MB. We demonstrate that our method can render high-fidelity wide-angle novel views at over 1K resolution, even for highly complex and dynamic scenes. We perform an extensive qualitative and quantitative evaluation that shows that our approach outperforms the current state of the art. Project website: https://neural-3d-video.github.io.\n  - [Dynamic View Synthesis from Dynamic Monocular Video, ICCV2021](https://free-view-video.github.io/) | [***``[code]``***](https://github.com/gaochen315/DynamicNeRF)\n    > We present an algorithm for generating novel views at arbitrary viewpoints and any input time step given a monocular video of a dynamic scene. Our work builds upon recent advances in neural implicit representation and uses continuous and differentiable functions for modeling the time-varying structure and the appearance of the scene. We jointly train a time-invariant static NeRF and a time-varying dynamic NeRF, and learn how to blend the results in an unsupervised manner. However, learning this implicit function from a single video is highly ill-posed (with infinitely many solutions that match the input video). To resolve the ambiguity, we introduce regularization losses to encourage a more physically plausible solution. We show extensive quantitative and qualitative results of dynamic view synthesis from casually captured videos.\n  - [GRAF: Generative Radiance Fields for 3D-Aware Image Synthesis, NeurIPS2020](https://avg.is.mpg.de/publications/schwarz2020NeurIPS) | [***``[code]``***](https://github.com/autonomousvision/graf)\n    > While 2D generative adversarial networks have enabled high-resolution image synthesis, they largely lack an understanding of the 3D world and the image formation process. Thus, they do not provide precise control over camera viewpoint or object pose. To address this problem, several recent approaches leverage intermediate voxel-based representations in combination with differentiable rendering. However, existing methods either produce low image resolution or fall short in disentangling camera and scene properties, eg, the object identity may vary with the viewpoint. In this paper, we propose a generative model for radiance fields which have recently proven successful for novel view synthesis of a single scene. In contrast to voxel-based representations, radiance fields are not confined to a coarse discretization of the 3D space, yet allow for disentangling camera and scene properties while degrading gracefully in the presence of reconstruction ambiguity. By introducing a multi-scale patch-based discriminator, we demonstrate synthesis of high-resolution images while training our model from unposed 2D images alone. We systematically analyze our approach on several challenging synthetic and real-world datasets. Our experiments reveal that radiance fields are a powerful representation for generative image synthesis, leading to 3D consistent models that render with high fidelity.\n  - [GRF: Learning a General Radiance Field for 3D Scene Representation and Rendering, ICCV2021(oral)](https://arxiv.org/abs/2010.04595) | [***``[code]``***](https://github.com/alextrevithick/GRF)\n    > We present a simple yet powerful neural network that implicitly represents and renders 3D objects and scenes only from 2D observations. The network models 3D geometries as a general radiance field, which takes a set of 2D images with camera poses and intrinsics as input, constructs an internal representation for each point of the 3D space, and then renders the corresponding appearance and geometry of that point viewed from an arbitrary position. The key to our approach is to learn local features for each pixel in 2D images and to then project these features to 3D points, thus yielding general and rich point representations. We additionally integrate an attention mechanism to aggregate pixel features from multiple 2D views, such that visual occlusions are implicitly taken into account. Extensive experiments demonstrate that our method can generate high-quality and realistic novel views for novel objects, unseen categories and challenging real-world scenes.\n  - [pixelNeRF: Neural Radiance Fields from One or Few Images, CVPR2021](https://arxiv.org/abs/2012.02190) | [***``[code]``***](https://github.com/sxyu/pixel-nerf)\n    > We propose pixelNeRF, a learning framework that predicts a continuous neural scene representation conditioned on one or few input images. The existing approach for constructing neural radiance fields involves optimizing the representation to every scene independently, requiring many calibrated views and significant compute time. We take a step towards resolving these shortcomings by introducing an architecture that conditions a NeRF on image inputs in a fully convolutional manner. This allows the network to be trained across multiple scenes to learn a scene prior, enabling it to perform novel view synthesis in a feed-forward manner from a sparse set of views (as few as one). Leveraging the volume rendering approach of NeRF, our model can be trained directly from images with no explicit 3D supervision. We conduct extensive experiments on ShapeNet benchmarks for single image novel view synthesis tasks with held-out objects as well as entire unseen categories. We further demonstrate the flexibility of pixelNeRF by demonstrating it on multi-object ShapeNet scenes and real scenes from the DTU dataset. In all cases, pixelNeRF outperforms current state-of-the-art baselines for novel view synthesis and single image 3D reconstruction. For the video and code, please visit the project website: this https URL\n  - [Learned Initializations for Optimizing Coordinate-Based Neural Representations, CVPR2021](https://www.matthewtancik.com/learnit) | [***``[code]``***](https://github.com/tancik/learnit)\n    > Coordinate-based neural representations have shown significant promise as an alternative to discrete, array-based representations for complex low dimensional signals. However, optimizing a coordinate-based network from randomly initialized weights for each new signal is inefficient. We propose applying standard meta-learning algorithms to learn the initial weight parameters for these fully-connected networks based on the underlying class of signals being represented (e.g., images of faces or 3D models of chairs). Despite requiring only a minor change in implementation, using these learned initial weights enables faster convergence during optimization and can serve as a strong prior over the signal class being modeled, resulting in better generalization when only partial observations of a given signal are available. We explore these benefits across a variety of tasks, including representing 2D images, reconstructing CT scans, and recovering 3D shapes and scenes from 2D image observations.\n  - [pi-GAN: Periodic Implicit Generative Adversarial Networks for 3D-Aware Image Synthesis, CVPR2021(oral)](https://marcoamonteiro.github.io/pi-GAN-website/) | [***``[code]``***](https://github.com/marcoamonteiro/pi-GAN)\n    > We have witnessed rapid progress on 3D-aware image synthesis, leveraging recent advances in generative visual models and neural rendering. Existing approaches however fall short in two ways: first, they may lack an underlying 3D representation or rely on view-inconsistent rendering, hence synthesizing images that are not multi-view consistent; second, they often depend upon representation network architectures that are not expressive enough, and their results thus lack in image quality. We propose a novel generative model, named Periodic Implicit Generative Adversarial Networks (π-GAN or pi-GAN), for high-quality 3D-aware image synthesis. π-GAN leverages neural representations with periodic activation functions and volumetric rendering to represent scenes as view-consistent 3D representations with fine detail. The proposed approach obtains state-of-the-art results for 3D-aware image synthesis with multiple real and synthetic datasets.\n  - [Portrait Neural Radiance Fields from a Single Image](https://portrait-nerf.github.io/) | [code]\n    > We present a method for estimating Neural Radiance Fields (NeRF) from a single headshot portrait. While NeRF has demonstrated high-quality view synthesis, it requires multiple images of static scenes and thus impractical for casual captures and moving subjects. In this work, we propose to pretrain the weights of a multilayer perceptron (MLP), which implicitly models the volumetric density and colors, with a meta-learning framework using a light stage portrait dataset. To improve the generalization to unseen faces, we train the MLP in the canonical coordinate space approximated by 3D face morphable models. We quantitatively evaluate the method using controlled captures and demonstrate the generalization to real portrait images, showing favorable results against state-of-the-arts.\n  - [ShaRF: Shape-conditioned Radiance Fields from a Single View, ICML2021](https://arxiv.org/abs/2102.08860) | [***``[code]``***](https://github.com/tensorflow/graphics/tree/master/tensorflow_graphics/projects/radiance_fields)\n    > We present a method for estimating neural scenes representations of objects given only a single image. The core of our method is the estimation of a geometric scaffold for the object and its use as a guide for the reconstruction of the underlying radiance field. Our formulation is based on a generative process that first maps a latent code to a voxelized shape, and then renders it to an image, with the object appearance being controlled by a second latent code. During inference, we optimize both the latent codes and the networks to fit a test image of a new object. The explicit disentanglement of shape and appearance allows our model to be fine-tuned given a single image. We can then render new views in a geometrically consistent manner and they represent faithfully the input object. Additionally, our method is able to generalize to images outside of the training domain (more realistic renderings and even real photographs). Finally, the inferred geometric scaffold is itself an accurate estimate of the object's 3D shape. We demonstrate in several experiments the effectiveness of our approach in both synthetic and real images.\n  - [IBRNet: Learning Multi-View Image-Based Rendering, CVPR2021](https://arxiv.org/abs/2102.13090) | [***``[code]``***](https://github.com/googleinterns/IBRNet)\n    > We present a method that synthesizes novel views of complex scenes by interpolating a sparse set of nearby views. The core of our method is a network architecture that includes a multilayer perceptron and a ray transformer that estimates radiance and volume density at continuous 5D locations (3D spatial locations and 2D viewing directions), drawing appearance information on the fly from multiple source views. By drawing on source views at render time, our method hearkens back to classic work on image-based rendering (IBR), and allows us to render high-resolution imagery. Unlike neural scene representation work that optimizes per-scene functions for rendering, we learn a generic view interpolation function that generalizes to novel scenes. We render images using classic volume rendering, which is fully differentiable and allows us to train using only multi-view posed images as supervision. Experiments show that our method outperforms recent novel view synthesis methods that also seek to generalize to novel scenes. Further, if fine-tuned on each scene, our method is competitive with state-of-the-art single-scene neural rendering methods. Project page: this https URL\n  - [CAMPARI: Camera-Aware Decomposed Generative Neural Radiance Fields](https://arxiv.org/pdf/2103.17269.pdf) | [code]\n    > Tremendous progress in deep generative models has led to photorealistic image synthesis. While achieving compelling results, most approaches operate in the two-dimensional image domain, ignoring the three-dimensional nature of our world. Several recent works therefore propose generative models which are 3D-aware, i.e., scenes are modeled in 3D and then rendered differentiably to the image plane. This leads to impressive 3D consistency, but incorporating such a bias comes at a price: the camera needs to be modeled as well. Current approaches assume fixed intrinsics and a predefined prior over camera pose ranges. As a result, parameter tuning is typically required for real-world data, and results degrade if the data distribution is not matched. Our key hypothesis is that learning a camera generator jointly with the image generator leads to a more principled approach to 3D-aware image synthesis. Further, we propose to decompose the scene into a background and foreground model, leading to more efficient and disentangled scene representations. While training from raw, unposed image collections, we learn a 3D- and camera-aware generative model which faithfully recovers not only the image but also the camera data distribution. At test time, our model generates images with explicit control over the camera as well as the shape and appearance of the scene.\n  - [NeRF-VAE: A Geometry Aware 3D Scene Generative Model](https://arxiv.org/abs/2104.00587) | [code]\n    > We propose NeRF-VAE, a 3D scene generative model that incorporates geometric structure via NeRF and differentiable volume rendering. In contrast to NeRF, our model takes into account shared structure across scenes, and is able to infer the structure of a novel scene -- without the need to re-train -- using amortized inference. NeRF-VAE's explicit 3D rendering process further contrasts previous generative models with convolution-based rendering which lacks geometric structure. Our model is a VAE that learns a distribution over radiance fields by conditioning them on a latent scene representation. We show that, once trained, NeRF-VAE is able to infer and render geometrically-consistent scenes from previously unseen 3D environments using very few input images. We further demonstrate that NeRF-VAE generalizes well to out-of-distribution cameras, while convolutional models do not. Finally, we introduce and study an attention-based conditioning mechanism of NeRF-VAE's decoder, which improves model performance.\n  - [Unconstrained Scene Generation with Locally Conditioned Radiance Fields, ICCV2021](https://apple.github.io/ml-gsn/) | [***``[code]``***](https://github.com/apple/ml-gsn)\n    > We follow an adversarial learning framework, where the generator models scenes via their radiance field, and the discriminator attempts to distinguish between images rendered from those radiance fields and images of real scenes. Conceptually, our model decomposes the radiance field of a scene into many small local radiance fields that result from conditioning on a 2D grid of latent codes W. W can be interpreted as a latent floorplan representing the scene.\n  - [MVSNeRF: Fast Generalizable Radiance Field Reconstruction from Multi-View Stereo, ICCV2021](https://apchenstu.github.io/mvsnerf/) | [***``[code]``***](https://github.com/apchenstu/mvsnerf)\n    > We present MVSNeRF, a novel neural rendering approach that can efficiently reconstruct neural radiance fields for view synthesis. Unlike prior works on neural radiance fields that consider per-scene optimization on densely captured images, we propose a generic deep neural network that can reconstruct radiance fields from only three nearby input views via fast network inference. Our approach leverages plane-swept cost volumes (widely used in multi-view stereo) for geometry-aware scene reasoning, and combines this with physically based volume rendering for neural radiance field reconstruction. We train our network on real objects in the DTU dataset, and test it on three different datasets to evaluate its effectiveness and generalizability. Our approach can generalize across scenes (even indoor scenes, completely different from our training scenes of objects) and generate realistic view synthesis results using only three input images, significantly outperforming concurrent works on generalizable radiance field reconstruction. Moreover, if dense images are captured, our estimated radiance field representation can be easily fine-tuned; this leads to fast per-scene reconstruction with higher rendering quality and substantially less optimization time than NeRF.\n  - [Stereo Radiance Fields (SRF): Learning View Synthesis from Sparse Views of Novel Scenes, CVPR2021](https://arxiv.org/abs/2104.06935) | [***``[code]``***](https://virtualhumans.mpi-inf.mpg.de/srf/)\n    > Recent neural view synthesis methods have achieved impressive quality and realism, surpassing classical pipelines which rely on multi-view reconstruction. State-of-the-Art methods, such as NeRF, are designed to learn a single scene with a neural network and require dense multi-view inputs. Testing on a new scene requires re-training from scratch, which takes 2-3 days. In this work, we introduce Stereo Radiance Fields (SRF), a neural view synthesis approach that is trained end-to-end, generalizes to new scenes, and requires only sparse views at test time. The core idea is a neural architecture inspired by classical multi-view stereo methods, which estimates surface points by finding similar image regions in stereo images. In SRF, we predict color and density for each 3D point given an encoding of its stereo correspondence in the input images. The encoding is implicitly learned by an ensemble of pair-wise similarities -- emulating classical stereo. Experiments show that SRF learns structure instead of overfitting on a scene. We train on multiple scenes of the DTU dataset and generalize to new ones without re-training, requiring only 10 sparse and spread-out views as input. We show that 10-15 minutes of fine-tuning further improve the results, achieving significantly sharper, more detailed results than scene-specific models. The code, model, and videos are available at this https URL.\n  - [Neural Rays for Occlusion-aware Image-based Rendering, CVPR2022](https://liuyuan-pal.github.io/NeuRay/) | [***``[code]``***](https://github.com/liuyuan-pal/NeuRay)\n    > We present a new neural representation, called Neural Ray (NeuRay), for the novel view synthesis task. Recent works construct radiance fields from image features of input views to render novel view images, which enables the generalization to new scenes. However, due to occlusions, a 3D point may be invisible to some input views. On such a 3D point, these generalization methods will include inconsistent image features from invisible views, which interfere with the radiance field construction. To solve this problem, we predict the visibility of 3D points to input views within our NeuRay representation. This visibility enables the radiance field construction to focus on visible image features, which significantly improves its rendering quality. Meanwhile, a novel consistency loss is proposed to refine the visibility in NeuRay when finetuning on a specific scene. Experiments demonstrate that our approach achieves state-of-the-art performance on the novel view synthesis task when generalizing to unseen scenes and outperforms per-scene optimization methods after finetuning.\n  - [Putting NeRF on a Diet: Semantically Consistent Few-Shot View Synthesis, ICCV2021](https://www.ajayj.com/dietnerf) | [***``[code]``***](https://github.com/ajayjain/DietNeRF)\n    > We present DietNeRF, a 3D neural scene representation estimated from a few images. Neural Radiance Fields (NeRF) learn a continuous volumetric representation of a scene through multi-view consistency, and can be rendered from novel viewpoints by ray casting. While NeRF has an impressive ability to reconstruct geometry and fine details given many images, up to 100 for challenging 360° scenes, it often finds a degenerate solution to its image reconstruction objective when only a few input views are available. To improve few-shot quality, we propose DietNeRF. We introduce an auxiliary semantic consistency loss that encourages realistic renderings at novel poses. DietNeRF is trained on individual scenes to (1) correctly render given input views from the same pose, and (2) match high-level semantic attributes across different, random poses. Our semantic loss allows us to supervise DietNeRF from arbitrary poses. We extract these semantics using a pre-trained visual encoder such as CLIP, a Vision Transformer trained on hundreds of millions of diverse single-view, 2D photographs mined from the web with natural language supervision. In experiments, DietNeRF improves the perceptual quality of few-shot view synthesis when learned from scratch, can render novel views with as few as one observed image when pre-trained on a multi-view dataset, and produces plausible completions of completely unobserved regions.\n  - [Towards Continuous Depth MPI with NeRF for Novel View Synthesis, ICCV2021](https://arxiv.org/abs/2103.14910) | [***``[code]``***](https://github.com/vincentfung13/MINE)\n    > In this paper, we propose MINE to perform novel view synthesis and depth estimation via dense 3D reconstruction from a single image. Our approach is a continuous depth generalization of the Multiplane Images (MPI) by introducing the NEural radiance fields (NeRF). Given a single image as input, MINE predicts a 4-channel image (RGB and volume density) at arbitrary depth values to jointly reconstruct the camera frustum and fill in occluded contents. The reconstructed and inpainted frustum can then be easily rendered into novel RGB or depth views using differentiable rendering. Extensive experiments on RealEstate10K, KITTI and Flowers Light Fields show that our MINE outperforms state-of-the-art by a large margin in novel view synthesis. We also achieve competitive results in depth estimation on iBims-1 and NYU-v2 without annotated depth supervision. Our source code is available at this https URL\n  - [TöRF: Time-of-Flight Radiance Fields for Dynamic Scene View Synthesis, NeurIPS2021](https://imaging.cs.cmu.edu/torf/) | [***``[code]``***](https://github.com/breuckelen/torf)\n    > Neural networks can represent and accurately reconstruct radiance fields for static 3D scenes (e.g., NeRF). Several works extend these to dynamic scenes captured with monocular video, with promising performance. However, the monocular setting is known to be an under-constrained problem, and so methods rely on data-driven priors for reconstructing dynamic content. We replace these priors with measurements from a time-of-flight (ToF) camera, and introduce a neural representation based on an image formation model for continuous-wave ToF cameras. Instead of working with processed depth maps, we model the raw ToF sensor measurements to improve reconstruction quality and avoid issues with low reflectance regions, multi-path interference, and a sensor's limited unambiguous depth range. We show that this approach improves robustness of dynamic scene reconstruction to erroneous calibration and large motions, and discuss the benefits and limitations of integrating RGB+ToF sensors that are now available on modern smartphones.\n  - [CodeNeRF: Disentangled Neural Radiance Fields for Object Categories, ICCV2021(oral)](https://www.google.com/url?q=https%3A%2F%2Farxiv.org%2Fpdf%2F2109.01750.pdf&sa=D&sntz=1&usg=AOvVaw1Fnir0e4aRa22Nt0HoXDWh) | [***``[code]``***](https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fwbjang%2Fcode-nerf&sa=D&sntz=1&usg=AOvVaw2eD5ZoRbk2aWFuwUSHlh5_)\n    > CodeNeRF is an implicit 3D neural representation that learns the variation of object shapes and textures across a category and can be trained, from a set of posed images, to synthesize novel views of unseen objects. Unlike the original NeRF, which is scene specific, CodeNeRF learns to disentangle shape and texture by learning separate embeddings. At test time, given a single unposed image of an unseen object, CodeNeRF jointly estimates camera viewpoint, and shape and appearance codes via optimization. Unseen objects can be reconstructed from a single image, and then rendered from new viewpoints or their shape and texture edited by varying the latent codes. We conduct experiments on the SRN benchmark, which show that CodeNeRF generalises well to unseen objects and achieves on-par performance with methods that require known camera pose at test time. Our results on real-world images demonstrate that CodeNeRF can bridge the sim-to-real gap. \n  - [StyleNeRF: A Style-based 3D-Aware Generator for High-resolution Image Synthesis, ICLR2022](https://jiataogu.me/style_nerf/) | [***``[code]``***](https://github.com/facebookresearch/StyleNeRF)\n    > We propose StyleNeRF, a 3D-aware generative model for photo-realistic high-resolution image synthesis with high multi-view consistency, which can be trained on unstructured 2D images. Existing approaches either cannot synthesize high-resolution images with fine details or yield noticeable 3D-inconsistent artifacts. In addition, many of them lack control over style attributes and explicit 3D camera poses. StyleNeRF integrates the neural radiance field (NeRF) into a style-based generator to tackle the aforementioned challenges, i.e., improving rendering efficiency and 3D consistency for high-resolution image generation. We perform volume rendering only to produce a low-resolution feature map and progressively apply upsampling in 2D to address the first issue. To mitigate the inconsistencies caused by 2D upsampling, we propose multiple designs, including a better upsampler and a new regularization loss. With these designs, StyleNeRF can synthesize high-resolution images at interactive rates while preserving 3D consistency at high quality. StyleNeRF also enables control of camera poses and different levels of styles, which can generalize to unseen views. It also supports challenging tasks, including zoom-in and-out, style mixing, inversion, and semantic editing.\n  - [NeRF in the Dark: High Dynamic Range View Synthesis from Noisy Raw Images, CVPR2022(oral)](https://bmild.github.io/rawnerf/) | [***``[code]``***](https://github.com/google-research/multinerf)\n    > Neural Radiance Fields (NeRF) is a technique for high quality novel view synthesis from a collection of posed input images. Like most view synthesis methods, NeRF uses tonemapped low dynamic range (LDR) as input; these images have been processed by a lossy camera pipeline that smooths detail, clips highlights, and distorts the simple noise distribution of raw sensor data. We modify NeRF to instead train directly on linear raw images, preserving the scene's full dynamic range. By rendering raw output images from the resulting NeRF, we can perform novel high dynamic range (HDR) view synthesis tasks. In addition to changing the camera viewpoint, we can manipulate focus, exposure, and tonemapping after the fact. Although a single raw image appears significantly more noisy than a postprocessed one, we show that NeRF is highly robust to the zero-mean distribution of raw noise. When optimized over many noisy raw inputs (25-200), NeRF produces a scene representation so accurate that its rendered novel views outperform dedicated single and multi-image deep raw denoisers run on the same wide baseline input images. As a result, our method, which we call RawNeRF, can reconstruct scenes from extremely noisy images captured in near-darkness.\n  - [iNeRF: Inverting Neural Radiance Fields for Pose Estimation, IROS2021](http://yenchenlin.me/inerf/) | [***``[code]``***](https://github.com/yenchenlin/iNeRF-public)\n    > We present iNeRF, a framework that performs pose estimation by “inverting” a trained Neural Radiance Field(NeRF). NeRFs have been shown to be remarkably effective for the task of view synthesis — synthesizing photorealisticnovel views of real-world scenes or objects. In this work, we investigate whether we can apply analysis-by-synthesis with NeRF for 6DoF pose estimation – given an image, find the translation and rotation of a camera relative to a 3Dmodel. Starting from an initial pose estimate, we use gradient descent to minimize the residual between pixels rendered from an already-trained NeRF and pixels in an observed image. In our experiments, we first study 1) how to sample rays during pose refinement for iNeRF to collect informative gradients and 2) how different batch sizes ofrays affect iNeRF on a synthetic dataset. We then show that for complex real-world scenes from the LLFF dataset, iNeRF can improve NeRF by estimating the camera poses of novel images and using these images as additional trainingdata for NeRF. Finally, we show iNeRF can be combinedwith feature-based pose initialization. The approach outperforms all other RGB-based methods relying on syntheticdata on LineMOD.\n  - [A-NeRF: Surface-free Human 3D Pose Refinement via Neural Rendering, NeurIPS2021](https://arxiv.org/abs/2102.06199) | [***``[code]``***](https://github.com/LemonATsu/A-NeRF)\n    > While deep learning reshaped the classical motion capture pipeline with feed-forward networks, generative models are required to recover fine alignment via iterative refinement. Unfortunately, the existing models are usually hand-crafted or learned in controlled conditions, only applicable to limited domains. We propose a method to learn a generative neural body model from unlabelled monocular videos by extending Neural Radiance Fields (NeRFs). We equip them with a skeleton to apply to time-varying and articulated motion. A key insight is that implicit models require the inverse of the forward kinematics used in explicit surface models. Our reparameterization defines spatial latent variables relative to the pose of body parts and thereby overcomes ill-posed inverse operations with an overparameterization. This enables learning volumetric body shape and appearance from scratch while jointly refining the articulated pose; all without ground truth labels for appearance, pose, or 3D shape on the input videos. When used for novel-view-synthesis and motion capture, our neural model improves accuracy on diverse datasets. Project website: this https URL .\n  - [NeRF--: Neural Radiance Fields Without Known Camera Parameters](https://nerfmm.active.vision/) | [***``[code]``***](https://github.com/ActiveVisionLab/nerfmm)\n    > Considering the problem of novel view synthesis (NVS) from only a set of 2D images, we simplify the training process of Neural Radiance Field (NeRF) on forward-facing scenes by removing the requirement of known or pre-computed camera parameters, including both intrinsics and 6DoF poses. To this end, we propose NeRF−−, with three contributions: First, we show that the camera parameters can be jointly optimised as learnable parameters with NeRF training, through a photometric reconstruction; Second, to benchmark the camera parameter estimation and the quality of novel view renderings, we introduce a new dataset of path-traced synthetic scenes, termed as Blender Forward-Facing Dataset (BLEFF); Third, we conduct extensive analyses to understand the training behaviours under various camera motions, and show that in most scenarios, the joint optimisation pipeline can recover accurate camera parameters and achieve comparable novel view synthesis quality as those trained with COLMAP pre-computed camera parameters.\n  - [Implicit Mapping and Positioning in Real-Time, ICCV2021](https://arxiv.org/abs/2103.12352) | [code]\n    > We show for the first time that a multilayer perceptron (MLP) can serve as the only scene representation in a real-time SLAM system for a handheld RGB-D camera. Our network is trained in live operation without prior data, building a dense, scene-specific implicit 3D model of occupancy and colour which is also immediately used for tracking.\n  - [NICE-SLAM  Neural Implicit Scalable Encoding for SLAM, CVPR2022](https://arxiv.org/abs/2112.12130) | [***``[code]``***](https://github.com/cvg/nice-slam)\n    > Neural implicit representations have recently shown encouraging results in various domains, including promising progress in simultaneous localization and mapping (SLAM). Nevertheless, existing methods produce over-smoothed scene reconstructions and have difficulty scaling up to large scenes. These limitations are mainly due to their simple fully-connected network architecture that does not incorporate local information in the observations. In this paper, we present NICE-SLAM, a dense SLAM system that incorporates multi-level local information by introducing a hierarchical scene representation. Optimizing this representation with pre-trained geometric priors enables detailed reconstruction on large indoor scenes. Compared to recent neural implicit SLAM systems, our approach is more scalable, efficient, and robust. Experiments on five challenging datasets demonstrate competitive results of NICE-SLAM in both mapping and tracking quality.\n  - [GNeRF: GAN-based Neural Radiance Field without Posed Camera, ICCV2021(oral)](https://arxiv.org/abs/2103.15606) | [code]\n    > We introduce GNeRF, a framework to marry Generative Adversarial Networks (GAN) with Neural Radiance Field (NeRF) reconstruction for the complex scenarios with unknown and even randomly initialized camera poses. Recent NeRF-based advances have gained popularity for remarkable realistic novel view synthesis. However, most of them heavily rely on accurate camera poses estimation, while few recent methods can only optimize the unknown camera poses in roughly forward-facing scenes with relatively short camera trajectories and require rough camera poses initialization. Differently, our GNeRF only utilizes randomly initialized poses for complex outside-in scenarios. We propose a novel two-phases end-to-end framework. The first phase takes the use of GANs into the new realm for optimizing coarse camera poses and radiance fields jointly, while the second phase refines them with additional photometric loss. We overcome local minima using a hybrid and iterative optimization scheme. Extensive experiments on a variety of synthetic and natural scenes demonstrate the effectiveness of GNeRF. More impressively, our approach outperforms the baselines favorably in those scenes with repeated patterns or even low textures that are regarded as extremely challenging before.\n  - [BARF: Bundle-Adjusting Neural Radiance Fields, ICCV2021(oral)](https://chenhsuanlin.bitbucket.io/bundle-adjusting-NeRF/) | [***``[code]``***](https://github.com/chenhsuanlin/bundle-adjusting-NeRF)\n    > Neural Radiance Fields (NeRF) have recently gained a surge of interest within the computer vision community for its power to synthesize photorealistic novel views of real-world scenes. One limitation of NeRF, however, is its requirement of accurate camera poses to learn the scene representations. In this paper, we propose Bundle-Adjusting Neural Radiance Fields (BARF) for training NeRF from imperfect (or even unknown) camera poses — the joint problem of learning neural 3D representations and registering camera frames. We establish a theoretical connection to classical image alignment and show that coarse-to-fine registration is also applicable to NeRF. Furthermore, we show that naively applying positional encoding in NeRF has a negative impact on registration with a synthesis-based objective. Experiments on synthetic and real-world data show that BARF can effectively optimize the neural scene representations and resolve large camera pose misalignment at the same time. This enables view synthesis and localization of video sequences from unknown camera poses, opening up new avenues for visual localization systems (e.g. SLAM) and potential applications for dense 3D mapping and reconstruction.\n  - [Self-Calibrating Neural Radiance Fields, ICCV2021](https://postech-cvlab.github.io/SCNeRF/) | [***``[code]``***](https://github.com/POSTECH-CVLab/SCNeRF)\n    > In this work, we propose a camera self-calibration algorithm for generic cameras with arbitrary non-linear distortions. We jointly learn the geometry of the scene and the accurate camera parameters without any calibration objects. Our camera model consists a pinhole model, radial distortion, and a generic noise model that can learn arbitrary non-linear camera distortions. While traditional self-calibration algorithms mostly rely on geometric constraints, we additionally incorporate photometric consistency. This requires learning the geometry of the scene and we use Neural Radiance Fields (NeRF). We also propose a new geometric loss function, viz., projected ray distance loss, to incorporate geometric consistency for complex non-linear camera models. We validate our approach on standard real image datasets and demonstrate our model can learn the camera intrinsics and extrinsics (pose) from scratch without COLMAP initialization. Also, we show that learning accurate camera models in differentiable manner allows us to improves PSNR over NeRF. We experimentally demonstrate that our proposed method is applicable to variants of NeRF. In addition, we use a set of images captured with a fish-eye lens to demonstrate that learning camera model jointly improves the performance significantly over the COLMAP initialization.\n  - [NeRD: Neural Reflectance Decomposition from Image Collections, ICCV2021](https://markboss.me/publication/2021-nerd/#:~:text=NeRD%20is%20a%20novel%20method,can%20turn%20around%20the%20object.) | [***``[code]``***](https://github.com/cgtuebingen/NeRD-Neural-Reflectance-Decomposition)\n    > Decomposing a scene into its shape, reflectance, and illumination is a challenging but important problem in computer vision and graphics. This problem is inherently more challenging when the illumination is not a single light source under laboratory conditions but is instead an unconstrained environmental illumination. Though recent work has shown that implicit representations can be used to model the radiance field of an object, most of these techniques only enable view synthesis and not relighting. Additionally, evaluating these radiance fields is resource and time-intensive. We propose a neural reflectance decomposition (NeRD) technique that uses physically-based rendering to decompose the scene into spatially varying BRDF material properties. In contrast to existing techniques, our input images can be captured under different illumination conditions. In addition, we also propose techniques to convert the learned reflectance volume into a relightable textured mesh enabling fast real-time rendering with novel illuminations. We demonstrate the potential of the proposed approach with experiments on both synthetic and real datasets, where we are able to obtain high-quality relightable 3D assets from image collections.\n  - [NeRV: Neural Reflectance and Visibility Fields for Relighting and View Synthesis, CVPR2021](https://pratulsrinivasan.github.io/nerv/) | [code]\n    > We present a method that takes as input a set of images of a scene illuminated by unconstrained known lighting, and produces as output a 3D representation that can be rendered from novel viewpoints under arbitrary lighting conditions. Our method represents the scene as a continuous volumetric function parameterized as MLPs whose inputs are a 3D location and whose outputs are the following scene properties at that input location: volume density, surface normal, material parameters, distance to the first surface intersection in any direction, and visibility of the external environment in any direction. Together, these allow us to render novel views of the object under arbitrary lighting, including indirect illumination effects. The predicted visibility and surface intersection fields are critical to our model's ability to simulate direct and indirect illumination during training, because the brute-force techniques used by prior work are intractable for lighting conditions outside of controlled setups with a single light. Our method outperforms alternative approaches for recovering relightable 3D scene representations, and performs well in complex lighting settings that have posed a significant challenge to prior work.\n  - [NeX: Real-time View Synthesis with Neural Basis Expansion, CVPR2021(oral)](https://nex-mpi.github.io/) | [***``[code]``***](https://github.com/nex-mpi/nex-code/)\n    > We present NeX, a new approach to novel view synthesis based on enhancements of multiplane image (MPI) that can reproduce NeXt-level view-dependent effects---in real time. Unlike traditional MPI that uses a set of simple RGBα planes, our technique models view-dependent effects by instead parameterizing each pixel as a linear combination of basis functions learned from a neural network. Moreover, we propose a hybrid implicit-explicit modeling strategy that improves upon fine detail and produces state-of-the-art results. Our method is evaluated on benchmark forward-facing datasets as well as our newly-introduced dataset designed to test the limit of view-dependent modeling with significantly more challenging effects such as the rainbow reflections on a CD. Our method achieves the best overall scores across all major metrics on these datasets with more than 1000× faster rendering time than the state of the art.\n  - [NeRFactor: Neural Factorization of Shape and Reflectance Under an Unknown Illumination, TOG 2021 (Proc. SIGGRAPH Asia)](https://xiuming.info/projects/nerfactor/) | [code]\n    > We address the problem of recovering the shape and spatially-varying reflectance of an object from multi-view images (and their camera poses) of an object illuminated by one unknown lighting condition. This enables the rendering of novel views of the object under arbitrary environment lighting and editing of the object's material properties. The key to our approach, which we call Neural Radiance Factorization (NeRFactor), is to distill the volumetric geometry of a Neural Radiance Field (NeRF) [Mildenhall et al. 2020] representation of the object into a surface representation and then jointly refine the geometry while solving for the spatially-varying reflectance and environment lighting. Specifically, NeRFactor recovers 3D neural fields of surface normals, light visibility, albedo, and Bidirectional Reflectance Distribution Functions (BRDFs) without any supervision, using only a re-rendering loss, simple smoothness priors, and a data-driven BRDF prior learned from real-world BRDF measurements. By explicitly modeling light visibility, NeRFactor is able to separate shadows from albedo and synthesize realistic soft or hard shadows under arbitrary lighting conditions. NeRFactor is able to recover convincing 3D models for free-viewpoint relighting in this challenging and underconstrained capture setup for both synthetic and real scenes. Qualitative and quantitative experiments show that NeRFactor outperforms classic and deep learning-based state of the art across various tasks. Our videos, code, and data are available at people.csail.mit.edu/xiuming/projects/nerfactor/.\n  - [NeRF++: Analyzing and Improving Neural Radiance Fields](https://arxiv.org/abs/2010.07492) | [***``[code]``***](https://github.com/Kai-46/nerfplusplus;)\n    > Neural Radiance Fields (NeRF) achieve impressive view synthesis results for a variety of capture settings, including 360 capture of bounded scenes and forward-facing capture of bounded and unbounded scenes. NeRF fits multi-layer perceptrons (MLPs) representing view-invariant opacity and view-dependent color volumes to a set of training images, and samples novel views based on volume rendering techniques. In this technical report, we first remark on radiance fields and their potential ambiguities, namely the shape-radiance ambiguity, and analyze NeRF's success in avoiding such ambiguities. Second, we address a parametrization issue involved in applying NeRF to 360 captures of objects within large-scale, unbounded 3D scenes. Our method improves view synthesis fidelity in this challenging scenario. Code is available at this https URL.\n  - [GIRAFFE: Representing Scenes as Compositional Generative Neural Feature Fields, CVPR2021(oral)](https://arxiv.org/abs/2011.12100) | [***``[code]``***](https://github.com/autonomousvision/giraffe)\n    > Deep generative models allow for photorealistic image synthesis at high resolutions. But for many applications, this is not enough: content creation also needs to be controllable. While several recent works investigate how to disentangle underlying factors of variation in the data, most of them operate in 2D and hence ignore that our world is three-dimensional. Further, only few works consider the compositional nature of scenes. Our key hypothesis is that incorporating a compositional 3D scene representation into the generative model leads to more controllable image synthesis. Representing scenes as compositional generative neural feature fields allows us to disentangle one or multiple objects from the background as well as individual objects' shapes and appearances while learning from unstructured and unposed image collections without any additional supervision. Combining this scene representation with a neural rendering pipeline yields a fast and realistic image synthesis model. As evidenced by our experiments, our model is able to disentangle individual objects and allows for translating and rotating them in the scene as well as changing the camera pose.\n  - [Object-Centric Neural Scene Rendering](https://shellguo.com/osf/) | [***``[code]``***](https://shellguo.com/osf/)\n    > We present a method for composing photorealistic scenes from captured images of objects. Our work builds upon neural radiance fields (NeRFs), which implicitly model the volumetric density and directionally-emitted radiance of a scene. While NeRFs synthesize realistic pictures, they only model static scenes and are closely tied to specific imaging conditions. This property makes NeRFs hard to generalize to new scenarios, including new lighting or new arrangements of objects. Instead of learning a scene radiance field as a NeRF does, we propose to learn object-centric neural scattering functions (OSFs), a representation that models per-object light transport implicitly using a lighting- and view-dependent neural network. This enables rendering scenes even when objects or lights move, without retraining. Combined with a volumetric path tracing procedure, our framework is capable of rendering both intra- and inter-object light transport effects including occlusions, specularities, shadows, and indirect illumination. We evaluate our approach on scene composition and show that it generalizes to novel illumination conditions, producing photorealistic, physically accurate renderings of multi-object scenes.\n  - [Learning Compositional Radiance Fields of Dynamic Human Heads, CVPR2021(oral)](https://ziyanw1.github.io/hybrid_nerf/) | [code]\n    > Photorealistic rendering of dynamic humans is an important ability for telepresence systems, virtual shopping, synthetic data generation, and more. Recently, neural rendering methods, which combine techniques from computer graphics and machine learning, have created high-fidelity models of humans and objects. Some of these methods do not produce results with high-enough fidelity for driveable human models (Neural Volumes) whereas others have extremely long rendering times (NeRF). We propose a novel compositional 3D representation that combines the best of previous methods to produce both higher-resolution and faster results. Our representation bridges the gap between discrete and continuous volumetric representations by combining a coarse 3D-structure-aware grid of animation codes with a continuous learned scene function that maps every position and its corresponding local animation code to its view-dependent emitted radiance and local volume density. Differentiable volume rendering is employed to compute photo-realistic novel views of the human head and upper body as well as to train our novel representation end-to-end using only 2D supervision. In addition, we show that the learned dynamic radiance field can be used to synthesize novel unseen expressions based on a global animation code. Our approach achieves state-of-the-art results for synthesizing novel views of dynamic human heads and the upper body.\n  - [Neural Scene Graphs for Dynamic Scenes, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > Recent implicit neural rendering methods have demonstrated that it is possible to learn accurate view synthesis for complex scenes by predicting their volumetric density and color supervised solely by a set of RGB images. However, existing methods are restricted to learning efficient representations of static scenes that encode all scene objects into a single neural network, and lack the ability to represent dynamic scenes and decompositions into individual scene objects. In this work, we present the first neural rendering method that decomposes dynamic scenes into scene graphs. We propose a learned scene graph representation, which encodes object transformation and radiance, to efficiently render novel arrangements and views of the scene. To this end, we learn implicitly encoded scenes, combined with a jointly learned latent representation to describe objects with a single implicit function. We assess the proposed method on synthetic and real automotive data, validating that our approach learns dynamic scenes -- only by observing a video of this scene -- and allows for rendering novel photo-realistic views of novel scene compositions with unseen sets of objects at unseen poses.\n  - [Unsupervised Discovery of Object Radiance Fields, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > We study the problem of inferring an object-centric scene representation from a single image, aiming to derive a representation that explains the image formation process, captures the scene's 3D nature, and is learned without supervision. Most existing methods on scene decomposition lack one or more of these characteristics, due to the fundamental challenge in integrating the complex 3D-to-2D image formation process into powerful inference schemes like deep networks. In this paper, we propose unsupervised discovery of Object Radiance Fields (uORF), integrating recent progresses in neural 3D scene representations and rendering with deep inference networks for unsupervised 3D scene decomposition. Trained on multi-view RGB images without annotations, uORF learns to decompose complex scenes with diverse, textured background from a single image. We show that uORF performs well on unsupervised 3D scene segmentation, novel view synthesis, and scene editing on three datasets.\n  - [Learning Object-Compositional Neural Radiance Field for Editable Scene Rendering, ICCV2021](https://zju3dv.github.io/object_nerf/) | [***``[code]``***](https://github.com/zju3dv/object_nerf)\n    > Implicit neural rendering techniques have shown promising results for novel view synthesis. However, existing methods usually encode the entire scene as a whole, which is generally not aware of the object identity and limits the ability to the high-level editing tasks such as moving or adding furniture. In this paper, we present a novel neural scene rendering system, which learns an object-compositional neural radiance field and produces realistic rendering with editing capability for a clustered and real-world scene. Specifically, we design a novel two-pathway architecture, in which the scene branch encodes the scene geometry and appearance, and the object branch encodes each standalone object conditioned on learnable object activation codes. To survive the training in heavily cluttered scenes, we propose a scene-guided training strategy to solve the 3D space ambiguity in the occluded regions and learn sharp boundaries for each object. Extensive experiments demonstrate that our system not only achieves competitive performance for static scene novel-view synthesis, but also produces realistic rendering for object-level editing.\n  - [In-Place Scene Labelling and Understanding with Implicit Scene Representation, ICCV2021(oral)](https://shuaifengzhi.com/Semantic-NeRF/) | [***``[code]``***](https://github.com/Harry-Zhi/semantic_nerf/)\n    > Semantic labelling is highly correlated with geometry and radiance reconstruction, as scene entities with similar shape and appearance are more likely to come from similar classes. Recent implicit neural reconstruction techniques are appealing as they do not require prior training data, but the same fully self-supervised approach is not possible for semantics because labels are human-defined properties.\n  - [Editing Conditional Radiance Fields, ICCV2021](http://editnerf.csail.mit.edu/) | [***``[code]``***](https://github.com/stevliu/editnerf)\n    > A neural radiance field (NeRF) is a scene model supporting high-quality view synthesis, optimized per scene. In this paper, we explore enabling user editing of a category-level NeRF - also known as a conditional radiance field - trained on a shape category. Specifically, we introduce a method for propagating coarse 2D user scribbles to the 3D space, to modify the color or shape of a local region. First, we propose a conditional radiance field that incorporates new modular network components, including a shape branch that is shared across object instances. Observing multiple instances of the same category, our model learns underlying part semantics without any supervision, thereby allowing the propagation of coarse 2D user scribbles to the entire 3D region (e.g., chair seat). Next, we propose a hybrid network update strategy that targets specific network components, which balances efficiency and accuracy. During user interaction, we formulate an optimization problem that both satisfies the user's constraints and preserves the original object structure. We demonstrate our approach on various editing tasks over three shape datasets and show that it outperforms prior neural editing approaches. Finally, we edit the appearance and shape of a real photograph and show that the edit propagates to extrapolated novel views.\n  - [Editable Free-Viewpoint Video using a Layered Neural Representation, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > Generating free-viewpoint videos is critical for immersive VR/AR experience but recent neural advances still lack the editing ability to manipulate the visual perception for large dynamic scenes. To fill this gap, in this paper we propose the first approach for editable photo-realistic free-viewpoint video generation for large-scale dynamic scenes using only sparse 16 cameras. The core of our approach is a new layered neural representation, where each dynamic entity including the environment itself is formulated into a space-time coherent neural layered radiance representation called ST-NeRF. Such layered representation supports fully perception and realistic manipulation of the dynamic scene whilst still supporting a free viewing experience in a wide range. In our ST-NeRF, the dynamic entity/layer is represented as continuous functions, which achieves the disentanglement of location, deformation as well as the appearance of the dynamic entity in a continuous and self-supervised manner. We propose a scene parsing 4D label map tracking to disentangle the spatial information explicitly, and a continuous deform module to disentangle the temporal motion implicitly. An object-aware volume rendering scheme is further introduced for the re-assembling of all the neural layers. We adopt a novel layered loss and motion-aware ray sampling strategy to enable efficient training for a large dynamic scene with multiple performers, Our framework further enables a variety of editing functions, i.e., manipulating the scale and location, duplicating or retiming individual neural layers to create numerous visual effects while preserving high realism. Extensive experiments demonstrate the effectiveness of our approach to achieve high-quality, photo-realistic, and editable free-viewpoint video generation for dynamic scenes.\n  - [FiG-NeRF: Figure Ground Neural Radiance Fields for 3D Object Category Modelling, 3DV2021](https://fig-nerf.github.io/) | [code]\n    > We investigate the use of Neural Radiance Fields (NeRF) to learn high quality 3D object category models from collections of input images. In contrast to previous work, we are able to do this whilst simultaneously separating foreground objects from their varying backgrounds. We achieve this via a 2-component NeRF model, FiG-NeRF, that prefers explanation of the scene as a geometrically constant background and a deformable foreground that represents the object category. We show that this method can learn accurate 3D object category models using only photometric supervision and casually captured images of the objects. Additionally, our 2-part decomposition allows the model to perform accurate and crisp amodal segmentation. We quantitatively evaluate our method with view synthesis and image fidelity metrics, using synthetic, lab-captured, and in-the-wild data. Our results demonstrate convincing 3D object category modelling that exceed the performance of existing methods.\n  - [NeRF-Tex: Neural Reflectance Field Textures, EGSR2021](https://developer.nvidia.com/blog/nvidia-research-nerf-tex-neural-reflectance-field-textures/) | [***``[code]``***](https://github.com/hbaatz/nerf-tex)\n    > We investigate the use of neural fields for modeling diverse mesoscale structures, such as fur, fabric, and grass. Instead of using classical graphics primitives to model the structure, we propose to employ a versatile volumetric primitive represented by a neural reflectance field (NeRF-Tex), which jointly models the geometry of the material and its response to lighting. The NeRF-Tex primitive can be instantiated over a base mesh to “texture” it with the desired meso and microscale appearance. We condition the reflectance field on user-defined parameters that control the appearance. A single NeRF texture thus captures an entire space of reflectance fields rather than one specific structure. This increases the gamut of appearances that can be modeled and provides a solution for combating repetitive texturing artifacts. We also demonstrate that NeRF textures naturally facilitate continuous level-of-detail rendering. Our approach unites the versatility and modeling power of neural networks with the artistic control needed for precise modeling of virtual scenes. While all our training data is currently synthetic, our work provides a recipe that can be further extended to extract complex, hard-to-model appearances from real images.\n  - [Mip-NeRF: A Multiscale Representation for Anti-Aliasing Neural Radiance Fields, ICCV2021(oral)](https://jonbarron.info/mipnerf/) | [***``[code]``***](https://github.com/google/mipnerf)\n    > The rendering procedure used by neural radiance fields (NeRF) samples a scene with a single ray per pixel and may therefore produce renderings that are excessively blurred or aliased when training or testing images observe scene content at different resolutions. The straightforward solution of supersampling by rendering with multiple rays per pixel is impractical for NeRF, because rendering each ray requires querying a multilayer perceptron hundreds of times. Our solution, which we call \"mip-NeRF\" (à la \"mipmap\"), extends NeRF to represent the scene at a continuously-valued scale. By efficiently rendering anti-aliased conical frustums instead of rays, mip-NeRF reduces objectionable aliasing artifacts and significantly improves NeRF's ability to represent fine details, while also being 7% faster than NeRF and half the size. Compared to NeRF, mip-NeRF reduces average error rates by 17% on the dataset presented with NeRF and by 60% on a challenging multiscale variant of that dataset that we present. mip-NeRF is also able to match the accuracy of a brute-force supersampled NeRF on our multiscale dataset while being 22x faster.\n  - [UNISURF: Unifying Neural Implicit Surfaces and Radiance Fields for Multi-View Reconstruction, ICCV2021(oral)](https://arxiv.org/abs/2104.10078) | [***``[code]``***](https://github.com/autonomousvision/unisurf)\n    > Neural implicit 3D representations have emerged as a powerful paradigm for reconstructing surfaces from multi-view images and synthesizing novel views. Unfortunately, existing methods such as DVR or IDR require accurate per-pixel object masks as supervision. At the same time, neural radiance fields have revolutionized novel view synthesis. However, NeRF's estimated volume density does not admit accurate surface reconstruction. Our key insight is that implicit surface models and radiance fields can be formulated in a unified way, enabling both surface and volume rendering using the same model. This unified perspective enables novel, more efficient sampling procedures and the ability to reconstruct accurate surfaces without input masks. We compare our method on the DTU, BlendedMVS, and a synthetic indoor dataset. Our experiments demonstrate that we outperform NeRF in terms of reconstruction quality while performing on par with IDR without requiring masks.\n  - [NeuS: Learning Neural Implicit Surfaces by Volume Rendering for Multi-view Reconstruction, NeurIPS2021](https://arxiv.org/abs/2106.10689) | [***``[code]``***](https://github.com/Totoro97/NeuS)\n    > We present a novel neural surface reconstruction method, called NeuS, for reconstructing objects and scenes with high fidelity from 2D image inputs. Existing neural surface reconstruction approaches, such as DVR and IDR, require foreground mask as supervision, easily get trapped in local minima, and therefore struggle with the reconstruction of objects with severe self-occlusion or thin structures. Meanwhile, recent neural methods for novel view synthesis, such as NeRF and its variants, use volume rendering to produce a neural scene representation with robustness of optimization, even for highly complex objects. However, extracting high-quality surfaces from this learned implicit representation is difficult because there are not sufficient surface constraints in the representation. In NeuS, we propose to represent a surface as the zero-level set of a signed distance function (SDF) and develop a new volume rendering method to train a neural SDF representation. We observe that the conventional volume rendering method causes inherent geometric errors (i.e. bias) for surface reconstruction, and therefore propose a new formulation that is free of bias in the first order of approximation, thus leading to more accurate surface reconstruction even without the mask supervision. Experiments on the DTU dataset and the BlendedMVS dataset show that NeuS outperforms the state-of-the-arts in high-quality surface reconstruction, especially for objects and scenes with complex structures and self-occlusion.\n  - [Volume Rendering of Neural Implicit Surfaces, NeurIPS2021](https://arxiv.org/abs/2106.12052) | [code]\n    > Neural volume rendering became increasingly popular recently due to its success in synthesizing novel views of a scene from a sparse set of input images. So far, the geometry learned by neural volume rendering techniques was modeled using a generic density function. Furthermore, the geometry itself was extracted using an arbitrary level set of the density function leading to a noisy, often low fidelity reconstruction. The goal of this paper is to improve geometry representation and reconstruction in neural volume rendering. We achieve that by modeling the volume density as a function of the geometry. This is in contrast to previous work modeling the geometry as a function of the volume density. In more detail, we define the volume density function as Laplace's cumulative distribution function (CDF) applied to a signed distance function (SDF) representation. This simple density representation has three benefits: (i) it provides a useful inductive bias to the geometry learned in the neural volume rendering process; (ii) it facilitates a bound on the opacity approximation error, leading to an accurate sampling of the viewing ray. Accurate sampling is important to provide a precise coupling of geometry and radiance; and (iii) it allows efficient unsupervised disentanglement of shape and appearance in volume rendering. Applying this new density representation to challenging scene multiview datasets produced high quality geometry reconstructions, outperforming relevant baselines. Furthermore, switching shape and appearance between scenes is possible due to the disentanglement of the two.\n  - [NerfingMVS: Guided Optimization of Neural Radiance Fields for Indoor Multi-view Stereo, ICCV2021(oral)](https://arxiv.org/abs/2109.01129) | [***``[code]``***](https://github.com/weiyithu/NerfingMVS)\n    > In this work, we present a new multi-view depth estimation method that utilizes both conventional SfM reconstruction and learning-based priors over the recently proposed neural radiance fields (NeRF). Unlike existing neural network based optimization method that relies on estimated correspondences, our method directly optimizes over implicit volumes, eliminating the challenging step of matching pixels in indoor scenes. The key to our approach is to utilize the learning-based priors to guide the optimization process of NeRF. Our system firstly adapts a monocular depth network over the target scene by finetuning on its sparse SfM reconstruction. Then, we show that the shape-radiance ambiguity of NeRF still exists in indoor environments and propose to address the issue by employing the adapted depth priors to monitor the sampling process of volume rendering. Finally, a per-pixel confidence map acquired by error computation on the rendered image can be used to further improve the depth quality. Experiments show that our proposed framework significantly outperforms state-of-the-art methods on indoor scenes, with surprising findings presented on the effectiveness of correspondence-based optimization and NeRF-based optimization over the adapted depth priors. In addition, we show that the guided optimization scheme does not sacrifice the original synthesis capability of neural radiance fields, improving the rendering quality on both seen and novel views.\n  - [3D Neural Scene Representations for Visuomotor Control, CoRL2021(oral)](https://3d-representation-learning.github.io/nerf-dy/) | [code]\n    > Humans have a strong intuitive understanding of the 3D environment around us. The mental model of the physics in our brain applies to objects of different materials and enables us to perform a wide range of manipulation tasks that are far beyond the reach of current robots. In this work, we desire to learn models for dynamic 3D scenes purely from 2D visual observations. Our model combines Neural Radiance Fields (NeRF) and time contrastive learning with an autoencoding framework, which learns viewpoint-invariant 3D-aware scene representations. We show that a dynamics model, constructed over the learned representation space, enables visuomotor control for challenging manipulation tasks involving both rigid bodies and fluids, where the target is specified in a viewpoint different from what the robot operates on. When coupled with an auto-decoding framework, it can even support goal specification from camera viewpoints that are outside the training distribution. We further demonstrate the richness of the learned 3D dynamics model by performing future prediction and novel view synthesis. Finally, we provide detailed ablation studies regarding different system designs and qualitative analysis of the learned representations.\n  - [Vision-Only Robot Navigation in a Neural Radiance World](https://arxiv.org/abs/2110.00168) | [code]\n    > Neural Radiance Fields (NeRFs) have recently emerged as a powerful paradigm for the representation of natural, complex 3D scenes. NeRFs represent continuous volumetric density and RGB values in a neural network, and generate photo-realistic images from unseen camera viewpoints through ray tracing. We propose an algorithm for navigating a robot through a 3D environment represented as a NeRF using only an on-board RGB camera for localization. We assume the NeRF for the scene has been pre-trained offline, and the robot's objective is to navigate through unoccupied space in the NeRF to reach a goal pose. We introduce a trajectory optimization algorithm that avoids collisions with high-density regions in the NeRF based on a discrete time version of differential flatness that is amenable to constraining the robot's full pose and control inputs. We also introduce an optimization based filtering method to estimate 6DoF pose and velocities for the robot in the NeRF given only an onboard RGB camera. We combine the trajectory planner with the pose filter in an online replanning loop to give a vision-based robot navigation pipeline. We present simulation results with a quadrotor robot navigating through a jungle gym environment, the inside of a church, and Stonehenge using only an RGB camera. We also demonstrate an omnidirectional ground robot navigating through the church, requiring it to reorient to fit through the narrow gap. Videos of this work can be found at this https URL .\n"
  },
  {
    "path": "docs/weekly_nerf_cn.md",
    "content": "\n每周分类神经辐射场 ![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)\n==========================================================================================================================\n\n NeRF研究QQ大群（300+成员）：706949479 \n## 按类别筛选: \n [全部](./weekly_nerf_cn.md) | [动态](./classified_weekly_nerf_cn/dynamic.md) | [编辑](./classified_weekly_nerf_cn/editing.md) | [快速](./classified_weekly_nerf_cn/fast.md) | [泛化](./classified_weekly_nerf_cn/generalization.md) | [人体](./classified_weekly_nerf_cn/human.md) | [视频](./classified_weekly_nerf_cn/video.md) | [光照](./classified_weekly_nerf_cn/lighting.md) | [重建](./classified_weekly_nerf_cn/reconstruction.md) | [纹理](./classified_weekly_nerf_cn/texture.md) | [语义](./classified_weekly_nerf_cn/semantic.md) | [姿态-SLAM](./classified_weekly_nerf_cn/pose-slam.md) | [其他](./classified_weekly_nerf_cn/others.md) \n## Dec27 - Jan3, 2023\n  - [使用基于体素的轨迹感知预训练增强无人机跟踪, RAL2022](https://ieeexplore.ieee.org/abstract/document/10015867) | [code]\n    > 基于 Siamese 网络的目标跟踪显着提升了高度机动无人机 (UAV) 的自动化能力。 然而，前沿的跟踪框架往往依赖于模板匹配，这使得它在面对连续帧中的多个对象视图时陷入困境。 此外，一般的图像级预训练主干可能会过度适应整体表示，导致在无人机跟踪中学习对象级属性时出现错位。 为了解决这些问题，这项工作提出了 TRTrack，这是一个全面的框架，可以充分利用无人机跟踪的立体表示。 具体来说，提出了一种新的预训练范式方法。 通过轨迹感知重建训练（TRT），在不增加任何参数的情况下，增强了主干提取立体结构特征的能力。 因此，提出了一种创新的分层自注意力 Transformer 来捕获局部细节信息和全局结构知识。 为了优化相关图，我们提出了一种新的空间相关细化（SCR）模块，它提高了对远程空间依赖性进行建模的能力。 三个具有挑战性的无人机基准测试的综合实验表明，所提出的 TRTrack 在精度和效率方面都实现了卓越的无人机跟踪性能。 现实环境中的定量测试充分证明了我们工作的有效性。\n## Dec25 - Dec31, 2022\n  - [用于高质量视图合成的稀疏 RGB-D 图像的神经辐射场, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9999509) | [code]\n    > 最近提出的神经辐射场 (NeRF) 使用作为多层感知器 (MLP) 制定的连续函数来模拟 3D 场景的外观和几何形状。 这使得新视图的逼真合成成为可能，即使对于具有视图依赖外观的场景也是如此。 此后，许多后续工作以不同方式扩展了 NeRF。 然而，该方法的一个基本限制仍然是它需要从密集放置的视点捕获大量图像以进行高质量合成，并且当捕获的视图数量不足时，结果的质量会迅速下降。 为了解决这个问题，我们提出了一种新的基于 NeRF 的框架，该框架能够仅使用一组稀疏的 RGB-D 图像进行高质量的视图合成，这些图像可以在当前的消费设备上使用相机和 LiDAR 传感器轻松捕获。 首先，从捕获的 RGB-D 图像重建场景的几何代理。 然后可以使用重建场景的渲染以及精确的相机参数来预训练网络。 最后，使用少量真实捕获的图像对网络进行微调。 我们进一步引入了一个补丁鉴别器，以在微调期间在新颖的视图下监督网络，并在提高合成质量之前引入 3D 颜色。 我们证明了我们的方法可以从少至 6 个 RGB-D 图像生成 3D 场景的任意新颖视图。 大量实验表明，与现有的基于 NeRF 的方法相比，我们的方法有所改进，包括旨在减少输入图像数量的方法。\n## Dec18 - Dec24, 2022\n  - [从神经辐射场中移除对象](https://arxiv.org/abs/2212.11966) | [code]\n    > 神经辐射场 (NeRFs) 正在成为一种无处不在的场景表示，可实现新颖的视图合成。 NeRF 将越来越多地与其他人共享。 不过，在共享 NeRF 之前，可能需要删除个人信息或难看的物体。 使用当前的 NeRF 编辑框架不容易实现这种删除。 我们提出了一个框架，用于从 RGB-D 序列创建的 NeRF 表示中删除对象。 我们的 NeRF 修复方法利用了最近在 2D 图像修复方面的工作，并以用户提供的掩码为指导。 我们的算法以基于置信度的视图选择程序为基础。 它选择在创建 NeRF 时使用哪些单独的 2D 修复图像，以便生成的修复 NeRF 是 3D 一致的。 我们表明我们的 NeRF 编辑方法对于以多视图连贯方式合成合理的修复是有效的。 我们使用一个新的且仍然具有挑战性的数据集来验证我们的方法来完成 NeRF 修复任务。\n  - [iLabel：揭示神经领域中的对象, RAL2022](https://ieeexplore.ieee.org/abstract/document/9996585) | [code]\n    > 经过自我监督训练以有效表示 3D 场景的几何形状和颜色的神经场往往会自动将其分解为连贯且准确的类似物体的区域，这些区域可以通过稀疏标记交互来揭示以产生 3D 语义场景分割。 我们的实时 iLabel 系统从手持式 RGB-D 相机获取输入，需要零先验训练数据，并以“开放集”方式工作，语义类别由用户即时定义。 iLabel 的底层模型是一个简单的多层感知器 (MLP)，从头开始训练以学习单个 3D 场景的神经表示。 该模型不断更新并实时可视化，使用户能够专注于交互以实现极其高效的语义分割。 一个房间规模的场景可以准确地标记为 10 多个语义类别，只需大约 100 次点击，耗时不到 5 分钟。 定量标记的准确性随着点击次数的增加而显着增加，并迅速超越标准的预训练语义分割方法。 我们还展示了 iLabel 的分层标签变体和“免提”模式，用户只需为自动生成的位置提供标签名称。\n  - [紧凑型神经辐射场的掩蔽小波表示](https://arxiv.org/abs/2212.09069) | [***``[code]``***](https://github.com/daniel03c1/masked_wavelet_nerf)\n    > 神经辐射场 (NeRF) 已经证明了神经渲染中基于坐标的神经表示（神经场或隐式神经表示）的潜力。 然而，使用多层感知器 (MLP) 来表示 3D 场景或对象需要大量的计算资源和时间。 最近有关于如何通过使用额外的数据结构（例如网格或树）来减少这些计算效率低下的研究。 尽管性能很有前途，但显式数据结构需要大量内存。 在这项工作中，我们提出了一种在不损害具有附加数据结构的优势的情况下减小大小的方法。 详细地说，我们建议在基于网格的神经场上使用小波变换。 基于网格的神经场是为了快速收敛，而其效率已经在高性能标准编解码器中得到证明的小波变换是为了提高网格的参数效率。 此外，为了在保持重建质量的同时实现更高的网格系数稀疏性，我们提出了一种新颖的可训练掩蔽方法。 实验结果表明，非空间网格系数，例如小波系数，能够获得比空间网格系数更高的稀疏度，从而产生更紧凑的表示。 通过我们提出的掩码和压缩管道，我们在 2 MB 的内存预算内实现了最先进的性能。 我们的代码可通过此 https 网址获得。\n## Dec11 - Dec17, 2022\n  - [NeRF-Art：文本驱动的神经辐射场程式化](https://arxiv.org/abs/2212.08070) | [***``[code]``***](https://cassiepython.github.io/nerfart/)\n    > 作为 3D 场景的强大表示，神经辐射场 (NeRF) 可以从多视图图像中合成高质量的新视图。 然而，对 NeRF 进行样式化仍然具有挑战性，尤其是在模拟外观和几何形状同时发生变化的文本引导样式时。 在本文中，我们介绍了 NeRF-Art，这是一种文本引导的 NeRF 风格化方法，它通过简单的文本提示来操纵预训练的 NeRF 模型的风格。 与以前缺乏足够的几何变形和纹理细节或需要网格来指导风格化的方法不同，我们的方法可以将 3D 场景转换为以所需几何形状和外观变化为特征的目标样式，而无需任何网格引导。 这是通过引入一种新颖的全局-局部对比学习策略，结合方向约束来同时控制目标风格的轨迹和强度来实现的。 此外，我们采用权重正则化方法来有效抑制在几何样式化过程中转换密度场时容易出现的混浊伪影和几何噪声。 通过对各种风格的广泛实验，我们证明了我们的方法在单视图风格化质量和跨视图一致性方面是有效且稳健的。 代码和更多结果可以在我们的项目页面中找到：这个 https URL。\n## Dec4 - Dec10, 2022\n  - [4K-NeRF：超高分辨率下的高保真神经辐射场](https://arxiv.org/abs/2212.04701) | [***``[code]``***](https://github.com/frozoul/4K-NeRF)\n    > 在本文中，我们提出了一个新颖而有效的框架，名为 4K-NeRF，以神经辐射场 (NeRF) 的方法为基础，在超高分辨率的具有挑战性的场景中追求高保真视图合成。 基于 NeRF 的方法的渲染过程通常依赖于像素方式，在这种方式中，射线（或像素）在训练和推理阶段都被独立处理，限制了其描述细微细节的表现能力，尤其是在提升到极高的分辨率时。 我们通过更好地探索光线相关性来解决这个问题，以增强受益于使用几何感知局部上下文的高频细节。 特别是，我们使用视图一致编码器在较低分辨率空间中有效地建模几何信息，并通过视图一致解码器恢复精细细节，条件是编码器估计的光线特征和深度。 联合训练与基于补丁的采样进一步促进了我们的方法，将来自面向感知的正则化的监督纳入像素明智的损失之外。 与现代 NeRF 方法的定量和定性比较表明，我们的方法可以显着提高渲染质量以保留高频细节，在 4K 超高分辨率场景下实现最先进的视觉质量。 代码可在 \\url{this https URL}\n  - [图像生成器的扩散引导域自适应](https://arxiv.org/abs/2212.04473) | [code]\n    > 能否将文本到图像扩散模型用作训练目标，让 GAN 生成器适应另一个领域？ 在本文中，我们展示了无分类器指导可以用作评论家，并使生成器能够从大规模文本到图像扩散模型中提取知识。 生成器可以有效地转移到文本提示指示的新域中，而无需访问目标域中的真实样本。 我们通过大量实验证明了我们方法的有效性和可控性。 尽管没有经过训练来最小化 CLIP 损失，但我们的模型在短提示上获得了同样高的 CLIP 分数和显着降低的 FID，并且在长而复杂的提示上在定性和定量上都优于基线。 据我们所知，所提出的方法是首次尝试将大规模预训练扩散模型和蒸馏采样结合起来用于文本驱动的图像生成器域自适应，并提供了以前无法实现的质量。 此外，我们将我们的工作扩展到基于 3D 风格的生成器和 DreamBooth 指南。\n  - [Ref-NPR：基于参考的非真实感辐射场](https://arxiv.org/abs/2212.02766) | [code]\n    > 现有的 3D 场景风格化方法采用任意风格参考来将纹理和颜色作为风格进行传输，而无需建立有意义的语义对应关系。 我们提出了基于参考的非真实感辐射场，即 Ref-NPR。 它是一种可控的场景风格化方法，利用辐射场对 3D 场景进行风格化，并以单个风格化的 2D 视图作为参考。 为了获得不错的结果，我们提出了一种基于程式化参考视图的光线配准过程，以在新颖的视图中获得伪光线监督，并利用内容图像中的语义对应来填充具有感知相似风格的遮挡区域。 结合这些操作，Ref-NPR 使用单个参考生成非真实感和连续的新颖视图序列，同时在遮挡区域获得合理的程式化。 实验表明，Ref-NPR 在视觉质量和语义对应方面明显优于其他场景和视频风格化方法。 代码和数据将公开。\n  - [NeRDi：以语言引导扩散作为一般图像先验的单视图 NeRF 合成](https://arxiv.org/abs/2212.03267) | [code]\n    > 2D 到 3D 重建是一个病态问题，但由于人类多年来积累的 3D 世界先验知识，因此擅长解决这个问题。 受此观察的驱动，我们提出了 NeRDi，这是一种单视图 NeRF 合成框架，具有来自 2D 扩散模型的一般图像先验。 将单视图重建制定为图像条件 3D 生成问题，我们通过在输入视图约束下使用预训练图像扩散模型最小化其任意视图渲染上的扩散损失来优化 NeRF 表示。 我们利用现成的视觉语言模型，并引入两部分语言指导作为扩散模型的条件输入。 这本质上有助于提高多视图内容的一致性，因为它缩小了以单视图输入图像的语义和视觉特征为条件的一般图像先验范围。 此外，我们引入了基于估计深度图的几何损失，以正则化 NeRF 的底层 3D 几何。 DTU MVS 数据集上的实验结果表明，与在此数据集上训练的现有方法相比，我们的方法可以合成更高质量的新视图。 我们还展示了我们在野外图像的零样本 NeRF 合成中的普遍性。\n  - [GARF：几何感知广义神经辐射场](https://arxiv.org/abs/2212.02280) | [code]\n    > 神经辐射场 (NeRF) 彻底改变了自由视点渲染任务，并取得了令人瞩目的成果。 然而，效率和准确性问题阻碍了其广泛应用。 为了解决这些问题，我们提出了几何感知广义神经辐射场 (GARF) 和几何感知动态采样 (GADS) 策略，以在不进行逐场景优化的情况下对未见场景执行实时新颖视图渲染和无监督深度估计。 与大多数现有的广义 NeRF 不同，我们的框架仅使用少量输入图像就可以在像素尺度和几何尺度上推断出看不见的场景。 更具体地说，我们的方法通过编码器-解码器结构和有助于避免遮挡的点级可学习多视图特征融合模块来学习新视图合成的共同属性。 为了在广义模型中保留场景特征，我们引入了一个无监督深度估计模块来推导粗几何，将光线采样间隔缩小到估计表面的邻近空间，并在期望最大位置采样，构成几何感知动态采样策略（ GADS）。 此外，我们引入了多级语义一致性损失 (MSC) 来帮助提供更多信息的表示学习。 对室内和室外数据集的大量实验表明，与最先进的广义 NeRF 方法相比，GARF 将样本减少了 25% 以上，同时提高了渲染质量和 3D 几何估计。\n  - [用于相机重定位的快速轻量级场景回归器](https://arxiv.org/abs/2212.01830) | [***``[code]``***](https://github.com/aislab/feat2map)\n    > 涉及先前 3D 重建的相机重定位在许多混合现实和机器人应用中起着至关重要的作用。 对于一些存储和/或通信带宽有限的应用程序，直接根据预建 3D 模型估计相机姿势可能非常昂贵。 尽管最近的场景和绝对姿态回归方法在有效的相机定位方面变得流行，但它们中的大多数都是计算资源密集型的，并且难以获得具有高精度约束的实时推理。 本研究提出了一种简单的场景回归方法，只需要一个多层感知器网络来映射场景坐标，即可实现准确的相机姿态估计。 所提出的方法使用稀疏描述符来回归场景坐标，而不是密集的 RGB 图像。 使用稀疏特征有几个优点。 首先，拟议的回归网络比以前的研究报告的要小得多。 这使我们的系统高效且可扩展。 其次，预建的 3D 模型提供了最可靠和稳健的 2D-3D 匹配。 因此，向它们学习可以导致对等效特征的认识并显着提高泛化性能。 提供了对我们的方法的详细分析和使用现有数据集的广泛评估，以支持所提出的方法。 可在此 https URL 获取实施细节\n## Nov27 - Dec3, 2022\n  - [StegaNeRF：在神经辐射场中嵌入不可见信息](https://arxiv.org/abs/2212.01602) | [***``[code]``***](https://github.com/XGGNet/StegaNeRF)\n    > 神经渲染的最新进展意味着通过共享 NeRF 模型权重广泛分布视觉数据的未来。 然而，虽然常见的视觉数据（图像和视频）具有明确或巧妙地嵌入所有权或版权信息的标准方法，但对于新兴的 NeRF 格式，该问题仍未得到探索。 我们介绍了 StegaNeRF，这是一种在 NeRF 渲染中嵌入隐写信息的方法。 我们设计了一个优化框架，允许从 NeRF 渲染的图像中准确提取隐藏信息，同时保留其原始视觉质量。 我们在几个潜在的部署场景下对我们的方法进行了实验评估，并进一步讨论了通过我们的分析发现的见解。 StegaNeRF 标志着对将可定制、不可察觉和可恢复的信息灌输到 NeRF 渲染的新问题的初步探索，同时对渲染图像的影响最小。 项目页面：此 https 网址。\n  - [QFF：神经场表示的量化傅立叶特征](https://arxiv.org/abs/2212.00914) | [code]\n    > 多层感知器 (MLP) 学习高频的速度很慢。 最近的方法对空间箱中的特征进行编码以提高学习细节的速度，但是以更大的模型尺寸和连续性损失为代价。 相反，我们建议在通常用于位置编码的傅里叶特征的容器中对特征进行编码。 我们称这些为量化傅立叶特征 (QFF)。 作为一种自然的多分辨率和周期性表示，我们的实验表明，使用 QFF 可以为多种应用带来更小的模型尺寸、更快的训练和更高质量的输出，包括神经图像表示 (NIR)、神经辐射场 (NeRF) 和符号距离函数 (SDF) 建模。 QFF 易于编码，计算速度快，并且可以作为许多神经场表示之外的简单补充。\n  - [3D-TOGO：走向文本引导的跨类别 3D 对象生成, AAAI2023](https://arxiv.org/abs/2212.01103) | [code]\n    > 文本引导的 3D 对象生成旨在生成由用户定义的标题描述的 3D 对象，这为可视化我们想象的内容铺平了道路。 尽管一些工作致力于解决这一具有挑战性的任务，但这些工作要么使用一些明确的 3D 表示（例如，网格），这些表示缺乏纹理并且需要后期处理来渲染照片般逼真的视图； 或者需要对每个案例进行单独耗时的优化。 在这里，我们首次尝试通过新的 3D-TOGO 模型实现通用文本引导的跨类别 3D 对象生成，该模型集成了文本到视图生成模块和视图到 3D 生成模块。 文本到视图生成模块旨在生成给定输入字幕的目标 3D 对象的不同视图。 提出了先验指导、标题指导和视图对比学习，以实现更好的视图一致性和标题相似性。 同时，views-to-3D 生成模块采用 pixelNeRF 模型，以从先前生成的视图中获取隐式 3D 神经表示。 我们的 3D-TOGO 模型以具有良好纹理的神经辐射场形式生成 3D 对象，并且不需要对每个单独的字幕进行时间成本优化。 此外，3D-TOGO可以通过输入的字幕控制生成的3D对象的类别、颜色和形状。 在最大的 3D 对象数据集（即 ABO）上进行了大量实验，以验证 3D-TOGO 可以根据 PSNR、SSIM、LPIPS 和 CLIP 等 98 个不同类别的输入字幕更好地生成高质量的 3D 对象。 得分，与文本 NeRF 和 Dreamfields 相比。\n  - [LatentSwap3D：3D 图像 GAN 的语义编辑](https://arxiv.org/abs/2212.01381) | [***``[code]``***](https://github.com/enisimsar/latentswap3d)\n    > 最近的 3D 感知 GAN 依靠体积渲染技术来解开物体的姿势和外观，事实上生成整个 3D 体积而不是从潜在代码生成单视图 2D 图像。 复杂的图像编辑任务可以在基于标准 2D 的 GAN（例如，StyleGAN 模型）中作为对潜在维度的操作来执行。 然而，据我们所知，对于 3D 感知 GAN 模型，仅部分探索了类似的属性。 这项工作旨在通过展示现有方法的局限性并提出 LatentSwap3D 来填补这一空白，LatentSwap3D 是一种与模型无关的方法，旨在在预训练的 3D 感知 GAN 的潜在空间中启用属性编辑。 我们首先根据随机森林分类器的特征重要性排名，确定控制目标属性的模型的潜在空间中最相关的维度。 然后，为了应用转换，我们将正在编辑的图像的前 K 个最相关的潜在维度与显示所需属性的图像交换。 尽管它很简单，但 LatentSwap3D 以一种分离的方式提供了卓越的语义编辑，并且在质量和数量上都优于其他方法。 我们在各种 3D 感知生成模型（如 pi-GAN、GIRAFFE、StyleSDF、MVCGAN、EG3D 和 VolumeGAN）以及各种数据集（如 FFHQ、AFHQ、Cats、MetFaces 和 CompCars）上展示了我们的语义编辑方法。 可以找到项目页面：\\url{this https URL}。\n  - [DiffRF：渲染引导的 3D 辐射场扩散](https://arxiv.org/abs/2212.01206) | [code]\n    > 我们介绍了 DiffRF，这是一种基于去噪扩散概率模型的 3D 辐射场合成新方法。 虽然现有的基于扩散的方法对图像、潜在代码或点云数据进行操作，但我们是第一个直接生成体积辐射场的方法。 为此，我们提出了一种直接在显式体素网格表示上运行的 3D 去噪模型。 然而，由于从一组姿势图像生成的辐射场可能不明确且包含伪影，因此获取地面真实辐射场样本并非易事。 我们通过将去噪公式与渲染损失配对来解决这一挑战，使我们的模型能够学习有利于良好图像质量的偏差先验，而不是试图复制像浮动伪影这样的拟合错误。 与 2D 扩散模型相比，我们的模型学习多视图一致先验，支持自由视图合成和准确的形状生成。 与 3D GAN 相比，我们基于扩散的方法自然可以在推理时启用条件生成，例如掩蔽完成或单视图 3D 合成。\n  - [NeuWigs：用于体积头发捕捉和动画的神经动态模型](https://arxiv.org/abs/2212.00613) | [code]\n    > 人发的捕捉和动画是为虚拟现实创建逼真化身的两个主要挑战。 这两个问题都非常具有挑战性，因为头发具有复杂的几何形状和外观，并且表现出具有挑战性的运动。 在本文中，我们提出了一种两阶段方法，该方法独立于头部对头发进行建模，以数据驱动的方式应对这些挑战。 第一阶段，状态压缩，通过一种新颖的自动编码器作为跟踪器策略，学习包含运动和外观的 3D 头发状态的低维潜在空间。 为了在外观学习中更好地分离头发和头部，我们结合使用多视图头发分割蒙版和可区分的体积渲染器。 第二阶段学习一种新颖的毛发动力学模型，该模型根据发现的潜在代码执行时间毛发转移。 为了在驱动我们的动力学模型时加强稳定性，我们在压缩阶段使用 3D 点云自动编码器来对头发状态进行去噪。 我们的模型在新颖的视图合成方面优于现有技术，并且能够创建新颖的头发动画，而无需依赖头发观察作为驱动信号。 项目页面在此 https URL。\n  - [SparseFusion：蒸馏 View-conditioned Diffusion 用于 3D 重建](https://arxiv.org/abs/2212.00792) | [code]\n    > 我们提出了 SparseFusion，这是一种稀疏视图 3D 重建方法，它统一了神经渲染和概率图像生成方面的最新进展。 现有方法通常建立在具有重新投影特征的神经渲染上，但无法生成看不见的区域或处理大视点变化下的不确定性。 替代方法将其视为（概率）2D 合成任务，虽然它们可以生成似是而非的 2D 图像，但它们无法推断出一致的底层 3D。 然而，我们发现 3D 一致性和概率图像生成之间的这种权衡并不需要存在。 事实上，我们表明几何一致性和生成推理可以在模式搜索行为中互补。 通过从视图条件潜在扩散模型中提取 3D 一致场景表示，我们能够恢复一个合理的 3D 表示，其渲染既准确又逼真。 我们评估了 CO3D 数据集中 51 个类别的方法，并表明它在失真和感知指标方面优于现有方法，用于稀疏视图新视图合成。\n  - [用于快速多视图视频合成的混合神经体素](https://arxiv.org/abs/2212.00190) | [code]\n    > 由于现实世界环境的复杂性和高度动态的运动，从现实世界的多视图输入合成高保真视频具有挑战性。 以前基于神经辐射场的作品已经展示了动态场景的高质量重建。 但是，在真实场景中训练此类模型非常耗时，通常需要数天或数周。 在本文中，我们提出了一种名为 MixVoxels 的新方法，以更好地表示具有快速训练速度和有竞争力的渲染质量的动态场景。 拟议的 MixVoxels 将 4D 动态场景表示为静态和动态体素的混合，并使用不同的网络对其进行处理。 这样，静态体素所需模态的计算可以由轻量级模型处理，这从本质上减少了计算量，特别是对于许多以静态背景为主的日常动态场景。 为了分离这两种体素，我们提出了一个新的变化场来估计每个体素的时间方差。 对于动态体素，我们设计了一种内积时间查询方法来有效地查询多个时间步长，这对于恢复高动态运动至关重要。 因此，通过对输入 300 帧视频的动态场景进行 15 分钟的训练，MixVoxels 实现了比以前的方法更好的 PSNR。 此 https 网址提供代码和训练模型\n  - [Score Jacobian Chaining：为 3D 生成提升预训练的 2D 扩散模型](https://arxiv.org/abs/2212.00774) | [code]\n    > 扩散模型学习预测梯度矢量场。 我们建议对学习到的梯度应用链式法则，并通过可微分渲染器的雅可比矩阵反向传播扩散模型的分数，我们将其实例化为体素辐射场。 此设置将多个摄像机视点的 2D 分数聚合为 3D 分数，并将预训练的 2D 模型重新用于 3D 数据生成。 我们确定了此应用程序中出现的分布不匹配的技术挑战，并提出了一种新颖的估计机制来解决它。 我们在几个现成的扩散图像生成模型上运行我们的算法，包括最近发布的在大规模 LAION 数据集上训练的稳定扩散。\n  - [光场的神经子空间, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9968104) | [code]\n    > 我们引入了一个框架，用于用神经子空间的新概念来紧凑地表示光场内容。 虽然最近提出的神经光场表示通过将光场编码到单个神经网络中实现了很好的压缩结果，但统一设计并未针对光场中展示的复合结构进行优化。 此外，将光场的每一部分编码到一个网络中对于需要快速传输和解码的应用来说并不理想。 我们认识到这个问题与子空间学习的联系。 我们提出了一种使用几个小型神经网络的方法，专门研究特定光场段的神经子空间。 此外，我们在这些小型网络中提出了一种自适应权重共享策略，提高了参数效率。 实际上，该策略通过利用神经网络的分层结构，能够以协调一致的方式跟踪附近神经子空间之间的相似性。 此外，我们开发了一种软分类技术来提高神经表征的颜色预测准确性。 我们的实验结果表明，我们的方法在各种光场场景上比以前的方法更好地重建了光场。 我们进一步展示了其在具有不规则视点布局和动态场景内容的编码光场上的成功部署。\n  - [3D-LDM：使用潜在扩散模型生成神经隐式 3D 形状](https://arxiv.org/abs/2212.00842) | [code]\n    > 扩散模型在图像生成方面显示出巨大的潜力，在生成多样性方面击败了 GAN，具有可比的图像质量。 然而，它们在 3D 形状上的应用仅限于点或体素表示，这些表示在实践中不能准确地表示 3D 表面。 我们提出了一种用于在自动解码器的潜在空间中运行的 3D 形状的神经隐式表示的扩散模型。 这使我们能够生成多样化和高质量的 3D 表面。 我们还表明，我们可以根据图像或文本调节我们的模型，以使用 CLIP 嵌入实现图像到 3D 生成和文本到 3D 生成。 此外，将噪声添加到现有形状的潜在代码中可以让我们探索形状变化。\n  - [SinGRAF：学习单个场景的 3D 生成辐射场](https://arxiv.org/abs/2211.17260) | [code]\n    > 生成模型在合成逼真的 3D 对象方面显示出巨大的潜力，但它们需要大量的训练数据。 我们介绍了 SinGRAF，这是一种 3D 感知生成模型，使用单个场景的一些输入图像进行训练。 经过训练后，SinGRAF 会生成此 3D 场景的不同实现，在改变场景布局的同时保留输入的外观。 为此，我们以 3D GAN 架构的最新进展为基础，并在训练期间引入了一种新颖的渐进式补丁辨别方法。 通过几个实验，我们证明了 SinGRAF 产生的结果在质量和多样性方面都大大优于最接近的相关作品。\n  - [NeAF：学习用于点法线估计的神经角度场, AAAI2023](https://arxiv.org/abs/2211.16869) | [***``[code]``***](https://github.com/lisj575/NeAF)\n    > 非结构化点云的法线估计是 3D 计算机视觉中的一项重要任务。 当前的方法通过将局部补丁映射到法向量或使用神经网络学习局部表面拟合来取得令人鼓舞的结果。 然而，这些方法不能很好地推广到看不见的场景，并且对参数设置很敏感。 为了解决这些问题，我们提出了一个隐式函数来学习球坐标系中每个点法线周围的角度场，称为神经角度场（NeAF）。 我们不是直接预测输入点的法线，而是预测地面实况法线和随机采样的查询法线之间的角度偏移。 这种策略推动网络观察更多不同的样本，从而以更稳健的方式获得更高的预测精度。 为了在推理时从学习的角度场预测法线，我们在单位球形空间中随机采样查询向量，并将具有最小角度值的向量作为预测法线。 为了进一步利用 NeAF 学到的先验知识，我们建议通过最小化角度偏移来细化预测的法向量。 合成数据和真实扫描的实验结果显示，在广泛使用的基准下，与最先进的技术相比有了显着改进。\n  - [SNAF：具有神经衰减场的稀疏视图 CBCT 重建](https://arxiv.org/abs/2211.17048) | [code]\n    > 锥形束计算机断层扫描（CBCT）已广泛应用于临床实践，尤其是牙科诊所，而捕获时X射线的辐射剂量一直是CBCT成像中长期关注的问题。 已经提出了几项研究工作来从稀疏视图 2D 投影重建高质量的 CBCT 图像，但目前最先进的技术存在伪影和缺乏精细细节的问题。 在本文中，我们提出了通过学习神经衰减场来进行稀疏视图 CBCT 重建的 SNAF，我们发明了一种新颖的视图增强策略来克服稀疏输入视图数据不足带来的挑战。 我们的方法在高重建质量（30+ PSNR）方面实现了卓越的性能，只有 20 个输入视图（比临床收集少 25 倍），优于最先进的技术。 我们进一步进行了综合实验和消融分析，以验证我们方法的有效性。\n  - [NeRFInvertor：用于单次真实图像动画的高保真 NeRF-GAN 反演](https://arxiv.org/abs/2211.17235) | [code]\n    > 基于 Nerf 的生成模型在生成具有一致 3D 几何形状的高质量图像方面表现出了令人印象深刻的能力。 尽管成功合成了从潜在空间随机采样的假身份图像，但由于所谓的反转问题，采用这些模型生成真实主体的面部图像仍然是一项具有挑战性的任务。 在本文中，我们提出了一种通用方法来对这些 NeRF-GAN 模型进行微调，以便仅通过单个图像实现真实对象的高保真动画。 给定域外真实图像的优化潜代码，我们在渲染图像上使用 2D 损失函数来减少身份差距。 此外，我们的方法利用显式和隐式 3D 正则化，使用优化潜在代码周围的域内邻域样本来消除几何和视觉伪影。 我们的实验证实了我们的方法在跨不同数据集的多个 NeRF-GAN 模型上真实、高保真和 3D 一致的真实面孔动画的有效性。\n  - [使用 RGBXY 导数和最佳传输的可微分渲染, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555479) | [code]\n    > 传统的可微分渲染方法通常很难在逆渲染优化中收敛，尤其是当初始对象和目标对象位置不太接近时。 受拉格朗日流体模拟的启发，我们提出了一种新颖的可微分渲染方法来解决这个问题。 我们将每个屏幕空间像素与像素中心覆盖的可见 3D 几何点相关联，并计算几何点而不是像素的导数。 我们将关联的几何点称为像素的点代理。 对于每个点代理，我们计算其 5D RGBXY 导数，测量其 3D RGB 颜色和 2D 投影屏幕空间位置如何相对于场景参数发生变化。 此外，为了捕获全局和远程对象运动，我们利用基于最佳传输的像素匹配来设计更复杂的损失函数。 我们已经进行了实验来评估我们提出的方法在各种逆向渲染应用程序中的有效性，并证明了与最先进的基线相比更优越的收敛行为。\n  - [用于实时全局照明的高效光探测器, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555452) | [code]\n    > 再现基于物理的全局照明 (GI) 效果一直是许多实时图形应用程序的长期需求。 为了实现这一目标，许多最近的引擎采用了在预计算阶段烘焙的某种形式的光探测器。 不幸的是，由于探针存储、表示或查询的限制，预计算探针产生的 GI 效果相当有限。 在本文中，我们提出了一种基于探针的 GI 渲染的新方法，该方法可以在复杂场景中生成广泛的 GI 效果，包括具有多次反弹的光泽反射。 我们工作背后的关键贡献包括基于梯度的搜索算法和神经图像重建方法。 搜索算法旨在将探针的内容重新投影到任何查询视点，而不会引入视差误差，并快速收敛到最优解。 基于专用神经网络和多个 G 缓冲区的神经图像重建方法试图从由于分辨率有限或（潜在的）探头采样率低而导致的低质量输入中恢复高质量图像。 这种神经方法使光探针的生成变得高效。 此外，采用时间重投影策略和时间损失来提高动画序列的时间稳定性。 由于基于梯度的搜索算法的快速收敛速度和神经网络的轻量级设计，即使对于高分辨率 (1920×1080) 输出，整个流水线也实时运行（>30 帧/秒）。 已经对多个复杂场景进行了广泛的实验，以证明我们的方法优于最先进的方法。\n  - [LaplacianFusion：详细的 3D 衣服人体重建, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555511) | [code]\n    > 我们提出了 LaplacianFusion，这是一种从输入深度或 3D 点云序列重建详细且可控的 3D 穿衣人体形状的新颖方法。 我们方法的关键思想是使用拉普拉斯坐标，即已用于网格编辑的众所周知的微分坐标，来表示输入扫描中包含的局部结构，而不是之前使用的隐式 3D 函数或顶点位移。 我们的方法使用 SMPL 重建一个可控的基础网格，并学习一个表面函数来预测表示基础网格表面细节的拉普拉斯坐标。 对于给定的姿势，我们首先构建并细分一个基础网格，这是一个变形的 SMPL 模板，然后使用表面函数估计网格顶点的拉普拉斯坐标。 姿势的最终重建是通过将估计的拉普拉斯坐标作为一个整体进行整合而获得的。 实验结果表明，我们基于拉普拉斯坐标的方法比以前的方法成功地重建了视觉上更令人愉悦的形状细节。 该方法还支持各种表面细节操作，例如细节传输和增强。\n  - [QuadStream：一种用于新视点重建的基于 Quad 的场景流架构, ToG2022](https://dl.acm.org/doi/abs/10.1145/3550454.3555524) | [code]\n    > 通过网络将渲染的 3D 内容流式传输到手机或 VR/AR 耳机等瘦客户端设备，将高保真图形带到通常由于热量、功率或成本限制而无法实现的平台。 流式 3D 内容必须以对延迟和潜在网络丢失都具有鲁棒性的表示形式进行传输。 在存在遮挡事件的情况下，传输视频流并重新投影以纠正不断变化的视点失败； 在功率有限的移动 GPU 上无法在客户端流式传输场景几何体和执行高质量渲染。 为了平衡消除遮挡稳健性和最小客户端工作量这两个相互竞争的目标，我们引入了 QuadStream，这是一种新的流媒体内容表示，它通过允许客户端有效地渲染新颖的视图而没有由消除遮挡事件引起的伪影来减少运动到光子的延迟。 受视频编解码器设计的传统宏块方法的启发，我们将从视图单元中的位置看到的场景分解为一系列四边形代理，或来自多个视图的视图对齐四边形。 通过在光栅化 G-Buffer 上操作，我们的方法独立于场景本身的表示； 生成的 QuadStream 是场景的近似几何表示，可以由瘦客户端重建以呈现当前视图和附近的相邻视图。 我们的技术贡献是一种有效的并行四边形生成、合并和打包策略，用于覆盖场景中潜在客户移动的代理视图； 一种打包和编码策略，允许将具有深度信息的掩码四边形作为帧相干流传输； 以及一种高效的渲染方法，用于将我们的 QuadStream 表示渲染为瘦客户端上的全新视图。 我们表明，与视频数据流方法和基于几何的流媒体相比，我们的方法实现了卓越的质量。\n  - [DINER：基于深度感知图像的神经辐射场](https://arxiv.org/abs/2211.16630) | [code]\n    > 我们提出了基于深度感知图像的神经辐射场 (DINER)。 给定一组稀疏的 RGB 输入视图，我们预测深度和特征图以指导重建体积场景表示，使我们能够在新视图下渲染 3D 对象。 具体来说，我们提出了将深度信息纳入特征融合和高效场景采样的新技术。 与之前的最先进技术相比，DINER 实现了更高的合成质量，并且可以处理具有更大视差的输入视图。 这使我们能够在不改变捕获硬件要求的情况下更完整地捕获场景，并最终在新视图合成过程中实现更大的视点变化。 我们通过合成人头和一般物体的新视图来评估我们的方法，并观察到与以前的现有技术相比，定性结果有了显着改善，感知指标也有所增加。 该代码将公开用于研究目的。\n  - [从单目视频重建手持物体, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555401) | [code]\n    > 本文提出了一种从单目视频中重建手持物体的方法。 与许多最近通过训练有素的网络直接预测对象几何形状的方法相比，所提出的方法不需要任何关于对象的先验知识，并且能够恢复更准确和详细的对象几何形状。 关键思想是手部运动自然地提供了对象的多个视图，并且可以通过手部姿势跟踪器可靠地估计该运动。 然后，可以通过解决多视图重建问题来恢复对象几何形状。 我们设计了一种基于隐式神经表示的方法来解决重建问题，并解决手部姿势估计不精确、手部相对运动和小物体的几何优化不足等问题。 我们还提供了一个新收集的具有 3D ground truth 的数据集来验证所提出的方法。 数据集和代码将发布在 https://dihuangdh.github.io/hhor。\n  - [Dr.3D：将 3D GAN 应用于艺术绘画, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555422) | [code]\n    > 虽然 3D GAN 最近展示了多视图一致图像和 3D 形状的高质量合成，但它们主要限于照片般逼真的人像。 本文旨在将 3D GAN 扩展到一种不同但有意义的视觉形式：艺术肖像画。 然而，由于绘图中存在不可避免的几何歧义，将现有的 3D GAN 扩展到绘图具有挑战性。 为了解决这个问题，我们提出了 Dr.3D，这是一种新颖的适应方法，可以将现有的 3D GAN 适应艺术绘画。 Dr.3D 配备了三个新组件来处理几何模糊：变形感知 3D 合成网络、姿势估计和图像合成的交替适应以及几何先验。 实验表明，我们的方法可以成功地将 3D GAN 应用于绘图，并实现多视图一致的绘图语义编辑。\n  - [用于交互式自由视点视频的高效神经辐射场, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555376) | [code]\n    > 本文旨在解决高效制作交互式自由视点视频的挑战。 最近的一些工作为神经辐射场配备了图像编码器，使它们能够跨场景进行泛化。 在处理动态场景时，他们可以简单地将每个视频帧视为一个单独的场景，并进行新颖的视图合成以生成自由视点视频。 但是，它们的渲染过程很慢，不能支持交互式应用程序。 一个主要因素是他们在推断辐射场时在空白空间中采样大量点。 我们提出了一种称为 ENeRF 的新颖场景表示，用于快速创建交互式自由视点视频。 具体来说，给定一帧的多视图图像，我们首先构建级联成本量来预测场景的粗略几何形状。 粗糙的几何体允许我们在场景表面附近采样几个点，从而显着提高渲染速度。 这个过程是完全可微的，使我们能够从 RGB 图像中共同学习深度预测和辐射场网络。 对多个基准的实验表明，我们的方法表现出有竞争力的性能，同时比以前的可推广辐射场方法至少快 60 倍。\n  - [NeuLighting：使用不受约束的照片集重新照明的自由视点户外场景的神经照明, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555384) | [code]\n    > 我们提出了 NeuLighting，这是一个新的框架，用于从一组稀疏的、不受约束的野外照片集中重新照明自由视点户外场景。 我们的框架将所有场景组件表示为由 MLP 参数化的连续函数，这些函数将 3D 位置和照明条件作为输入和输出反射率以及必要的室外照明属性。 与通常利用具有可控且一致的室内照明的训练图像的对象级重新照明方法不同，我们专注于更具挑战性的室外情况，其中所有图像都是在任意未知照明下捕获的。 我们方法的关键包括将每幅图像的光照压缩为解缠结的潜在向量的神经光照表示，以及一种新的自由视点重新光照方案，该方案对图像间的任意光照变化具有鲁棒性。 光照表示具有压缩性，可以解释各种光照，并且可以很容易地输入到基于查询的 NeuLighting 框架中，从而能够在任何一种新型光照下进行高效的阴影效果评估。 此外，为了产生高质量的投射阴影，我们根据场景几何形状和太阳方向估计太阳能见度图以指示阴影区域。 由于灵活且可解释的神经照明表示，我们的系统支持使用许多不同的照明源进行户外重新照明，包括自然图像、环境地图和延时视频。 新视角和照明下的高保真渲染证明了我们的方法相对于最先进的重新照明解决方案的优越性。\n  - [用于全频着色的轻量级神经基函数, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555386) | [code]\n    > 基函数既提供了紧凑表示的能力，又提供了高效计算的特性。 因此，它们普遍用于渲染以执行全频着色。 然而，包括球谐函数 (SH)、小波和球面高斯函数 (SG) 在内的常用基函数都有其自身的局限性，例如 SH 的低频、小波的旋转不变性以及 SG 不支持多乘积等。 在本文中，我们提出了神经基函数，这是一组隐式和数据驱动的基函数，它规避了所有所需属性的限制。 我们首先引入了一个表示神经网络，它将任何一般的 2D 球面函数（例如环境光照、BRDF 和可见性）作为输入并将其投影到潜在空间上作为我们的神经基函数的系数。 然后，我们设计了几个执行不同类型计算的轻量级神经网络，为我们的基函数提供了不同的计算属性，例如双/三乘积积分和旋转。 我们通过将神经基函数集成到全频着色应用程序中来展示我们的神经基函数的实用性，表明我们的方法不仅在同等质量下实现了比小波高 10 × -40 × 的压缩率，而且还渲染了全频 实时照明效果，没有上述经典基础功能的限制。\n  - [DeepMVSHair：来自稀疏视图的深层头发建模, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550469.3555385) | [code]\n    > 我们提出了 DeepMVSHair，这是第一个基于深度学习的多视图发束重建方法。 我们管道的关键组件是 HairMVSNet，这是一种可区分的神经架构，它将空间头发结构隐含地表示为连续的 3D 头发生长方向场。 具体来说，给定一个 3D 查询点，我们根据观察到的 2D 结构特征确定其占用值和方向。 利用来自每个输入视图的查询点的像素对齐特征，我们利用视图感知转换器编码器将各向异性结构特征聚合为集成表示，该表示被解码以在查询点产生 3D 占用和方向。 HairMVSNet 有效地收集多视图头发结构特征并基于这种隐式表示保留高频细节。 在 HairMVSNet 的指导下，我们的头发生长算法产生的结果忠实于输入的多视图图像。 我们提出了一种新颖的图像引导多视图链变形算法，以进一步丰富建模细节。 大量实验表明，我们的稀疏视图方法的结果与最先进的密集多视图方法的结果相当，并且明显优于单视图和稀疏视图方法的结果。 此外，我们的方法比以前的多视图头发建模方法快一个数量级。\n  - [一种轻松教授变形金刚多视图几何的方法](https://arxiv.org/abs/2211.15107) | [code]\n    > 变形金刚是强大的视觉学习者，这在很大程度上是因为它们明显缺乏手动指定的先验。 由于 3D 形状和视点的近乎无限可能的变化（需要灵活性）以及射影几何的精确性质（遵守刚性法则），这种灵活性在涉及多视图几何的任务中可能会出现问题。 为了解决这个难题，我们提出了一种“轻触”方法，引导视觉变形金刚学习多视图几何，但允许它们在需要时摆脱束缚。 我们通过使用极线来引导 Transformer 的交叉注意力图来实现这一点，惩罚极线外的注意力值并鼓励沿着这些线的更高注意力，因为它们包含几何上合理的匹配。 与以前的方法不同，我们的建议在测试时不需要任何相机姿势信息。 我们专注于姿势不变的对象实例检索，由于查询和检索图像之间的视点存在巨大差异，因此标准 Transformer 网络在这方面存在困难。 在实验上，我们的方法在对象检索方面优于最先进的方法，而且在测试时不需要姿势信息。\n  - [Fast-SNARF：一种用于关节神经场的快速变形器](https://arxiv.org/abs/2211.15601) | [code]\n    > 神经场彻底改变了刚性场景的 3D 重建和新颖视图合成领域。 使这种方法适用于关节物体（例如人体）的一个关键挑战是对静止姿势（规范空间）和变形空间之间的 3D 位置的变形进行建模。 我们提出了一种新的神经场连接模块 Fast-SNARF，它通过迭代求根找到规范空间和姿势空间之间的准确对应关系。 Fast-SNARF 是我们之前工作 SNARF 功能的直接替代品，同时显着提高了其计算效率。 我们对 SNARF 进行了多项算法和实现改进，产生了 150 倍的加速。 这些改进包括基于体素的对应搜索、预计算线性混合蒙皮函数以及使用 CUDA 内核的高效软件实现。 Fast-SNARF 可以在没有对应的变形观察（例如 3D 网格）的情况下，高效地同时优化形状和蒙皮权重。 由于变形图的学习是许多 3D 人体化身方法中的重要组成部分，并且由于 Fast-SNARF 提供了一种计算高效的解决方案，我们相信这项工作代表了向实际创建 3D 虚拟人迈出的重要一步。\n  - [NeRF 在 360° 图像上的非均匀采样策略, BMVC2022](https://arxiv.org/abs/2212.03635) | [code]\n    > 近年来，随着神经辐射场 (NeRF) 的出现，使用透视图像进行新视图合成的性能得到了显着提高。 本研究提出了两种有效构建 360{\\textdegree} 全向图像 NeRF 的新技术。 由于ERP格式的360{\\textdegree}图像在高纬度地区存在空间畸变和360{\\textdegree}广视角的特点，NeRF的一般光线采样策略是无效的。 因此，NeRF 的视图合成精度有限，学习效率不高。 我们为 NeRF 提出了两种非均匀光线采样方案以适应 360{\\textdegree} 图像——失真感知光线采样和内容感知光线采样。 我们分别使用室内和室外场景的 Replica 和 SceneCity 模型创建了评估数据集 Synth360。 在实验中，我们表明我们的提议在准确性和效率方面都成功地构建了 360{\\textdegree} 图像 NeRF。 该提案广泛适用于 NeRF 的高级变体。 DietNeRF、AugNeRF 和 NeRF++ 结合所提出的技术进一步提高了性能。 此外，我们展示了我们提出的方法提高了 360{\\textdegree} 图像中真实世界场景的质量。 Synth360：这个 https 网址。\n  - [通过伪多视图优化的高保真 3D GAN 反演](https://arxiv.org/abs/2211.15662) | [***``[code]``***](https://github.com/jiaxinxie97/HFGI3D)\n    > 我们提出了一个高保真 3D 生成对抗网络 (GAN) 反演框架，可以在保留输入图像的特定细节的同时合成逼真的新视图。 由于高保真 3D 反演中的几何纹理权衡，高保真 3D GAN 反演本质上具有挑战性，其中对单个视图输入图像的过度拟合通常会在潜在优化期间损坏估计的几何形状。 为了解决这一挑战，我们提出了一种新的管道，它建立在具有可见性分析的伪多视图估计之上。 我们保留可见部分的原始纹理，并对被遮挡的部分使用生成先验。 广泛的实验表明，我们的方法比最先进的方法实现了有利的重建和新颖的视图合成质量，即使对于具有分布外纹理的图像也是如此。 拟议的管道还支持使用反向潜代码和 3D 感知纹理修改进行图像属性编辑。 我们的方法可以从单个图像进行高保真 3D 渲染，这有望用于 AI 生成的 3D 内容的各种应用。\n## Nov20 - Nov26, 2022\n  - [ResNeRF：用于室内场景新视图合成的几何引导残余神经辐射场](https://arxiv.org/abs/2211.16211) | [code]\n    > 我们代表 ResNeRF，这是一种用于室内场景新颖视图合成的新颖几何引导两阶段框架。请注意，良好的几何形状将极大地提高新视图合成的性能，并且为了避免几何模糊问题，我们建议基于从场景几何形状估计的基本密度和参数化的残差密度来表征场景的密度分布几何。在第一阶段，我们专注于基于 SDF 表示的几何重建，这将导致场景的良好几何表面和清晰的密度。在第二阶段，残差密度是基于第一阶段学习的SDF来学习的，用于编码更多关于外观的细节。通过这种方式，我们的方法可以更好地学习具有几何先验的密度分布，用于高保真新视图合成，同时保留 3D 结构。在具有许多观察较少和无纹理区域的大型室内场景上进行的实验表明，凭借良好的 3D 表面，我们的方法实现了新视图合成的最先进性能。\n  - [RUST：来自未定图像的潜在神经场景表示](https://arxiv.org/abs/2211.14306) | [code]\n    > 从 2D 观察中推断 3D 场景的结构是计算机视觉中的一项基本挑战。最近流行的基于神经场景表示的方法已经取得了巨大的影响，并已应用于各种应用程序。这个领域剩下的主要挑战之一是训练一个单一的模型，它可以提供潜在的表示，有效地泛化到单个场景之外。 Scene Representation Transformer (SRT) 在这个方向上显示出希望，但将其扩展到更大的不同场景集是具有挑战性的，并且需要准确定位的地面实况数据。为了解决这个问题，我们提出了 RUST（Really Unposed Scene representation Transformer），这是一种仅在 RGB 图像上训练的新颖视图合成的无姿势方法。我们的主要见解是，可以训练一个姿势编码器，它可以窥视目标图像并学习潜在姿势嵌入，解码器将其用于视图合成。我们对学习到的潜在姿势结构进行了实证研究，并表明它允许有意义的测试时间相机转换和准确的显式姿势读出。或许令人惊讶的是，RUST 实现了与获得完美相机姿势的方法相似的质量，从而释放了大规模训练摊销神经场景表示的潜力。\n  - [通过神经渲染的无监督连续语义适应](https://arxiv.org/abs/2211.13969) | [code]\n    > 越来越多的应用程序依赖于数据驱动模型，这些模型被部署用于跨一系列场景的感知任务。由于训练和部署数据之间的不匹配，在新场景上调整模型对于获得良好性能通常至关重要。在这项工作中，我们研究了语义分割任务的持续多场景适应，假设在部署期间没有可用的地面实况标签，并且应该保持先前场景的性能。我们建议通过融合分割模型的预测，然后使用视图一致的渲染语义标签作为伪标签来调整模型，为每个场景训练一个语义 NeRF 网络。通过与分割模型的联合训练，Semantic-NeRF 模型有效地实现了 2D-3D 知识迁移。此外，由于其紧凑的尺寸，它可以存储在长期记忆中，随后用于从任意角度渲染数据以减少遗忘。我们在 ScanNet 上评估了我们的方法，我们的方法优于基于体素的基线和最先进的无监督域适应方法。\n  - [ShadowNeuS：Shadow Ray 监督的神经 SDF 重建](https://arxiv.org/abs/2211.14086) | [code]\n    > 通过监督场景和多视图图像平面之间的相机光线，NeRF 为新视图合成任务重建神经场景表示。另一方面，光源和场景之间的阴影光线还有待考虑。因此，我们提出了一种新颖的阴影射线监督方案，可以优化沿射线的样本和射线位置。通过监督阴影光线，我们在多种光照条件下成功地从单视图纯阴影或 RGB 图像重建场景的神经 SDF。给定单视图二进制阴影，我们训练神经网络重建不受相机视线限制的完整场景。通过进一步模拟图像颜色和阴影光线之间的相关性，我们的技术还可以有效地扩展到 RGB 输入。我们将我们的方法与之前关于从单视图二值阴影或 RGB 图像重建形状的挑战性任务的工作进行比较，并观察到显着的改进。代码和数据将被发布。\n  - [动态神经肖像, WACV2023](https://arxiv.org/abs/2211.13994) | [code]\n    > 我们提出了动态神经肖像，这是一种解决全头重现问题的新方法。我们的方法通过明确控制头部姿势、面部表情和眼睛注视来生成逼真的视频肖像。我们提出的架构不同于现有方法，后者依赖基于 GAN 的图像到图像转换网络将 3D 人脸渲染转换为逼真的图像。相反，我们在具有可控动力学的基于 2D 坐标的 MLP 上构建我们的系统。我们采用基于 2D 的表示而不是最近的 3D 类 NeRF 系统的直觉源于这样一个事实，即视频肖像是由单目固定摄像机拍摄的，因此，只有一个场景的视点可用。首先，我们将我们的生成模型设置为表达式混合形状，尽管如此，我们表明我们的系统也可以成功地由音频功能驱动。我们的实验表明，所提出的方法比最近基于 NeRF 的重演方法快 270 倍，我们的网络在分辨率高达 1024 x 1024 时达到 24 fps 的速度，同时在视觉质量方面优于之前的工作。\n  - [ScanNeRF：神经辐射场的可扩展基准, WACV2023](https://arxiv.org/abs/2211.13762) | [code]\n    > 在本文中，我们提出了第一个用于评估神经辐射场 (NeRF) 和一般情况下的神经渲染 (NR) 框架的真实基准思想。我们设计并实施了一个有效的管道，可以毫不费力地扫描大量真实物体。我们的扫描站的硬件预算不到 500 美元，仅需 5 分钟即可收集大约 4000 张扫描对象的图像。这样的平台用于构建 ScanNeRF，这是一个以多个训练/验证/测试拆分为特征的数据集，旨在对现代 NeRF 方法在不同条件下的性能进行基准测试。因此，我们评估了三个尖端的 NeRF 变体，以突出它们的优点和缺点。该数据集可在我们的项目页面上找到，还有一个在线基准，以促进开发越来越好的 NeRF。\n  - [DiffusionSDF：有符号距离函数的条件生成模型](https://arxiv.org/abs/2211.13757) | [code]\n    > 概率扩散模型在图像合成、修复和文本到图像任务方面取得了最先进的结果。然而，它们仍处于生成复杂 3D 形状的早期阶段。这项工作提出了 DiffusionSDF，一种用于形状补全、单视图重建和真实扫描点云重建的生成模型。我们使用神经符号距离函数 (SDF) 作为我们的 3D 表示，通过神经网络参数化各种信号（例如，点云、2D 图像）的几何形状。神经 SDF 是隐式函数，扩散它们相当于学习它们的神经网络权重的反转，我们使用自定义调制模块解决了这个问题。广泛的实验表明，我们的方法能够从部分输入进行现实的无条件生成和条件生成。这项工作将扩散模型的领域从学习 2D 显式表示扩展到 3D 隐式表示。\n  - [沉浸式神经图形基元](https://arxiv.org/abs/2211.13494) | [code]\n    > 神经辐射场 (NeRF)，特别是它通过即时神经图形基元的扩展，是一种用于视图合成的新型渲染方法，它使用真实世界的图像来构建照片般逼真的沉浸式虚拟场景。尽管有潜力，但关于 NeRF 和虚拟现实 (VR) 结合的研究仍然很少。目前，还没有集成到可用的典型 VR 系统中，并且尚未评估 NeRF 实现的 VR 性能和适用性，例如，针对不同的场景复杂性或屏幕分辨率。在本文中，我们提出并评估了一个基于 NeRF 的框架，该框架能够在沉浸式 VR 中渲染场景，允许用户自由移动头部来探索复杂的现实世界场景。我们通过对三个不同的 NeRF 场景进行基准测试来评估我们的框架，这些场景涉及它们在不同场景复杂性和分辨率下的渲染性能。利用超分辨率，我们的方法可以产生每秒 30 帧的帧速率，每只眼睛的分辨率为 1280x720 像素。我们讨论了我们框架的潜在应用，并在线提供了一个开源实现。\n  - [BAD-NeRF：束调整的去模糊神经辐射场](https://arxiv.org/abs/2211.12853) | [code]\n    > 神经辐射场 (NeRF) 最近受到了相当大的关注，因为它在给定一组姿势相机图像的情况下，在逼真的 3D 重建和新颖的视图合成方面具有令人印象深刻的能力。早期的工作通常假设输入图像质量很好。然而，图像退化（例如低光条件下的图像运动模糊）在现实场景中很容易发生，这将进一步影响 NeRF 的渲染质量。在本文中，我们提出了一种新颖的束调整去模糊神经辐射场 (BAD-NeRF)，它可以对严重的运动模糊图像和不准确的相机姿势具有鲁棒性。我们的方法对运动模糊图像的物理图像形成过程进行建模，并联合学习 NeRF 的参数并恢复曝光时间内的相机运动轨迹。在实验中，我们表明，通过直接对真实物理图像形成过程进行建模，BAD-NeRF 在合成数据集和真实数据集上都实现了优于先前工作的性能。\n  - [Peekaboo：文本到图像扩散模型是零样本分割器](https://arxiv.org/abs/2211.13224) | [code]\n    > 最近基于扩散的生成模型与视觉语言模型相结合，能够根据自然语言提示创建逼真的图像。虽然这些模型是在大型互联网规模的数据集上训练的，但这种预训练模型并没有直接引入任何语义定位或基础。大多数当前的定位或接地方法都依赖于边界框或分割掩码形式的人工注释定位信息。例外是一些无监督方法，它们利用面向本地化的体系结构或损失函数，但它们需要单独训练。在这项工作中，我们探索了现成的扩散模型，在没有接触此类定位信息的情况下进行训练，如何能够在没有特定于分段的重新训练的情况下建立各种语义短语。引入了推理时间优化过程，能够生成以自然语言为条件的分割掩码。我们评估了我们在 Pascal VOC 数据集上进行无监督语义分割的提案 Peekaboo。此外，我们评估了 RefCOCO 数据集上的引用分割。总之，我们提出了第一个零样本、开放词汇、无监督（无定位信息）、语义基础技术，利用基于扩散的生成模型，无需重新训练。我们的代码将公开发布。\n  - [OReX：使用神经场从 Planner 横截面重建对象](https://arxiv.org/abs/2211.12886) | [code]\n    > 从平面横截面重建 3D 形状是一项受到医学成像和地理信息学等下游应用启发的挑战。输入是在空间平面的稀疏集合上完全定义的输入/输出指示函数，输出是指示函数对整个体积的插值。以前解决这个稀疏和病态问题的工作要么产生低质量的结果，要么依赖于额外的先验，例如目标拓扑、外观信息或输入法线方向。在本文中，我们介绍了 OReX，一种仅从切片重建 3D 形状的方法，以神经场作为插值先验。在输入平面上训练一个简单的神经网络以接收 3D 坐标并返回查询点的内部/外部估计。这个先验在诱导平滑性和自相似性方面很有用。这种方法的主要挑战是高频细节，因为神经先验过度平滑。为了缓解这种情况，我们提供了一种迭代估计架构和一种分层输入采样方案，鼓励从粗到精的训练，允许在后期阶段关注高频。此外，我们识别并分析了源自网格提取步骤的常见波纹状效果。我们通过调整输入输入/输出边界周围指示函数的空间梯度来缓解它，从根本上解决问题。\n  - [PANeRF：基于少样本输入的改进神经辐射场的伪视图增强](https://arxiv.org/abs/2211.12758) | [code]\n    > 近年来开发了神经辐射场 (NeRF) 方法，该技术在合成复杂场景的新视图方面具有广阔的应用前景。然而，NeRF 需要密集的输入视图，通常有数百个，以生成高质量图像。随着输入视图数量的减少，NeRF 对未见视点的渲染质量趋于急剧下降。为了克服这一挑战，我们提出了 NeRF 的伪视图增强，该方案通过考虑少镜头输入的几何形状来扩展足够数量的数据。我们首先通过利用扩展的伪视图来初始化 NeRF 网络，这可以有效地减少渲染看不见的视图时的不确定性。随后，我们通过使用包含精确几何和颜色信息的稀疏视图输入来微调网络。通过各种设置下的实验，我们验证了我们的模型忠实地合成了高质量的新视图图像，并且优于现有的多视图数据集方法。\n  - [ActiveRMAP：用于主动映射和规划的辐射场](https://arxiv.org/abs/2211.12656) | [code]\n    > 通过离线/在线映射方法，可以从一组 2D 图像中对场景进行高质量的 3D 重建。在本文中，我们从隐式表示的角度探索主动映射，最近在各种应用中产生了令人信服的结果。最流行的隐式表示之一——神经辐射场 (NeRF)，首先展示了使用多层感知器的照片级真实感渲染结果，并将有前途的离线 3D 重建作为辐射场的副产品。最近，研究人员还将这种隐式表示应用于在线重建和定位（即隐式 SLAM 系统）。然而，将隐式表示用于主动视觉任务的研究仍然非常有限。在本文中，我们对将神经辐射场应用于主动映射和规划问题特别感兴趣，这些问题是主动系统中紧密耦合的任务。我们首次提出了一个仅使用 RGB 的主动视觉框架，该框架使用辐射场表示以在线方式进行主动 3D 重建和规划。具体来说，我们将此联合任务制定为迭代双阶段优化问题，我们交替优化辐射场表示和路径规划。实验结果表明，与其他离线方法相比，所提出的方法取得了有竞争力的结果，并且优于使用 NeRF 的主动重建方法。\n  - [零 NeRF：零重叠注册](https://arxiv.org/abs/2211.12544) | [code]\n    > 我们提出了零 NeRF，这是一种投影表面配准方法，据我们所知，它提供了第一个能够在具有最小或零视觉对应的场景表示之间对齐的通用解决方案。为此，我们加强了部分和完整重建的可见表面之间的一致性，这使我们能够约束被遮挡的几何体。我们使用 NeRF 作为我们的表面表示和 NeRF 渲染管道来执行此对齐。为了证明我们方法的有效性，我们从对面的现实世界场景中注册了无法使用现有方法准确注册的无限小重叠，并将这些结果与广泛使用的注册方法进行了比较。\n  - [FLNeRF：神经辐射场中的 3D 面部地标估计](https://arxiv.org/abs/2211.11202) | [code]\n    > 本文介绍了在不使用 2D 图像、深度图或点云等中间表示的情况下直接预测神经辐射场 (NeRF) 上的 3D 面部地标的第一项重要工作。我们的 3D 从粗到细的人脸地标 NeRF (FLNeRF) 模型有效地从整个面部的 NeRF 中采样，并具有个人面部特征以获得准确的地标。为了缓解可用数据中面部表情的有限数量，局部和非线性 NeRF 扭曲被应用于精细的面部特征以模拟大范围的情绪，包括夸张的面部表情（例如，吹脸颊、张大嘴巴、眨眼） ), 用于训练 FLNeRF。通过这种表达增强，我们的模型可以预测 3D 地标，而不仅限于数据中给出的 20 个离散表达。强大的 3D NeRF 面部标志有助于许多下游任务。例如，我们修改 MoFaNeRF 以在 NeRF 上使用面部特征点启用高质量的面部编辑和交换，从而允许更直接的控制和更广泛的复杂表情。实验表明，使用地标的改进模型取得了相当好的结果。\n  - [SPARF：来自稀疏和嘈杂姿势的神经辐射场](https://arxiv.org/abs/2211.11738) | [code]\n    > 神经辐射场 (NeRF) 最近已成为合成逼真新颖视图的有力代表。虽然表现出令人印象深刻的性能，但它依赖于具有高精度相机姿势的密集输入视图的可用性，从而限制了其在现实场景中的应用。在这项工作中，我们引入了稀疏姿态调整辐射场 (SPARF)，以应对仅在少量宽基线输入图像（低至 3 张）且相机姿态嘈杂的情况下进行新视图合成的挑战。我们的方法利用多视图几何约束来共同学习 NeRF 并改进相机姿势。通过依赖于输入视图之间提取的像素匹配，我们的多视图对应目标强制优化场景和相机姿势以收敛到全局和几何精确的解决方案。我们的深度一致性损失进一步鼓励重建的场景从任何角度来看都是一致的。我们的方法在多个具有挑战性的数据集的稀疏视图机制中设置了一个新的技术状态。\n  - [Tensor4D：用于高保真动态重建和渲染的高效神经 4D 分解](https://arxiv.org/abs/2211.11610) | [code]\n    > 我们介绍了 Tensor4D，这是一种高效而有效的动态场景建模方法。我们解决方案的关键是一种高效的 4D 张量分解方法，使动态场景可以直接表示为 4D 时空张量。为了解决伴随的内存问题，我们首先将 4D 张量投影到三个时间感知体积，然后是九个紧凑的特征平面，从而分层分解 4D 张量。通过这种方式，可以以紧凑且高效的方式同时捕获随时间变化的空间信息。当应用 Tensor4D 进行动态场景重建和渲染时，我们进一步将 4D 场分解为不同的尺度，以便从粗到细学习结构运动和动态细节变化。我们的方法的有效性在合成场景和真实场景中都得到了验证。大量实验表明，我们的方法能够从稀疏视图摄像机装置甚至单目摄像机实现高质量的动态重建和渲染。代码和数据集将在此 https URL 上发布。\n  - [NeRF-RPN：NeRF 中对象检测的通用框架](https://arxiv.org/abs/2211.11646) | [code]\n    > 本文介绍了第一个重要的目标检测框架 NeRF-RPN，它直接在 NeRF 上运行。给定预训练的 NeRF 模型，NeRF-RPN 旨在检测场景中对象的所有边界框。通过利用包含多尺度 3D 神经体积特征的新型体素表示，我们证明可以直接回归 NeRF 中对象的 3D 边界框，而无需在任何视点渲染 NeRF。 NeRF-RPN 是一个通用框架，可用于检测没有类标签的对象。我们用各种骨干架构、RPN 头部设计和损失函数对 NeRF-RPN 进行了实验。所有这些都可以以端到端的方式进行训练，以估计高质量的 3D 边界框。为了促进 NeRF 对象检测的未来研究，我们构建了一个新的基准数据集，其中包含经过仔细标记和清理的合成数据和真实数据。请单击此 https URL 以可视化我们的 NeRF-RPN 的 3D 区域提案。代码和数据集将可用。\n  - [束调整神经辐射场的局部到全局配准](https://arxiv.org/abs/2211.11505) | [***``[code]``***](https://github.com/rover-xingyu/L2G-NeRF)\n    > Neural Radiance Fields (NeRF) 实现了逼真的新视图合成；然而，精确相机位姿的要求限制了它的应用。尽管存在用于联合学习神经 3D 表示和注册相机帧的分析综合扩展，但如果初始化不当，它们很容易受到次优解决方案的影响。我们提出了 L2G-NeRF，这是一种用于束调整神经辐射场的局部到全局配准方法：首先，逐像素灵活对齐，然后逐帧约束参数对齐。通过优化光度重建误差的深度网络以无监督的方式学习逐像素局部对齐。使用可微分参数估计求解器对逐像素对应执行逐帧全局对齐以找到全局变换。对合成数据和真实世界数据的实验表明，我们的方法在高保真重建和解决大型相机姿态失调方面优于当前最先进的方法。我们的模块是一个易于使用的插件，可以应用于 NeRF 变体和其他神经领域应用程序。此 https URL 提供了代码和补充材料。\n  - [SegNeRF：具有神经辐射场的 3D 部分分割](https://arxiv.org/abs/2211.11215) | [code]\n    > 神经辐射场 (NeRF) 的最新进展在生成任务（如新视图合成和 3D 重建）方面表现出色。基于神经辐射场的方法能够通过完全依赖姿势图像隐含地表示 3D 世界。然而，它们很少在 3D 零件分割等判别任务领域进行探索。在这项工作中，我们试图通过提出 SegNeRF 来弥合这一差距：一种将语义场与通常的辐射场集成在一起的神经场表示。 SegNeRF 继承了之前作品执行新视图合成和 3D 重建的能力，并能够从少量图像中进行 3D 部分分割。我们在 PartNet 上进行的广泛实验表明，SegNeRF 能够同时预测来自摆姿势图像的几何形状、外观和语义信息，即使对于看不见的物体也是如此。预测的语义场允许 SegNeRF 实现 2D 新视图分割的平均 mIoU 为 30.30%，3D 部分分割的平均 mIoU 为 37.46%，与基于点的方法相比，仅使用少量姿势图像具有竞争力的性能。此外，SegNeRF 能够从野外拍摄的物体的单个图像及其相应的部分分割生成显式 3D 模型。\n  - [通过 Bootstrapped Radiance Field Inversion 从单个图像中获取形状、姿势和外观](https://arxiv.org/abs/2211.11674) | [code]\n    > 神经辐射场 (NeRF) 与 GAN 相结合代表了从单一视图进行 3D 重建领域的一个有前途的方向，因为它们能够有效地对任意拓扑进行建模。然而，该领域最近的工作主要集中在已知确切地面真实姿势的合成数据集上，而忽略了姿势估计，这对于某些下游应用程序（例如增强现实 (AR) 和机器人技术）很重要。我们为自然图像引入了一个有原则的端到端重建框架，其中没有准确的地面真实姿势。我们的方法从对象的单个图像中恢复 SDF 参数化的 3D 形状、姿势和外观，而无需在训练期间利用多个视图。更具体地说，我们利用无条件 3D 感知生成器，我们对其应用混合反演方案，在该方案中，模型会产生对解决方案的初步猜测，然后通过优化对其进行细化。我们的框架可以在短短 10 步内对图像进行反渲染，使其能够在实际场景中使用。我们在各种真实和综合基准测试中展示了最先进的结果。\n  - [恢复神经隐式表面重建的精细细节](https://arxiv.org/abs/2211.11320) | [code]\n    > 最近关于隐式神经表征的工作取得了重大进展。使用体绘制学习隐式神经表面在没有 3D 监督的多视图重建中得到了普及。然而，由于几何和外观表示的潜在模糊性，准确地恢复精细细节仍然具有挑战性。在本文中，我们提出了 D-NeuS，一种能够恢复精细几何细节的基于体积渲染的神经隐式表面重建方法，它通过两个额外的损失函数扩展了 NeuS，旨在提高重建质量。首先，我们鼓励来自 alpha 合成的渲染表面点具有零符号距离值，从而减轻将 SDF 转换为体积渲染密度所产生的几何偏差。其次，我们在表面点上施加多视图特征一致性，这是通过沿射线从采样点插值 SDF 零交叉得出的。广泛的定量和定性结果表明，我们的方法重建了具有细节的高精度表面，并且优于现有技术。\n  - [Neural Puppeteer：基于关键点的动态形状神经渲染, ACCV2022](https://openaccess.thecvf.com/content/ACCV2022/html/Giebenhain_Neural_Puppeteer_Keypoint-Based_Neural_Rendering_of_Dynamic_Shapes_ACCV_2022_paper.html) | [***``[code]``***](https://github.com/urs-waldmann/NePu/)\n    > 我们介绍了 Neural Puppeteer，这是一种用于铰接形状的高效神经渲染管道。通过逆向渲染，我们可以单独从多视图 2D 轮廓预测 3D 关键点，而不需要纹理信息。此外，我们可以使用一个相同的训练模型轻松预测同一类形状的 3D 关键点，并更容易地从合成数据的训练中进行概括，我们通过成功地将零样本合成应用于现实世界的实验来证明这一点。我们通过将模型拟合到不同动物和人类的合成视频来展示我们方法的灵活性，并获得优于我们基线的定量结果。我们的方法将 3D 关键点与各个局部特征向量和全局潜在代码结合使用，以有效表示时变和铰接的形状，例如人类和动物。与之前的工作相比，我们不在 3D 域中进行重建，而是将 3D 特征投影到 2D 相机中，并根据这些投影特征对 2D RGB-D 图像进行重建，这比体积渲染要快得多。我们的合成数据集将公开可用，以进一步发展不断发展的动物姿势和形状重建领域。\n  - [DynIBaR：基于神经动态图像的渲染, -](https://arxiv.org/abs/2211.11082) | [code]\n    > 我们解决了从描述复杂动态场景的单目视频中合成新视图的问题。基于随时间变化的神经辐射场（又名动态 NeRF）的最先进方法已在该任务上显示出令人印象深刻的结果。然而，对于具有复杂物体运动和不受控制的摄像机轨迹的长视频，这些方法可能会产生模糊或不准确的渲染，从而阻碍它们在现实世界中的应用。我们提出了一种新方法来解决这些限制，而不是在 MLP 的权重内对整个动态场景进行编码，方法是采用基于体积图像的渲染框架，该框架通过以场景运动感知方式聚合附近视图的特征来合成新视点.我们的系统保留了先前方法在建模复杂场景和视图相关效果方面的优势，而且还能够从具有复杂场景动态和不受约束的相机轨迹的长视频中合成照片般逼真的新颖视图。我们展示了对动态场景数据集的最先进方法的显着改进，并将我们的方法应用于具有挑战性相机和物体运动的野外视频，在这些视频中，先前的方法无法产生高质量的渲染。我们的项目网页位于此 http URL。\n  - [折射物体的神经辐射场采样, SIGGRAPH-Asia2022](https://arxiv.org/abs/2211.14799) | [***``[code]``***](https://github.com/alexkeroro86/SampleNeRFRO)\n    > 最近，神经辐射场 (NeRF) 中的可微分体绘制得到了广泛的关注，其变体取得了许多令人印象深刻的结果。然而，现有的方法通常假设场景是一个均匀的体积，因此光线沿着直线路径投射。在这项工作中，场景是一个具有分段恒定折射率的异质体积，如果它与不同的折射率相交，路径将弯曲。对于折射物体的新视图合成，我们基于 NeRF 的框架旨在从具有折射物体轮廓的多视图姿势图像中优化有界体积和边界的辐射场。为了解决这个具有挑战性的问题，场景的折射率是从轮廓中重建的。给定折射率，我们扩展了 NeRF 中的分层和分层采样技术，以允许沿着由 Eikonal 方程跟踪的弯曲路径绘制样本。结果表明，我们的框架在数量和质量上都优于最先进的方法，在感知相似性度量上表现出更好的性能，并且在几个合成和真实场景的渲染质量上有明显改善。\n## Nov13 - Nov19, 2022\n  - [大尺度室内场景实时全向漫游, SIGGRAPH-Asia2022](https://dl.acm.org/doi/abs/10.1145/3550340.3564222) | [code]\n    > 神经辐射场 (NeRF) 最近在新视图合成方面取得了令人瞩目的成果。然而，之前关于 NeRF 的工作主要集中在以对象为中心的场景。由于位置编码容量有限，它们在面向外的和大规模场景中会遭受明显的性能下降。为了缩小差距，我们以几何感知的方式探索辐射场。我们从从多个 360° 图像中学习的全向神经辐射场估计显式几何。依靠恢复的几何形状，我们使用自适应分而治之的策略来缩小和微调辐射场，进一步提高渲染速度和质量。基线之间的定量和定性比较说明了我们在大型室内场景中的主要性能，并且我们的系统支持实时 VR 漫游。\n  - [Magic3D：高分辨率文本到 3D 内容创建](https://arxiv.org/abs/2211.10440) | [code]\n    > DreamFusion 最近展示了预训练的文本到图像扩散模型在优化神经辐射场 (NeRF) 方面的实用性，实现了卓越的文本到 3D 合成结果。然而，该方法有两个固有的局限性：(a) NeRF 的优化极其缓慢和 (b) NeRF 上的低分辨率图像空间监督，导致处理时间长的低质量 3D 模型。在本文中，我们通过使用两阶段优化框架来解决这些限制。首先，我们使用低分辨率扩散先验获得粗糙模型，并使用稀疏 3D 哈希网格结构进行加速。使用粗略表示作为初始化，我们进一步优化了带纹理的 3D 网格模型，该模型具有与高分辨率潜在扩散模型交互的高效可微分渲染器。我们的方法被称为 Magic3D，可以在 40 分钟内创建高质量的 3D 网格模型，比 DreamFusion 快 2 倍（据报道平均需要 1.5 小时），同时还实现了更高的分辨率。用户研究表明 61.7% 的评分者更喜欢我们的方法而不是 DreamFusion。连同图像调节生成功能，我们为用户提供了控制 3D 合成的新方法，为各种创意应用开辟了新途径。\n  - [AligNeRF：通过对齐感知训练的高保真神经辐射场](https://arxiv.org/abs/2211.09682) | [code]\n    > 神经辐射场 (NeRF) 是将 3D 场景建模为连续函数的强大表示。尽管 NeRF 能够渲染具有视图相关效果的复杂 3D 场景，但很少有人致力于探索其在高分辨率设置中的局限性。具体来说，现有的基于 NeRF 的方法在重建高分辨率真实场景时面临着一些限制，包括大量的参数、未对齐的输入数据和过度平滑的细节。在这项工作中，我们对使用高分辨率数据训练 NeRF 进行了首次试点研究，并提出了相应的解决方案：1）将多层感知器（MLP）与卷积层结合，可以编码更多的邻域信息，同时减少参数总数； 2) 一种新的训练策略来解决由移动物体或小相机校准误差引起的未对准问题； 3）高频感知损失。我们的方法几乎是免费的，没有引入明显的训练/测试成本，而在不同数据集上的实验表明，与当前最先进的 NeRF 模型相比，它可以恢复更多的高频细节。项目页面：\\url{此 https URL。}\n  - [3DLatNav：导航用于语义感知 3D 对象操作的生成潜在空间](https://arxiv.org/abs/2211.09770) | [code]\n    > 3D 生成模型最近成功地以点云的形式生成逼真的 3D 对象。然而，大多数模型在没有广泛的语义属性标签或其他参考点云的情况下不提供操纵组件对象部分的形状语义的可控性。此外，除了执行简单的潜在向量运算或插值的能力之外，还缺乏对 3D 形状的部分级语义如何在其相应的生成潜在空间中进行编码的理解。在本文中，我们提出了 3DLatNav；一种导航预训练生成潜在空间以实现 3D 对象的受控部分级语义操作的新方法。首先，我们提出了一种使用 3D 形状的潜在表示的部分级弱监督形状语义识别机制。然后，我们将该知识转移到预训练的 3D 对象生成潜在空间，以解开纠缠的嵌入，以线性子空间的形式表示对象组成部分的不同形状语义，尽管在训练期间部分级标签不可用。最后，我们利用那些已识别的子空间来表明，通过将所提出的框架应用于任何预训练的 3D 生成模型，可以实现可控的 3D 对象部分操作。通过两个新的定量指标来评估部分级操作的一致性和定位准确性，我们表明 3DLatNav 在识别编码 3D 对象的部分级形状语义的潜在方向方面优于现有的无监督潜在解缠结方法。通过对最先进的生成模型进行多项消融研究和测试，我们表明 3DLatNav 可以在输入点云上实现受控的部分级语义操作，同时保留对象的其他特征和真实性。\n  - [RenderDiffusion：用于 3D 重建、修复和生成的图像扩散](https://arxiv.org/abs/2211.09869) | [code]\n    > 扩散模型目前在条件和无条件图像生成方面都达到了最先进的性能。然而，到目前为止，图像扩散模型不支持 3D 理解所需的任务，例如视图一致的 3D 生成或单视图对象重建。在本文中，我们将 RenderDiffusion 作为第一个用于 3D 生成和推理的扩散模型，可以仅使用单眼 2D 监督进行训练。我们方法的核心是一种新颖的图像去噪架构，它在每个去噪步骤中生成并渲染场景的中间三维表示。这在扩散过程中强制实施了一个强大的归纳结构，为我们提供了一个 3D 一致的表示，同时只需要 2D 监督。可以从任何视点渲染生成的 3D 表示。我们在 ShapeNet 和 Clevr 数据集上评估 RenderDiffusion，并展示了在生成 3D 场景和从 2D 图像推断 3D 场景方面的竞争性能。此外，我们基于扩散的方法允许我们使用 2D 修复来编辑 3D 场景。我们相信，我们的工作有望在对大量图像集进行训练时实现大规模的完整 3D 生成，从而避免对大型 3D 模型集进行监督的需要。\n  - [DINER：无序不变的隐式神经表征](https://arxiv.org/abs/2211.07871) | [code]\n    > 隐式神经表示 (INR) 将信号的属性表征为相应坐标的函数，它成为解决逆问题的利器。然而，INR 的容量受到网络训练中频谱偏差的限制。在本文中，我们发现通过重新排列输入信号的坐标可以在很大程度上解决这种与频率相关的问题，为此我们提出了无序不变的隐式神经表示 (DINER)，方法是将哈希表扩充为传统的 INR 骨架。鉴于离散信号共享相同的属性直方图和不同的排列顺序，哈希表可以将坐标投影到相同的分布中，映射信号可以使用后续的 INR 网络更好地建模，从而显着减轻频谱偏差。实验不仅揭示了 DINER 对不同 INR 主干（MLP 与 SIREN）和各种任务（图像/视频表示、相位检索和折射率恢复）的泛化，而且还显示了优于最先进技术的优势算法的质量和速度。\n  - [用于形状引导生成 3D 形状和纹理的 Latent-NeRF](https://arxiv.org/abs/2211.07600) | [code]\n    > 近年来，文本引导图像生成发展迅速，激发了文本引导形状生成方面的重大突破。最近，已经表明，使用分数蒸馏，可以成功地通过文本引导 NeRF 模型生成 3D 对象。我们将分数蒸馏调整为公开可用且计算效率高的潜在扩散模型，该模型将整个扩散过程应用于预训练自动编码器的紧凑潜在空间中。由于 NeRF 在图像空间中运行，因此通过潜在分数蒸馏来引导它们的简单解决方案需要在每个引导步骤中编码到潜在空间。相反，我们建议将 NeRF 带入潜在空间，从而产生 Latent-NeRF。分析我们的 Latent-NeRF，我们表明虽然文本到 3D 模型可以产生令人印象深刻的结果，但它们本质上是不受约束的，并且可能缺乏引导或执行特定 3D 结构的能力。为了协助和指导 3D 生成，我们建议使用 Sketch-Shape 来指导我们的 Latent-NeRF：一种定义所需对象的粗略结构的抽象几何体。然后，我们提出了将这种约束直接集成到 Latent-NeRF 中的方法。这种文本和形状指导的独特组合可以增强对生成过程的控制。我们还表明，潜在分数蒸馏可以成功地直接应用于 3D 网格。这允许在给定的几何体上生成高质量的纹理。我们的实验验证了我们不同形式的指导的力量和使用潜在渲染的效率。可通过此 https 网址实现\n  - [AsyncNeRF：从具有时间姿态函数的异步 RGB-D 序列中学习大规模辐射场](https://arxiv.org/abs/2211.07459) | [code]\n    > 大规模辐射场是用于智能交通应用（如自动驾驶或无人机送货）的有前途的测绘工具。但对于大型场景，由于感测范围有限，紧凑型同步 RGB-D 相机并不适用，使用单独的 RGB 和深度传感器不可避免地导致序列不同步。受最近不需要已知内在或外在参数的自校准辐射场训练方法的成功启发，我们提出了第一个自校准 RGB 和深度帧之间的不匹配的解决方案。我们利用重要的特定领域事实，即 RGB 和深度帧实际上是从同一轨迹采样的，并开发了一种称为时间-姿势函数的新型隐式网络。将它与大规模辐射场相结合会产生一种级联两个隐式表示网络的架构。为了验证其有效性，我们构建了一个多样化且逼真的数据集，涵盖各种 RGB-D 不匹配场景。通过对该数据集进行全面的基准测试，我们展示了我们的方法在不同场景中的灵活性以及优于适用的先前对应方法的卓越性能。代码、数据和模型将公开提供。\n## Nov6 - Nov12, 2022\n  - [NeXT：通过 Multi-skip Transformer 实现高质量的神经辐射场, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19824-3_5) | [***``[code]``***](https://github.com/Crishawy/NeXT)\n    > 神经辐射场 (NeRF) 方法通过神经网络表示场景，在新颖的视图合成方面表现出令人印象深刻的性能。然而，大多数现有的基于 NeRF 的方法（包括其变体）将每个样本点单独视为输入，同时忽略了来自相应射线的相邻样本点之间的内在关系，从而阻碍了重建性能。为了解决这个问题，我们探索了一种全新的方案，即 NeXT，引入了一个多跳跃变换器来捕获射线级查询中各个样本点之间的丰富关系。具体来说，提出了射线标记化以将每条射线表示为一系列点嵌入，并将其作为我们提出的 NeXT 的输入。这样，通过内置的自注意力机制捕获样本点之间的关系，以促进重建。此外，我们提出的 NeXT 可以很容易地与其他基于 NeRF 的方法结合，以提高它们的渲染质量。在三个数据集上进行的大量实验表明，NeXT 大大优于所有以前的最先进的工作。特别是，拟议的 NeXT 在 Blender 数据集上的 PSNR 超过了强大的 NeRF 基线 2.74 dB。该代码可在 https://github.com/Crishawy/NeXT 获得。\n  - [用于 3D 场景重建的定向射线距离函数, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-20086-1_12) | [code]\n    > 我们提出了一种从单个看不见的图像重建全 3D 场景的方法。我们训练了真实的非水密场景扫描数据集。我们的方法使用预测距离函数，因为这些函数在处理复杂拓扑和大空间方面显示出前景。我们确定并分析了预测此类图像条件距离函数的两个关键挑战，这些距离函数阻碍了它们在真实 3D 场景数据上的成功。首先，我们展示了从图像预测传统场景距离需要对大感受野进行推理。其次，我们分析表明，经过训练以预测这些距离函数的网络的最佳输出不符合所有距离函数属性。我们提出了一种替代距离函数，即定向射线距离函数 (DRDF)，它可以解决这两个挑战。我们表明，在 Matterport3D、3DFront 和 ScanNet 上从单个图像进行 3D 重建时，经过训练以预测 DRDF 的深度网络在数量和质量上优于所有其他方法。 （项目页面：https://nileshkulkarni.github.io/scene_drdf）\n  - [ParticleNeRF：动态场景中在线神经辐射场的基于粒子的编码](https://arxiv.org/abs/2211.04041) | [code]\n    > 神经辐射场 (NeRFs) 从图像中学习隐式表示（通常是静态的）环境。我们的论文扩展了 NeRFs 以在线方式处理动态场景。我们建议 ParticleNeRF 适应环境几何形状的变化，每 350 毫秒学习一个新的最新表示。与其他 NeRF 框架相比，ParticleNeRF 可以以更高的保真度表示动态环境的当前状态。为实现这一目标，我们引入了一种新的基于粒子的参数编码，它允许中间 NeRF 特征——现在耦合到空间中的粒子——随动态几何移动。这可以通过将光度重建损失反向传播到粒子的位置来实现。位置梯度被解释为粒子速度，并使用基于位置的动力学 (PBS) 物理系统集成到位置中。将 PBS 引入 NeRF 公式使我们能够为粒子运动添加碰撞约束，并创造未来机会将其他运动先验添加到系统中，例如刚体和可变形体\n  - [基于时间相干性的大规模场景分布式光线追踪, ToG2022](https://ieeexplore.ieee.org/abstract/document/9940545) | [code]\n    > 分布式光线追踪算法在渲染海量场景时被广泛使用，其中数据利用率和负载均衡是提高性能的关键。一项基本观察是射线在时间上是相干的，这表明时间信息可用于提高计算效率。在本文中，我们使用时间相干性来优化分布式光线追踪的性能。首先，我们提出了一种基于时间一致性的调度算法来指导任务/数据分配和调度。然后，我们提出了一个虚拟门户结构来预测基于前一帧的光线辐射率，并将辐射率低的光线发送到预先计算的简化模型进行进一步追踪，这可以大大降低遍历复杂度和网络数据传输的开销.该方法在大小高达 355 GB 的场景中得到验证。与以前的算法相比，我们的算法可以实现高达 81% 的加速，并且均方误差非常小。\n  - [QRF：具有量子辐射场的隐式神经表示](https://arxiv.org/abs/2211.03418) | [code]\n    > 现实世界场景的逼真渲染对于包括混合现实 (MR) 和虚拟现实 (VR) 在内的广泛应用来说是一项巨大的挑战。神经网络长期以来一直在求解微分方程的背景下进行研究，之前已被引入作为照片级渲染的隐式表示。然而，使用经典计算的逼真渲染具有挑战性，因为它需要耗时的光线行进，并且由于维数灾难而遭受计算瓶颈。在本文中，我们提出了量子辐射场 (QRF)，它集成了量子电路、量子激活函数和量子体积渲染，用于隐式场景表示。结果表明，QRF不仅发挥了量子计算速度快、收敛快、并行度高等优势，而且保证了体绘制的高质量。\n  - [3D常见宠物：现实生活中可变形类别的动态新视角合成](https://arxiv.org/abs/2211.03889) | [code]\n    > 从稀疏视图中获得对象的逼真重建本质上是模棱两可的，只能通过学习合适的重建先验来实现。早期关于稀疏刚性对象重建的工作成功地从大型数据集（如 CO3D）中学习了这样的先验。在本文中，我们将这种方法扩展到动态对象。我们以猫和狗作为代表性示例，并介绍 Common Pets in 3D (CoP3D)，这是一组众包视频，展示了大约 4,200 种不同的宠物。 CoP3D 是首批用于“野外”非刚性 3D 重建基准测试的大型数据集之一。我们还提出了 Tracker-NeRF，这是一种从我们的数据集中学习 4D 重建的方法。在测试时，给定一个看不见的物体的少量视频帧，Tracker-NeRF 预测其 3D 点的轨迹并生成新视图、插值视点和时间。 CoP3D 的结果揭示了比现有基线更好的非刚性新视图合成性能。\n  - [基于学习的复杂室内场景逆渲染与可微蒙特卡洛光线追踪, SIGGRAPH-Asia2022](https://jingsenzhu.github.io/invrend/) | [code]\n    > 我们提出了一种基于学习的方法，用于使用可区分的蒙特卡洛光线追踪对复杂的室内场景进行逆向渲染。我们的方法将单个室内场景 RGB 图像作为输入，并自动推断其底层表面反射率、几何形状和空间变化的照明。这使我们能够对场景进行逼真的编辑，例如插入多个复杂的虚拟对象并使用全局照明忠实地编辑表面材质。\n## Oct30 - Nov5, 2022\n  - [深度外观预过滤, ToG2022](https://dl.acm.org/doi/abs/10.1145/3570327) | [code]\n    > 复杂场景的基于物理的渲染可能成本高得令人望而却步，并且复杂性在渲染图像上的分布可能是无限且不均匀的。理想的细节层次 (LoD) 方法的目标是使渲染成本独立于 3D 场景的复杂性，同时保持场景的外观。然而，由于依赖近似模型和其他启发式方法，当前的预过滤 LoD 方法在它们可以支持的外观方面受到限制。我们提出了第一个全面的多尺度 LoD 框架，用于预过滤具有复杂几何形状和材料（例如 Disney BRDF）的 3D 环境，同时保持与光线追踪参考相关的外观。使用场景的多尺度层次结构，我们执行数据驱动的预过滤步骤以获得每个尺度的外观相位函数和方向覆盖掩码。我们方法的核心是一种新颖的神经表示，它将这些信息编码成一种紧凑的潜在形式，这种形式很容易在基于物理的渲染器中解码。一旦场景被烘焙出来，我们的方法在渲染时不需要原始几何体、材质或纹理。我们证明我们的方法与最先进的预过滤方法相比具有优势，并且可以为复杂场景节省大量内存。\n  - [用于机器人操纵的神经抓取距离场](https://arxiv.org/abs/2211.02647) | [code]\n    > 我们将抓取学习制定为一个神经场，并提出神经抓取距离场 (NGDF)。这里，输入是机器人末端执行器的 6D 姿态，输出是到物体有效抓握的连续流形的距离。与预测一组离散候选抓握的当前方法相比，基于距离的 NGDF 表示很容易被解释为成本，并且最小化该成本会产生成功的抓握姿势。这种抓取距离成本可以直接合并到轨迹优化器中，与其他成本（如轨迹平滑度和碰撞避免）进行联合优化。在优化过程中，随着各种成本的平衡和最小化，抓取目标可以平滑变化，因为学习到的抓取域是连续的。在使用 Franka 手臂的模拟基准测试中，我们发现使用 NGDF 的联合抓取和规划比基线执行成功率高出 63%，同时泛化到看不见的查询姿势和看不见的物体形状。项目页面：此 https 网址。\n  - [nerf2nerf：神经辐射场的成对配准](https://arxiv.org/abs/2211.01600) | [code]\n    > 我们引入了一种神经场成对配准技术，该技术扩展了经典的基于优化的局部配准（即 ICP）以在神经辐射场 (NeRF) 上运行——从校准图像集合训练的神经 3D 场景表示。 NeRF 不分解照明和颜色，因此为了使配准不受照明影响，我们引入了“表面场”的概念——从预训练的 NeRF 模型中提取的场，该模型测量点在表面上的可能性物体的表面。然后，我们将 nerf2nerf 注册作为一种稳健的优化，迭代地寻求对齐两个场景的表面场的刚性转换。我们通过引入预训练的 NeRF 场景数据集来评估我们的技术的有效性——我们的合成场景可以对经典配准技术进行定量评估和比较，而我们的真实场景则证明了我们的技术在现实场景中的有效性。其他结果位于：此 https 网址\n  - [HyperSound：使用超网络生成音频信号的隐式神经表示](https://arxiv.org/abs/2211.01839) | [code]\n    > 隐式神经表征 (INR) 是一个快速发展的研究领域，它提供了表示多媒体信号的替代方法。 INR 最近的应用包括图像超分辨率、高维信号压缩或 3D 渲染。然而，这些解决方案通常侧重于视觉数据，将它们适应音频领域并非易事。此外，它需要为每个数据样本单独训练模型。为了解决这个限制，我们提出了 HyperSound，这是一种利用超网络为训练时看不见的音频信号生成 INR 的元学习方法。我们表明，我们的方法可以重建声波，其质量可与其他最先进的模型相媲美。\n  - [基于注意力的神经元胞自动机, NeurIPS2022](https://arxiv.org/abs/2211.01233) | [code]\n    > 元胞自动机 (CA) 最近的扩展结合了现代深度学习的关键思想，极大地扩展了它们的能力并催生了一个新的神经元元自动机 (NCA) 技术家族。受基于 Transformer 的架构的启发，我们的工作提出了一类新的基于注意力的 NCA，使用空间局部化但全局组织的自注意力方案形成。我们介绍了此类的一个实例，名为 Vision Transformer Cellular Automata (ViTCA)。我们展示了跨六个基准数据集的去噪自动编码的定量和定性结果，将 ViTCA 与 U-Net、基于 U-Net 的 CA 基线 (UNetCA) 和 Vision Transformer (ViT) 进行了比较。在比较配置为类似参数复杂性的架构时，ViTCA 架构在所有基准测试和几乎每个评估指标上都产生了卓越的性能。我们对 ViTCA 的各种架构配置进行了消融研究，分析了它对细胞状态的影响，并调查了它的归纳偏差。最后，我们通过线性探针在其聚合细胞状态隐藏表示上检查其学习表示，与我们的 U-Net、ViT 和 UNetCA 基线相比，平均产生更好的结果。\n  - [GARF：用于高保真重建和姿态估计的高斯激活辐射场, ECCV2022](https://arxiv.org/abs/2204.05735) | [code]\n    > 尽管神经辐射场 (NeRF) 在现实世界场景的逼真新颖视图合成中显示出令人信服的结果，但大多数现有方法都需要准确的先验相机姿势。尽管存在联合恢复辐射场和相机姿态的方法 (BARF)，但它们依赖于繁琐的从粗到细的辅助位置嵌入来确保良好的性能。我们提出了高斯激活神经辐射场 (GARF)，这是一种新的无位置嵌入神经辐射场架构 - 采用高斯激活 - 在高保真重建和姿态估计方面优于当前最先进的技术。\n  - [使用表面信号参数化学习神经隐式表示](https://arxiv.org/abs/2211.00519) | [code]\n    > 神经隐式表面表示最近已成为显式 3D 对象编码的流行替代方法，例如多边形网格、列表点或体素。虽然重要的工作已经提高了这些表示的几何保真度，但很少有人关注它们的最终外观。传统的显式对象表示通常将 3D 形状数据与辅助表面映射图像数据耦合，例如漫反射颜色纹理和法线贴图中的精细几何细节，通常需要将 3D 表面映射到平面上，即表面参数化;另一方面，由于缺乏可配置的表面参数化，隐式表示不能轻易地进行纹理化。受这种数字内容创作方法的启发，我们设计了一种神经网络架构，该架构隐式编码适合外观数据的底层表面参数化。因此，我们的模型与现有的具有外观数据的基于网格的数字内容保持兼容。受到最近将紧凑网络过度拟合到单个 3D 对象的工作的启发，我们提出了一种新的权重编码神经隐式表示，它扩展了神经隐式表面的能力，以实现纹理映射的各种常见和重要应用。我们的方法优于合理的基线和最先进的替代方法。\n  - [gCoRF：生成合成辐射场, 3DV2022](https://vcai.mpi-inf.mpg.de/projects/gCoRF/) | [code]\n    > 对象的 3D 生成模型可通过 3D 控制实现逼真的图像合成。现有方法将场景建模为全局场景表示，忽略了场景的组成方面。除了支持可概括的 3D 推理之外，组合推理还可以支持各种编辑应用程序。在本文中，我们提出了一个组合生成模型，其中对象的每个语义部分都表示为仅从野外 2D 数据中学习的独立 3D 表示。我们从全局生成模型 (GAN) 开始，学习使用 2D 分割掩码的监督将其分解为不同的语义部分。然后，我们学习合成独立采样的部分，以创建连贯的全局场景。不同的部分可以独立采样，同时保持物体的其余部分固定。我们在各种对象和部件上评估我们的方法，并演示编辑应用程序。\n  - [深入研究 Radiance Grid 以进行实时视图合成并保留细节, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19784-0_42) | [code]\n    > 神经辐射场 (NeRF) [31] 系列在表示场景和合成高质量新颖视图方面令人印象深刻。然而，大多数以前的作品都无法保留纹理细节并且训练速度慢。最近的一种方法 SNeRG [11] 表明，将经过训练的 NeRF 烘焙为稀疏神经辐射网格可以实现实时视图合成，同时略微降低渲染质量。在本文中，我们深入研究了 Radiance Grid 表示并提出了一系列改进，这些改进共同提高了速度和质量方面的性能。首先，我们提出了一种分层稀疏辐射网格 (HrSRG) 表示，它对信息空间具有更高的体素分辨率，对其他空间具有更少的体素。 HrSRG 利用受 [30, 55] 启发的分层体素网格构建过程，并且可以在不占用过多内存的情况下以高分辨率描述场景。此外，我们表明直接优化体素网格会在渲染图像中产生出奇的好纹理细节。这种直接优化是内存友好的，并且需要比传统 NeRF 少多个数量级的时间，因为它只涉及一个微型 MLP。最后，我们发现阻止精细细节恢复的一个关键因素是由相机姿势错误引起的图像中未对齐的 2D 像素。我们建议使用感知损失来增加对错位的容忍度，从而提高渲染图像的视觉质量。\n## Oct23 - Oct29, 2022\n  - [NeX360：基于神经基础扩展的实时全方位视图合成, TPAMI2022](https://ieeexplore.ieee.org/abstract/document/9931981) | [code]\n    > 我们介绍了 NeX，这是一种基于多平面图像 (MPI) 增强的新颖视图合成的新方法，可以实时再现视图相关的效果。与传统的 MPI 不同，我们的技术将每个像素参数化为从神经网络学习的球形基函数的线性组合，以对视图相关的效果进行建模，并使用混合隐式-显式建模策略来改进精细细节。此外，我们还展示了 NeX 的扩展，它利用知识蒸馏来为无限 360 ∘ 场景训练多个 MPI。我们的方法在几个基准数据集上进行了评估：NeRF-Synthetic 数据集、Light Field 数据集、Real Forward-Facing 数据集、Space 数据集以及 Shiny，我们的新数据集包含更具挑战性的视图相关效果，例如彩虹反射在 CD 上。我们的方法在 PSNR、SSIM 和 LPIPS 上优于其他实时渲染方法，可以实时渲染无界 360 ∘ 场景。\n  - [NeRFPlayer：具有分解神经辐射场的可流式动态场景表示](https://arxiv.org/abs/2210.15947) | [code]\n    > 在 VR 中自由地在真实世界的 4D 时空空间中进行视觉探索一直是一项长期的追求。当仅使用几个甚至单个 RGB 相机来捕捉动态场景时，这项任务特别有吸引力。为此，我们提出了一个能够快速重建、紧凑建模和流式渲染的高效框架。首先，我们建议根据时间特征分解 4D 时空空间。 4D 空间中的点与属于三个类别的概率相关联：静态区域、变形区域和新区域。每个区域都由一个单独的神经场表示和规范化。其次，我们提出了一种基于混合表示的特征流方案，用于有效地对神经场进行建模。我们的方法，创造了 NeRFPlayer，在单手持相机和多相机阵列捕获的动态场景上进行评估，在质量和速度方面实现与最近最先进的方法相当或更优的渲染性能，实现重建每帧 10 秒，实时渲染。\n  - [Vox-Fusion：基于体素的神经隐式表示的密集跟踪和映射](https://arxiv.org/abs/2210.15858) | [***``[code]``***](https://github.com/zju3dv/Vox-Fusion)\n    > 在这项工作中，我们提出了一个名为 Vox-Fusion 的密集跟踪和映射系统，它将神经隐式表示与传统的体积融合方法无缝融合。我们的方法受到最近开发的隐式映射和定位系统的启发，并进一步扩展了这一思想，使其可以自由应用于实际场景。具体来说，我们利用基于体素的神经隐式表面表示来编码和优化每个体素内的场景。此外，我们采用基于八叉树的结构来划分场景并支持动态扩展，使我们的系统能够像以前的作品一样在不知道环境的情况下跟踪和映射任意场景。此外，我们提出了一个高性能的多进程框架来加速该方法，从而支持一些需要实时性能的应用程序。评估结果表明，我们的方法可以实现比以前的方法更好的准确性和完整性。我们还展示了我们的 Vox-Fusion 可用于增强现实和虚拟现实应用程序。我们的源代码可通过此 https 网址公开获得。\n  - [通过辐射贴图提升点云渲染](https://arxiv.org/abs/2210.15107) | [code]\n    > 近年来，由于其高质量，我们见证了基于 NeRF 的图像渲染的快速发展。然而，点云渲染在某种程度上较少被探索。与遭受密集空间采样的基于 NeRF 的渲染相比，点云渲染自然计算密集度较低，这使其能够部署在移动计算设备中。在这项工作中，我们专注于通过紧凑的模型设计提高点云渲染的图像质量。我们首先分析体绘制公式在点云上的适应性。基于分析，我们将 NeRF 表示简化为空间映射函数，每个像素只需要一次评估。此外，受光线行进的启发，我们将嘈杂的原始点云校正为光线与表面之间的估计交点作为查询坐标，这可以避免空间频率崩溃和邻点干扰。由光栅化、空间映射和细化阶段组成，我们的方法在点云渲染上实现了最先进的性能，以显着的优势优于之前的工作，模型尺寸更小。我们在 NeRF-Synthetic 上获得了 31.74 的 PSNR，在 ScanNet 上获得了 25.88，在 DTU 上获得了 30.81。代码和数据将很快发布。\n  - [用于 3D 视频合成的流式辐射场, NeurIPS2022](https://arxiv.org/abs/2210.14831) | [code]\n    > 我们提出了一种基于显式网格的方法，用于有效地重建流辐射场，用于真实世界动态场景的新视图合成。我们不是训练一个结合所有帧的单一模型，而是用增量学习范式来制定动态建模问题，其中训练每帧模型差异以补充当前帧上基础模型的适应性。通过利用简单而有效的窄带调整策略，所提出的方法实现了一个可行的框架，用于处理高训练效率的动态视频序列。通过使用基于模型差异的压缩，可以显着减少使用显式网格表示引起的存储开销。我们还引入了一种有效的策略来进一步加速每一帧的模型优化。对具有挑战性的视频序列的实验表明，我们的方法能够以具有竞争力的渲染质量实现每帧 15 秒的训练速度，比最先进的隐式方法实现 1000 倍的加速。此 https 网址提供了代码。\n  - [EpipolarNVS：利用对极几何进行单图像新视图合成, BMVC2022](https://arxiv.org/abs/2210.13077) | [code]\n    > 新视图合成 (NVS) 可以通过不同的方法来解决，具体取决于一般设置：单个源图像到短视频序列、精确或嘈杂的相机姿势信息、基于 3D 的信息（如点云等）。最具挑战性的场景，我们在这项工作中所处的场景，只考虑一个独特的源图像来从另一个角度生成一个新颖的图像。然而，在这种棘手的情况下，最新的基于学习的解决方案往往难以集成相机视点转换。事实上，外部信息通常通过低维向量按原样传递。甚至可能会发生这样的情况，当参数化为欧拉角时，这样的相机姿势会通过单热表示进行量化。这种普通的编码选择阻止了学习的架构在连续的基础上（从相机姿势的角度）推断新的视图。我们声称它存在一种优雅的方式来通过利用 3D 相关概念（例如对极约束）更好地编码相对相机姿势。因此，我们引入了一种创新方法，将视点变换编码为 2D 特征图像。这种相机编码策略为网络提供了关于相机如何在两个视图之间的空间中移动的有意义的见解。通过将相机姿势信息编码为有限数量的彩色对极线，我们通过实验证明我们的策略优于普通编码。\n  - [NeRF-SLAM：具有神经辐射场的实时密集单目 SLAM](https://arxiv.org/abs/2210.13641) | [code]\n    > 我们提出了一种新颖的几何和光度学 3D 映射管道，用于从单目图像进行准确和实时的场景重建。为实现这一目标，我们利用了密集单眼 SLAM 和实时分层体积神经辐射场的最新进展。我们的见解是，密集的单目 SLAM 通过提供准确的姿态估计和具有相关不确定性的深度图，提供正确的信息以实时拟合场景的神经辐射场。通过我们提出的基于不确定性的深度损失，我们不仅实现了良好的光度精度，而且还实现了很高的几何精度。事实上，我们提出的管道实现了比竞争方法更好的几何和光度精度（PSNR 提高了 179%，L1 深度提高了 86%），同时实时工作并且仅使用单目图像。\n  - [Compressing Explicit Voxel Grid Representations：快速的 NeRFs 也变小了](https://arxiv.org/abs/2210.12782) | [code]\n    > 由于其固有的紧凑性，NeRF 彻底改变了逐场景辐射场重建的世界。 NeRF 的主要限制之一是它们在训练和推理时的渲染速度都很慢。最近的研究重点是优化表示场景的显式体素网格 (EVG)，它可以与神经网络配对以学习辐射场。这种方法显着提高了训练和推理时间的速度，但代价是占用大量内存。在这项工作中，我们提出了 Re:NeRF，这是一种专门针对 EVG-NeRF 可压缩性的方法，旨在减少 NeRF 模型的内存存储，同时保持相当的性能。我们在四种流行的基准测试中使用三种不同的 EVG-NeRF 架构对我们的方法进行了基准测试，展示了 Re:NeRF 广泛的可用性和有效性。\n## Oct16 - Oct22, 2022\n  - [将多维天气和气候数据压缩到神经网络中](https://arxiv.org/abs/2210.12538) | [code]\n    > 天气和气候模拟会产生数 PB 的高分辨率数据，研究人员随后会对这些数据进行分析，以了解气候变化或恶劣天气。我们提出了一种压缩这种多维天气和气候数据的新方法：训练基于坐标的神经网络以过度拟合数据，并将生成的参数作为原始基于网格的数据的紧凑表示。虽然压缩比范围从 300 倍到超过 3,000 倍，但我们的方法在加权 RMSE、MAE 方面优于最先进的压缩器 SZ3。它可以忠实地保存重要的大型大气结构，并且不引入人工制品。当使用生成的神经网络作为 790 倍压缩数据加载器来训练 WeatherBench 预测模型时，其 RMSE 增加不到 2%。三个数量级的压缩使高分辨率气候数据的访问民主化，并实现了许多新的研究方向。\n  - [NeARportation：远程实时神经渲染框架, VRST22](https://arxiv.org/abs/2210.12398) | [code]\n    > 虽然逼真外观的呈现在沉浸在增强虚拟环境中起着重要作用，但显示真实物体的逼真外观仍然是一个具有挑战性的问题。摄影测量学的最新发展促进了将真实物体纳入虚拟空间。然而，照片般逼真的摄影测量需要专用的测量环境，并且需要在测量成本和质量之间进行权衡。此外，即使使用逼真的外观测量，渲染质量和帧速率之间也存在权衡。没有任何框架可以解决这些权衡问题并轻松地实时提供照片般逼真的外观。我们的 NeARportation 框架结合了服务器-客户端双向通信和神经渲染来解决这些权衡问题。服务器上的神经渲染接收客户端的头部姿势并生成具有逼真外观再现的新视图图像，并将其流式传输到客户端的显示器上。通过将我们的框架应用于立体显示器，我们确认它可以根据用户的头部运动以每秒 35-40 帧 (fps) 的速度在全高清立体视频上显示高保真外观。\n  - [具有超分辨声向的神经声场分解](https://arxiv.org/abs/2210.12345) | [code]\n    > 声场分解使用来自有限数量麦克风的信号作为输入来预测任意方向的波形。声场分解是下游任务的基础，包括源定位、源分离和空间音频再现。传统的声场分解方法（例如 Ambisonics）具有有限的空间分解分辨率。本文提出了一种基于学习的神经声场分解 (NeSD) 框架，允许使用来自任意位置的几个麦克风的麦克风胶囊的录音进行具有精细空间方向分辨率的声场分解。 NeSD 系统的输入包括麦克风信号、麦克风位置和查询的方向。 NeSD 的输出包括波形和查询位置的存在概率。我们分别用不同的神经网络对 NeSD 系统进行建模，包括全连接、时间延迟和循环神经网络。我们表明，NeSD 系统在语音、音乐和声音事件数据集的声场分解和源定位方面优于传统的 Ambisonics 和 DOANet 方法。此 https URL 提供了演示。\n  - [神经辐射场场景重建探索：合成、真实世界和动态场景](https://arxiv.org/abs/2210.12268) | [code]\n    > 该项目介绍了使用神经辐射场 (NeRF) 方法对合成和真实世界场景进行 3D 场景重建的探索。我们主要利用神经图形基元多分辨率哈希编码的训练和渲染时间的减少，来重建静态视频游戏场景和现实世界场景，比较和观察重建细节和局限性。此外，我们使用动态场景的神经辐射场 (D-NeRF) 探索动态场景重建。最后，我们扩展了 D-NeRF 的实现，最初仅限于处理合成场景，也可以处理真实世界的动态场景。\n  - [用于学习 3D LiDAR 数据场景先验的生成范围成像, WACV2023](https://arxiv.org/abs/2210.11750) | [code]\n    > 3D LiDAR 传感器对于自主移动机器人的强大视觉是必不可少的。然而，部署基于 LiDAR 的感知算法通常会由于与训练环境的域差距而失败，例如角度分辨率不一致和属性缺失。现有的研究通过学习域间映射解决了这个问题，而可迁移性受到训练配置的限制，并且训练容易受到称为光线下降的特殊有损噪声的影响。为了解决这个问题，本文提出了一种适用于数据级域迁移的 LiDAR 距离图像生成模型。受 LiDAR 测量基于逐点距离成像这一事实的启发，我们训练了一个基于隐式图像表示的生成对抗网络以及可微的射线下落效应。与基于点和基于图像的最先进的生成模型相比，我们展示了我们模型的保真度和多样性。我们还展示了上采样和恢复应用程序。此外，我们介绍了用于 LiDAR 语义分割的 Sim2Real 应用程序。我们证明了我们的方法作为一个逼真的光线滴模拟器是有效的，并且优于最先进的方法。\n  - [HDHumans：高保真数字人类的混合方法](https://arxiv.org/abs/2210.12003) | [code]\n    > 逼真的数字人类头像在图形中非常重要，因为它们可以在全球范围内实现沉浸式通信，改善游戏和娱乐体验，并且对 AR 和 VR 设置特别有益。然而，当前的头像生成方法要么在高保真新视图合成、对新动作的泛化、宽松衣服的再现方面存在不足，要么无法以现代显示器提供的高分辨率渲染角色。为此，我们提出了 HDHumans，这是第一个用于 HD 人物角色合成的方法，它共同产生准确且时间连贯的 3D 变形表面和任意新颖视图和训练时未看到的运动的高度逼真的图像。在技​​术核心，我们的方法将经典的变形字符模板与神经辐射场 (NeRF) 紧密集成。我们的方法经过精心设计，以实现经典表面变形和 NeRF 之间的协同作用。首先，模板引导 NeRF，它允许合成高度动态和清晰的角色的新视图，甚至可以合成新的动作。其次，我们还利用 NeRF 产生的密集点云通过 3D 到 3D 监督进一步改善变形表面。在合成质量和分辨率以及 3D 表面重建的质量方面，我们在数量和质量上都优于最先进的技术。\n  - [通过多视图未校准光度立体和渐变 SDF 进行高质量 RGB-D 重建, WACV2023](https://arxiv.org/abs/2210.12202) | [code]\n    > 在许多应用中，对精细重建的需求很高。然而，大多数现有的 RGB-D 重建方法依赖于预先计算的准确相机位姿来恢复详细的表面几何形状，其中在优化不同数量时需要调整表面的表示。在本文中，我们提出了一种新颖的基于多视图 RGB-D 的重建方法，该方法通过利用梯度符号距离场 (gradient-SDF) 来处理相机位姿、光照、反照率和表面法线估计。所提出的方法使用特定的基于物理的模型来制定图像渲染过程，并使用其体积表示来优化实际表面上的表面数量，而不是仅估计实际表面附近的表面数量的其他工作。为了验证我们的方法，我们研究了两个用于自然光和点光源应用的基于物理的图像形成模型。在合成数据集和真实世界数据集上的实验结果表明，所提出的方法可以比现有技术更忠实地恢复表面的高质量几何形状，并进一步提高估计相机位姿的准确性。\n  - [从单个图像进行机器人对象操作的神经场, ICRA2023](https://arxiv.org/abs/2210.12126) | [code]\n    > 我们为对象渲染、3D 重建和抓取姿势预测提供了一个统一且紧凑的表示，可以在几秒钟内从单个图像中推断出来。我们通过利用神经辐射场 (NeRF) 文献的最新进展来实现这一点，这些文献学习类别级先验并以最少的数据和时间对新对象进行微调。我们的见解是，我们可以学习紧凑的形状表示并从中提取有意义的附加信息，例如抓取姿势。我们相信这是第一个使用单个视点（仅 RGB）直接从基于 NeRF 的表示中检索抓取姿势的工作，而不是通过辅助网络和/或表示。与现有技术相比，我们的方法小两到三个数量级，同时在视图重建和抓取方面实现了相当的性能。伴随我们的方法，我们还提出了一个新的渲染鞋数据集，用于训练 sim-2-real NeRF 方法，该方法具有不同宽度的抓手的抓取姿势。\n  - [TANGO：通过光照分解实现文本驱动的真实感和强大的 3D 风格化, NeurIPS2022](https://arxiv.org/abs/2210.11277) | [***``[code]``***](https://cyw-3d.github.io/tango/)\n    > 通过程式化创建 3D 内容是计算机视觉和图形研究中一个有前途但具有挑战性的问题。在这项工作中，我们专注于对任意拓扑的给定表面网格的逼真外观渲染进行风格化。受最近对比语言-图像预训练 (CLIP) 模型的跨模态监督激增的启发，我们提出了 TANGO，它根据文本提示以逼真的方式转移给定 3D 形状的外观风格。从技术上讲，我们建议将外观风格分解为空间变化的双向反射率分布函数、局部几何变化和照明条件，通过基于球形高斯的可微分渲染器通过监督 CLIP 损失来共同优化它们。因此，TANGO 通过自动预测​​反射效果来实现逼真的 3D 风格转换，即使是对于裸露的、低质量的网格，也无需对特定任务的数据集进行培训。大量实验表明，TANGO 在逼真的质量、3D 几何的一致性和对低质量网格进行样式化时的鲁棒性方面优于现有的文本驱动 3D 样式转换方法。我们的代码和结果可在我们的项目网页 https URL 上找到。\n  - [坐标并不孤单——码本先验有助于隐式神经 3D 表示, NeurIPS2022](https://arxiv.org/abs/2210.11170) | [code]\n    > 隐式神经 3D 表示在表面或场景重建和新颖的视图合成中取得了令人印象深刻的结果，这通常使用基于坐标的多层感知器 (MLP) 来学习连续的场景表示。然而，现有的方法，例如神经辐射场 (NeRF) 及其变体，通常需要密集的输入视图（即 50-150）才能获得不错的结果。为了重温对大量校准图像的过度依赖并丰富基于坐标的特征表示，我们探索将先验信息注入基于坐标的网络，并引入一种新颖的基于坐标的模型 CoCo-INR，用于隐式神经 3D 表示。我们方法的核心是两个注意力模块：码本注意力和坐标注意力。前者从先验码本中提取包含丰富几何和外观信息的有用原型，后者将这些先验信息传播到每个坐标中，并丰富其对场景或物体表面的特征表示。在先验信息的帮助下，与使用较少可用校准图像的当前方法相比，我们的方法可以渲染具有更逼真外观和几何形状的 3D 视图。在包括 DTU 和 BlendedMVS 在内的各种场景重建数据集以及完整的 3D 头部重建数据集 H3DS 上的实验证明了我们提出的方法在较少输入视图下的鲁棒性和精细的细节保留能力。\n  - [用于鲁棒姿态估计的神经辐射场的并行反演, ICRA2023](https://arxiv.org/abs/2210.10108) | [code]\n    > 我们提出了一种基于快速神经辐射场 (NeRF) 的并行优化方法，用于估计 6-DoF 目标姿势。给定单个观察到的目标 RGB 图像，我们可以通过最小化从快速 NeRF 模型渲染的像素与观察图像中的像素之间的残差来预测相机的平移和旋转。我们将基于动量的相机外部优化程序集成到 Instant Neural Graphics Primitives 中，这是最近异常快速的 NeRF 实现。通过在姿态估计任务中引入并行蒙特卡罗采样，我们的方法克服了局部最小值并在更广泛的搜索空间中提高了效率。我们还展示了采用更强大的基于像素的损失函数来减少错误的重要性。实验表明，我们的方法可以在合成和真实世界的基准测试中实现改进的泛化性和鲁棒性。\n  - [神经接触场：使用触觉感应跟踪外部接触](https://arxiv.org/abs/2210.09297) | [code]\n    > 我们提出了神经接触场，一种将神经场和触觉传感结合在一起的方法，以解决跟踪对象与环境之间的外部接触的问题。了解外部接触发生在哪里是迈向可以主动控制它以促进下游操作任务的方法的第一步。用于定位环境接触的先前工作通常假定接触类型（例如点或线），不捕获接触/非接触过渡，并且仅适用于基本几何形状的对象。神经接触场是第一种无需对接触类型做出任何假设即可跟踪任意多模态外部接触的方法。我们的主要见解是估计物体形状潜在空间中任何 3D 点的接触概率，给定基于视觉的触觉输入，该输入感知外部接触引起的局部运动。在实验中，我们发现神经接触场能够定位多个接触块，而无需对接触的几何形状做出任何假设，并在看不见的环境配置中捕获具有看不见的形状的已知类别对象的接触/非接触转换。除了神经接触场之外，我们还发布了模拟外部接触交互的 YCB-Extrinsic-Contact 数据集，以便在该领域进行进一步研究。项目存储库：此 https 网址\n  - [S3-NeRF：单一视点下阴影和阴影的神经反射场, NeurIPS2022](https://arxiv.org/abs/2210.08936) | [***``[code]``***](https://github.com/ywq/s3nerf)\n    > 在本文中，我们解决了多视图场景重建的“双重问题”，其中我们利用在不同点光源下捕获的单视图图像来学习神经场景表示。与只能恢复 2.5D 场景表示（即可见表面的法线/深度图）的现有单视图方法不同，我们的方法学习神经反射场来表示场景的 3D 几何和 BRDF。我们的方法不依赖于多视图照片一致性，而是利用两个信息丰富的单目线索，即阴影和阴影来推断场景几何。对多个具有挑战性的数据集的实验表明，我们的方法能够从单视图图像中恢复场景的 3D 几何图形，包括可见和不可见部分。由于神经反射场表示，我们的方法对深度不连续性具有鲁棒性。它支持新视图合成和重新照明等应用程序。我们的代码和模型可以在这个 https URL 上找到。\n  - [动力学增强神经对象的微分物理模拟](https://arxiv.org/abs/2210.09420) | [code]\n    > 我们提出了一种可微分管道，用于模拟将其几何形状表示为参数化为深度网络的连续密度场的对象的运动。这包括神经辐射场 (NeRFs) 和其他相关模型。从密度场，我们估计物体的动力学特性，包括它的质量、质心和惯性矩阵。然后，我们引入了一种基于密度场的可微接触模型，用于计算碰撞产生的法向力和摩擦力。这允许机器人从运动物体的静止图像和视频中自主构建视觉和动态准确的物体模型。生成的动态增强神经对象 (DANO) 使用现有的可微分模拟引擎 Dojo 进行模拟，并与其他标准模拟对象（例如指定为 URDF 的球体、平面和机器人）交互。机器人可以使用这种模拟来优化神经物体的抓取和操纵轨迹，或者通过基于梯度的真实到模拟传输来改进神经物体模型。我们演示了从肥皂在桌子上滑动的真实视频中学习一块肥皂的摩擦系数的管道。我们还通过从合成数据中与熊猫机器人手臂的交互来了解斯坦福兔子的摩擦系数和质量，并在模拟中优化熊猫手臂的轨迹，以将兔子推到目标位置。\n## Oct9 - Oct15, 2022\n  - [LB-NERF：用于透明介质的光弯曲神经辐射场, ICIP2022](https://ieeexplore.ieee.org/abstract/document/9897642) | [code]\n    > 神经辐射场 (NeRFs) 已被提出作为新颖的视图合成方法，并且由于其多功能性已被用于解决各种问题。 NeRF 可以使用假设直线光路的神经渲染来表示 3D 空间中的颜色和密度。但是，场景中具有不同折射率的介质，例如透明介质，会引起光的折射，打破了光路直线的假设。因此，不能在多视图图像中一致地学习 NeRF。为了解决这个问题，本研究提出了一种方法，通过引入光折射效应作为与源自相机中心的直线的偏移量来学习跨多个视点的一致辐射场。实验结果定量和定性地验证了在考虑透明物体的折射时，我们的方法可以比传统的 NeRF 方法更好地插入视点。\n  - [IBL-NeRF：基于图像的神经辐射场照明公式](https://arxiv.org/abs/2210.08202) | [code]\n    > 我们提出了 IBL-NeRF，它将大规模室内场景的神经辐射场 (NeRF) 分解为内在成分。以前的 NeRF 逆向渲染方法转换隐式体积以适应显式几何的渲染管道，并使用环境照明近似分割、孤立对象的视图。相比之下，我们的逆渲染扩展了原始的 NeRF 公式，以捕捉场景体积内照明的空间变化，以及表面属性。具体来说，将不同材质的场景分解为基于图像的渲染的内在组件，即反照率、粗糙度、表面法线、辐照度和预过滤辐射度。所有组件都被推断为来自 MLP 的神经图像，可以对大规模的一般场景进行建模。通过采用基于图像的 NeRF 公式，我们的方法继承了合成图像的卓越视觉质量和多视图一致性。我们展示了在具有复杂对象布局和灯光配置的场景上的性能，这些在以前的任何作品中都无法处理。\n  - [ExAug：通过几何经验增强的机器人条件导航策略](https://arxiv.org/abs/2210.07450) | [code]\n    > 机器学习技术依赖于庞大而多样的数据集进行泛化。计算机视觉、自然语言处理和其他应用程序通常可以重用公共数据集来训练许多不同的模型。然而，由于物理配置的差异，利用公共数据集在新机器人平台上训练机器人控制策略或执行新任务具有挑战性。在这项工作中，我们提出了一个新颖的框架 ExAug，以从不同环境中的多个数据集中增强不同机器人平台的体验。 ExAug 利用了一个简单的原理：通过以点云的形式提取 3D 信息，我们可以创建更复杂和结构化的增强，利用生成合成图像和几何感知惩罚，这在相同情况下适用于不同的机器人，具有不同的尺寸、转弯半径和摄像头位置。在有障碍物的室内和室外环境中，在两个带有三个不同摄像头的新机器人平台上评估训练后的策略。\n  - [基于显着性感知动态路由策略的遥感图像轻量级无级超分辨率](https://arxiv.org/abs/2210.07598) | [***``[code]``***](https://github.com/hanlinwu/SalDRN)\n    > 基于深度学习的算法极大地提高了遥感图像（RSI）超分辨率（SR）的性能。然而，增加网络深度和参数会导致计算和存储的巨大负担。直接减少现有模型的深度或宽度会导致性能大幅下降。我们观察到，一个 RSI 中不同区域的 SR 难度差异很大，现有方法使用相同的深度网络处理图像中的所有区域，造成计算资源的浪费。此外，现有的 SR 方法通常预先定义整数尺度因子，不能进行无级 SR，即单个模型可以处理任何潜在的尺度因子。在每个比例因子上重新训练模型会浪费大量的计算资源和模型存储空间。为了解决上述问题，我们提出了一种显着性感知动态路由网络（SalDRN），用于 RSI 的轻量级和无级 SR。首先，我们引入视觉显着性作为区域级 SR 难度的指标，并将轻量级显着性检测器集成到 SalDRN 中以捕获像素级视觉特征。然后，我们设计了一种显着性感知动态路由策略，该策略采用路径选择开关根据子图像块的 SR 难度自适应地选择适当深度的特征提取路径。最后，我们提出了一种新颖的轻量级无级上采样模块，其核心是隐式特征函数，用于实现从低分辨率特征空间到高分辨率特征空间的映射。综合实验验证，SalDRN 可以在性能和复杂性之间取得良好的折衷。代码位于 \\url{this https URL}。\n  - [NOCaL：里程计和相机内在学的免校准半监督学习](https://arxiv.org/abs/2210.07435) | [code]\n    > 有许多新兴的成像技术可以使机器人技术受益。然而，对定制模型、校准和低级处理的需求是它们采用的主要障碍。在这项工作中，我们展示了 NOCaL、神经里程计和使用光场的校准，这是一种半监督学习架构，能够在没有校准的情况下解释以前看不见的相机。 NOCaL 学习估计相机参数、相对姿势和场景外观。它采用在大量现有摄像机和场景上预训练的场景渲染超网络，并使用小型监督训练集来适应以前看不见的摄像机来强制度量尺度。我们使用传统相机在渲染和捕获的图像上演示 NOCaL，演示免校准里程计和新颖的视图合成。这项工作是朝着自动解释一般相机几何形状和新兴成像技术迈出的关键一步。\n  - [重新审视多视图光度立体, WACV2023](https://arxiv.org/abs/2210.07670) | [code]\n    > 多视图光度立体 (MVPS) 是从图像中详细和精确地 3D 采集对象的首选方法。尽管 MVPS 的流行方法可以提供出色的结果，但它们通常执行起来很复杂，并且仅限于各向同性的材料对象。为了解决这些限制，我们提出了一种简单实用的 MVPS 方法，该方法适用于各向同性以及其他对象材料类型，例如各向异性和光泽。本文提出的方法利用深度神经网络中不确定性建模的优势，实现光度立体 (PS) 和多视图立体 (MVS) 网络预测的可靠融合。然而，与最近提出的最先进技术相反，我们引入了神经体积渲染方法，用于可靠地融合 MVS 和 PS 测量。引入神经体绘制的优势在于它有助于对具有不同材料类型的对象进行可靠建模，而现有的 MVS 方法、PS 方法或两者都可能失败。此外，它允许我们处理神经 3D 形状表示，最近在许多几何处理任务中显示出出色的结果。我们建议的新损失函数旨在使用最确定的 MVS 和 PS 网络预测以及加权神经体积渲染成本来拟合隐式神经函数的零水平集。当在几个基准数据集上进行广泛测试时，所提出的方法显示了最先进的结果。\n  - [通过隐式神经表示的测试时间训练实现可控风格迁移](https://arxiv.org/abs/2210.07762) | [code]\n    > 我们提出了一个基于隐式神经表示的可控风格迁移框架，该框架通过测试时训练以像素方式控制风格化输出。与传统的图像优化方法经常遇到不稳定的收敛和需要密集训练且泛化能力有限的基于学习的方法不同，我们提出了一个模型优化框架，该框架在测试时通过显式损失函数来优化神经网络以进行风格迁移。在经过一次测试时间训练后，由于基于 INR 的模型的灵活性，我们的框架可以以像素方式精确控制风格化图像，并自由调整图像分辨率，无需进一步优化或训练。我们演示了几个应用程序。\n  - [具有可学习位置特征的可扩展神经视频表示, NeurIPS2022](https://arxiv.org/abs/2210.06823) | [***``[code]``***](https://github.com/subin-kim-cv/NVP)\n    > 使用基于坐标的神经表示 (CNR) 的复杂信号的简洁表示已经取得了很大进展，最近的几项工作集中在扩展它们以处理视频。在这里，主要挑战是如何（a）减轻训练 CNR 时的计算效率低下，以（b）实现高质量的视频编码，同时（c）保持参数效率。为了同时满足 (a)、(b) 和 (c) 的所有要求，我们提出了具有可学习位置特征 (NVP) 的神经视频表示，这是一种新颖的 CNR，通过引入“可学习位置特征”可以有效地将视频摊销为潜在代码。具体来说，我们首先提出了一种基于设计 2D 潜在关键帧的 CNR 架构，以学习每个时空轴上的常见视频内容，这极大地改善了所有这三个要求。然后，我们建议利用现有强大的图像和视频编解码器作为潜在代码的计算/内存高效压缩过程。我们展示了 NVP 在流行的 UVG 基准上的优越性；与现有技术相比，NVP 不仅训练速度快 2 倍（不到 5 分钟），而且编码质量也超过了 34.07→34.57（用 PSNR 指标衡量），即使使用的参数减少了 8 倍以上。我们还展示了 NVP 的有趣属性，例如视频修复、视频帧插值等。\n  - [NeuralRoom：用于室内场景重建的几何约束神经隐式表面](https://arxiv.org/abs/2210.06853) | [code]\n    > 我们提出了一种称为 NeuralRoom 的新型神经表面重建方法，用于直接从一组 2D 图像重建房间大小的室内场景。最近，由于其高质量的结果和简单性，隐式神经表示已成为从多视图图像重建表面的有前途的方法。然而，隐式神经表示通常不能很好地重建室内场景，因为它们存在严重的形状-辐射度模糊性。我们假设室内场景由纹理丰富和平坦的无纹理区域组成。在纹理丰富的区域，多视图立体可以获得准确的结果。在平坦区域，正态估计网络通常能获得较好的正态估计。基于上述观察，我们通过可靠的几何先验来减少隐式神经表面可能的空间变化范围，以减轻形状-辐射度的模糊性。具体来说，我们使用多视图立体结果来限制 NeuralRoom 优化空间，然后使用可靠的几何先验来指导 NeuralRoom 训练。然后，NeuralRoom 将生成一个神经场景表示，该表示可以渲染与输入训练图像一致的图像。此外，我们提出了一种称为扰动残差限制的平滑方法来提高平坦区域的准确性和完整性，该方法假设局部表面中的采样点应该与观测中心具有相同的法线和相似的距离。在 ScanNet 数据集上的实验表明，我们的方法可以重建室内场景的无纹理区域，同时保持细节的准确性。我们还将 NeuralRoom 应用于更高级的多视图重建算法，并显着提高了它们的重建质量。\n  - [CUF：连续上采样滤波器](https://arxiv.org/abs/2210.06965) | [code]\n    > 神经领域已迅速被用于表示 3D 信号，但它们在更经典的 2D 图像处理中的应用相对有限。在本文中，我们考虑了图像处理中最重要的操作之一：上采样。在深度学习中，可学习的上采样层已广泛用于单图像超分辨率。我们建议将上采样内核参数化为神经域。这种参数化导致了一个紧凑的架构，与竞争的任意尺度超分辨率架构相比，参数数量减少了 40 倍。当对大小为 256x256 的图像进行上采样时，我们表明我们的架构比竞争的任意尺度超分辨率架构效率高 2x-10 倍，并且在实例化为单尺度模型时比亚像素卷积更有效。在一般情况下，这些增益随目标规模的平方呈多项式增长。我们在标准基准上验证了我们的方法，表明可以在不牺牲超分辨率性能的情况下实现这种效率提升。\n  - [GeoAug：具有几何约束的 Few-Shot NeRF 的数据增强, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19790-1_20) | [code]\n    > 神经辐射场 (NeRF) 通过学习仅具有姿势 RGB 图像的隐式体积表示，显示出渲染特定场景新视图的非凡能力。尽管 NeRF 令人印象深刻且简单，但在训练图像很少的情况下，它通常会收敛到几何不正确的次优解决方案。我们在此提出 GeoAug：一种用于 NeRF 的数据增强方法，它丰富了基于多视图几何约束的训练数据。 GeoAug 提供用于训练的随机人工（新姿势、RGB 图像）对，其中 RGB 图像来自附近的训练视图。新姿势的渲染被扭曲到具有深度图和相对姿势的附近训练视图，以匹配 RGB 图像监督。我们的方法通过在训练期间引入更多数据来降低过度拟合的风险，同时还为深度图提供了额外的隐式监督。在实验中，我们的方法显着提高了以少量训练视图为条件的神经辐射场的性能。\n  - [逼真的神经域随机化, ECCV2022](https://link.springer.com/chapter/10.1007/978-3-031-19806-9_18) | [code]\n    > 合成数据是人工监督的可扩展替代方案，但它需要克服模拟到真实领域的差距。虚拟世界和现实世界之间的这种差异可以通过两种看似相反的方法来解决：提高模拟的真实性或完全通过域随机化来超越真实性。在本文中，我们展示了神经渲染方面的最新进展实现了一种新的统一方法，我们称之为逼真的神经域随机化 (PNDR)。我们建议学习神经网络的组合，它充当基于物理的光线追踪器，仅从场景几何中生成高质量的渲染。我们的方法是模块化的，由用于材料、照明和渲染的不同神经网络组成，因此可以在可微的管道中随机化不同的关键图像生成组件。一旦经过训练，我们的方法可以与其他方法相结合，用于在线生成照片般逼真的图像增强，并且比通过传统的光线追踪更有效。我们通过两个下游任务证明了 PNDR 的有用性：6D 对象检测和单目深度估计。我们的实验表明，使用 PNDR 进行训练可以泛化到新场景，并且在现实世界传输方面明显优于现有技术。\n  - [AniFaceGAN：用于视频头像的动画 3D 感知人脸图像生成, NeurIPS2022](https://arxiv.org/abs/2210.06465) | [***``[code]``***](https://yuewuhkust.github.io/AniFaceGAN/files/github_icon.jpeg)\n    > 尽管 2D 生成模型在人脸图像生成和动画方面取得了长足进步，但它们在从不同相机视点渲染图像时经常会遇到不希望的伪影，例如 3D 不一致。这可以防止他们合成与真实动画无法区分的视频动画。最近，3D 感知 GAN 扩展了 2D GAN，通过利用 3D 场景表示来明确解开相机姿势。这些方法可以很好地保持生成图像在不同视图中的 3D 一致性，但它们无法实现对其他属性的细粒度控制，其中面部表情控制可以说是面部动画最有用和最理想的方法。在本文中，我们提出了一种可动画的 3D 感知 GAN，用于多视图一致的人脸动画生成。关键思想是将 3D-aware GAN 的 3D 表示分解为模板字段和变形字段，其中前者用规范表达式表示不同的身份，后者表征每个身份的表达变化。为了通过变形实现对面部表情的有意义的控制，我们在 3D 感知 GAN 的对抗训练期间提出了生成器和参数 3D 面部模型之间的 3D 级模仿学习方案。这有助于我们的方法实现具有强烈视觉 3D 一致性的高质量动画人脸图像生成，即使仅使用非结构化 2D 图像进行训练。广泛的实验证明了我们优于以前的工作的性能。项目页面：此 https 网址\n  - [从单目视频中重建个性化语义面部 NeRF 模型, SIGGRAPH-Asia2022](https://arxiv.org/abs/2210.06108) | [***``[code]``***](https://github.com/USTC3DV/NeRFBlendShape-code)\n    > 我们提出了一种用神经辐射场定义的人头语义模型。 3D 一致的头部模型由一组解耦和可解释的基础组成，并且可以由低维表达系数驱动。由于神经辐射场强大的表示能力，所构建的模型可以表示复杂的面部属性，包括头发、着装等，这些属性是传统网格混合形状无法表示的。为了构建个性化的语义面部模型，我们建议将基础定义为几个多级体素字段。以短的单目 RGB 视频作为输入，我们的方法可以在 10 到 20 分钟内构建主体的语义面部 NeRF 模型，并且可以在给定的表情系数和视图方向下在数十毫秒内渲染出照片般逼真的人头图像。通过这种新颖的表示，我们将其应用于面部重定向和表情编辑等许多任务。实验结果证明了其强大的表示能力和训练/推理速度。我们的项目页面中提供了演示视频和发布的代码：此 https 网址\n  - [LION：用于 3D 形状生成的潜在点扩散模型, NeurIPS2022](https://arxiv.org/abs/2210.06978) | [***``[code]``***](https://nv-tlabs.github.io/LION)\n    > 去噪扩散模型 (DDM) 在 3D 点云合成中显示出可喜的结果。为了推进 3D DDM 并使它们对数字艺术家有用，我们需要 (i) 高生成质量，(ii) 操作和应用的灵活性，例如条件合成和形状插值，以及 (iii) 输出平滑表面或网格的能力。为此，我们介绍了用于 3D 形状生成的分层潜在点扩散模型 (LION)。 LION 被设置为具有分层潜在空间的变分自动编码器 (VAE)，该分层潜在空间将全局形状潜在表示与点结构潜在空间相结合。对于生成，我们在这些潜在空间中训练两个分层 DDM。与直接在点云上运行的 DDM 相比，分层 VAE 方法提高了性能，而点结构的潜在模型仍然非常适合基于 DDM 的建模。在实验上，LION 在多个 ShapeNet 基准上实现了最先进的生成性能。此外，我们的 VAE 框架使我们能够轻松地将 LION 用于不同的相关任务：LION 在多模态形状去噪和体素条件合成方面表现出色，并且可以适应文本和图像驱动的 3D 生成。我们还演示了形状自动编码和潜在形状插值，并使用现代表面重建技术增强了 LION 以生成平滑的 3D 网格。我们希望 LION 凭借其高质量的生成、灵活性和表面重建功能，为处理 3D 形状的艺术家提供强大的工具。项目页面和代码：此 https 网址。\n  - [GraspNeRF：使用通用 NeRF 对透明和镜面物体进行基于多视图的 6-DoF 抓取检测](https://arxiv.org/abs/2210.06575) | [code]\n    > 在这项工作中，我们解决了透明和镜面物体的 6 自由度抓取检测问题，这是基于视觉的机器人系统中一个重要但具有挑战性的问题，因为深度相机无法感知其几何形状。我们首次提出了一种基于多视图 RGB 的 6 自由度抓取检测网络 GraspNeRF，该网络利用可泛化的神经辐射场 (NeRF) 在杂乱中实现与材料无关的物体抓取。与现有的基于 NeRF 的 3-DoF 抓取检测方法依赖于密集捕获的输入图像和耗时的每个场景优化相比，我们的系统可以使用稀疏 RGB 输入执行零样本 NeRF 构建并可靠地检测 6-DoF 抓取，两者都是实时的。所提出的框架以端到端的方式联合学习可泛化的 NeRF 和抓取检测，优化抓取的场景表示结构。对于训练数据，我们生成了一个大规模逼真的域随机合成数据集，用于在杂乱的桌面场景中抓取，从而可以直接转移到现实世界。我们在合成和现实世界环境中的广泛实验表明，我们的方法在所有实验中显着优于所有基线，同时保持实时。\n  - [X-NeRF：多场景 360 的显式神经辐射场∘ RGB-D 视图不足, WACV2023](https://arxiv.org/abs/2210.05135) | [***``[code]``***](https://github.com/HaoyiZhu/XNeRF)\n    > 神经辐射场 (NeRFs) 尽管在新颖的视图合成方面表现出色，但通常需要密集的输入视图。许多论文分别为每个场景训练一个模型，很少有人探索将多模态数据纳入这个问题。在本文中，我们关注一个很少讨论但很重要的设置：我们能否训练一个模型来表示多个场景、360∘ 视图和 RGB-D 图像不足？我们将不足的视图称为少数极其稀疏且几乎不重叠的视图。为了解决这个问题，提出了一种完全显式的方法 X-NeRF，它学习一般的场景完成过程而不是基于坐标的映射。给定一些不足的 RGB-D 输入视图，X-NeRF 首先将它们转换为稀疏点云张量，然后应用 3D 稀疏生成卷积神经网络 (CNN) 将其完成到可以快速进行体积渲染的显式辐射场在推理期间不运行网络。为了避免过度拟合，除了常见的渲染损失之外，我们还应用了感知损失以及通过点云上的随机旋转来增强视图。在我们的环境中，所提出的方法显着优于以前的隐式方法，表明所提出的问题和方法的巨大潜力。此 https 网址提供了代码和数据。\n  - [具有动态学习神经隐式表示的多对象导航](https://arxiv.org/abs/2210.05129) | [code]\n    > 理解和映射新环境是任何自主导航代理的核心能力。虽然经典机器人通常使用 SLAM 变体以独立的方式估计地图，这些变体保持拓扑或度量表示，但导航的端到端学习在神经网络中保留了某种形式的记忆。网络通常充满归纳偏差，其范围从矢量表示到鸟瞰度量张量或拓扑结构。在这项工作中，我们建议构建具有两个神经隐式表示的神经网络，它们在每一集期间动态学习并映射场景的内容：（i）语义查找器预测先前看到的查询对象的位置； (ii) Occupancy and Exploration Implicit Representation 封装了有关探索区域和障碍物的信息，并使用一种新颖的全局读取机制进行查询，该机制直接从函数空间映射到可用的嵌入空间。这两种表示都由经过强化学习 (RL) 训练的代理利用，并在每一集期间在线学习。我们评估了多对象导航上的代理，并展示了使用神经隐式表示作为记忆源的巨大影响。\n  - [CLIP-Fields：机器人记忆的弱监督语义场](https://mahis.life/clip-fields/) | [code]\n    > 我们提出了 CLIP-Fields，这是一种隐式场景模型，可以在没有直接人工监督的情况下进行训练。该模型学习从空间位置到语义嵌入向量的映射。然后，该映射可用于各种任务，例如分割、实例识别、空间语义搜索和视图定位。最重要的是，映射可以通过仅来自网络图像和网络文本训练模型（如 CLIP、Detic 和 Sentence-BERT）的监督进行训练。与 Mask-RCNN 之类的基线相比，我们的方法在 HM3D 数据集上的少量实例识别或语义分割方面表现优于仅一小部分示例。最后，我们展示了使用 CLIP-Fields 作为场景记忆，机器人可以在现实环境中执行语义导航。我们的代码和演示可在此处获得：https://mahis.life/clip-fields/\n  - [神经形状变形先验, NeurIPS2022](https://arxiv.org/abs/2210.05616) | [code]\n    > 我们提出了神经形状变形先验，这是一种新的形状操作方法，可以根据用户提供的手柄运动来预测非刚性物体的网格变形。最先进的方法将此问题视为优化任务，其中输入源网格被迭代变形以根据手工制作的正则化器（如 ARAP）最小化目标函数。在这项工作中，我们基于形状的基本几何特性来学习变形行为，同时利用包含各种非刚性变形的大规模数据集。具体来说，给定源网格和描述部分表面变形的手柄的所需目标位置，我们预测在 3D 空间中定义的连续变形场以描述空间变形。为此，我们引入了基于变压器的变形网络，将形状变形表示为局部表面变形的组合。它学习一组锚定在 3D 空间中的局部潜在代码，从中我们可以学习一组局部表面的连续变形函数。我们的方法可以应用于具有挑战性的变形，并且可以很好地推广到看不见的变形。我们使用 DeformingThing4D 数据集在实验中验证了我们的方法，并与经典的基于优化的方法和最近的基于神经网络的方法进行了比较。\n  - [动态人脸合成的可控辐射场, 3DV2022](https://arxiv.org/abs/2210.05825) | [code]\n    > 最近关于 3D 感知图像合成的工作利用神经渲染的进步取得了令人瞩目的成果。然而，面部动态的 3D 感知合成并没有受到太多关注。在这里，我们研究如何明确控制表现出非刚性运动（例如，面部表情变化）的面部动力学的生成模型合成，同时确保 3D 感知。为此，我们提出了一种可控辐射场（CoRF）：1）通过在基于样式的生成器的分层潜在运动空间中嵌入运动特征来实现运动控制； 2）为了确保背景、运动特征和特定主题属性（如光照、纹理、形状、反照率和身份）的一致性，结合了人脸解析网络、头部回归器和身份编码器。在头部图像/视频数据上，我们表明 CoRF 具有 3D 感知能力，同时能够编辑身份、查看方向和运动。\n  - [神经过程的连续条件视频合成](https://arxiv.org/abs/2210.05810) | [***``[code]``***](https://github.com/NPVS/NPVS)\n    > 我们为多个条件视频合成任务提出了一个统一模型，包括视频预测和视频帧插值。我们表明，条件视频合成可以表述为一个神经过程，它将输入时空坐标映射到给定上下文时空坐标和像素值的目标像素值。具体来说，我们将坐标的隐式神经表示馈送到基于 Transformer 的非自回归条件视频合成模型中。我们的任务特定模型优于以前在多个数据集上进行视频插值的工作，并与最先进的视频预测模型具有竞争力的性能。重要的是，该模型能够以任意高帧速率进行插值或预测，即连续合成。我们的源代码可在此 https 网址上找到。\n  - [SiNeRF：用于联合姿势估计和场景重建的正弦神经辐射场, BMVC2022](https://arxiv.org/abs/2210.04553) | [***``[code]``***](https://github.com/yitongx/sinerf)\n    > NeRFmm 是处理联合优化任务的神经辐射场 (NeRF)，即同时重建真实场景和注册相机参数。尽管 NeRFmm 产生了精确的场景合成和姿势估计，但它仍然难以在具有挑战性的场景中超越全注释基线。在这项工作中，我们发现联合优化中存在系统的次优性，并进一步确定了它的多个潜在来源。为了减少潜在源的影响，我们提出了利用正弦激活进行辐射映射的正弦神经辐射场 (SiNeRF) 和用于有效选择射线批次的新型混合区域采样 (MRS)。定量和定性的结果表明，与NeRFmm相比，SiNeRF在图像合成质量和姿态估计精度方面实现了全面的显着提升。此 https 网址提供了代码。\n  - [NerfAcc：一个通用的 NeRF 加速工具箱](https://arxiv.org/abs/2210.04847) | [***``[code]``***](https://github.com/KAIR-BAIR/nerfacc)\n    > 我们提出了 NerfAcc，一个用于高效体积渲染辐射场的工具箱。我们以 Instant-NGP 中提出的技术为基础，并将这些技术扩展为不仅支持有界静态场景，还支持动态场景和无界场景。 NerfAcc 带有一个用户友好的 Python API，并为大多数 NeRF 的即插即用加速做好了准备。提供了各种示例来展示如何使用此工具箱。可在此处找到代码：此 https 网址。\n  - [通过神经渲染在静态视频中进行自我监督的 3D 人体姿态估计](https://arxiv.org/abs/2210.04514) | [code]\n    > 从 2D 图像推断 3D 人体姿势是计算机视觉领域中一个具有挑战性且长期存在的问题，具有许多应用，包括运动和医学的运动捕捉、虚拟现实、监视或步态分析。我们提供了一种从包含单个人和静态背景的 2D 视频中估计 3D 姿势的方法的初步结果，而无需任何手动地标注释。我们通过制定一个简单而有效的自我监督任务来实现这一点：我们的模型需要重建视频的随机帧，给定来自另一个时间点的帧和变换后的人体形状模板的渲染图像。对于优化至关重要，我们基于光线投射的渲染管道是完全可区分的，能够仅基于重建任务进行端到端训练。\n  - [MVSPlenOctree：从多视图立体中快速和通用地重建 PlenOctree 中的辐射场, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547795) | [code]\n    > 我们提出了 MVSPlenOctree，这是一种新方法，可以有效地重建辐射场以进行视图合成。与以前特定场景的辐射场重建方法不同，我们提出了一个通用管道，可以通过从数十个稀疏展开的图像中进行多视图立体 (MVS) 推断来有效地重建 360 度可渲染的辐射场。我们的方法利用基于方差的统计特征进行 MVS 推理，并将其与基于图像的渲染和体积渲染相结合以进行辐射场重建。我们首先训练一个 MVS 机器来推理场景的密度和外观。然后，基于 PlenOctree 的空间层次结构和从粗到细的密集采样机制，我们设计了一种鲁棒高效的 PlenOctree 重建采样策略，可以鲁棒地处理遮挡。一个 360 度可渲染的辐射场可以在 MVS Machine 的 PlenOctree 中以有效的单次前向传递进行重建。我们在真实世界的 DTU、LLFF 数据集和合成数据集上训练了我们的方法。我们通过评估在训练中看不到的 DTU 数据集的测试集来验证其普遍性。总之，我们的辐射场重建方法既高效又通用，可以在几秒钟内重建一个粗略的 360 度可渲染辐射场，在几分钟内重建一个密集的辐射场。更多详情请访问项目页面：https://derry-xing.github.io/projects/MVSPlenOctree。\n  - [ParseMVS：学习用于稀疏多视图立体视觉的原始感知表面表示, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547920) | [code]\n    > 多视图立体视觉 (MVS) 通过从密集采样的图像中找到密集的照片一致对应关系来恢复 3D 表面。在本文中，我们从稀疏采样的视图（最多减少一个数量级的图像）解决具有挑战性的 MVS 任务，这在应用程序中更实用且更具成本效益。主要挑战来自严重遮挡和高度倾斜的补丁引入的显着对应模糊性。另一方面，这种模糊性可以通过结合来自全局结构的几何线索来解决。有鉴于此，我们提出 ParseMVS，通过学习 Primitive-A waR e S urface rE 表示来提升稀疏 MVS。特别是，除了了解全局结构之外，我们的新颖表示还允许保留精细细节，包括几何、纹理和可见性。更具体地说，整个场景被解析为多个几何图元。在它们中的每一个上，几何定义为沿基元法线方向的位移，以及沿每个视图方向的纹理和可见性。一个无监督的神经网络被训练来通过逐渐增加所有输入图像之间的照片一致性和渲染一致性来学习这些因素。由于表面属性在每个图元的 2D 空间中局部更改，ParseMVS 可以在优化局部细节的同时保留全局图元结构，处理“不完整”和“不准确”问题。我们通过实验证明，在不同的采样稀疏度下，尤其是在极端稀疏的 MVS 设置下，ParseMVS 在完整性和总体得分方面始终优于最先进的表面重建方法。除此之外，ParseMVS 在压缩、鲁棒性和效率方面也显示出巨大的潜力。\n  - [通过相邻几何引导体积完成的自监督多视图立体, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547926) | [code]\n    > 现有的自我监督多视图立体（MVS）方法在很大程度上依赖于几何推断的光度一致性，因此受到低纹理或非朗伯外观的影响。在本文中，我们观察到相邻几何具有某些共性，可以帮助推断具有挑战性或低置信度区域的正确几何。然而，由于缺乏训练数据和确保视图之间一致性的必要性，在非监督 MVS 方法中利用此类属性仍然具有挑战性。为了解决这些问题，我们提出了一种新颖的几何推理训练方案，通过选择性地掩盖具有丰富纹理的区域，其中几何可以很好地恢复并用于监督信号，然后引导一个精心设计的成本体积完成网络来学习如何恢复几何被屏蔽的区域。在推理过程中，我们然后屏蔽低置信区域，并使用成本体积完成网络进行几何校正。为了处理成本体积金字塔的不同深度假设，我们为完成网络设计了一个三分支体积推理结构。此外，通过将平面视为一种特殊的几何形状，我们首先从伪标签中识别平面区域，然后通过平面法线一致性通过高置信度标签校正低置信度像素。在 DTU 和 Tanks & Temples 上进行的大量实验证明了所提出框架的有效性和最先进的性能。\n  - [面向 DIBR 的视图合成的几何翘曲误差感知 CNN, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3547946) | [code]\n    > 基于深度图像渲染（DIBR）的面向视图合成是一种重要的虚拟视图生成技术。它根据深度图将参考视图图像扭曲到目标视点，而不需要许多可用的视点。然而，在 3D 翘曲过程中，像素被翘曲到分数像素位置，然后四舍五入（或插值）到整数像素，导致几何翘曲错误并降低图像质量。这在某种程度上类似于图像超分辨率问题，但具有不固定的小数像素位置。为了解决这个问题，我们提出了一个几何翘曲误差感知 CNN (GWEA) 框架来增强面向 DIBR 的视图合成。首先，利用 DIBR 模块中保留的几何翘曲误差，开发了一种基于可变形卷积的几何翘曲误差感知对齐 (GWEA-DCA) 模块。在可变形卷积中学习的偏移量可以解释几何翘曲误差，以促进从小数像素到整数像素的映射。此外，鉴于翘曲图像中的像素由于翘曲误差的强度不同而具有不同的质量，进一步开发了注意力增强视图混合（GWEA-AttVB）模块，以自适应地融合来自不同翘曲图像的像素。最后，基于部分卷积的空洞填充和细化模块填充剩余的空洞并提高整体图像的质量。实验表明，我们的模型可以合成比现有方法更高质量的图像，并且还进行了消融研究，验证了每个提出的模块的有效性。\n  - [ReFu：细化和融合未观察到的视图以保留细节的单图像 3D 人体重建](https://dl.acm.org/doi/abs/10.1145/3503161.3547971) | [code]\n    > 单图像 3D 人体重建旨在在给定单个图像的情况下重建人体的 3D 纹理表面。虽然基于隐式函数的方法最近实现了合理的重建性能，但它们仍然存在局限性，从未观察的角度显示表面几何形状和纹理质量下降。作为回应，为了生成逼真的纹理表面，我们提出了 ReFu，这是一种从粗到细的方法，可以细化投影的背面视图图像并融合细化的图像以预测最终的人体。为了抑制在投影图像和重建网格中引起噪声的扩散占用，我们建议通过同时利用 2D 和 3D 监督和基于占用的体渲染来训练占用概率。我们还引入了一种细化架构，该架构可以生成具有前后扭曲的保留细节的背面视图图像。大量实验表明，我们的方法从单个图像中实现了 3D 人体重建的最先进性能，从未观察到的视图中显示出增强的几何和纹理质量。\n  - [NeRF2Real：使用神经辐射场的视觉引导双足运动技能的 Sim2real 转移](https://arxiv.org/abs/2210.04932) | [code]\n    > 我们提出了一个系统，用于将 sim2real 方法应用于具有逼真视觉效果的“野外”场景，以及依赖于使用 RGB 相机的主动感知的策略。给定一个使用通用电话收集的静态场景的短视频，我们学习场景的接触几何和使用神经辐射场 (NeRF) 进行新视图合成的功能。我们通过叠加其他动态对象（例如机器人自己的身体、球）的渲染来增强静态场景的 NeRF 渲染。然后使用物理模拟器中的渲染引擎创建模拟，该模拟从静态场景几何（根据 NeRF 体积密度估计）和动态对象的几何和物理属性（假设已知）计算接触动力学。我们证明我们可以使用这个模拟来学习基于视觉的全身导航和推球策略，用于具有驱动头戴式 RGB 摄像头的 20 自由度类人机器人，并且我们成功地将这些策略转移到真实机器人。此 https 网址提供项目视频\n  - [从单幅图像中进行 3D 人脸绑定的不确定性感知半监督学习, ACMMM2022](https://dl.acm.org/doi/abs/10.1145/3503161.3548285) | [code]\n    > 我们提出了一种通过动作单元 (AU)、视点和光线方向从单个输入图像中装配 3D 面的方法。现有的人脸合成和动画 3D 方法严重依赖 3D 可变形模型（3DMM），该模型建立在 3D 数据之上，无法提供直观的表情参数，而 AU 驱动的 2D 方法无法处理头部姿势和光照效果。我们通过以半监督方式将最近的 3D 重建方法与 2D AU 驱动方法相结合来弥补差距。建立在自动编码 3D 人脸重建模型的基础上，该模型在没有任何监督的情况下将深度、反照率、视点和光线解耦，我们进一步将表达式与深度和反照率的身份解耦，并使用新的条件特征转换模块和预训练的批评家进行 AU 强度估计和图像分类.新颖的目标函数是使用未标记的野外图像和带有 AU 标签的室内图像设计的。我们还利用不确定性损失将可能变化的图像 AU 区域建模为合成的输入噪声，并对有噪声的 AU 强度标签进行建模以估计 AU 评论家的强度。在四个数据集上进行的人脸编辑和动画实验表明，与六种最先进的方法相比，我们提出的方法在表情一致性、身份相似性和姿势相似性方面具有优越性和有效性。\n  - [强化神经辐射场的多尺度表示, BMVC2022](https://arxiv.org/abs/2210.04233) | [code]\n    > 神经辐射场 (NeRF) 最近成为从多视图 (MV) 图像中表示对象的新范例。然而，它无法处理多尺度 (MS) 图像和相机姿态估计错误，这通常是从日常商品相机捕获的多视图图像的情况。虽然最近提出的 Mip-NeRF 可以处理 NeRF 的多尺度成像问题，但它不能处理相机姿态估计误差。另一方面，新提出的 BARF 可以解决 NeRF 的相机位姿问题，但如果图像本质上是多尺度的，则会失败。本文提出了一种强大的多尺度神经辐射场表示方法，以同时克服两个现实世界的成像问题。我们的方法通过利用场景刚性的基本原理，使用受 NeRF 启发的方法来处理多尺度成像效果和相机姿态估计问题。为了减少由于光线空间中的多尺度图像造成的令人不快的混叠伪影，我们利用了 Mip-NeRF 多尺度表示。对于鲁棒相机位姿的联合估计，我们在神经体绘制框架中提出了基于图神经网络的多重运动平均。我们通过示例证明，为了从日常获取的多视图图像中准确地表示对象，拥有精确的相机姿态估计是至关重要的。如果不考虑相机姿态估计中的鲁棒性度量，通过圆锥截头体对多尺度混叠伪影进行建模可能会适得其反。我们在基准数据集上进行了广泛的实验，以证明我们的方法比最近的 NeRF 启发的方法在这种现实设置中提供了更好的结果。\n  - [使用树结构从辐射场估计神经反射场](https://arxiv.org/abs/2210.04217) | [code]\n    > 我们提出了一种新方法，用于在未知光照下从一组姿势多视图图像中估计对象的神经反射场 (NReF)。 NReF 以分离的方式表示对象的 3D 几何和外观，并且很难仅从图像中估计。我们的方法通过利用神经辐射场（NeRF）作为代理表示来解决这个问题，我们从中进行进一步的分解。高质量的 NeRF 分解依赖于良好的几何信息提取以及良好的先验项来正确解决不同组件之间的歧义。为了从辐射场中提取高质量的几何信息，我们重新设计了一种新的基于射线投射的表面点提取方法。为了有效地计算和应用先验项，我们将不同的先验项转换为从辐射场提取的表面上的不同类型的滤波操作。然后，我们采用两种类型的辅助数据结构，即高斯 KD-tree 和八叉树，以支持在训练期间快速查询表面点和高效计算表面过滤器。基于此，我们设计了一个多级分解优化流程，用于从神经辐射场估计神经反射场。大量实验表明，我们的方法在不同数据上优于其他最先进的方法，并且能够实现高质量的自由视图重新照明以及材料编辑任务。\n  - [通过学习一致性场实现高效的神经场景图, BMVC2022](https://arxiv.org/abs/2210.04127) | [***``[code]``***](https://github.com/ldynx/CF-NSG)\n    > 神经辐射场 (NeRF) 从新颖的视图实现照片般逼真的图像渲染，神经场景图 (NSG) \\cite{ost2021neural} 将其扩展到具有多个对象的动态场景（视频）。然而，为每个图像帧计算繁重的光线行进成为一个巨大的负担。在本文中，利用视频中相邻帧之间的显着冗余，我们提出了一个特征重用框架。然而，从天真地重用 NSG 特征的第一次尝试中，我们了解到，将跨帧一致的对象内在属性与瞬态属性分开是至关重要的。我们提出的方法，\\textit{基于一致性场的 NSG (CF-NSG)}，重新定义了神经辐射场以额外考虑 \\textit{一致性场}。通过解开表示，CF-NSG 充分利用了特征重用方案，并以更可控的方式执行扩展程度的场景操作。我们凭经验验证，CF-NSG 通过使用比 NSG 少 85% 的查询大大提高了推理效率，而渲染质量没有显着下降。代码将在以下位置提供：此 https 网址\n## Oct2 - Oct8, 2022\n  - [ViewFool：评估视觉识别对对抗性观点的鲁棒性, NeurIPS2022](https://arxiv.org/abs/2210.03895) | [code]\n    > 最近的研究表明，视觉识别模型对分布变化缺乏鲁棒性。然而，目前的工作主要考虑模型对 2D 图像转换的鲁棒性，而较少探索 3D 世界中的视点变化。一般来说，视点变化在各种实际应用（例如自动驾驶）中很普遍，因此评估视点鲁棒性势在必行。在本文中，我们提出了一种称为 ViewFool 的新方法来寻找误导视觉识别模型的对抗性视点。通过将现实世界中的物体编码为神经辐射场 (NeRF)，ViewFool 在熵正则化器下表征了不同对抗视点的分布，这有助于处理真实相机姿态的波动并减轻真实物体与其神经之间的现实差距申述。实验验证了常见的图像分类器极易受到生成的对抗性视点的影响，这也表现出很高的跨模型可迁移性。基于 ViewFool，我们引入了 ImageNet-V，这是一种新的分布外数据集，用于对图像分类器的视点鲁棒性进行基准测试。对具有不同架构、目标函数和数据增强的 40 个分类器的评估结果显示，在 ImageNet-V 上进行测试时模型性能显着下降，这为利用 ViewFool 作为一种有效的数据增强策略来提高视点鲁棒性提供了可能性。\n  - [用于手术记录的新视图合成](https://link.springer.com/chapter/10.1007/978-3-031-18576-2_7) | [code]\n    > 在手术室记录手术是医疗教育和评估的基本任务之一。然而，由于目标在手术过程中被医生或护士的头部或手严重遮挡，因此难以记录描绘手术的区域。我们使用了一个记录系统，该系统在手术灯中嵌入了多个摄像头，假设至少有一个摄像头正在无遮挡地记录目标。在本文中，我们提出 Conditional-BARF (C-BARF) 通过合成来自相机的新颖视图图像来生成无遮挡图像，旨在生成具有平滑相机姿态转换的视频。据我们所知，这是第一个解决从手术场景的多个图像合成新颖视图图像的问题的工作。我们使用三种不同类型手术的原始数据集进行实验。我们的实验表明，我们可以成功地从嵌入在手术灯中的多个摄像头记录的图像中合成新的视图。\n  - [一种基于关键点的音频驱动自由视角说话头合成增强方法](https://arxiv.org/abs/2210.03335) | [code]\n    > 音频驱动的说话头合成是一项具有挑战性的任务，近年来越来越受到关注。虽然现有的基于 2D 标志或 3D 人脸模型的方法可以为任意身份合成准确的嘴唇同步和有节奏的头部姿势，但它们仍然存在局限性，例如嘴部映射中的切割感和缺乏皮肤高光。与周围的人脸相比，变形区域是模糊的。提出了一种基于关键点的增强（KPBE）方法用于音频驱动的自由视图说话头合成，以提高生成视频的自然度。首先，使用现有方法作为后端来合成中间结果。然后我们使用关键点分解从后端输出和源图像中提取视频合成控制参数。之后，将控制参数合成为源关键点和驱动关键点。使用基于运动场的方法从关键点表示生成最终图像。通过关键点表示，我们克服了嘴巴映射中的切割感和缺乏皮肤高光的问题。实验表明，我们提出的增强方法在平均意见得分方面提高了谈话头视频的质量。\n  - [用于将图像转换为任意比例的简单插件](https://arxiv.org/abs/2210.03417) | [code]\n    > 现有的超分辨率模型通常专门针对一个尺度，从根本上限制了它们在实际场景中的使用。在本文中，我们的目标是开发一个通用插件，可以插入到现有的超分辨率模型中，方便地增强它们对任意分辨率图像缩放的能力，因此被称为 ARIS。我们做出以下贡献：（i）我们提出了一个基于transformer的插件模块，它使用空间坐标作为查询，通过交叉注意迭代地关注低分辨率图像特征，并为查询的空间位置输出视觉特征，类似于图像的隐式表示； (ii) 我们引入了一种新颖的自我监督训练方案，该方案利用一致性约束来有效地增强模型将图像上采样到看不见的尺度的能力，即不提供真实的高分辨率图像； (iii) 在不失一般性的情况下，我们将提出的 ARIS 插件模块注入到多个现有模型中，即 IPT、SwinIR 和 HAT，表明生成的模型不仅可以在固定比例因子上保持其原始性能，而且可以外推到看不见的模型尺度，在标准基准上大大优于现有的任何尺度超分辨率模型，例如Urban100、DIV2K等\n  - [用于实时、开放集场景理解的特征真实神经融合](https://arxiv.org/abs/2210.03043) | [code]\n    > 机器人的一般场景理解需要灵活的语义表示，以便可以识别、分割和分组训练时可能不知道的新物体和结构。我们提出了一种算法，该算法在实时 SLAM 期间将来自标准预训练网络的一般学习特征融合到高效的 3D 几何神经场表示中。融合的 3D 特征图继承了神经域几何表示的连贯性。这意味着在运行时交互的少量人类标签使对象甚至对象的一部分能够以开放集的方式稳健而准确地分割。\n  - [XDGAN：2D 空间中的多模态 3D 形状生成](https://arxiv.org/abs/2210.03007) | [code]\n    > 由于二维卷积架构的效率，二维图像的生成模型最近在质量、分辨率和速度方面取得了巨大进步。然而，由于大多数当前的 3D 表示依赖于自定义网络组件，因此很难将此进展扩展到 3D 领域。本文解决了一个核心问题：是否可以直接利用 2D 图像生成模型来生成 3D 形状？为了回答这个问题，我们提出了 XDGAN，这是一种有效且快速的方法，用于将 2D 图像 GAN 架构应用于 3D 对象几何图形的生成，并结合附加的表面属性，如颜色纹理和法线。具体来说，我们提出了一种将 3D 形状转换为紧凑的 1 通道几何图像并利用 StyleGAN3 和图像到图像转换网络在 2D 空间中生成 3D 对象的新方法。生成的几何图像可以快速转换为 3D 网格，实现实时 3D 对象合成、可视化和交互式编辑。此外，使用标准 2D 架构有助于将更多 2D 进步带入 3D 领域。我们定量和定性地表明，我们的方法在各种任务中非常有效，例如 3D 形状生成、单视图重建和形状操作，同时与最近的 3D 生成模型相比明显更快、更灵活。\n  - [一种基于神经表面重建的鲁棒对象抓取的 Real2Sim2Real 方法](https://arxiv.org/abs/2210.02685) | [code]\n    > 最近基于 3D 的操作方法要么使用 3D 神经网络直接预测抓取姿势，要么使用从形状数据库中检索到的类似对象来解决抓取姿势。然而，前者在使用新的机器人手臂或看不见的物体进行测试时面临着普遍性挑战；后者假设数据库中存在类似的对象。我们假设最近的 3D 建模方法为构建评估场景的数字副本提供了途径，该评估场景提供物理模拟并支持稳健的操作算法学习。我们建议使用最先进的神经表面重建方法（Real2Sim 步骤）从现实世界的点云中重建高质量的网格。由于大多数模拟器采用网格进行快速模拟，因此重建的网格无需人工即可生成抓取姿势标签。生成的标签可以训练在真实评估场景中表现稳健的抓取网络（Sim2Real 步骤）。在合成和真实实验中，我们表明 Real2Sim2Real 管道的性能优于使用大型数据集训练的基线抓取网络和基于检索的重建的抓取采样方法。 Real2Sim2Real 管道的好处来自 1) 将场景建模和抓取采样解耦为子问题，以及 2) 可以使用最新的 3D 学习算法和基于网格的物理模拟技术以足够高的质量解决这两个子问题。\n  - [用于实时、开放集场景理解的特征真实神经融合](https://arxiv.org/abs/2210.03043) | [code]\n    > 机器人的一般场景理解需要灵活的语义表示，以便可以识别、分割和分组训练时可能不知道的新物体和结构。我们提出了一种算法，该算法在实时 SLAM 期间将来自标准预训练网络的一般学习特征融合到高效的 3D 几何神经场表示中。融合的 3D 特征图继承了神经域几何表示的连贯性。这意味着在运行时交互的少量人类标签使对象甚至对象的一部分能够以开放集的方式稳健而准确地分割。\n  - [神经匹配字段：视觉对应匹配字段的隐式表示, NeurIPS2022](https://arxiv.org/abs/2210.02689) | [***``[code]``***](https://ku-cvlab.github.io/NeMF/)\n    > 现有的语义对应管道通常包括提取高级语义特征以保持对类内变化和背景杂波的不变性。然而，这种架构不可避免地会导致低分辨率匹配字段，该字段还需要临时插值过程作为将其转换为高分辨率的后处理，这肯定会限制匹配结果的整体性能。为了克服这个问题，受隐式神经表示最近成功的启发，我们提出了一种新的语义对应方法，称为神经匹配场 (NeMF)。然而，4D 匹配场的复杂性和高维性是主要障碍，我们提出了一种成本嵌入网络来处理粗略的成本量，以作为通过以下全连接网络建立高精度匹配场的指导。然而，学习高维匹配字段仍然具有挑战性，主要是由于计算复杂性，因为简单的穷举推理需要从 4D 空间中的所有像素中查询以推断像素级对应关系。为了克服这个问题，我们提出了充分的训练和推理程序，在训练阶段，我们随机抽取匹配的候选者，在推理阶段，我们在测试时迭代地执行基于 PatchMatch 的推理和坐标优化。通过这些结合，在语义对应的几个标准基准上获得了具有竞争力的结果。此 https URL 提供了代码和预训练的权重。\n  - [IR-MCL：基于隐式表示的在线全球本地化](https://arxiv.org/abs/2210.03113) | [***``[code]``***](https://github.com/PRBonn/ir-mcl)\n    > 确定移动机器人的状态是机器人导航系统的重要组成部分。在本文中，我们解决了使用 2D LiDAR 数据估计机器人在室内环境中的姿势的问题，并研究了现代环境模型如何改进黄金标准 Monte-Carlo 定位 (MCL) 系统。我们提出了一个神经占用场（NOF）来使用神经网络隐式表示场景。借助预训练网络，我们可以通过体绘制合成 2D LiDAR 扫描以获取任意机器人姿势。基于隐式表示，我们可以获得合成扫描与实际扫描之间的相似度作为观察模型，并将其集成到 MCL 系统中以执行准确的定位。我们在五个自记录数据集和三个公开可用数据集的序列上评估我们的方法。我们表明，我们可以使用我们的方法准确有效地定位机器人，超过最先进方法的定位性能。实验表明，所呈现的隐式表示能够预测更准确的 2D LiDAR 扫描，从而为我们的基于粒子滤波器的定位提供改进的观察模型。我们方法的代码发布在：this https URL。\n  - [SelfNeRF：来自单目自旋转视频的人类快速训练 NeRF](https://arxiv.org/abs/2210.01651) | [code]\n    > 在本文中，我们提出了 SelfNeRF，一种有效的基于神经辐射场的新型视图合成方法，用于人类表现。给定人类表演者的单目自旋转视频，SelfNeRF 可以从头开始训练并在大约 20 分钟内获得高保真结果。最近的一些工作利用神经辐射场进行动态人体重建。然而，这些方法中的大多数都需要多视图输入并且需要数小时的训练，因此仍然难以实际使用。为了解决这个具有挑战性的问题，我们引入了一种基于多分辨率哈希编码的表面相对表示，可以大大提高训练速度并聚合帧间信息。在几个不同数据集上的广泛实验结果证明了 SelfNeRF 对具有挑战性的单目视频的有效性和效率。\n  - [从单目视频中捕捉和动画身体和服装](https://arxiv.org/abs/2210.01868) | [code]\n    > 虽然最近的工作已经显示出从单个图像、视频或一组 3D 扫描中提取穿衣服的 3D 人体化身的进展，但仍然存在一些限制。大多数方法使用整体表示来对身体和服装进行联合建模，这意味着对于虚拟试穿等应用，服装和身体不能分开。其他方法分别对身体和衣服进行建模，但它们需要从从 3D/4D 扫描仪或物理模拟获得的大量 3D 衣服人体网格中进行训练。我们的洞察是身体和服装有不同的造型要求。虽然基于网格的参数 3D 模型可以很好地表示身体，但隐式表示和神经辐射场更适合捕捉服装中存在的各种形状和外观。基于这一见解，我们提出了 SCARF（分段穿衣化身辐射场），这是一种将基于网格的身体与神经辐射场相结合的混合模型。将网格与可微分光栅器相结合将网格集成到体积渲染中，使我们能够直接从单目视频优化 SCARF，而无需任何 3D 监督。混合建模使 SCARF 能够（i）通过改变身体姿势（包括手部关节和面部表情）为穿着衣服的身体化身制作动画，（ii）合成化身的新视图，以及（iii）在虚拟试穿中在化身之间转移衣服应用程序。我们证明了 SCARF 重建的服装比现有方法具有更高的视觉质量，服装随着身体姿势和体形的变化而变形，并且服装可以在不同主体的化身之间成功转移。代码和模型可在此 https 网址获得。\n  - [在杂乱的环境中学习感知感知敏捷飞行](https://arxiv.org/abs/2210.01841) | [code]\n    > 最近，神经控制策略的性能优于现有的基于模型的规划和控制方法，可在最短的时间内通过杂乱的环境自主导航四旋翼飞行器。然而，它们没有感知意识，这是基于视觉的导航的关键要求，因为相机的视野有限和四旋翼的驱动不足。我们提出了一种学习神经网络策略的方法，该策略可在杂乱的环境中实现感知感知、最短时间飞行。我们的方法通过利用特权学习作弊框架结合了模仿学习和强化学习 (RL)。使用 RL，我们首先训练具有全状态信息的感知感知教师策略，以便在最短时间内通过杂乱的环境。然后，我们使用模仿学习将其知识提炼成基于视觉的学生策略，该策略仅通过相机感知环境。我们的方法将感知和控制紧密结合，在计算速度（快 10 倍）和成功率方面显示出显着优势。我们使用物理四旋翼和硬件在环仿真以高达 50 公里/小时的速度展示了闭环控制性能。\n  - [用于自监督入住预测的可区分光线投射, ECCV2022](https://arxiv.org/abs/2210.01917) | [***``[code]``***](https://github.com/tarashakhurana/emergent-occ-forecasting)\n    > 安全自动驾驶的运动规划需要了解自我车辆周围的环境如何随时间演变。场景中可驱动区域的以自我为中心的感知不仅随着环境中演员的运动而变化，而且随着自我车辆本身的运动而变化。为大规模规划（例如以自我为中心的自由空间）提出的自我监督表示混淆了这两种运动，使得该表示难以用于下游运动规划器。在本文中，我们使用几何占用作为自由空间等依赖于视图的表示的自然替代方案。占用图自然地将环境的运动与自我车辆的运动分开。然而，人们无法直接观察场景的完整 3D 占用情况（由于遮挡），因此难以用作学习信号。我们的主要见解是使用可微分光线投射将未来占用预测“渲染”到未来的 LiDAR 扫描预测中，这可以与自监督学习的地面实况扫描进行比较。可微光线投射的使用允许占用率作为预测网络中的内部表示出现。在没有地面实况占用的情况下，我们定量评估了光线投射 LiDAR 扫描的预测，并显示了多达 15 个 F1 点的改进。对于下游运动规划器，紧急占用可以直接用于引导不可驱动区域，与以自由空间为中心的运动规划器相比，这种表示相对减少了高达 17% 的物体碰撞次数。\n  - [用于新视图合成的自我改进多平面到层图像, WACV2023](https://samsunglabs.github.io/MLI/) | [***``[code]``***](https://github.com/SamsungLabs/MLI)\n    > 我们提出了一种用于轻量级小说视图合成的新方法，该方法可以推广到任意前向场景。最近的方法在计算上很昂贵，需要逐场景优化，或者产生内存昂贵的表示。我们首先用一组正面平行的半透明平面来表示场景，然后以端到端的方式将它们转换为可变形层。此外，我们采用前馈细化程序，通过聚合来自输入视图的信息来纠正估计的表示。我们的方法在处理新场景时不需要微调，并且可以不受限制地处理任意数量的视图。实验结果表明，我们的方法在常用指标和人工评估方面超过了最近的模型，在推理速度和推断分层几何的紧凑性方面具有显着优势，请参阅此 https URL\n  - [用于隐式场景重建的不确定性驱动的主动视觉](https://arxiv.org/abs/2210.00978) | [code]\n    > 多视图隐式场景重建方法由于能够表示复杂的场景细节而变得越来越流行。最近的努力致力于改进输入信息的表示并减少获得高质量重建所需的视图数量。然而，也许令人惊讶的是，关于选择哪些视图以最大限度地提高场景理解的研究在很大程度上仍未得到探索。我们提出了一种用于隐式场景重建的不确定性驱动的主动视觉方法，该方法利用体积渲染在场景中累积的占用不确定性来选择下一个要获取的视图。为此，我们开发了一种基于占用的重建方法，该方法使用 2D 或 3D 监督准确地表示场景。我们在 ABC 数据集和野外 CO3D 数据集上评估了我们提出的方法，并表明：（1）我们能够获得高质量的最先进的占用重建； (2) 我们的视角条件不确定性定义有效地推动了下一个最佳视图选择的改进，并且优于强大的基线方法； (3) 我们可以通过对视图选择候选执行基于梯度的搜索来进一步提高形状理解。总体而言，我们的结果突出了视图选择对于隐式场景重建的重要性，使其成为进一步探索的有希望的途径。\n  - [NARF22：用于配置感知渲染的神经铰接辐射场, IROS2022](https://progress.eecs.umich.edu/projects/narf/) | [code]\n    > 铰接物体对机器人的感知和操作提出了独特的挑战。它们增加的自由度数量使得定位等任务在计算上变得困难，同时也使得现实世界数据集收集的过程无法扩展。为了解决这些可扩展性问题，我们提出了神经铰接辐射场 (NARF22)，这是一个使用完全可微分、配置参数化神经辐射场 (NeRF) 作为提供铰接对象高质量渲染的方法的管道。 NARF22 在推理时不需要明确了解对象结构。我们提出了一种两阶段的基于部件的训练机制，即使底层训练数据只有一个配置表示，它也允许对象渲染模型在配置空间中很好地泛化。我们通过在通过 Fetch 移动操作机器人收集的真实关节工具数据集上训练可配置渲染器来展示 NARF22 的功效。我们通过配置估计和 6 自由度姿态细化任务展示了该模型对基于梯度的推理方法的适用性。项目网页位于：此 https URL。\n  - [密集单目 SLAM 的概率体积融合](https://arxiv.org/abs/2210.01276) | [code]\n    > 我们提出了一种利用深度密集单目 SLAM 和快速不确定性传播从图像中重建 3D 场景的新方法。所提出的方法能够密集、准确、实时地对场景进行 3D 重建，同时对来自密集单目 SLAM 的极其嘈杂的深度估计具有鲁棒性。与以前的方法不同，要么使用 ad-hoc 深度滤波器，要么从 RGB-D 相机的传感器模型估计深度不确定性，我们的概率深度不确定性直接来自 SLAM 中底层束调整问题的信息矩阵。我们表明，由此产生的深度不确定性提供了一个很好的信号来加权深度图以进行体积融合。如果没有我们的深度不确定性，生成的网格会很嘈杂并带有伪影，而我们的方法会生成准确的 3D 网格，并且伪影要少得多。我们提供了具有挑战性的 Euroc 数据集的结果，并表明我们的方法比直接融合来自单目 SLAM 的深度的准确度提高了 92%，与最佳竞争方法相比提高了 90%。\n  - [SinGRAV：从单个自然场景中学习生成辐射量](https://arxiv.org/abs/2210.01202) | [code]\n    > 我们提出了一个用于一般自然场景的 3D 生成模型。由于缺乏表征目标场景的必要 3D 数据量，我们建议从单个场景中学习。我们的关键见解是，一个自然场景通常包含多个组成部分，其几何、纹理和空间排列遵循一些清晰的模式，但在同一场景中的不同区域仍然表现出丰富的变化。这表明将生成模型的学习本地化在大量局部区域上。因此，我们利用具有空间局部性偏差的多尺度卷积网络来学习单个场景中多个尺度的局部区域的统计信息。与现有方法相比，我们的学习设置绕过了从许多同质 3D 场景中收集数据以学习共同特征的需要。我们创造了我们的方法 SinGRAV，用于从单个自然场景中学习生成辐射体积。我们展示了 SinGRAV 从单个场景生成合理多样的变化的能力，SingGRAV 相对于最先进的生成神经场景方法的优点，以及 SinGRAV 在各种应用中的多功能性，涵盖 3D 场景编辑、合成和动画。代码和数据将被发布以促进进一步的研究。\n  - [IntrinsicNeRF：学习用于可编辑新视图合成的内在神经辐射场](https://arxiv.org/abs/2210.00647) | [***``[code]``***](https://github.com/zju3dv/IntrinsicNeRF)\n    > 我们提出了被称为 IntrinsicNeRF 的内在神经辐射场，它将内在分解引入到基于 NeRF 的~\\cite{mildenhall2020nerf} 神经渲染方法中，并且可以在现有的逆向渲染结合神经渲染方法的同时在房间规模的场景中执行可编辑的新视图合成~ \\cite{zhang2021physg, zhang2022modeling} 只能用于特定对象的场景。鉴于内在分解本质上是一个模棱两可且约束不足的逆问题，我们提出了一种新颖的距离感知点采样和自适应反射率迭代聚类优化方法，该方法使具有传统内在分解约束的 IntrinsicNeRF 能够以无监督的方式进行训练，从而在时间上一致的内在分解结果。为了解决场景中相似反射率的不同相邻实例被错误地聚集在一起的问题，我们进一步提出了一种从粗到细优化的层次聚类方法，以获得快速的层次索引表示。它支持引人注目的实时增强现实应用，例如场景重新着色、材质编辑和照明变化。 Blender 对象和副本场景的大量实验表明，即使对于具有挑战性的序列，我们也可以获得高质量、一致的内在分解结果和高保真新视图合成。项目网页上提供了代码和数据：此 https 网址。\n  - [使用辐射场传播的无监督多视图对象分割, NeurIPS2022](https://arxiv.org/abs/2210.00489) | [code]\n    > 我们提出了辐射场传播 (RFP)，这是一种在重建过程中分割 3D 对象的新方法，仅给出场景的未标记多视图图像。 RFP 源自新兴的基于神经辐射场的技术，该技术将语义与外观和几何形状联合编码。我们方法的核心是一种新颖的传播策略，用于具有双向光度损失的单个对象的辐射场，能够将场景无监督地划分为对应于不同对象实例的显着或有意义的区域。为了更好地处理具有多个对象和遮挡的复杂场景，我们进一步提出了一种迭代期望最大化算法来细化对象掩码。据我们所知，RFP 是第一个在没有任何监督、注释或其他线索（如 3D 边界框和对象类别的先验知识）的情况下处理神经辐射场 (NeRF) 的 3D 场景对象分割的无监督方法。实验表明，RFP 实现了可行的分割结果，比以前的无监督图像/场景分割方法更准确，并且可与现有的基于 NeRF 监督的方法相媲美。分段对象表示支持单独的 3D 对象编辑操作。\n  - [MonoNHR：单眼神经人类渲染器](https://arxiv.org/abs/2210.00627) | [code]\n    > 由于不可见区域中缺乏信息以及可见区域中像素的深度模糊性，现有的神经人类渲染方法难以处理单个图像输入。在这方面，我们提出了单目神经人类渲染器 (MonoNHR)，这是一种新颖的方法，可以仅在给定单个图像的情况下渲染任意人的鲁棒自由视点图像。 MonoNHR 是第一个（i）在单目设置中呈现在训练期间从未见过的人类受试者，以及（ii）在没有几何监督的情况下以弱监督方式训练的方法。首先，我们建议解开 3D 几何和纹理特征，并根据 3D 几何特征调整纹理推断。其次，我们引入了一个 Mesh Inpainter 模块，该模块利用人类结构先验（例如对称性）来修复被遮挡的部分。在 ZJU-MoCap、AIST 和 HUMBI 数据集上的实验表明，我们的方法明显优于最近适应单目情况的方法。\n  - [NeRF：3D 视觉中的神经辐射场，综合评论](https://arxiv.org/abs/2210.00379) | [code]\n    > 神经辐射场 (NeRF) 是一种具有隐式场景表示的新型视图合成，已经席卷了计算机视觉领域。作为一种新颖的视图合成和 3D 重建方法，NeRF 模型在机器人技术、城市测绘、自主导航、虚拟现实/增强现实等领域都有应用。自 Mildenhall 等人的原始论文以来，已发表了 250 多份预印本，其中 100 多份最终被一级计算机​​视觉会议接受。鉴于 NeRF 的受欢迎程度和当前对该研究领域的兴趣，我们认为有必要对过去两年的 NeRF 论文进行全面调查，我们将其组织成基于架构和基于应用程序的分类法。我们还介绍了基于 NeRF 的新颖视图合成理论，以及关键 NeRF 模型的性能和速度的基准比较。通过创建这项调查，我们希望向 NeRF 介绍新的研究人员，为该领域有影响力的工作提供有益的参考，并通过我们的讨论部分激发未来的研究方向。\n## Sep25 - Oct1, 2022\n  - [通过对极约束不带姿势相机的结构感知 NeRF](https://arxiv.org/abs/2210.00183) | [***``[code]``***](https://github.com/XTU-PR-LAB/SaNerf)\n    > 用于逼真的新视图合成的神经辐射场 (NeRF) 需要通过运动结构 (SfM) 方法预先获取相机姿势。这种两阶段策略使用不方便并且会降低性能，因为姿势提取中的错误会传播到视图合成。我们将姿势提取和视图合成集成到一个端到端的过程中，这样它们就可以相互受益。为了训练 NeRF 模型，只给出了 RGB 图像，没有预先知道的相机姿势。相机位姿是通过极线约束获得的，其中不同视图中的相同特征具有根据提取的位姿从本地相机坐标转换而来的相同世界坐标。对极约束与像素颜色约束联合优化。姿势由基于 CNN 的深度网络表示，其输入是相关帧。这种联合优化使 NeRF 能够感知场景的结构，从而提高泛化性能。在各种场景上进行的大量实验证明了所提出方法的有效性。此 https 网址提供了代码。\n  - [SCI：用于生物医学数据的频谱集中隐式神经压缩](https://arxiv.org/abs/2209.15180) | [code]\n    > 海量医疗数据的海量采集和爆炸式增长，需要有效压缩以实现高效存储、传输和共享。现成的视觉数据压缩技术已被广泛研究，但针对自然图像/视频量身定制，因此在具有不同特征的医学数据上表现出有限的性能。新兴的隐式神经表示 (INR) 正在获得动力，并展示了以特定于目标数据的方式拟合各种视觉数据的高前景，但迄今为止还没有涵盖各种医疗数据的通用压缩方案。为了解决这个问题，我们首先对 INR 的频谱集中特性进行了数学解释，并对面向压缩的 INR 架构的设计进行了分析洞察。此外，我们设计了一个漏斗形神经网络，能够覆盖广泛的复杂医疗数据并实现高压缩比。在此设计的基础上，我们在给定预算下通过优化进行压缩，并提出了一种自适应压缩方法SCI，该方法将目标数据自适应地划分为与所采用的INR的集中频谱包络匹配的块，并在给定压缩比下分配具有高表示精度的参数.实验表明 SCI 优于传统技术的性能以及在各种医学数据中的广泛适用性。\n  - [使用几何感知鉴别器改进 3D 感知图像合成, NeurIPS2022](https://arxiv.org/abs/2209.15637) | [***``[code]``***](https://github.com/vivianszf/geod)\n    > 3D 感知图像合成旨在学习一个生成模型，该模型可以渲染逼真的 2D 图像，同时捕捉体面的底层 3D 形状。一种流行的解决方案是采用生成对抗网络 (GAN)，并用 3D 渲染器替换生成器，其中通常使用带有神经辐射场 (NeRF) 的体积渲染。尽管合成质量有所提高，但现有方法无法获得适度的 3D 形状。我们认为，考虑到 GAN 公式中的两人游戏，仅使生成器具有 3D 感知能力是不够的。换句话说，取代生成机制只能提供生成 3D 感知图像的能力，但不能保证，因为生成器的监督主要来自鉴别器。为了解决这个问题，我们提出 GeoD 通过学习几何感知鉴别器来改进 3D 感知 GAN。具体来说，除了从 2D 图像空间中区分真假样本外，还要求鉴别器从输入中获取几何信息，然后将其用作生成器的指导。这种简单而有效的设计有助于学习更准确的 3D 形状。对各种生成器架构和训练数据集的广泛实验验证了 GeoD 优于最先进的替代方案。此外，我们的方法被注册为一个通用框架，这样一个更有能力的鉴别器（即，除了域分类和几何提取之外，还有第三个新的视图合成任务）可以进一步帮助生成器获得更好的多视图一致性。\n  - [了解体素网格 NeRF 模型的纯 CLIP 指导](https://arxiv.org/abs/2209.15172) | [code]\n    > 我们使用 CLIP 探索文本到 3D 对象生成的任务。具体来说，我们在不访问任何数据集的情况下使用 CLIP 进行指导，我们将这种设置称为纯 CLIP 指导。虽然之前的工作采用了这种设置，但没有系统研究防止 CLIP 中产生对抗性生成的机制。我们说明了不同的基于图像的增强如何防止对抗性生成问题，以及生成的结果如何受到影响。我们测试了不同的 CLIP 模型架构，并表明集成不同的模型进行指导可以防止更大模型中的对抗性生成并产生更清晰的结果。此外，我们实现了一个隐式体素网格模型，以展示神经网络如何提供额外的正则化层，从而产生更好的几何结构和生成对象的连贯性。与之前的工作相比，我们以更高的记忆效率和更快的训练速度获得了更连贯的结果。\n  - [从图像对中提取样式以进行全局正向和反向色调映射, CVMP2022](https://arxiv.org/abs/2209.15165) | [code]\n    > 许多图像增强或编辑操作，例如正向和反向色调映射或颜色分级，没有唯一的解决方案，而是有一系列解决方案，每个解决方案代表不同的风格。尽管如此，现有的基于学习的方法试图学习一个独特的映射，而忽略了这种风格。在这项工作中，我们展示了有关风格的信息可以从图像对的集合中提取并编码为 2 维或 3 维向量。这不仅为我们提供了有效的表示，而且为编辑图像样式提供了可解释的潜在空间。我们将一对图像之间的全局颜色映射表示为自定义归一化流，以像素颜色的多项式为条件。我们表明，这样的网络在低维空间中编码图像风格方面比 PCA 或 VAE 更有效，并且让我们获得接近 40 dB 的准确度，这比现有技术提高了大约 7-10 dB方法。\n  - [神经隐式曲面的球面引导训练](https://arxiv.org/abs/2209.15511) | [code]\n    > 近年来，通过神经隐函数进行表面建模已成为多视图 3D 重建的主要技术之一。然而，最先进的方法依赖于隐式函数来模拟整个场景体积，导致在具有薄物体或高频细节的区域中降低重建保真度。为了解决这个问题，我们提出了一种与辅助显式形状表示一起联合训练神经隐式表面的方法，该辅助显式形状表示充当表面引导。在我们的方法中，这种表示封装了场景的表面区域，使我们能够通过仅对该区域的体积进行建模来提高隐式函数训练的效率。我们建议使用一组可学习的球形基元作为可学习的表面指导，因为它们可以使用其梯度与神经表面函数一起有效地训练。我们的训练管道包括使用隐函数的梯度对球体中心的迭代更新，然后将后者微调到场景的更新表面区域。我们表明，对训练过程的这种修改可以插入到几种流行的隐式重建方法中，从而提高多个 3D 重建基准的结果质量。\n  - [迈向多时空尺度广义 PDE 建模](https://arxiv.org/abs/2209.15616) | [code]\n    > 偏微分方程 (PDE) 是描述复杂物理系统模拟的核心。他们昂贵的解决方案技术引起了人们对基于深度神经网络的代理的兴趣增加。然而，训练这些代理人的实际效用取决于他们模拟复杂的多尺度时空现象的能力。已经提出了各种神经网络架构来针对此类现象，最着名的是傅里叶神经算子（FNO），它通过不同傅里叶模式的参数化对局部\\和全局空间信息进行自然处理，以及通过以下方式处理局部和全局信息的 U-Nets下采样和上采样路径。然而，跨不同方程参数或不同时间尺度的泛化仍然是一个挑战。在这项工作中，我们对涡流和速度函数形式的流体力学问题的各种 FNO 和 U-Net 方法进行了全面比较。对于 U-Net，我们从计算机视觉中转移了最近的架构改进，最显着的是来自对象分割和生成建模。我们进一步分析了使用 FNO 层来提高 U-Net 架构的性能而不显着降低计算性能的设计考虑因素。最后，我们展示了使用单个代理模型泛化到不同 PDE 参数和时间尺度的有希望的结果。\n  - [MonoNeuralFusion：具有几何先验的在线单目神经 3D 重建](https://arxiv.org/abs/2209.15153) | [code]\n    > 从单目视频重建高保真 3D 场景仍然具有挑战性，特别是对于完整和细粒度的几何重建。先前具有神经隐式表示的 3D 重建方法已显示出完整场景重建的有希望的能力，但它们的结果通常过于平滑且缺乏足够的几何细节。本文介绍了一种新颖的神经隐式场景表示法，用于从单目视频中进行高保真在线 3D 场景重建的体积渲染。对于细粒度重建，我们的关键见解是将几何先验纳入神经隐式场景表示和神经体绘制，从而产生基于体绘制优化的有效几何学习机制。受益于此，我们提出了 MonoNeuralFusion 来从单目视频执行在线神经 3D 重建，从而在动态 3D 单目扫描期间有效地生成和优化 3D 场景几何图形。与最先进方法的广泛比较表明，我们的 MonoNeuralFusion 在数量和质量上始终生成更好的完整和细粒度的重建结果。\n  - [时间相关 PDE 的隐式神经空间表示](https://arxiv.org/abs/2210.00124) | [code]\n    > 数值求解偏微分方程 (PDE) 通常需要空间和时间离散化。传统方法（例如，有限差分、有限元、平滑粒子流体动力学）经常采用显式空间离散化，例如网格、网格和点云，其中每个自由度对应于空间中的一个位置。虽然这些明确的空间对应对于建模和理解来说是直观的，但这些表示对于准确性、内存使用或适应性而言不一定是最佳的。在这项工作中，我们探索隐式神经表示作为替代空间离散化，其中空间信息隐式存储在神经网络权重中。通过隐式神经空间表示，受 PDE 约束的时间步长转化为更新神经网络权重，它自然地与常用的优化时间积分器集成。我们通过涉及大弹性变形、湍流流体和多尺度现象的示例验证了我们在各种经典 PDE 上的方法。虽然计算速度比传统表示慢，但我们的方法表现出更高的准确性、更低的内存消耗和动态自适应分配的自由度，而无需复杂的重新划分网格。\n  - [具有隐式神经表示的连续 PDE 动态预测](https://arxiv.org/abs/2209.14855) | [code]\n    > 有效的数据驱动 PDE 预测方法通常依赖于固定的空间和/或时间离散化。这增加了现实世界应用的限制，例如需要在任意时空位置进行灵活外推的天气预报。我们通过引入一种新的数据驱动方法 DINo 来解决这个问题，该方法使用空间连续函数的连续时间动态对 PDE 的流进行建模。这是通过在由学习的 ODE 时间驱动的小潜在空间中通过隐式神经表示独立于其离散化嵌入空间观察来实现的。这种对时间和空间的分离和灵活处理使 DINo 成为第一个结合以下优点的数据驱动模型。它在任意空间和时间位置外推；它可以从稀疏的不规则网格或流形中学习；在测试时，它会推广到新的网格或分辨率。在代表性 PDE 系统的各种具有挑战性的泛化场景中，DINo 的表现优于替代神经 PDE 预测器。\n  - [SymmNeRF：学习探索单视图视图合成的对称先验, ACCV2022](https://arxiv.org/abs/2209.14819) | [***``[code]``***](https://github.com/xingyi-li/SymmNeRF)\n    > 我们研究了从单个图像中对对象进行新视图合成的问题。现有方法已经证明了单视图视图合成的潜力。但是，它们仍然无法恢复精细的外观细节，尤其是在自闭区域。这是因为单个视图仅提供有限的信息。我们观察到人造物体通常表现出对称的外观，这会引入额外的先验知识。受此启发，我们研究了将对称性显式嵌入场景表示的潜在性能增益。在本文中，我们提出了 SymmNeRF，这是一种基于神经辐射场 (NeRF) 的框架，在引入对称先验的情况下结合了局部和全局条件。特别是，SymmNeRF 将像素对齐的图像特征和相应的对称特征作为 NeRF 的额外输入，其参数由超网络生成。由于参数以图像编码的潜在代码为条件，因此 SymmNeRF 与场景无关，可以推广到新场景。对合成数据集和真实世界数据集的实验表明，SymmNeRF 可以合成具有更多细节的新颖视图，而不管姿势变换如何，并且在应用于看不见的对象时表现出良好的泛化性。代码位于：此 https URL。\n  - [面向多边形几何的通用表示学习, GeoInformatica](https://arxiv.org/abs/2209.15458) | [code]\n    > 空间数据的神经网络表示学习是地理人工智能 (GeoAI) 问题的普遍需求。近年来，在点、折线和网络的表示学习方面取得了许多进展，而在多边形，尤其是复杂的多边形几何形状方面进展甚微。在这项工作中，我们专注于开发一种通用的多边形编码模型，该模型可以将多边形几何体（有或没有孔，单面或多面体）编码到嵌入空间中。结果嵌入可以直接用于（或微调）下游任务，例如形状分类、空间关系预测等。为了实现模型的泛化性保证，我们确定了一些理想的属性：循环原点不变性、平凡顶点不变性、部分置换不变性和拓扑感知。我们探索了两种不同的编码器设计：一种是在空间域中派生所有表示；另一个利用谱域表示。对于空间域方法，我们提出了 ResNet1D，这是一种基于 CNN 的 1D 多边形编码器，它使用圆形填充来实现简单多边形上的循环原点不变性。对于谱域方法，我们开发了基于非均匀傅里叶变换 (NUFT) 的 NUFTspec，它自然地满足了所有所需的属性。我们对两个任务进行了实验：1）基于MNIST的形状分类； 2）基于两个新数据集——DBSR-46K和DBSR-cplx46K的空间关系预测。我们的结果表明，NUFTspec 和 ResNet1D 的性能优于多个现有的基线，具有显着的优势。虽然 ResNet1D 在形状不变几何修改后模型性能下降，但由于 NUFT 的性质，NUFTspec 对这些修改非常稳健。\n  - [具有三层采样和全景表示的城市级增量神经映射](https://arxiv.org/abs/2209.14072) | [code]\n    > 神经隐式表示最近引起了机器人界的广泛关注，因为它们具有表现力、连续性和紧凑性。然而，基于稀疏 LiDAR 输入的城市规模增量隐式密集映射仍然是一个未充分探索的挑战。为此，我们成功构建了第一个具有全景表示的城市规模增量神经映射系统，该系统由环境级和实例级建模组成。给定一个稀疏的 LiDAR 点云流，它维护一个动态生成模型，将 3D 坐标映射到有符号距离场 (SDF) 值。为了解决在城市尺度空间中表示不同层次几何信息的困难，我们提出了一种定制的三层采样策略来动态采样全局、局部和近地表域。同时，为了实现高保真映射，引入了特定类别的先验以更好地对几何细节进行建模，从而实现全景表示。我们评估了公共 SemanticKITTI 数据集，并使用定量和定性结果证明了新提出的三层采样策略和全景表示的重要性。代码和数据将公开。\n  - [360FusionNeRF：具有联合引导的全景神经辐射场](https://arxiv.org/abs/2209.14265) | [code]\n    > 我们提出了一种基于神经辐射场 (NeRF) 从单个 360 度全景图像合成新视图的方法。类似设置中的先前研究依赖于多层感知的邻域插值能力来完成由遮挡引起的缺失区域，这导致其预测中的伪影。我们提出了 360FusionNeRF，这是一个半监督学习框架，我们在其中引入几何监督和语义一致性来指导渐进式训练过程。首先，将输入图像重新投影到 360 度图像，并在其他相机位置提取辅助深度图。除了 NeRF 颜色指导之外，深度监督还改进了合成视图的几何形状。此外，我们引入了语义一致性损失，鼓励对新视图进行逼真的渲染。我们使用预训练的视觉编码器（例如 CLIP）提取这些语义特征，CLIP 是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的 2D 照片进行训练。实验表明，我们提出的方法可以在保留场景特征的同时产生未观察到的区域的合理完成。在跨各种场景进行训练时，360FusionNeRF 在转移到合成 Structured3D 数据集（PSNR~5%，SSIM~3% LPIPS~13%）、真实世界的 Matterport3D 数据集（PSNR~3%）时始终保持最先进的性能, SSIM~3% LPIPS~9%) 和 Replica360 数据集 (PSNR~8%, SSIM~2% LPIPS~18%)。\n  - [Orbeez-SLAM：具有 ORB 特征和 NeRF 实现映射的实时单目视觉 SLAM](https://arxiv.org/abs/2209.13274) | [code]\n    > 一种可以通过视觉信号执行复杂任务并与人类合作的空间人工智能备受期待。为了实现这一点，我们需要一个无需预训练即可轻松适应新场景并实时为下游任务生成密集地图的视觉 SLAM。由于其组件的内在限制，以前的基于学习和非基于学习的视觉 SLAM 都不能满足所有需求。在这项工作中，我们开发了一个名为 Orbeez-SLAM 的视觉 SLAM，它成功地与隐式神经表示 (NeRF) 和视觉里程计合作来实现我们的目标。此外，Orbeez-SLAM 可以与单目相机配合使用，因为它只需要 RGB 输入，使其广泛适用于现实世界。我们在各种具有挑战性的基准上验证了它的有效性。结果表明，我们的 SLAM 比强基线快 800 倍，并具有出色的渲染结果。\n  - [通过控制屏障功能和神经辐射场增强基于视觉的控制器的安全性](https://arxiv.org/abs/2209.12266) | [code]\n    > 为了在复杂的环境中导航，机器人必须越来越多地使用高维视觉反馈（例如图像）进行控制。然而，依靠高维图像数据做出控制决策会引发重要问题；特别是，我们如何证明视觉反馈控制器的安全性？控制障碍函数 (CBF) 是在状态反馈设置中验证反馈控制器安全性的强大工具，但由于需要预测未来的观察结果以评估障碍函数，CBF 传统上不太适合视觉反馈控制.在这项工作中，我们利用神经辐射场 (NeRFs) 的最新进展来解决这个问题，神经辐射场 (NeRFs) 学习 3D 场景的隐式表示并可以从以前看不见的相机视角渲染图像，为基于 CBF 的单步视觉预测提供控制器。这种新颖的组合能够过滤掉不安全的行为并进行干预以保护安全。我们在实时模拟实验中展示了我们的控制器的效果，它成功地防止了机器人采取危险行动。\n  - [自主隐式重建的高效视图路径规划](https://arxiv.org/abs/2209.13159) | [code]\n    > 隐式神经表示已显示出用于 3D 场景重建的巨大潜力。最近的工作通过学习用于视图路径规划的信息增益，将其应用于自主 3D 重建。虽然有效，但信息增益的计算成本很高，并且与使用体积表示的计算相比，使用 3D 点的隐式表示的碰撞检查要慢得多。在本文中，我们建议 1）利用神经网络作为信息增益场的隐式函数逼近器，以及 2）将隐式细粒度表示与粗略的体积表示相结合以提高效率。随着效率的进一步提高，我们提出了一种基于基于图的规划器的新颖的信息路径规划。与具有隐式和显式表示的自主重建相比，我们的方法证明了重建质量和规划效率的显着提高。我们将该方法部署在真实的无人机上，结果表明我们的方法可以规划信息丰富的视图并重建高质量的场景。\n  - [WaterNeRF：水下场景的神经辐射场](https://arxiv.org/abs/2209.13091) | [code]\n    > 水下成像是海洋机器人执行的一项关键任务，其应用范围广泛，包括水产养殖、海洋基础设施检查和环境监测。然而，水柱效应，例如衰减和反向散射，会极大地改变水下捕获图像的颜色和质量。由于不同的水条件和这些影响的范围依赖性，恢复水下图像是一个具有挑战性的问题。这会影响下游感知任务，包括深度估计和 3D 重建。在本文中，我们推进了神经辐射场 (NeRF) 的最新技术，以实现基于物理的密集深度估计和颜色校正。我们提出的方法 WaterNeRF 估计了基于物理的水下图像形成模型的参数，从而产生了混合数据驱动和基于模型的解决方案。在确定场景结构和辐射场后，我们可以生成退化和校正的水下图像的新视图，以及场景的密集深度。我们在真实的水下数据集上定性和定量地评估所提出的方法。\n  - [神经全局照明：动态区域光下的交互式间接照明预测, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9904431) | [code]\n    > 我们提出了神经全局照明，这是一种在具有动态视点和区域照明的静态场景中快速渲染全全局照明的新方法。我们方法的关键思想是利用深度渲染网络来模拟从每个着色点到全局照明的复杂映射。为了有效地学习映射，我们提出了一种对神经网络友好的输入表示，包括每个着色点的属性、视点信息和组合照明表示，该表示能够与紧凑的神经网络进行高质量的拟合。为了合成高频全局光照效果，我们通过位置编码将低维输入转换为高维空间，并将渲染网络建模为深度全连接网络。此外，我们将屏幕空间神经缓冲区提供给我们的渲染网络，以将屏幕空间中的对象之间的全局信息共享到每个着色点。我们已经证明了我们的神经全局照明方法可以渲染各种场景，这些场景表现出复杂的全频全局照明效果，例如多次反射光泽互反射、渗色和焦散。\n  - [烘焙特征：通过渲染特征图加速体积分割](https://arxiv.org/abs/2209.12744) | [code]\n    > 最近提出了一些方法，即仅使用彩色图像和专家监督以稀疏语义注释像素的形式将 3D 体积密集分割成类。虽然令人印象深刻，但这些方法仍然需要相对大量的监督，并且在实践中分割对象可能需要几分钟。这样的系统通常只优化它们在它们适合的特定场景上的表示，而不利用来自先前看到的图像的任何先验信息。在本文中，我们建议使用在现有大型数据集上训练的模型提取的特征来提高分割性能。我们通过体积渲染特征图并监督从每个输入图像中提取的特征，将这种特征表示烘焙到神经辐射场 (NeRF) 中。我们表明，通过将这种表示烘焙到 NeRF 中，我们使后续的分类任务变得更加容易。我们的实验表明，与现有方法相比，我们的方法在广泛的场景中以更少的语义注释实现了更高的分割精度。\n## Sep18 - Sep24, 2022\n  - [Local_INN：使用可逆神经网络的隐式地图表示和定位](https://arxiv.org/abs/2209.11925) | [code]\n    > 机器人定位是使用地图和传感器测量找到机器人姿势的逆问题。近年来，可逆神经网络（INNs）成功地解决了各个领域的模糊逆问题。本文提出了一个用 INN 解决本地化问题的框架。我们设计了一个 INN，它在正向路径中提供隐式地图表示并在反向路径中提供定位。通过在评估中对潜在空间进行采样，Local\\_INN 输出具有协方差的机器人位姿，可用于估计不确定性。我们表明 Local\\_INN 的本地化性能与当前的方法相当，但延迟要低得多。我们使用训练集外部的姿势从 Local\\_INN 显示详细的 2D 和 3D 地图重建。我们还提供了一个使用 Local\\_INN 的全局定位算法来解决绑架问题。\n  - [NeRF-Loc：神经辐射场内基于变换器的对象定位](https://arxiv.org/abs/2209.12068) | [code]\n    > 神经辐射场 (NeRFs) 已成功用于场景表示。最近的工作还开发了使用基于 NeRF 的环境表示的机器人导航和操纵系统。由于对象定位是许多机器人应用的基础，为了进一步释放 NeRF 在机器人系统中的潜力，我们研究了 NeRF 场景中的对象定位。我们提出了一个基于转换器的框架 NeRF-Loc 来提取 NeRF 场景中对象的 3D 边界框。 NeRF-Loc 将预先训练的 NeRF 模型和相机视图作为输入，并生成标记的 3D 对象边界框作为输出。具体来说，我们设计了一对并行的转换器编码器分支，即粗流和细流，对目标对象的上下文和细节进行编码。然后将编码特征与注意力层融合在一起，以减轻模糊性，从而实现准确的对象定位。我们将我们的方法与传统的基于变压器的方法进行了比较，我们的方法取得了更好的性能。此外，我们还展示了第一个基于 NeRF 样本的对象定位基准 NeRFLocBench。\n  - [SG-SRNs：超像素引导的场景表示网络, SignalProcessingLetters](https://ieeexplore.ieee.org/abstract/document/9900405) | [code]\n    > 最近，场景表示网络（SRNs）由于其连续且轻量级的场景表示能力，在计算机视觉领域引起了越来越多的关注。然而，SRN 通常在低纹理图像区域上表现不佳。为了解决这个问题，我们在本文中提出了超像素引导的场景表示网络，称为 SG-SRN，由主干模块 (SRN)、超像素分割模块和超像素正则化模块组成。在所提出的方法中，除了新颖的视图合成任务外，表示感知的超像素分割掩码生成任务由所提出的超像素分割模块实现。然后，超像素正则化模块利用超像素分割掩码以局部平滑的方式引导要学习的主干，并优化局部区域的场景表示，以自监督的方式间接缓解低纹理区域的结构失真.在我们构建的数据集和公共 Synthetic-NeRF 数据集上的广泛实验结果表明，所提出的 SG-SRN 实现了显着更好的 3D 结构表示性能。\n  - [PNeRF：用于不确定 3D 视觉映射的概率神经场景表示, ICRA2023](https://arxiv.org/abs/2209.11677) | [code]\n    > 最近，神经场景表示在视觉上表示 3D 场景提供了非常令人印象深刻的结果，但是，它们的研究和进展主要局限于计算机图形中虚拟模型的可视化或计算机视觉中的场景重建，而没有明确考虑传感器和姿势的不确定性。然而，在机器人应用中使用这种新颖的场景表示需要考虑神经图中的这种不确定性。因此，本文的目的是提出一种用不确定的训练数据训练 {\\em 概率神经场景表示} 的新方法，该方法可以将这些表示包含在机器人应用程序中。使用相机或深度传感器获取图像包含固有的不确定性，此外，用于学习 3D 模型的相机姿势也不完善。如果将这些测量值用于训练而不考虑其不确定性，则生成的模型不是最优的，并且生成的场景表示可能包含诸如模糊和几何不均匀等伪影。在这项工作中，通过关注以概率方式使用不确定信息进行训练，研究了将不确定性整合到学习过程中的问题。所提出的方法涉及使用不确定性项显式增加训练似然性，使得网络的学习概率分布相对于训练不确定性最小化。将会显示，除了更精确和一致的几何形状之外，这会导致更准确的图像渲染质量。已经对合成数据集和真实数据集进行了验证，表明所提出的方法优于最先进的方法。结果表明，即使在训练数据有限的情况下，所提出的方法也能够呈现新颖的高质量视图。\n  - [感觉怎么样？ 用于越野车辆可穿越性的自我监督成本图学习](https://arxiv.org/abs/2209.10788) | [code]\n    > 估计越野环境中的地形可穿越性需要推理机器人与这些地形之间的复杂交互动力学。然而，对于这些交互，构建准确的物理模型或创建信息标签以有监督的方式学习模型具有挑战性。我们提出了一种方法，该方法通过以自我监督的方式将外部感知环境信息与本体感知地形交互反馈相结合来学习预测可遍历性成本图。此外，我们提出了一种将机器人速度纳入成本图预测管道的新方法。我们在具有挑战性的越野地形的大型自主全地形车 (ATV) 上的多个短距离和大规模导航任务中验证了我们的方法，并证明了在单独的大型地面机器人上易于集成。我们的短尺度导航结果表明，使用我们学习的成本图可以使导航整体更顺畅，并为机器人提供对机器人与不同地形类型（如草地和砾石）之间相互作用的更细粒度的理解。我们的大规模导航试验表明，在 400 米到 3150 米的具有挑战性的越野路线中，与基于占用的导航基线相比，我们可以将干预次数减少多达 57%。\n  - [具有通道调谐的面向边缘的隐式神经表示](https://arxiv.org/abs/2209.11697) | [code]\n    > 隐式神经表示，将图像表示为连续函数而不是离散网格形式，广泛用于图像处理。尽管其表现出色，但在恢复给定信号的清晰形状（例如图像边缘）方面仍然存在限制。在本文中，我们提出了梯度幅度调整算法，该算法计算图像的梯度以训练隐式表示。此外，我们提出了面向边缘的表示网络（EoREN），它可以通过拟合梯度信息（面向边缘的模块）来重建具有清晰边缘的图像。此外，我们添加了 Channel-tuning 模块来调整给定信号的分布，从而解决了拟合梯度的长期问题。通过分离两个模块的反向传播路径，EoREN 可以在不妨碍梯度作用的情况下学习图像的真实颜色。我们定性地证明了我们的模型可以重建复杂的信号，并通过定量结果证明了我们模型的一般重建能力。\n  - [来自单个压缩光场测量的快速视差估计](https://arxiv.org/abs/2209.11342) | [code]\n    > 来自光场的丰富空间和角度信息允许开发多种视差估计方法。然而，光场的获取需要较高的存储和处理成本，限制了该技术在实际应用中的使用。为了克服这些缺点，压缩传感 (CS) 理论允许开发光学架构来获取单个编码光场测量。该测量使用需要高计算成本的优化算法或深度神经网络进行解码。从压缩光场进行视差估计的传统方法需要首先恢复整个光场，然后进行后处理步骤，因此需要很长时间。相比之下，这项工作通过省略传统方法中所需的恢复步骤，从单个压缩测量中提出了一种快速的视差估计。具体来说，我们建议联合优化用于获取单个编码光场快照的光学架构和用于估计视差图的卷积神经网络 (CNN)。在实验上，所提出的方法估计的视差图与使用深度学习方法重建的光场获得的视差图相当。此外，所提出的方法在训练和推理方面比从重建光场估计视差的最佳方法快 20 倍。\n  - [FNeVR：面部动画的神经体积渲染](https://arxiv.org/abs/2209.10340) | [code]\n    > 人脸动画是计算机视觉中最热门的话题之一，在生成模型的帮助下取得了可喜的成绩。然而，由于复杂的运动变形和复杂的面部细节建模，生成身份保持和照片般逼真的图像仍然是一个关键挑战。为了解决这些问题，我们提出了一个人脸神经体绘制 (FNeVR) 网络，以在一个统一的框架中充分挖掘 2D 运动扭曲和 3D 体绘制的潜力。在 FNeVR 中，我们设计了一个 3D 面部体积渲染 (FVR) 模块来增强图像渲染的面部细节。具体来说，我们首先使用精心设计的架构提取 3D 信息，然后引入正交自适应光线采样模块以实现高效渲染。我们还设计了一个轻量级的姿势编辑器，使 FNeVR 能够以简单而有效的方式编辑面部姿势。大量实验表明，我们的 FNeVR 在广泛使用的 Talking Head 基准测试中获得了最佳的整体质量和性能。\n  - [PREF：可预测性正则化神经运动场, ECCV2022(oral)](https://arxiv.org/abs/2209.10691) | [code]\n    > 了解动态场景中的 3D 运动对于许多视觉应用至关重要。最近的进展主要集中在估计一些特定元素的活动，如人类。在本文中，我们利用神经运动场来估计多视图设置中所有点的运动。由于颜色相似的点和颜色随时间变化的点的模糊性，使用多视图数据对动态场景的运动进行建模具有挑战性。我们建议将估计的运动规范化为可预测的。如果先前帧的运动是已知的，那么不久的将来的运动应该是可预测的。因此，我们通过首先调节潜在嵌入的估计运动，然后通过采用预测器网络来强制嵌入的可预测性来引入可预测性正则化。与最先进的基于神经运动场的动态场景表示方法相比，所提出的框架 PREF（Predictability REgularized Fields）实现了同等或更好的结果，同时不需要场景的先验知识。\n  - [wildNeRF：使用稀疏单目数据捕获的野外动态场景的完整视图合成](https://arxiv.org/abs/2209.10399) | [code]\n    > 我们提出了一种新的神经辐射模型，该模型可以以自我监督的方式进行训练，用于动态非结构化场景的新视图合成。我们的端到端可训练算法可在几秒钟内学习高度复杂的真实静态场景，并在几分钟内学习具有刚性和非刚性运动的动态场景。通过区分静态像素和以运动为中心的像素，我们从一组稀疏的图像中创建高质量的表示。我们对现有基准进行了广泛的定性和定量评估，并在具有挑战性的 NVIDIA 动态场景数据集上设置了最先进的性能指标。此外，我们在具有挑战性的现实世界数据集（例如 Cholec80 和 SurgicalActions160）上评估我们的模型性能。\n  - [Loc-NeRF：使用神经辐射场进行蒙特卡罗定位](https://arxiv.org/abs/2209.09050) | [***``[code]``***](https://github.com/MIT-SPARK/Loc-NeRF)\n    > 我们提出了 Loc-NeRF，这是一种基于实时视觉的机器人定位方法，它结合了蒙特卡洛定位和神经辐射场 (NeRF)。我们的系统使用预训练的 NeRF 模型作为环境地图，并且可以使用 RGB 相机作为机器人上唯一的外部感受器实时定位自身。虽然神经辐射场已经在计算机视觉和图形中看到了视觉渲染的重要应用，但它们在机器人技术中的用途有限。现有的基于 NeRF 的定位方法需要良好的初始姿势猜测和大量计算，这使得它们对于实时机器人应用不切实际。通过使用 Monte Carlo 定位作为使用 NeRF 地图模型估计姿态的主力，Loc-NeRF 能够比现有技术更快地执行定位，并且不依赖于初始姿态估计。除了对合成数据进行测试外，我们还使用 Clearpath Jackal UGV 收集的真实数据运行我们的系统，并首次展示了使用神经辐射场执行实时全局定位的能力。我们通过此 https 网址公开我们的代码。\n  - [密度感知 NeRF 集成：量化神经辐射场中的预测不确定性](https://arxiv.org/abs/2209.08718) | [code]\n    > 我们表明，如果考虑到密度感知认知不确定性项，则集成有效地量化了神经辐射场 (NeRFs) 中的模型不确定性。在先前的工作中研究的朴素集成只是简单地平均渲染的 RGB 图像，以量化由观察到的场景的相互矛盾的解释引起的模型不确定性。相比之下，由于缺乏关于训练期间未观察到的场景部分的知识，我们还考虑了沿单个射线的终止概率来识别认知模型的不确定性。我们在已建立的 NeRF 不确定性量化基准中实现了新的最先进的性能，优于需要对 NeRF 架构和训练机制进行复杂更改的方法。我们进一步证明了 NeRF 不确定性可用于次佳视图选择和模型细化。\n  - [NeRF-SOS：复杂场景上的任意视图自监督对象分割](https://zhiwenfan.github.io/NeRF-SOS/) | [***``[code]``***](https://github.com/VITA-Group/NeRF-SOS)\n    > 神经体积表示已经显示了多层感知器 (MLP) 可以使用多视图校准图像进行优化以表示场景几何和外观的潜力，而无需明确的 3D 监督。对象分割可以基于学习到的辐射场丰富许多下游应用。然而，引入手工分割来定义复杂现实世界场景中的感兴趣区域并非易事且成本高昂，因为它需要每个视图注释。本文针对复杂的现实世界场景使用 NeRF 进行对象分割的自监督学习探索。我们的框架称为带有自监督对象分割 NeRF-SOS 的 NeRF，它结合了对象分割和神经辐射场来分割场景中任何视图中的对象。通过在外观和几何级别上提出一种新颖的协作对比损失，NeRF-SOS 鼓励 NeRF 模型从其密度场和自我监督的预训练 2D 视觉特征中提取紧凑的几何感知分割簇。自监督对象分割框架可以应用于各种 NeRF 模型，这些模型既可以产生逼真的渲染结果，又可以在室内和室外场景中提供令人信服的分割图。 LLFF、Tank & Temple 和 BlendedMVS 数据集的广泛结果验证了 NeRF-SOS 的有效性。它始终超越其他基于 2D 的自我监督基线，并预测比现有监督对应物更精细的语义掩码。请参阅我们项目页面上的视频以获取更多详细信息：此 https URL。\n  - [MeSLAM：基于神经域的内存高效 SLAM, SMC2022](https://arxiv.org/abs/2209.09357) | [code]\n    > 由于长期机器人操作中地图大小的增加，现有的同时定位和映射 (SLAM) 方法的可扩展性有限。此外，为定位和规划任务处理此类地图会导致车载所需的计算资源增加。为了解决长期操作中的内存消耗问题，我们开发了一种新颖的实时 SLAM 算法 MeSLAM，它基于神经场隐式地图表示。它将提议的全局映射策略（包括神经网络分布和区域跟踪）与外部里程计系统相结合。因此，该算法能够有效地训练代表不同地图区域的多个网络，并在大规模环境中准确地跟踪姿势。实验结果表明，所提出的方法的准确性与最先进的方法相当（在 TUM RGB-D 序列上平均为 6.6 cm），并且优于基线 iMAP*。此外，所提出的 SLAM 方法在最先进的 SLAM 方法中提供了最紧凑的地图，没有细节失真（1.9 MB 可存储 57 m3）。\n  - [通过神经动画网格进行人体性能建模和渲染](https://arxiv.org/abs/2209.08468) | [code]\n    > 我们最近看到了照片真实人体建模和渲染的神经进步的巨大进步。但是，将它们集成到现有的基于网格的管道中以用于下游应用程序仍然具有挑战性。在本文中，我们提出了一种综合神经方法，用于从密集的多视图视频中对人类表演进行高质量的重建、压缩和渲染。我们的核心直觉是将传统的动画网格工作流程与新型高效神经技术联系起来。我们首先介绍了一种用于在几分钟内生成高质量表面的神经表面重建器。它将截断有符号距离场 (TSDF) 的隐式体积渲染与多分辨率哈希编码结合在一起。我们进一步提出了一种混合神经跟踪器来生成动画网格，它将显式非刚性跟踪与自监督框架中的隐式动态变形相结合。前者将粗略的变形提供回规范空间，而后者隐含的进一步使用我们的重构器中的 4D 哈希编码来预测位移。然后，我们讨论使用获得的动画网格的渲染方案，范围从动态纹理到各种带宽设置下的流明图渲染。为了在质量和带宽之间取得复杂的平衡，我们提出了一种分层解决方案，首先渲染覆盖表演者的 6 个虚拟视图，然后进行遮挡感知神经纹理混合。我们展示了我们的方法在各种基于网格的应用程序和各种平台上逼真的自由视图体验中的有效性，即通过移动 AR 将虚拟人类表演插入真实环境或使用 VR 耳机沉浸式观看才艺表演。\n  - [LATITUDE：在城市规模的 NeRF 中使用截断动态低通滤波器进行机器人全局定位, ICRA2023](https://arxiv.org/abs/2209.08498) | [***``[code]``***](https://github.com/jike5/LATITUDE)\n    > 神经辐射场 (NeRFs) 在表示具有高分辨率细节和高效内存的复杂 3D 场景方面取得了巨大成功。然而，当前基于 NeRF 的姿态估计器没有初始姿态预测，并且在优化过程中容易出现局部最优。在本文中，我们提出了 LATITUDE：使用截断动态低通滤波器进行全局定位，它在城市规模的 NeRF 中引入了两阶段定位机制。在位置识别阶段，我们通过训练后的 NeRF 生成的图像训练回归器，为全局定位提供初始值。在姿态优化阶段，我们通过直接优化切平面上的姿态来最小化观察图像和渲染图像之间的残差。为了避免收敛到局部最优，我们引入了截断动态低通滤波器 (TDLF) 用于从粗到细的姿态配准。我们在合成数据和真实世界数据上评估我们的方法，并展示其在大规模城市场景中高精度导航的潜在应用。代码和数据将在此 https 网址上公开提供。\n  - [使用成像声纳的神经隐式表面重建](https://arxiv.org/abs/2209.08221) | [code]\n    > 我们提出了一种使用成像声纳（也称为前视声纳（FLS））对物体进行密集 3D 重建的技术。与以前将场景几何建模为点云或体积网格的方法相比，我们将几何表示为神经隐函数。此外，给定这样的表示，我们使用可微分体积渲染器来模拟声波的传播以合成成像声纳测量。我们在真实和合成数据集上进行实验，并表明我们的算法从多视图 FLS 图像中重建高保真表面几何图形的质量比以前的技术高得多，并且不会受到相关的内存开销的影响。\n  - [使用神经辐射场进行主动机器人 3D 重建的不确定性引导策略, RAL2022](https://arxiv.org/abs/2209.08409) | [code]\n    > 在本文中，我们解决了物体的主动机器人 3D 重建问题。特别是，我们研究了带有手持摄像头的移动机器人如何选择有利数量的视图来有效地恢复对象的 3D 形状。与该问题的现有解决方案相反，我们利用流行的基于神经辐射场的对象表示，最近在各种计算机视觉任务中显示出令人印象深刻的结果。然而，使用这种表示直接推断对象的显式 3D 几何细节并不简单，这使得密集 3D 重建的次佳视图选择问题具有挑战性。本文介绍了一种基于光线的体积不确定性估计器，它计算颜色样本沿物体隐式神经表示的每条光线的权重分布的熵。我们表明，使用所提出的估计器给出一个新颖的视图，可以推断出底层 3D 几何的不确定性。然后，我们提出了一个下一个最佳视图选择策略，该策略由基于神经辐射场的表示中基于射线的体积不确定性指导。令人鼓舞的合成数据和真实世界数据的实验结果表明，本文提出的方法可以启用一个新的研究方向，即使用隐式 3D 对象表示来解决机器人视觉应用中的下一个最佳视图问题，将我们的方法与现有的方法区分开来依赖于显式 3D 几何建模的方法。\n  - [医学影像分割的隐式神经表示, MICCAI2022](https://link.springer.com/chapter/10.1007/978-3-031-16443-9_42) | [code]\n    > 医学成像中的 3D 信号（例如 CT 扫描）通常被参数化为体素的离散网格。例如，现有的最先进的器官分割方法学习离散的分割图。不幸的是，这些方法的内存需求随着空间分辨率的增加而呈立方增长，这使得它们不适合处理高分辨率扫描。为了克服这个问题，我们设计了一个隐式器官分割网络 (IOSNet)，它利用连续的隐式神经表示并具有几个有用的属性。首先，IOSNet 解码器内存大致恒定且独立于空间分辨率，因为它将分割图参数化为连续函数。其次，IOSNet 的收敛速度比基于离散体素的方法快得多，因为它能够准确地分割器官而不受器官大小的影响，从而在不需要任何辅助技巧的情况下缓解大小不平衡问题。第三，由于其连续学习表示，IOSNet 自然支持超分辨率（即在推理过程中以任意分辨率采样）。此外，尽管使用了一个简单的轻量级解码器，IOSNet 始终优于离散专业分割架构 UNet。因此，我们的方法表明隐式神经表示非常适合医学成像应用，尤其是处理高分辨率 3D 医学扫描。\n  - [ActiveNeRF：通过不确定性估计学习在哪里看](https://arxiv.org/abs/2209.08546) | [***``[code]``***](https://github.com/LeapLabTHU/ActiveNeRF)\n    > 最近，神经辐射场 (NeRF) 在重建 3D 场景和从一组稀疏的 2D 图像合成新视图方面显示出令人鼓舞的性能。尽管有效，但 NeRF 的性能很大程度上受训练样本质量的影响。由于场景中的姿势图像有限，NeRF 无法很好地泛化到新颖的视图，并且可能会在未观察到的区域中崩溃为琐碎的解决方案。这使得 NeRF 在资源受限的情况下变得不切实际。在本文中，我们提出了一种新颖的学习框架 ActiveNeRF，旨在对输入预算受限的 3D 场景进行建模。具体来说，我们首先将不确定性估计纳入 NeRF 模型，以确保在少量观察下的稳健性，并提供对 NeRF 如何理解场景的解释。在此基础上，我们建议使用基于主动学习方案的新捕获样本来补充现有的训练集。通过评估给定新输入的不确定性减少情况，我们选择带来最多信息增益的样本。通过这种方式，可以用最少的额外资源提高新视图合成的质量。大量实验验证了我们的模型在真实场景和合成场景上的性能，尤其是在训练数据较少的情况下。代码将在 \\url{this https URL} 发布。\n## Sep11 - Sep17, 2022\n  - [iDF-SLAM：具有神经隐式映射和深度特征跟踪的端到端 RGB-D SLAM](https://arxiv.org/abs/2209.07919) | [code]\n    > 我们提出了一种新颖的端到端 RGB-D SLAM iDF-SLAM，它采用基于特征的深度神经跟踪器作为前端，采用 NeRF 风格的神经隐式映射器作为后端。神经隐式映射器是即时训练的，虽然神经跟踪器是在 ScanNet 数据集上进行预训练的，但它也会随着神经隐式映射器的训练进行微调。在这样的设计下，我们的 iDF-SLAM 能够学习使用特定场景的特征进行相机跟踪，从而实现 SLAM 系统的终身学习。跟踪器和映射器的训练都是自我监督的，没有引入地面真实姿势。我们在 Replica 和 ScanNet 数据集上测试了 iDF-SLAM 的性能，并将结果与​​最近的两个基于 NeRF 的神经 SLAM 系统进行了比较。所提出的 iDF-SLAM 在场景重建和相机跟踪的竞争性能方面展示了最先进的结果。\n  - [3DMM-RF：用于 3D 人脸建模的卷积辐射场](https://arxiv.org/abs/2209.07366) | [code]\n    > 面部 3D 可变形模型是具有无数应用的主要计算机视觉主题，并且在过去二十年中得到了高度优化。深度生成网络的巨大改进为改进此类模型创造了各种可能性，并引起了广泛的兴趣。此外，神经辐射领域的最新进展正在彻底改变已知场景的新视图合成。在这项工作中，我们提出了一个面部 3D 可变形模型，它利用了上述两者，并且可以准确地建模对象的身份、姿势和表情，并在任意光照下渲染它。这是通过利用强大的基于深度样式的生成器来克服神经辐射场的两个主要弱点，即它们的刚性和渲染速度来实现的。我们引入了一种基于样式的生成网络，它一次性合成所有且仅合成神经辐射场所需的渲染样本。我们创建了一个巨大的面部渲染标记合成数据集，并在这些数据上训练网络，以便它可以准确地建模和概括面部身份、姿势和外观。最后，我们证明该模型可以准确地拟合任意姿势和光照的“in-the-wild”人脸图像，提取人脸特征，并用于在可控条件下重新渲染人脸。\n  - [DevNet：通过密度体积构建的自监督单目深度学习, ECCV2022](https://arxiv.org/abs/2209.06351) | [code]\n    > 单目图像的自监督深度学习通常依赖于时间相邻图像帧之间的 2D 像素级光度关系。然而，它们既没有充分利用 3D 逐点几何对应，也没有有效地解决由遮挡或照明不一致引起的光度翘曲的模糊性。为了解决这些问题，这项工作提出了密度体积构建网络 (DevNet)，这是一种新颖的自我监督单目深度学习框架，可以考虑 3D 空间信息，并利用相邻相机平截头体之间更强的几何约束。我们的 DevNet 不是直接从单个图像中回归像素值，而是将相机平截头体划分为多个平行平面，并预测每个平面上的逐点遮挡概率密度。最终的深度图是通过沿相应光线对密度进行积分来生成的。在训练过程中，引入了新的正则化策略和损失函数来减轻光度模糊和过拟合。在没有明显扩大模型参数大小或运行时间的情况下，DevNet 在 KITTI-2015 室外数据集和 NYU-V2 室内数据集上都优于几个具有代表性的基线。特别是，在深度估计任务中，KITTI-2015 和 NYU-V2 上的 DevNet 的均方根偏差降低了约 4%。此 https 网址提供了代码。\n  - [明确可控的 3D 感知肖像生成](https://arxiv.org/abs/2209.05434) | [code]\n    > 与成本高昂的传统头像创建流程相比，当代生成方法直接从照片中学习数据分布。虽然大量工作扩展了无条件生成模型并实现了一定程度的可控性，但确保多视图一致性仍然具有挑战性，尤其是在大姿势中。在这项工作中，我们提出了一个生成 3D 感知肖像的网络，同时可以根据有关姿势、身份、表情和照明的语义参数进行控制。我们的网络使用神经场景表示来建模 3D 感知肖像，其生成由支持显式控制的参数化面部模型引导。虽然通过对比具有部分不同属性的图像可以进一步增强潜在的解缠结，但在为表情制作动画时，非面部区域（例如头发和背景）仍然存在明显的不一致。我们通过提出一种体积混合策略来解决这个问题，在该策略中，我们通过混合动态和静态区域来形成复合输出，其中两部分从联合学习的语义场中分割出来。我们的方法在广泛的实验中优于现有技术，当从自由视角观看时，可以在自然光下生成逼真的肖像，并具有生动的表达。它还展示了对真实图像和域外数据的泛化能力，在实际应用中显示出巨大的前景。\n  - [StructNeRF：具有结构提示的室内场景的神经辐射场](https://arxiv.org/abs/2209.05277) | [code]\n    > 神经辐射场 (NeRF) 使用密集捕获的输入图像实现照片般逼真的视图合成。然而，在给定稀疏视图的情况下，NeRF 的几何形状受到极大限制，导致新视图合成质量显着下降。受自监督深度估计方法的启发，我们提出了 StructNeRF，这是一种针对具有稀疏输入的室内场景的新颖视图合成的解决方案。 StructNeRF 利用自然嵌入在多视图输入中的结构提示来处理 NeRF 中的无约束几何问题。具体来说，它分别处理纹理和非纹理区域：提出了一种基于块的多视图一致光度损失来约束纹理区域的几何形状；对于非纹理平面，我们明确将它们限制为 3D 一致平面。通过密集的自监督深度约束，我们的方法提高了 NeRF 的几何和视图合成性能，而无需对外部数据进行任何额外的训练。对几个真实世界数据集的广泛实验表明，StructNeRF 在数量和质量上都超过了用于室内场景的最先进的方法。\n  - [学习用于视图合成的统一 3D 点云](https://arxiv.org/abs/2209.05013) | [code]\n    > 基于 3D 点云表示的视图合成方法已证明是有效的。然而，现有方法通常仅从单个源视图合成新视图，并且将它们泛化以处理多个源视图以追求更高的重建质量并非易事。在本文中，我们提出了一种新的基于深度学习的视图合成范式，它从不同的源视图中学习统一的 3D 点云。具体来说，我们首先通过根据深度图将源视图投影到 3D 空间来构建子点云。然后，我们通过自适应融合子点云联合上定义的局部邻域中的点来学习统一的 3D 点云。此外，我们还提出了一个 3D 几何引导图像恢复模块来填充孔洞并恢复渲染新视图的高频细节。三个基准数据集的实验结果表明，我们的方法在数量上和视觉上都在很大程度上优于最先进的视图合成方法。\n  - [用于稀疏视图计算机断层扫描的自监督坐标投影网络](https://arxiv.org/abs/2209.05483) | [code]\n    > 在目前的工作中，我们提出了一种自监督坐标投影网络（SCOPE），通过解决逆断层扫描成像问题，从单个 SV 正弦图重建无伪影的 CT 图像。与最近使用隐式神经表示网络 (INR) 解决类似问题的相关工作相比，我们的重要贡献是一种有效且简单的重投影策略，该策略将断层扫描图像重建质量提高到有监督的深度学习 CT 重建工作之上。所提出的策略受到线性代数和逆问题之间简单关系的启发。为了求解欠定线性方程组，我们首先引入INR，通过图像连续性先验来约束解空间并获得粗解。其次，我们建议生成密集视图正弦图，提高线性方程组的秩并产生更稳定的 CT 图像解空间。我们的实验结果表明，重投影策略显着提高了图像重建质量（PSNR 至少 +3 dB）。此外，我们将最近的哈希编码集成到我们的 SCOPE 模型中，这极大地加速了模型训练。最后，我们在并行和扇形 X 射线束 SVCT 重建任务中评估 SCOPE。实验结果表明，所提出的 SCOPE 模型在数量和质量上都优于两种最新的基于 INR 的方法和两种流行的监督 DL 方法。\n  - [CU-Net：高效的点云颜色上采样网络](https://arxiv.org/abs/2209.06112) | [code]\n    > 增强现实、虚拟现实和远程呈现场景需要点云上采样。尽管几何上采样被很好地研究以致密点云坐标，但颜色的上采样在很大程度上被忽略了。在本文中，我们提出了第一个深度学习点云颜色上采样模型 CU-Net。利用基于稀疏卷积的特征提取器和基于神经隐函数的颜色预测模块，CU-Net 实现了线性时间和空间复杂度。因此，理论上保证 CU-Net 比大多数具有二次复杂度的现有方法更有效。实验结果表明，CU-Net 可以实时为具有近百万个点的照片般逼真的点云着色，同时具有比基线更好的视觉质量。此外，CU-Net 可以适应任意的上采样率和看不见的对象。我们的源代码将很快向公众发布。\n## Sep4 - Sep10, 2022\n  - [PixTrack：使用 NeRF 模板和特征度量对齐的精确 6DoF 对象姿势跟踪](https://arxiv.org/abs/2209.03910) | [code]\n    > 我们提出了 PixTrack，这是一个基于视觉的对象姿态跟踪框架，使用新颖的视图合成和深度特征度量对齐。我们的评估表明，我们的方法可以对 RGB 图像中的对象进行高度准确、稳健且无抖动的 6DoF 姿态估计，而无需任何数据注释或轨迹平滑。我们的方法在计算上也很高效，可以轻松进行多对象跟踪，而无需更改我们的方法，并且只使用 CPU 多处理。\n  - [具有深度神经表示的隐式全波形反演](https://arxiv.org/abs/2209.03525) | [code]\n    > 全波形反演（FWI）通常代表最先进的地下结构和物理参数成像方法，然而，其实施通常面临巨大挑战，例如建立一个良好的初始模型以摆脱局部最小值，以及评估反演结果的不确定性。在本文中，我们提出了使用连续和隐式定义的深度神经表示的隐式全波形反演（IFWI）算法。与对初始模型敏感的 FWI 相比，IFWI 受益于深度学习优化增加的自由度，从而允许从随机初始化开始，这大大降低了非唯一性和陷入局部最小值的风险。理论和实验分析均表明，在给定随机初始模型的情况下，IFWI 能够收敛到全局最小值，并生成具有精细结构的地下高分辨率图像。此外，IFWI 的不确定性分析可以很容易地通过使用各种深度学习方法近似贝叶斯推理来执行，本文通过添加 dropout 神经元对其进行分析。此外，IFWI具有一定的鲁棒性和较强的泛化能力，在各种二维地质模型的实验中得到了体现。通过适当的设置，IFWI也可以很好地适用于多尺度联合地球物理反演。\n  - [具有学习几何先验的 3D 纹理形状恢复](https://arxiv.org/abs/2209.03254) | [code]\n    > 从部分扫描中恢复 3D 纹理形状对于许多实际应用至关重要。现有方法已经证明了隐式函数表示的有效性，但它们存在严重遮挡和不同对象类型的部分输入，这极大地阻碍了它们在现实世界中的应用价值。本技术报告介绍了我们通过结合学习几何先验来解决这些限制的方法。为此，我们从学习的姿势预测中生成一个 SMPL 模型，并将其融合到部分输入中，以添加人体的先验知识。我们还提出了一种新颖的完整性感知边界框自适应，用于处理不同级别的尺度和部分扫描的局部性。\n  - [SIRA：来自单个图像的可重新点亮的头像](https://arxiv.org/abs/2209.03027) | [code]\n    > 从单个图像中恢复人头的几何形状，同时分解材料和照明是一个严重不适定的问题，需要解决先验信息。基于 3D 可变形模型 (3DMM) 的方法，以及它们与可微渲染器的组合，已显示出可喜的结果。然而，3DMM 的表现力是有限的，它们通常会产生过度平滑且与身份无关的 3D 形状，仅限于面部区域。最近已经通过使用多层感知器参数化几何形状的神经场获得了高度准确的全头重建。这些表示的多功能性也被证明对于解开几何、材料和照明是有效的。然而，这些方法需要几十个输入图像。在本文中，我们介绍了 SIRA，这是一种从单个图像重建具有高保真几何形状和分解光和表面材料的人头头像的方法。我们的关键成分是两个基于神经场的数据驱动统计模型，可解决单视图 3D 表面重建和外观分解的模糊性。实验表明，SIRA 在 3D 头部重建中获得了最先进的结果，同时它成功地解开了全局照明、漫反射和镜面反射率。此外，我们的重建适用于基于物理的外观编辑和头部模型重新照明。\n  - [神经特征融合领域：自监督 2D 图像表示的 3D 蒸馏, 3DV2022(oral)](https://arxiv.org/abs/2209.03494) | [***``[code]``***](https://github.com/dichotomies/N3F)\n    > 我们提出了神经特征融合场 (N3F)，这是一种在将密集 2D 图像特征提取器应用于可重构为 3D 场景的多张图像分析时改进密集 2D 图像特征提取器的方法。给定一个图像特征提取器，例如使用自我监督进行预训练，N3F 使用它作为教师来学习在 3D 空间中定义的学生网络。 3D 学生网络类似于提取所述特征的神经辐射场，并且可以使用通常的可微渲染机器进行训练。因此，N3F 很容易适用于大多数神经渲染公式，包括 vanilla NeRF 及其对复杂动态场景的扩展。我们表明，我们的方法不仅能够在不使用手动标签的情况下在特定场景的神经领域的上下文中实现语义理解，而且在自我监督的 2D 基线上持续改进。这通过考虑不同序列中的各种任务（例如 2D 对象检索、3D 分割和场景编辑）来证明，包括 EPIC-KITCHENS 基准测试中的以自我为中心的长视频。\n  - [MotionDiffuse：使用扩散模型的文本驱动人体运动生成](https://arxiv.org/abs/2208.15001) | [***``[code]``***](https://github.com/mingyuan-zhang/MotionDiffuse)\n    > 人体运动建模对于许多现代图形应用程序很重要，这些应用程序通常需要专业技能。为了消除外行的技能障碍，最近的动作生成方法可以直接生成以自然语言为条件的人体动作。然而，通过各种文本输入实现多样化和细粒度的运动生成仍然具有挑战性。为了解决这个问题，我们提出了 MotionDiffuse，这是第一个基于扩散模型的文本驱动的运动生成框架，它展示了现有方法的几个所需属性。 1）概率映射。 MotionDiffuse 不是确定性的语言-运动映射，而是通过一系列注入变化的去噪步骤生成运动。 2）现实综合。 MotionDiffuse 擅长对复杂的数据分布进行建模并生成生动的运动序列。 3) 多级操作。 MotionDiffuse 响应身体部位的细粒度指令，以及带有时变文本提示的任意长度运动合成。我们的实验表明，MotionDiffuse 在文本驱动的运动生成和动作条件的运动生成方面具有令人信服的优势，从而优于现有的 SoTA 方法。定性分析进一步证明了 MotionDiffuse 对综合运动生成的可控性。主页：此 https 网址\n## Aug28 - Sep3, 2022\n  - [使用有符号射线距离函数 (SRDF) 的多视图重建](https://arxiv.org/abs/2209.00082) | [code]\n    > 在本文中，我们解决了多视图 3D 形状重建的问题。尽管最近与隐式形状表示相关的可微渲染方法提供了突破性的性能，但它们的计算量仍然很大，并且通常在估计的几何形状上缺乏精度。为了克服这些限制，我们研究了一种新的计算方法，它建立在一种新的体积形状表示上，就像最近的可微渲染方法一样，但用深度图参数化以更好地实现形状表面。与此表示相关的形状能量评估给定彩色图像的 3D 几何形状，不需要外观预测，但在优化时仍然受益于体积积分。在实践中，我们提出了一种隐式形状表示，SRDF，它基于我们通过沿相机光线的深度参数化的有符号距离。相关的形状能量考虑了深度预测一致性和光度一致性之间的一致性，这在体积表示中的 3D 位置。可以考虑各种照片一致性先验，例如基于中值的基线，或更详细的标准，如学习函数。该方法保留了深度图的像素精度，并且是可并行化的。我们在标准数据集上的实验表明，它提供了关于最近使用隐式形状表示的方法以及传统的多视图立体方法的最先进的结果。\n  - [Dual-Space NeRF：在不同空间中学习动画化身和场景照明, 3DV2022](https://arxiv.org/abs/2208.14851) | [code]\n    > 在规范空间中对人体进行建模是捕捉和动画的常见做法。但是当涉及到神经辐射场 (NeRF) 时，仅仅在标准空间中学习一个静态的 NeRF 是不够的，因为即使场景照明是恒定的，当人移动时身体的照明也会发生变化。以前的方法通过学习每帧嵌入来缓解光照的不一致性，但这种操作并不能推广到看不见的姿势。鉴于光照条件在世界空间中是静态的，而人体在规范空间中是一致的，我们提出了一种双空间 NeRF，它在两个独立的空间中使用两个 MLP 对场景光照和人体进行建模。为了弥合这两个空间，以前的方法主要依赖于线性混合蒙皮 (LBS) 算法。然而，动态神经领域的 LBS 的混合权重是难以处理的，因此通常用另一个 MLP 来记忆，这不能推广到新的姿势。尽管可以借用 SMPL 等参数网格的混合权重，但插值操作会引入更多伪影。在本文中，我们建议使用重心映射，它可以直接泛化到看不见的姿势，并且出人意料地取得了比具有神经混合权重的 LBS 更好的结果。 Human3.6M 和 ZJU-MoCap 数据集的定量和定性结果显示了我们方法的有效性。\n  - [FoV-NeRF：虚拟现实的中心凹神经辐射场, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9872532) | [code]\n    > 随着消费者显示器和商业 VR 平台的兴起，虚拟现实 (VR) 正变得无处不在。这种显示需要低延迟和高质量的合成图像渲染，同时减少计算开销。神经渲染的最新进展表明，有望通过基于图像的虚拟或物理环境表示来解锁 3D 计算机图形的新可能性。具体来说，神经辐射场 (NeRF) 表明，可以在不损失与视图相关的效果的情况下实现 3D 场景的照片般逼真的质量和连续视图变化。虽然 NeRF 可以显着受益于 VR 应用的渲染，但它面临着由高视场、高分辨率和立体/以自我为中心的观看带来的独特挑战，通常会导致渲染图像的低质量和高延迟。在 VR 中，这不仅会损害交互体验，还可能导致疾病。为了解决 VR 中的六自由度、以自我为中心和立体 NeRF 的这些问题，我们提出了第一个注视条件 3D 神经表示和视图合成方法。我们将视觉和立体敏锐度的人类心理物理学纳入 3D 风景的以自我为中心的神经表示中。然后，我们共同优化延迟/性能和视觉质量，同时相互桥接人类感知和神经场景合成，以实现感知上高质量的沉浸式交互。我们进行了客观分析和主观研究，以评估我们方法的有效性。我们发现我们的方法显着减少了延迟（与 NeRF 相比减少了高达 99% 的时间），而不会损失高保真渲染（在感知上与全分辨率地面实况相同）。所提出的方法可能是迈向未来实时捕捉、传送和可视化远程环境的 VR/AR 系统的第一步。\n  - [跨光谱神经辐射场, 3DV2022](https://arxiv.org/abs/2209.00648) | [code]\n    > 我们提出了 X-NeRF，这是一种基于神经辐射场公式的学习交叉光谱场景表示的新方法，该方法给定从具有不同光谱灵敏度的相机捕获的图像。 X-NeRF 在训练期间优化跨光谱的相机姿势，并利用归一化跨设备坐标 (NXDC) 从任意视点呈现不同模态的图像，这些图像对齐并具有相同的分辨率。对 16 个具有彩色、多光谱和红外图像的前向场景进行的实验证实了 X-NeRF 在建模交叉光谱场景表示方面的有效性。\n  - [克隆：用于占用网格辅助神经表示的相机-激光雷达融合](https://arxiv.org/abs/2209.01194) | [code]\n    > 本文提出了 CLONeR，它通过允许对从稀疏输入传感器视图观察到的大型户外驾驶场景进行建模，显着改进了 NeRF。这是通过将 NeRF 框架内的占用和颜色学习解耦为分别使用 LiDAR 和相机数据训练的单独的多层感知器 (MLP) 来实现的。此外，本文提出了一种在 NeRF 模型旁边构建可微分 3D 占用网格图 (OGM) 的新方法，并利用此占用网格改进沿射线的点采样，以在度量空间中进行体积渲染。\n  - [NerfCap：使用动态神经辐射场捕获人类表现, TVCG2022](https://ieeexplore.ieee.org/abstract/document/9870173) | [code]\n    > 本文解决了从稀疏的多视图或单目视频中捕捉人类表演的挑战。给定表演者的模板网格，以前的方法通过将模板网格非刚性地注册到具有 2D 轮廓或密集光度对齐的图像来捕获人体运动。然而，详细的表面变形无法从轮廓中恢复，而光度对齐则受到视频外观变化引起的不稳定性的影响。为了解决这些问题，我们提出了 NerfCap，这是一种基于表演者动态神经辐射场 (NeRF) 表示的新型表演捕捉方法。具体来说，通过优化变形场和规范 NeRF 的外观模型，从模板几何初始化规范 NeRF 并注册到视频帧。为了捕捉大型身体运动和详细的表面变形，NerfCap 将线性混合蒙皮与嵌入式图形变形相结合。与受限于固定拓扑和纹理的基于网格的方法相比，NerfCap 能够灵活地捕捉视频中复杂的几何形状和外观变化，并合成更逼真的图像。此外，NerfCap 可以通过将合成视频与输入视频进行匹配，以自我监督的方式进行端到端的预训练。各种数据集的实验结果表明，NerfCap 在表面重建精度和新视图合成质量方面都优于先前的工作。\n## Aug21 - Aug27, 2022\n  - [训练和调整生成神经辐射场以进行属性条件 3D 感知人脸生成](https://arxiv.org/abs/2208.12550) | [***``[code]``***](https://github.com/zhangqianhui/TT-GNeRF)\n    > 基于生成神经辐射场 (GNeRF) 的 3D 感知 GAN 已经实现了令人印象深刻的高质量图像生成，同时保持了强大的 3D 一致性。最显着的成就是在人脸生成领域。然而，这些模型中的大多数都专注于提高视图一致性而忽略了解耦方面，因此这些模型无法提供对生成的高质量语义/属性控制。为此，我们引入了一个使用特定属性标签作为输入的条件 GNeRF 模型，以提高 3D 感知生成模型的可控性和解开能力。我们利用预训练的 3D 感知模型作为基础，并集成了一个双分支属性编辑模块 (DAEM)，该模块利用属性标签来提供对生成的控制。此外，我们提出了一种 TRIOT (TRAining as Init, and Optimizing for Tuning) 方法来优化潜在向量，以进一步提高属性编辑的精度。在广泛使用的 FFHQ 上进行的大量实验表明，我们的模型在保留非目标区域的同时，可以产生具有更好视图一致性的高质量编辑。该代码可在此 https 网址上找到。\n  - [Voxurf：基于体素的高效准确的神经表面重建](https://arxiv.org/abs/2208.12697) | [code]\n    > 神经表面重建旨在基于多视图图像重建准确的 3D 表面。以前基于神经体绘制的方法大多训练完全隐式模型，并且它们需要对单个场景进行数小时的训练。最近的努力探索了显式体积表示，它通过在可学习的体素网格中记忆重要信息来大大加速优化过程。然而，这些基于体素的方法通常难以重建细粒度几何。通过实证研究，我们发现高质量的表面重建取决于两个关键因素：构建连贯形状的能力和颜色几何依赖性的精确建模。特别是后者是精细细节准确重建的关键。受这些发现的启发，我们开发了 Voxurf，这是一种基于体素的高效和准确的神经表面重建方法，它包括两个阶段：1）利用可学习的特征网格来构建色场并获得连贯的粗略形状，以及 2）使用捕获精确的颜色几何依赖性的双色网络优化详细的几何图形。我们进一步引入了分层几何特征，以实现跨体素的信息共享。我们的实验表明，Voxurf 同时实现了高效率和高质量。在 DTU 基准上，与最先进的方法相比，Voxurf 实现了更高的重建质量，训练速度提高了 20 倍。\n  - [神经小说演员：学习人类演员的广义动画神经表示](https://arxiv.org/abs/2208.11905) | [code]\n    > 我们提出了一种新方法，用于从一组稀疏的多人多视图图像中学习广义的可动画神经人类表示。学习到的表示可用于从一组稀疏的相机中合成任意人的新颖视图图像，并使用用户的姿势控制进一步对它们进行动画处理。虽然现有方法可以推广到新人或使用用户控制合成动画，但它们都不能同时实现这两者。我们将这一成就归功于为共享的多人人体模型使用 3D 代理，并进一步将不同姿势的空间扭曲到共享的规范姿势空间，在该空间中，我们学习了一个神经领域并预测了人和与姿势相关的变形，以及从输入图像中提取的特征的外观。为了应对身体形状、姿势和服装变形的巨大变化的复杂性，我们设计了具有解开几何和外观的神经人体模型。此外，我们利用 3D 代理的空间点和表面点的图像特征来预测与人和姿势相关的属性。实验表明，我们的方法在这两项任务上都显着优于现有技术。视频和代码可在此 https 网址上找到。\n  - [DreamBooth：为主题驱动生成微调文本到图像的扩散模型](https://dreambooth.github.io/) | [code]\n    > 大型文本到图像模型在人工智能的演进中实现了显着的飞跃，能够从给定的文本提示中对图像进行高质量和多样化的合成。然而，这些模型缺乏模仿给定参考集中对象的外观并在不同上下文中合成它们的新颖再现的能力。在这项工作中，我们提出了一种“个性化”文本到图像扩散模型的新方法（专门针对用户的需求）。给定主题的几张图像作为输入，我们微调预训练的文本到图像模型（Imagen，尽管我们的方法不限于特定模型），以便它学会将唯一标识符与该特定主题绑定.一旦对象被嵌入模型的输出域中，唯一标识符就可以用于合成在不同场景中情境化的对象的完全新颖的真实感图像。通过利用嵌入在模型中的语义先验和新的自生类特定先验保存损失，我们的技术能够在参考图像中没有出现的不同场景、姿势、视图和照明条件下合成主体。我们将我们的技术应用于几个以前无懈可击的任务，包括主题重新上下文化、文本引导视图合成、外观修改和艺术渲染（同时保留主题的关键特征）。项目页面：此 https 网址\n  - [E-NeRF：来自移动事件相机的神经辐射场](https://arxiv.org/abs/2208.11300) | [code]\n    > 从理想图像估计神经辐射场 (NeRFs) 已在计算机视觉领域得到广泛研究。大多数方法假设最佳照明和缓慢的相机运动。这些假设在机器人应用中经常被违反，其中图像包含运动模糊并且场景可能没有合适的照明。这可能会导致下游任务（例如场景的导航、检查或可视化）出现重大问题。为了缓解这些问题，我们提出了 E-NeRF，这是第一种从快速移动的事件摄像机中以 NeRF 形式估计体积场景表示的方法。我们的方法可以在非常快速的运动和高动态范围条件下恢复 NeRF，在这种情况下，基于帧的方法会失败。我们展示了仅通过提供事件流作为输入来渲染高质量帧是可能的。此外，通过结合事件和帧，我们可以估计在严重运动模糊下比最先进的方法质量更高的 NeRF。我们还表明，在只有很少的输入视图可用的情况下，结合事件和帧可以克服 NeRF 估计的失败情况，而无需额外的正则化。\n  - [FurryGAN：高质量的前景感知图像合成, ECCV2022](https://jeongminb.github.io/FurryGAN/) | [***``[code]``***](https://jeongminb.github.io/FurryGAN/)\n    > 前景感知图像合成旨在生成图像及其前景蒙版。一种常见的方法是将图像公式化为前景图像和背景图像的蒙版混合。这是一个具有挑战性的问题，因为它很容易达到一个简单的解决方案，即任一图像压倒另一个图像，即蒙版完全满或空，前景和背景没有有意义地分离。我们展示了 FurryGAN 的三个关键组件：1）将前景图像和合成图像都强加为逼真，2）将掩码设计为粗略和精细掩码的组合，以及 3）通过辅助掩码预测器引导生成器鉴别器。我们的方法使用非常详细的 alpha 蒙版生成逼真的图像，这些蒙版以完全无人监督的方式覆盖头发、毛皮和胡须。\n  - [SCONE：通过体积积分优化未知环境中的表面覆盖率](https://arxiv.org/abs/2208.10449) | [code]\n    > 下一个最佳视图计算 (NBV) 是机器人技术中长期存在的问题，包括识别下一个信息量最大的传感器位置，以有效且准确地重建 3D 对象或场景。像大多数当前方法一样，我们考虑来自深度传感器的 NBV 预测。依赖于场景体积表示的基于学习的方法适用于路径规划，但不能很好地适应场景的大小，并且精度低于使用基于表面的表示的方法。然而，后者将相机限制在少数姿势。为了获得这两种表示的优点，我们表明我们可以通过蒙特卡罗积分在体积表示上最大化表面度量。我们的方法可扩展到大型场景并处理自由相机运动：它将由深度传感器（如激光雷达系统）收集的任意大点云以及相机姿势作为输入来预测 NBV。我们在由大型复杂 3D 场景组成的新数据集上展示了我们的方法。\n## Aug14 - Aug20, 2022\n  - [Vox-Surf：基于体素的隐式表面表示](https://arxiv.org/abs/2208.10925) | [code]\n    > 虚拟内容创建和交互在 AR 和 VR 等现代 3D 应用中发挥着重要作用。从真实场景中恢复详细的 3D 模型可以显着扩展其应用范围，并且已经在计算机视觉和计算机图形学界进行了数十年的研究。我们提出了 Vox-Surf，一种基于体素的隐式表面表示。我们的 Vox-Surf 将空间划分为有限的有界体素。每个体素在其角顶点中存储几何和外观信息。由于从体素表示继承而来的稀疏性，Vox-Surf 几乎适用于任何场景，并且可以从多个视图图像中轻松训练。我们利用渐进式训练过程逐步提取重要体素进行进一步优化，从而只保留有效体素，这大大减少了采样点的数量并提高了渲染速度。精细体素也可以视为碰撞检测的边界体积。实验表明，与其他方法相比，Vox-Surf 表示可以以更少的内存和更快的渲染速度学习精细的表面细节和准确的颜色。我们还表明，Vox-Surf 在场景编辑和 AR 应用中可以更实用。\n  - [通过多平面图像的 3D 对象运动估计动态场景的时间视图合成, ISMAR2022](https://arxiv.org/abs/2208.09463) | [***``[code]``***](https://github.com/NagabhushanSN95/DeCOMPnet)\n    > 在低计算设备上以图形方式渲染高帧率视频的挑战可以通过对未来帧的定期预测来解决，以增强虚拟现实应用程序中的用户体验。这是通过时间视图合成 (TVS) 的问题来研究的，其目标是在给定前一帧以及前一帧和下一帧的头部姿势的情况下预测视频的下一帧。在这项工作中，我们考虑了用户和对象都在移动的动态场景的 TVS。我们设计了一个框架，将运动解耦为用户和对象运动，以在预测下一帧的同时有效地使用可用的用户运动。我们通过隔离和估计过去帧中的 3D 对象运动然后外推来预测对象的运动。我们使用多平面图像 (MPI) 作为场景的 3D 表示，并将对象运动建模为 MPI 表示中对应点之间的 3D 位移。为了在估计运动时处理 MPI 中的稀疏性，我们结合了部分卷积和掩蔽相关层来估计对应点。然后将预测的对象运动与给定的用户或相机运动集成以生成下一帧。使用遮蔽填充模块，我们合成由于相机和物体运动而未覆盖的区域。我们为包含 800 个全高清分辨率视频的动态场景 TVS 开发了一个新的合成数据集。我们通过对我们的数据集和 MPI Sintel 数据集的实验表明，我们的模型优于文献中的所有竞争方法。\n  - [LoRD：用于高保真动态人体建模的局部 4D 隐式表示, ECCV2022](https://arxiv.org/abs/2208.08622) | [code]\n    > 4D 隐式表示的最新进展集中在使用低维潜在向量全局控制形状和运动，这容易丢失表面细节和累积跟踪误差。尽管许多深度局部表示已显示出可用于 3D 形状建模的有希望的结果，但它们的 4D 对应物尚不存在。在本文中，我们提出了一种新颖的用于动态服装人体的局部 4D 隐式表示，名为 LoRD，以填补这一空白，它兼具 4D 人体建模和局部表示的优点，并能够通过详细的表面变形进行高保真重建，例如衣服褶皱。特别是，我们的关键见解是鼓励网络学习局部部分级表示的潜在代码，能够解释局部几何和时间变形。为了在测试时进行推断，我们首先在每个时间步估计体内骨骼运动以跟踪局部部位，然后根据不同类型的观察数据通过自动解码优化每个部位的潜在代码。大量实验表明，该方法具有很强的表示 4D 人体的能力，并且在实际应用中优于最先进的方法，包括从稀疏点进行 4D 重建、非刚性深度融合，无论是定性还是定量。\n  - [从单目视频中对动画 3D 人体进行神经捕获, ECCV2022](https://arxiv.org/abs/2208.08728) | [code]\n    > 我们提出了一种从单目视频输入构建可动画 3D 人体表示的新颖范例，这样它就可以以任何看不见的姿势和视图进行渲染。我们的方法基于动态神经辐射场 (NeRF)，该动态神经辐射场 (NeRF) 由作为几何代理的基于网格的参数化 3D 人体模型装配。以前的方法通常依赖多视图视频或准确的 3D 几何信息作为附加输入；此外，大多数方法在推广到看不见的姿势时质量会下降。我们认为，泛化的关键是用于查询动态 NeRF 的良好输入嵌入：良好的输入嵌入应该定义全体积空间中的单射映射，由姿态变化下的表面网格变形引导。基于这一观察，我们建议嵌入输入查询及其与网格顶点上一组测地最近邻所跨越的局部表面区域的关系。通过包含位置和相对距离信息，我们的嵌入定义了距离保留的变形映射，并很好地推广到看不见的姿势。为了减少对额外输入的依赖，我们首先使用现成的工具初始化每帧 3D 网格，然后提出一个管道来联合优化 NeRF 并细化初始网格。大量实验表明，我们的方法可以在看不见的姿势和视图下合成合理的人类渲染结果。\n  - [8 点算法作为 ViTs 相对姿势预测的归纳偏差, 3DV2022](https://arxiv.org/abs/2208.08988) | [***``[code]``***](https://github.com/crockwell/rel_pose)\n    > 我们提出了一个简单的基线，用于直接估计两个图像之间的相对姿势（旋转和平移，包括比例）。深度方法最近显示出强劲的进展，但通常需要复杂或多阶段的架构。我们展示了一些修改可以应用于视觉转换器 (ViT)，以使其计算接近八点算法。这种归纳偏差使一种简单的方法在多种环境中具有竞争力，通常在有限的数据机制中显着提高现有技术水平，并具有强大的性能提升。\n  - [PDRF：渐进式去模糊辐射场，用于从模糊图像中快速、稳健地重建场景](https://arxiv.org/abs/2208.08049) | [code]\n    > 我们提出了渐进式去模糊辐射场 (PDRF)，这是一种从模糊图像中有效重建高质量辐射场的新方法。虽然当前最先进的 (SoTA) 场景重建方法从干净的源视图实现照片般逼真的渲染结果，但当源视图受到模糊影响时，它们的性能会受到影响，这在野外图像中很常见。以前的去模糊方法要么不考虑 3D 几何，要么计算量很大。为了解决这些问题，PDRF 是辐射场建模中的一种渐进式去模糊方案，它通过结合 3D 场景上下文准确地模拟模糊。 PDRF 进一步使用有效的重要性采样方案，从而实现快速的场景优化。具体来说，PDRF 提出了一种 Coarse Ray Renderer 来快速估计体素密度和特征；然后使用 Fine Voxel Renderer 来实现高质量的光线追踪。我们进行了广泛的实验，结果表明 PDRF 比以前的 SoTA 快 15 倍，同时在合成场景和真实场景上都取得了更好的性能。\n  - [通过隐式视觉引导和超网络生成文本到图像](https://arxiv.org/abs/2208.08493) | [code]\n    > 我们开发了一种文本到图像生成的方法，该方法包含额外的检索图像，由隐式视觉引导损失和生成目标的组合驱动。与大多数现有的仅以文本为输入的文本到图像生成方法不同，我们的方法将跨模态搜索结果动态地馈送到统一的训练阶段，从而提高了生成结果的质量、可控性和多样性。我们提出了一种新的超网络调制的视觉文本编码方案来预测编码层的权重更新，从而实现从视觉信息（例如布局、内容）到相应的潜在域的有效传输。实验结果表明，我们的模型以额外的检索视觉数据为指导，优于现有的基于 GAN 的模型。在 COCO 数据集上，与最先进的方法相比，我们实现了更好的 FID 为 9.13，生成器参数减少了 3.5 倍。\n  - [从全向图像中捕捉休闲室内 HDR 辐射](https://arxiv.org/abs/2208.07903) | [code]\n    > 我们提出了 PanoHDR-NeRF，这是一种新颖的管道，可以随意捕获大型室内场景的合理全 HDR 辐射场，而无需精心设置或复杂的捕获协议。首先，用户通过在场景周围自由挥动现成的相机来捕捉场景的低动态范围 (LDR) 全向视频。 然后，LDR2HDR 网络将捕获的 LDR 帧提升为 HDR，随后用于训练定制的 NeRF++ 模型。 由此产生的 PanoHDR-NeRF 管道可以从场景的任何位置估计完整的 HDR 全景图。 通过对各种真实场景的新测试数据集进行实验，在训练期间未看到的位置捕获地面实况 HDR 辐射，我们表明 PanoHDR-NeRF 可以预测来自任何场景点的合理辐射。我们还表明，由 PanoHDR-NeRF 生成的 HDR 图像可以合成正确的照明效果，从而能够使用正确照明的合成对象来增强室内场景。\n  - [UPST-NeRF：用于 3D 场景的神经辐射场的通用逼真风格转移](https://arxiv.org/abs/2208.07059) | [***``[code]``***](https://github.com/semchan/UPST-NeRF)\n    > 3D 场景逼真风格化旨在根据给定的风格图像从任意新颖的视图生成逼真的图像，同时确保从不同视点渲染时的一致性。现有的一些具有神经辐射场的风格化方法可以通过将风格图像的特征与多视图图像相结合来训练3D场景，从而有效地预测风格化场景。然而，这些方法会生成包含令人反感的伪影的新颖视图图像。此外，它们无法为 3D 场景实现通用的逼真风格化。因此，造型图像必须重新训练基于神经辐射场的 3D 场景表示网络。我们提出了一种新颖的 3D 场景逼真风格迁移框架来解决这些问题。它可以用 2D 风格的图像实现逼真的 3D 场景风格转换。我们首先预训练了一个 2D 真实感风格迁移网络，可以满足任何给定内容图像和风格图像之间的真实感风格迁移。然后，我们使用体素特征来优化 3D 场景并获得场景的几何表示。最后，我们共同优化了一个超网络，以实现任意风格图像的场景逼真风格迁移。在迁移阶段，我们使用预训练的 2D 真实感网络来约束 3D 场景中不同视图和不同风格图像的真实感风格。实验结果表明，我们的方法不仅实现了任意风格图像的 3D 逼真风格转换，而且在视觉质量和一致性方面优于现有方法。项目页面：此 https URL。\n  - [DM-NeRF：2D 图像的 3D 场景几何分解和操作](https://arxiv.org/abs/2208.07227) | [***``[code]``***](https://github.com/vLAR-group/DM-NeRF)\n    > 在本文中，我们从 2D 视图研究 3D 场景几何分解和操纵问题。通过利用最近的隐式神经表示技术，特别是吸引人的神经辐射场，我们引入了一个对象场组件，仅从 2D 监督中学习 3D 空间中所有单个对象的唯一代码。该组件的关键是一系列精心设计的损失函数，以使每个 3D 点，尤其是在非占用空间中，即使没有 3D 标签也能得到有效优化。此外，我们引入了一种逆查询算法，可以在学习的场景表示中自由操作任何指定的 3D 对象形状。值得注意的是，我们的操作算法可以明确地解决关键问题，例如对象碰撞和视觉遮挡。我们的方法称为 DM-NeRF，是最早在单个管道中同时重建、分解、操作和渲染复杂 3D 场景的方法之一。在三个数据集上的大量实验清楚地表明，我们的方法可以准确地从 2D 视图中分解所有 3D 对象，允许在 3D 空间中自由操作任何感兴趣的对象，例如平移、旋转、大小调整和变形。\n  - [HDR-Plenoxels：自校准高动态范围辐射场, ECCV2022](https://arxiv.org/abs/2208.06787) | [code]\n    > 我们提出了高动态范围辐射 (HDR) 场 HDR-Plenoxels，它学习 3D HDR 辐射场、几何信息和 2D 低动态范围 (LDR) 图像中固有的不同相机设置的全光函数。我们基于体素的体素渲染管道仅使用从不同相机设置中以端到端方式拍摄的多视图 LDR 图像来重建 HDR 辐射场，并且具有快速的收敛速度。为了处理现实世界场景中的各种相机，我们引入了一个色调映射模块，该模块对相机内的数字成像管道 (ISP) 进行建模并解开辐射设置。我们的色调映射模块允许我们通过控制每个新视图的辐射设置来进行渲染。最后，我们构建了一个具有不同相机条件的多视图数据集，这符合我们的问题设置。我们的实验表明，HDR-Plenoxels 可以仅从带有各种相机的 LDR 图像中表达细节和高质量的 HDR 新颖视图。\n## Aug7 - Aug13, 2022\n  - [渐进式多尺度光场网络, 3DV2022](https://arxiv.org/abs/2208.06710) | [code]\n    > 与图像集表示相比，神经表示在表示辐射和光场的能力方面显示出了巨大的希望，同时非常紧凑。然而，当前的表示不太适合流式传输，因为解码只能在单个细节级别上完成，并且需要下载整个神经网络模型。此外，高分辨率光场网络可能会出现闪烁和混叠，因为在没有适当过滤的情况下对神经网络进行采样。为了解决这些问题，我们提出了一个渐进式多尺度光场网络，它对具有多层次细节的光场进行编码。使用较少的神经网络权重对较低级别的细节进行编码，从而实现渐进式流传输并减少渲染时间。我们的渐进式多尺度光场网络通过在较低细节级别编码较小的抗锯齿表示来解决锯齿问题。此外，每个像素级别的细节使我们的表示能够支持抖动过渡和中心点渲染。\n  - [OmniVoxel：一种快速精确的全向神经辐射场重建方法, GCCE 2022](https://arxiv.org/abs/2208.06335) | [code]\n    > 本文提出了一种利用等矩形全向图像重建神经辐射场的方法。具有辐射场的隐式神经场景表示可以在有限的空间区域内连续重建场景的 3D 形状。然而，在商用 PC 硬件上训练完全隐式表示需要大量时间和计算资源（每个场景 15 ~ 20 小时）。因此，我们提出了一种显着加速这一过程的方法（每个场景 20 ∼ 40 分钟）。我们没有使用完全隐式的光线表示来重建辐射场，而是采用包含张量中的密度和颜色特征的特征体素。考虑到全向 equirectangular 输入和相机布局，我们使用球面体素化来表示，而不是三次表示。我们的体素化方法可以平衡内景和外景的重建质量。此外，我们对颜色特征采用轴对齐位置编码方法来提高整体图像质量。我们的方法在具有随机相机姿势的合成数据集上实现了令人满意的经验性能。此外，我们在包含复杂几何形状的真实场景中测试了我们的方法，并实现了最先进的性能。我们的代码和完整的数据集将与论文发表的同时发布。\n  - [HyperTime：时间序列的隐式神经表示](https://arxiv.org/abs/2208.05836) | [code]\n    > 隐式神经表示 (INR) 最近已成为一种强大的工具，可提供准确且与分辨率无关的数据编码。它们作为通用逼近器的鲁棒性已在各种数据源中得到证明，并应用于图像、声音和 3D 场景表示。然而，很少有人关注利用这些架构来表示和分析时间序列数据。在本文中，我们使用 INR 分析时间序列的表示，比较不同的激活函数在重建精度和训练收敛速度方面。我们展示了如何利用这些网络对时间序列进行插补，以及在单变量和多变量数据上的应用。最后，我们提出了一种利用 INR 来学习整个时间序列数据集的压缩潜在表示的超网络架构。我们引入了基于 FFT 的损失来指导训练，以便在时间序列中保留所有频率。我们展示了该网络可用于将时间序列编码为 INR，并且可以对它们的嵌入进行插值以从现有的时间序列中生成新的时间序列。我们通过将其用于数据增强来评估我们的生成方法，并表明它与当前最先进的时间序列增强方法具有竞争力。\n  - [RelPose：预测野外单个物体的概率相对旋转, ECCV2022](https://jasonyzhang.com/relpose/) | [***``[code]``***](https://github.com/jasonyzhang/relpose)\n    > 我们描述了一种数据驱动的方法，用于在给定任意对象的多个图像的情况下推断相机视点。该任务是经典几何流水线（如 SfM 和 SLAM）的核心组成部分，也是当代神经方法（例如 NeRF）对对象重建和视图合成的重要预处理要求。与现有的在稀疏视图中表现不佳的对应驱动方法相比，我们提出了一种基于自上而下预测的方法来估计相机视点。我们的关键技术见解是使用基于能量的公式来表示相对相机旋转的分布，从而使我们能够明确表示由对象对称性或视图产生的多个相机模式。利用这些相对预测，我们从多张图像中共同估计一组一致的相机旋转。我们表明，在给定可见和不可见类别的稀疏图像的情况下，我们的方法优于最先进的 SfM 和 SLAM 方法。此外，我们的概率方法明显优于直接回归相对姿势，这表明建模多模态对于连贯的关节重建很重要。我们证明我们的系统可以成为从多视图数据集进行野外重建的垫脚石。包含代码和视频的项目页面可以在这个 https URL 找到。\n  - [NIDN：纳米结构的神经逆向设计](https://arxiv.org/abs/2208.05480) | [code]\n    > 近十年来，计算工具已成为材料设计的核心，以降低成本实现快速开发周期。机器学习工具在光子学领域尤其兴起。然而，从优化的角度来看，设计所需的麦克斯韦方程的反演特别具有挑战性，需要复杂的软件。我们提出了一种创新的开源软件工具，称为纳米结构的神经逆向设计 (NIDN)，它允许使用基于物理的深度学习方法设计复杂的堆叠材料纳米结构。我们执行基于梯度的神经网络训练，而不是无导数或数据驱动的优化或学习方法，在这种训练中，我们根据其光谱特性直接优化材料及其结构。 NIDN 支持两种不同的求解器，严格的耦合波分析和有限差分时域方法。 NIDN 的实用性和有效性在几个合成示例以及 1550 nm 滤光片和抗反射涂层的设计中得到了证明。结果与实验基线、其他模拟工具和所需的光谱特性相匹配。鉴于其在网络架构和 Maxwell 求解器方面的完全模块化以及开源、许可的可用性，NIDN 将能够支持广泛应用中的计算材料设计过程。\n  - [HRF-Net：来自稀疏输入的整体辐射场](https://arxiv.org/abs/2208.04717) | [code]\n    > 我们提出了 HRF-Net，这是一种基于整体辐射场的新型视图合成方法，它使用一组稀疏输入来渲染新颖的视图。最近的泛化视图合成方法也利用了辐射场，但渲染速度不是实时的。现有的方法可以有效地训练和渲染新颖的视图，但它们不能推广到看不见的场景。我们的方法解决了用于泛化视图合成的实时渲染问题，包括两个主要阶段：整体辐射场预测器和基于卷积的神经渲染器。这种架构不仅可以基于隐式神经场推断出一致的场景几何，还可以使用单个 GPU 有效地渲染新视图。我们首先在 DTU 数据集的多个 3D 场景上训练 HRF-Net，并且该网络可以仅使用光度损失对看不见的真实和合成数据产生似是而非的新颖视图。此外，我们的方法可以利用单个场景的一组更密集的参考图像来生成准确的新颖视图，而无需依赖额外的显式表示，并且仍然保持预训练模型的高速渲染。实验结果表明，HRF-Net 在各种合成和真实数据集上优于最先进的可泛化神经渲染方法。\n  - [使用隐式神经表示的蒙特卡罗去噪](https://oaktrust.library.tamu.edu/handle/1969.1/196567) | [code]\n    > Monte Carlo 路径追踪是计算机图形学中流行的 3D 渲染技术，但它通常需要在图像中的噪声量和计算时间之间进行代价高昂的权衡。因此，尝试“平滑”噪声图像是有用的，通常通过在样本之间构建新数据或对图像应用过滤器。在这项工作中，我们研究了训练神经网络以将固定视点场景的亮度隐式表示为连续函数的可行性。我们使用多层感知器网络实现神经网络，并在由离线 Monte Carlo 渲染器生成的稀疏采样图像上对其进行训练。该训练数据使用图像平面上每个样本的 (x, y) 坐标作为输入，并将样本的 RGB 颜色作为输出。此外，我们为网络提供第一条光线交点的表面法线、深度和反照率，作为像素坐标旁边的额外输入。这些额外的输入维度通过帮助网络考虑深度、法线和漫反射颜色的变化来提高隐式表示的质量。一旦网络在稀疏采样的场景上得到训练，我们就可以对每个像素的网络进行多次密集采样，以创建最终的去噪图像。我们发现该网络可以在具有柔和照明和光泽反射的场景中快速学习和去噪图像，并且只需少量训练即可轻松处理深度、正常和漫反射颜色的不连续性。\n  - [通过可微分渲染进行表面捕获的快速梯度下降, 3DV2022](https://hal.inria.fr/hal-03748662/) | [code]\n    > 差分渲染最近已成为一种强大的工具，用于从多个视图进行基于图像的渲染或几何重建，具有非常高的质量。到目前为止，此类方法已在通用对象数据库上进行了基准测试，并有望应用于一些真实数据，但尚未应用于可能受益的特定应用程序。在本文中，我们研究了如何为原始多相机性能捕获制作差分渲染系统。我们以实际可用性和可重复性的方式解决了几个关键问题，例如处理速度、模型的可解释性和一般输出模型质量。这导致我们对差分渲染框架做出了一些贡献。特别是，我们展示了差分渲染和经典优化的统一视图是可能的，从而导致可以分析计算完整的非随机梯度步骤并将完整的每帧数据存储在视频内存中的公式和实现，从而产生简单有效的实现.我们还使用稀疏存储和从粗到细的方案来实现极高的分辨率，同时包含内存和计算时间。我们通过实验表明，在质量上与最先进的多视图人体表面捕获方法相媲美的结果可以在很短的时间内实现，通常每帧大约一分钟。\n  - [PlaneFormers：从稀疏视图平面到 3D 重建, ECCV2022](https://arxiv.org/abs/2208.04307) | [code]\n    > 我们提出了一种从具有有限重叠的图像中对场景进行平面表面重建的方法。这种重建任务具有挑战性，因为它需要联合推理单图像 3D 重建、图像之间的对应关系以及图像之间的相对相机位姿。过去的工作提出了基于优化的方法。我们介绍了一种更简单的方法，PlaneFormer，它使用一个应用于 3D 感知平面令牌的转换器来执行 3D 推理。我们的实验表明，我们的方法比以前的工作要有效得多，并且几个特定于 3D 的设计决策对其成功至关重要。\n  - [PS-NeRV：视频的补丁风格化神经表示](https://arxiv.org/abs/2208.03742) | [code]\n    > 我们研究如何使用隐式神经表示 (INR) 来表示视频。经典的 INR 方法通常利用 MLP 将输入坐标映射到输出像素。虽然最近的一些作品试图用 CNN 直接重建整个图像。然而，我们认为上述像素级和图像级策略都不利于视频数据。相反，我们提出了一种补丁解决方案 PS-NeRV，它将视频表示为补丁和相应补丁坐标的函数。它自然继承了image-wise方法的优点，并以快速的解码速度实现了出色的重建性能。整个方法包括传统的模块，如位置嵌入、MLPs 和 CNNs，同时还引入了 AdaIN 来增强中间特征。这些简单而重要的变化可以帮助网络轻松适应高频细节。大量实验证明了它在视频压缩和视频修复等视频相关任务中的有效性。\n## Jul31 - Aug6, 2022\n  - [NFOMP：具有非完整约束的差动驱动机器人最优运动规划器的神经场, IEEE Robotics and Automation Letters](https://ieeexplore.ieee.org/abstract/document/9851532/) | [code]\n    > 摘要：最优运动规划是移动机器人中最关键的问题之一。一方面，经典的基于采样的方法为这个问题提出了渐近最优的解决方案。然而，这些规划器无法在合理的计算时间内实现平滑和短的轨迹。另一方面，基于优化的方法能够在各种场景中生成平滑而平坦的轨迹，包括密集的人群。然而，现代基于优化的方法使用预先计算的有符号距离函数进行碰撞损失估计，它限制了这些方法在一般配置空间中的应用，包括具有非完整约束的差分驱动非圆形机器人。此外，基于优化的方法缺乏准确处理 U 形或薄障碍物的能力。我们建议从两个方面改进优化方法。首先，我们开发了一个障碍物神经场模型来估计碰撞损失；将此模型与轨迹优化一起训练可以持续改善碰撞损失，同时实现更可行和更平滑的轨迹。其次，我们通过将拉格朗日乘数添加到轨迹损失函数中来强制轨迹考虑非完整约束。我们应用我们的方法解决了具有非完整约束的差动驱动机器人的最优运动规划问题，对我们的解决方案进行了基准测试，并证明了新的规划器生成了非常适合机器人跟随的平滑、短而平坦的轨迹，并且优于最先进的方法在归一化曲率上提高了 25%，在 MovingAI 环境中的尖点数量上提高了 75%。\n  - [NeSF: 用于 3D 场景的可概括语义分割的神经语义场](https://research.google/pubs/pub51563/) | [code]\n    > 我们提出了 NeSF，一种从预训练的密度场和稀疏的 2D 语义监督产生 3D 语义场的方法。我们的方法通过利用将 3D 信息存储在神经域中的神经表示来避开传统的场景表示。尽管仅由 2D 信号监督，我们的方法能够从新颖的相机姿势生成 3D 一致的语义图，并且可以在任意 3D 点进行查询。值得注意的是，NeSF 与任何产生密度场的方法兼容，并且随着预训练密度场质量的提高，其准确性也会提高。我们的实证分析证明了在令人信服的合成场景上与竞争性 2D 和 3D 语义分割基线相当的质量，同时还提供了现有方法无法提供的功能。\n  - [PRIF: Primary Ray-based Implicit Function](https://research.google/pubs/pub51556/) | [code]\n    > 我们引入了一种新的隐式形状表示，称为基于初级光线的隐式函数 (PRIF)。与大多数基于符号距离函数 (SDF) 处理空间位置的现有方法相比，我们的表示在定向射线上运行。具体来说，PRIF 被制定为直接生成给定输入射线的表面命中点，而无需昂贵的球体跟踪操作，从而实现高效的形状提取和可微渲染。我们证明了经过训练以编码 PRIF 的神经网络在各种任务中取得了成功，包括单一形状表示、类别形状生成、稀疏或嘈杂观察的形状补全、相机姿态估计的逆渲染以及颜色的神经渲染。\n  - [Transformers as Meta-Learners for Implicit Neural Representations, ECCV2022](https://arxiv.org/abs/2208.02801) | [***``[code]``***](https://yinboc.github.io/trans-inr/)\n    > 近年来，隐式神经表示 (INR) 已经出现并显示出其优于离散表示的优势。然而，将 INR 拟合到给定的观测值通常需要从头开始使用梯度下降进行优化，这是低效的，并且不能很好地泛化稀疏的观测值。为了解决这个问题，大多数先前的工作都训练了一个超网络，该超网络生成单个向量来调制 INR 权重，其中单个向量成为限制输出 INR 重建精度的信息瓶颈。最近的工作表明，通过基于梯度的元学习，可以在没有单向量瓶颈的情况下精确推断 INR 中的整个权重集。受基于梯度的元学习的广义公式的启发，我们提出了一个公式，该公式使用 Transformer 作为 INR 的超网络，它可以使用专门作为集合到集合映射的 Transformer 直接构建整个 INR 权重集。我们展示了我们的方法在不同任务和领域中构建 INR 的有效性，包括 2D 图像回归和 3D 对象的视图合成。我们的工作在 Transformer 超网络和基于梯度的元学习算法之间建立了联系，我们为理解生成的 INR 提供了进一步的分析。\n  - [全息显示3D相位全息图的端到端学习](https://www.nature.com/articles/s41377-022-00894-6) | [code]\n    > 计算机生成的全息术 (CGH) 提供相干波前的体积控制，是体积 3D 显示器、光刻、神经光刺激和光/声捕获等应用的基础。最近，基于深度学习的方法作为 CGH 合成的有前途的计算范式出现，克服了传统基于模拟/优化的方法中的质量-运行时权衡。然而，预测全息图的质量本质上受数据集质量的限制。在这里，我们介绍了一个新的全息图数据集 MIT-CGH-4K-V2，它使用分层深度图像作为数据高效的体积 3D 输入和用于直接合成高质量 3D 相位的两阶段监督+无监督训练协议-只有全息图。所提出的系统还可以校正视觉像差，从而允许为最终用户定制。我们通过实验展示了逼真的 3D 全息投影并讨论了相关的空间光调制器校准程序。我们的方法在消费级 GPU 上实时运行，在 iPhone 13 Pro 上以 5 FPS 运行，有望显着提高上述应用程序的性能。\n  - [VolTeMorph：体积表示的实时、可控和可泛化动画](https://arxiv.org/pdf/2208.00949) | [code]\n    > 最近，用于场景重建和新颖视图合成的体积表示越来越受欢迎，这使人们重新关注在高可见度下对体积内容进行动画处理质量和实时性。虽然基于学习函数的隐式变形方法可以产生令人印象深刻的结果，但它们对于艺术家和内容创作者来说是“黑匣子”，它们需要大量的训练数据才能进行有意义的概括，而且它们不会在训练数据之外产生现实的外推。在这项工作中，我们通过引入一种实时、易于使用现成软件进行编辑并且可以令人信服地推断的体积变形方法来解决这些问题。为了展示我们方法的多功能性，我们将其应用于两个场景：基于物理的对象变形和远程呈现，其中化身使用混合形状进行控制。我们还进行了彻底的实验，表明我们的方法优于结合隐式变形的体积方法和基于网格变形的方法。\n  - [基于神经辐射场和运动图的可控自由视点视频重建, IEEE Transactions on Visualization and Computer Graphics](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > 在本文中，我们提出了一种基于运动图和神经辐射场（NeRF）的可控高质量自由视点视频生成方法。与现有的姿势驱动 NeRF 或时间/结构条件的 NeRF 工作不同，我们建议首先构建捕获序列的有向运动图。这种序列-运动-参数化策略不仅能够灵活地控制自由视点视频渲染的姿态，而且避免了相似姿态的冗余计算，从而提高了整体重建效率。此外，为了支持身体形状控制而不损失逼真的自由视点渲染性能，我们通过结合显式表面变形和隐式神经场景表示来改进 vanilla NeRF。具体来说，我们为运动图上的每个有效帧训练一个局部表面引导的 NeRF，并且体积渲染仅在真实表面周围的局部空间中执行，从而实现了合理的形状控制能力。据我们所知，我们的方法是第一个同时支持逼真的自由视点视频重建和基于运动图的用户引导运动遍历的方法。结果和比较进一步证明了所提出方法的有效性。\n  - [基于神经描述符字段的鲁棒变化检测, IROS2022](https://ieeexplore.ieee.org/abstract/document/9845414) | [code]\n    > 推理环境变化的能力对于长时间运行的机器人至关重要。代理应在操作期间捕获更改，以便可以遵循操作以确保工作会话的顺利进行。然而，不同的视角和累积的定位误差使得机器人很容易由于低观察重叠和漂移的对象关联而错误地检测到周围世界的变化。在本文中，基于最近提出的类别级神经描述符字段 (NDF)，我们开发了一种对象级在线变化检测方法，该方法对部分重叠的观察和嘈杂的定位结果具有鲁棒性。利用 NDF 的形状补全能力和 SE(3) 等效性，我们表示具有紧凑形状代码的对象，该代码编码来自部分观察的完整对象形状。然后基于从 NDF 恢复的对象中心将对象组织在空间树结构中，以便快速查询对象邻域。通过形状代码相似性关联对象并比较局部对象-邻居空间布局，我们提出的方法证明了对低观测重叠和定位噪声的鲁棒性。我们对合成序列和真实世界序列进行了实验，与多种基线方法相比，实现了改进的变化检测结果。\n## Jul24 - Jul30, 2022\n  - [DoF-NeRF：景深与神经辐射场相遇, ACMMM2022](https://arxiv.org/pdf/2208.00945) | [***``[code]``***](https://github.com/zijinwuzijin/DoF-NeRF)\n    > 神经辐射场 (NeRF) 及其变体在表示 3D 场景和合成逼真的新颖视图方面取得了巨大成功。但是，它们通常基于针孔相机模型并假设全焦点输入。这限制了它们的适用性，因为从现实世界捕获的图像通常具有有限的景深 (DoF)。为了缓解这个问题，我们引入了 DoF-NeRF，一种新颖的神经渲染方法，可以处理浅自由度输入并可以模拟自由度效果。特别是，它根据几何光学原理扩展了 NeRF 以模拟镜头的孔径。这样的物理保证允许 DoF-NeRF 操作具有不同焦点配置的视图。得益于显式光圈建模，DoF-NeRF 还可以通过调整虚拟光圈和焦点参数来直接操纵 DoF 效果。它是即插即用的，可以插入到基于 NeRF 的框架中。在合成数据集和真实世界数据集上的实验表明，DoF-NeRF 不仅在全焦点设置中的性能与 NeRF 相当，而且还可以合成以浅自由度输入为条件的全焦点新视图。还演示了 DoF-NeRF 在 DoF 渲染中的一个有趣应用。\n  - [ZEPI-Net：通过内部跨尺度对极平面图像零样本学习的光场超分辨率, Neural Processing Letters (2022)](https://link.springer.com/article/10.1007/s11063-022-10955-x) | [code]\n    > 光场 (LF) 成像的许多应用都受到空间角分辨率问题的限制，因此需要高效的超分辨率技术。最近，基于学习的解决方案比传统的超分辨率（SR）技术取得了显着更好的性能。不幸的是，学习或训练过程在很大程度上依赖于训练数据集，这对于大多数 LF 成像应用程序来说可能是有限的。在本文中，我们提出了一种基于零样本学习的新型 LF 空间角 SR 算法。我们建议在核平面图像 (EPI) 空间中学习跨尺度可重用特征，并避免显式建模场景先验或从大量 LF 中隐式学习。最重要的是，在不使用任何外部 LF 的情况下，所提出的算法可以同时在空间域和角域中超分辨 LF。此外，所提出的解决方案没有深度或视差估计，这通常由现有的 LF 空间和角度 SR 采用。通过使用一个简单的 8 层全卷积网络，我们表明所提出的算法可以产生与最先进的空间 SR 相当的结果。我们的算法在多组公共 LF 数据集上的角度 SR 方面优于现有方法。实验结果表明，跨尺度特征可以很好地学习并在 EPI 空间中用于 LF SR。\n  - [ObjectFusion：具有神经对象先验的准确对象级 SLAM, Graphical Models, Volume 123, September 2022](https://www.sciencedirect.com/science/article/pii/S1524070322000418) | [code]\n    > 以前的对象级同步定位和映射 (SLAM) 方法仍然无法以有效的方式创建高质量的面向对象的 3D 地图。主要挑战来自如何有效地表示对象形状以及如何将这种对象表示有效地应用于准确的在线相机跟踪。在本文中，我们提供 ObjectFusion 作为静态场景中的一种新颖的对象级 SLAM，它通过利用神经对象先验，有效地创建具有高质量对象重建的面向对象的 3D 地图。我们提出了一种仅具有单个编码器-解码器网络的神经对象表示，以有效地表达各种类别的对象形状，这有利于对象实例的高质量重建。更重要的是，我们建议将这种神经对象表示转换为精确测量，以共同优化对象形状、对象姿态和相机姿态，以实现最终准确的 3D 对象重建。通过对合成和真实世界 RGB-D 数据集的广泛评估，我们表明我们的 ObjectFusion 优于以前的方法，具有更好的对象重建质量，使用更少的内存占用，并且以更有效的方式，尤其是在对象级别。\n  - [MobileNeRF：利用多边形光栅化管道在移动架构上进行高效的神经场渲染](https://arxiv.org/abs/2208.00277) | [***``[code]``***](https://github.com/google-research/jax3d/tree/main/jax3d/projects/mobilenerf)\n    > 神经辐射场 (NeRFs) 展示了从新颖视图合成 3D 场景图像的惊人能力。但是，它们依赖于基于光线行进的专用体积渲染算法，这些算法与广泛部署的 g 的功能不匹配图形硬件。本文介绍了一种基于纹理多边形的新 NeRF 表示，它可以使用标准渲染管道有效地合成新图像。 NeRF 表示为一组多边形，其纹理表示二进制不透明度和特征向量。使用 z 缓冲区对多边形进行传统渲染会生成每个像素都有特征的图像，这些图像由在片段着色器中运行的小型、依赖于视图的 MLP 进行解释，以产生最终的像素颜色。这种方法使 NeRF 能够使用传统的多边形光栅化管道进行渲染，该管道提供大规模的像素级并行性，在包括手机在内的各种计算平台上实现交互式帧速率。\n  - [神经密度-距离场, ECCV2022](https://arxiv.org/abs/2207.14455) | [***``[code]``***](https://ueda0319.github.io/neddf/)\n    > 神经领域在 3D 视觉任务中的成功现在是无可争辩的。遵循这一趋势，已经提出了几种针对视觉定位的方法（例如，SLAM）来使用神经场估计距离或密度场。然而，仅通过基于密度场的方法（例如神经辐射场 (NeRF)）很难实现高定位性能，因为它们在大多数空白区域中不提供密度梯度。另一方面，基于距离场的方法，例如神经隐式表面 (NeuS)，在对象的表面形状方面存在局限性。本文提出了神经密度-距离场 (NeDDF)，这是一种新的 3D 表示，它相互约束距离和密度场。我们将距离场公式扩展到没有明确边界表面的形状，例如毛皮或烟雾，这使得从距离场到密度场的显式转换成为可能。通过显式转换实现的一致距离和密度场既能保证初始值的鲁棒性，又能实现高质量的配准。此外，场之间的一致性允许从稀疏点云快速收敛。实验表明，NeDDF 可以实现高定位性能，同时在新颖的视图合成上提供与 NeRF 相当的结果。该代码可在此 https URL 获得。\n  - [通过 NeRF Attention 进行端到端视图合成](https://arxiv.org/abs/2207.14741) | [code]\n    > 在本文中，我们提出了一个用于视图合成的简单 seq2seq 公式，其中我们将一组光线点作为输入和输出与光线相对应的颜色。在这个 seq2seq 公式上直接应用标准转换器有两个限制。首先，标准注意力不能成功地适应体积渲染过程，因此合成视图中缺少高频分量。其次，将全局注意力应用于所有光线和像素是非常低效的。受神经辐射场 (NeRF) 的启发，我们提出了 NeRF 注意力 (NeRFA) 来解决上述问题。一方面，NeRFA 将体积渲染方程视为软特征调制过程。通过这种方式，特征调制增强了具有类似 NeRF 电感偏置的变压器。另一方面，NeRFA 执行多阶段注意力以减少计算开销。此外，NeRFA 模型采用光线和像素转换器来学习光线和像素之间的相互作用。 NeRFA 在四个数据集上展示了优于 NeRF 和 NerFormer 的性能：DeepVoxels、Blender、LLFF 和 CO3D。此外，NeRFA 在两种设置下建立了新的 state-of-the-art：单场景视图合成和以类别为中心的新颖视图合成。该代码将公开发布。\n  - [脱离网格：用于 3D 血管建模的连续隐式神经表示, MICCAI STACOM 2022](https://arxiv.org/abs/2207.14663) | [code]\n    > 个性化 3D 血管模型对于心血管疾病患者的诊断、预后和治疗计划非常有价值。传统上，此类模型是用网格和体素掩码等显式表示或径向基函数或原子（管状）形状等隐式表示构建的。在这里，我们建议在可微的隐式神经表示 (INR) 中通过其有符号距离函数 (SDF) 的零水平集来表示表面。这使我们能够用隐式、连续、轻量级且易于与深度学习算法集成的表示来对复杂的血管结构进行建模。我们在这里通过三个实际示例展示了这种方法的潜力。首先，我们从 CT 图像中获得了腹主动脉瘤 (AAA) 的准确且防水的表面，并从表面上的 200 个点显示出稳健的拟合。其次，我们同时将嵌套的血管壁安装在单个 INR 中，没有交叉点。第三，我们展示了如何将单个动脉的 3D 模型平滑地融合到单个防水表面中。我们的结果表明，INR 是一种灵活的表示形式，具有最小交互注释的潜力复杂血管结构的研究和操作。\n  - [神经链：从多视图图像中学习头发的几何形状和外观, ECCV2022](https://arxiv.org/pdf/2207.14067) | [***``[code]``***](https://radualexandru.github.io/neural_strands/)\n    > 我们提出了 Neural Strands，这是一种新颖的学习框架，用于从多视图图像输入中对精确的头发几何形状和外观进行建模。学习的头发模型可以从具有高保真视图相关效果的任何视点实时渲染。与体积模型不同，我们的模型实现了直观的形状和样式控制。为了实现这些特性，我们提出了一种基于神经头皮纹理的新型头发表示，该神经头皮纹理对每个纹素位置的单个股线的几何形状和外观进行编码。此外，我们引入了一种基于学习发束光栅化的新型神经渲染框架。我们的神经渲染是精确的和抗锯齿的，使渲染视图一致且逼真。将外观与多视图几何先验相结合，我们首次实现了从多视图设置中联合学习外观和显式头发几何形状。我们展示了我们的方法在各种发型的保真度和效率方面的有效性。\n  - [具有全局照明的可重新照明的新视图合成的神经辐射转移场](https://arxiv.org/abs/2207.13607) | [code]\n    > 给定场景的一组图像，从新颖的视图和光照条件重新渲染该场景是计算机视觉和图形学中一个重要且具有挑战性的问题。一方面，计算机视觉中的大多数现有作品通常对图像形成过程施加许多假设，例如直接照明和预定义的材料，使场景参数估计易于处理。另一方面，成熟的计算机图形学工具允许在给定所有场景参数的情况下对复杂的照片般逼真的光传输进行建模。结合这些方法，我们提出了一种通过学习神经预计算辐射传递函数来在新视图下重新点亮场景的方法，该函数使用新的环境图隐式处理全局光照效果。我们的方法可以在单一未知照明条件下对一组场景的真实图像进行单独监督。为了在训练期间消除任务的歧义，我们在训练过程中紧密集成了一个可微的路径跟踪器，并提出了合成 OLAT 和真实图像损失的组合。结果表明，与当前技术水平相比，场景参数的恢复解缠结得到了显着改善，因此，我们的重新渲染结果也更加真实和准确。\n  - [ShAPO：多对象形状、外观和姿势优化的隐式表示, ECCV2022](https://arxiv.org/abs/2207.13691) | [***``[code]``***](https://zubair-irshad.github.io/projects/ShAPO.html)\n    > 我们的方法从单个 RGB-D 观察中研究以对象为中心的 3D 理解的复杂任务。由于这是一个不适定问题，现有方法在具有遮挡的复杂多对象场景中的 3D 形状和 6D 姿势和尺寸估计性能低下。我们提出了 ShaAPO，一种用于联合多对象检测、3D 纹理重建、6D 对象姿态和大小估计的方法。 ShAPO 的关键是一个单次管道，用于回归形状、外观和姿势潜在代码以及每个对象实例的掩码，然后以稀疏到密集的方式进一步细化。首先学习了一种新的解开的先验形状和外观数据库，以将对象嵌入到它们各自的形状和外观空间中。我们还提出了一种新颖的、基于八叉树的可微优化步骤，使我们能够以综合分析的方式在学习的潜在空间下同时进一步改进对象形状、姿势和外观。我们新颖的联合隐式纹理对象表示使我们能够准确地识别和重建新的看不见的对象，而无需访问它们的 3D 网格。通过广泛的实验，我们证明了我们的方法在模拟室内场景上进行训练，能够以最少的微调准确地回归现实世界中新物体的形状、外观和姿势。我们的方法显着优于 NOCS 数据集上的所有基线，6D 姿态估计的 mAP 绝对提高了 8%。\n  - [GAUDI：沉浸式 3D 场景生成的神经架构师](https://arxiv.org/abs/2207.13751) | [***``[code]``***](https://github.com/apple/ml-gaudi)\n    > 我们介绍了 GAUDI，这是一种生成模型，能够捕捉复杂而逼真的 3D 场景的分布，可以从移动的相机中沉浸式地渲染。我们用一种可扩展但功能强大的方法来解决这个具有挑战性的问题，我们首先优化一个潜在的表示，以解开辐射场和相机姿势。然后使用这种潜在表示来学习生成模型，该模型可以无条件和有条件地生成 3D 场景.我们的模型通过消除相机姿态分布可以跨样本共享的假设来概括以前专注于单个对象的工作。我们展示了 GAUDI 在跨多个数据集的无条件生成设置中获得了最先进的性能，并允许在给定条件变量（如稀疏图像观察或描述场景的文本）的情况下有条件地生成 3D 场景。\n  - [AlignSDF：用于手对象重建的姿势对齐有符号距离场, ECCV2022](https://arxiv.org/abs/2207.12909) | [***``[code]``***](https://zerchen.github.io/projects/alignsdf.html)\n    > 最近的工作在从单目彩色图像联合重建手和操纵对象方面取得了令人瞩目的进展。现有方法侧重于参数网格或符号距离场 (SDF) 方面的两种替代表示。一方面，参数模型可以从先验知识中受益，但代价是有限的形状变形和网格分辨率。因此，网格模型可能无法精确重建细节，例如手和物体的接触面。另一方面，基于 SDF 的方法可以表示任意细节，但缺乏明确的先验。在这项工作中，我们的目标是使用参数表示提供的先验改进 SDF 模型。特别是，我们提出了一个联合学习框架，可以解开姿势和形状。我们从参数模型中获取手和物体的姿势，并使用它们在 3D 空间中对齐 SDF。我们表明，这种对齐的 SDF 更好地专注于重建形状细节并提高手和物体的重建精度。我们评估了我们的方法，并在具有挑战性的 ObMan 和 DexYCB 基准上展示了对现有技术的显着改进。\n  - [拉普拉斯系统的神经格林函数, Computer & Graphics](https://www.sciencedirect.com/science/article/pii/S0097849322001406) | [code]\n    > 求解源自拉普拉斯算子的线性方程组是广泛应用的核心。由于线性系统的稀疏性，当解具有大量自由度时，通常采用迭代求解器，例如共轭梯度和多重网格。这些迭代求解器可以看作是拉普拉斯算子格林函数的稀疏近似。在本文中，我们提出了一种机器学习方法，该方法从边界条件中回归格林函数。这是通过格林函数实现的，该函数可以以多尺度方式有效地表示，从而大大降低了与密集矩阵表示相关的成本。此外，由于格林函数完全依赖于边界条件，因此训练所提出的神经网络不需要对线性系统的右侧进行采样。结果表明，我们的方法优于最先进的共轭梯度和多重网格方法。\n  - [用笼子变形辐射场, ECCV2022](https://arxiv.org/abs/2207.12298) | [code]\n    > 辐射场的最新进展可以实现静态或动态 3D 场景的逼真渲染，但仍不支持用于场景操作或动画的显式变形。在本文中，我们提出了一种新的辐射场变形方法：自由形式的辐射场变形。我们使用一个三角形网格来包围称为笼子的前景对象作为界面，通过操纵笼子顶点，我们的方法可以实现辐射场的自由变形。我们方法的核心是网格变形中常用的基于笼的变形。我们提出了一种将其扩展到辐射场的新公式，该公式将采样点的位置和视图方向从变形空间映射到规范空间，从而实现变形场景的渲染。合成数据集和真实世界数据集的变形结果证明了我们方法的有效性。\n  - [NeuMesh：学习基于解缠结神经网格的隐式场，用于几何和纹理编辑, ECCV2022(oral)](https://arxiv.org/abs/2207.11911) | [code]\n    > 最近，神经隐式渲染技术得到了迅速发展，并在新颖的视图合成和 3D 场景重建中显示出巨大的优势。然而，现有的用于编辑目的的神经渲染方法提供的功能有限，例如，刚性变换，或者不适用于日常生活中一般对象的细粒度编辑。在本文中，我们提出了一种新颖的基于网格的表示，通过在网格顶点上使用解开几何和纹理代码对神经隐场进行编码，这促进了一组编辑功能，包括网格引导的几何编辑、带有纹理交换的指定纹理编辑、填充和绘画操作。为此，我们开发了几种技术包括可学习的符号指标以放大基于网格的表示的空间可区分性，蒸馏和微调机制以实现稳定收敛，以及空间感知优化策略以实现精确的纹理编辑。对真实数据和合成数据的大量实验和编辑示例证明了我们的方法在表示质量和编辑能力方面的优越性。代码可在项目网页上找到：此 https URL。\n  - [关于物理概念的可学习性：神经网络能理解什么是真](https://arxiv.org/abs/2207.12186) | [code]\n    > 鉴于深度神经网络生成逼真的合成数据的卓越能力，我们重新审视了经典的信号到符号障碍。 DeepFakes 和欺骗突出了物理现实与其抽象表示之间联系的脆弱性，无论是由数字计算机还是生物代理学习。从一个广泛适用的抽象概念定义开始，我们表明标准的前馈架构只能捕获微不足道的概念，无论权重的数量和训练数据的数量如何，尽管它们是非常有效的分类器。另一方面，包含递归的架构可以代表更大的概念类别，但可能仍然无法从有限的数据集中学习它们。我们定性地描述了可以被用随机梯度下降变体训练的现代架构“理解”的概念类别，使用（自由能）拉格朗日来测量信息复杂性。然而，即使一个概念已经被理解，网络也无法将其理解传达给外部代理，除非通过持续的交互和验证。然后，我们将物理对象表征为抽象概念，并使用前面的分析来表明物理对象可以由有限架构编码。然而，为了理解物理概念，传感器必须提供持续令人兴奋的观察，而控制数据采集过程的能力是必不可少的（主动感知）。控制的重要性取决于形式，比听觉或化学感知更有益于视觉。最后，我们得出结论，可以在有限的时间内用有限的资源将物理实体绑定到数字身份，原则上解决了信号到符号的障碍问题，但我们强调了持续验证的必要性。\n## Previous weeks\n  - [﻿Plenoxels：没有神经网络的辐射场, CVPR2022(oral)](https://arxiv.org/abs/2112.05131) | [***``[code]``***](https://alexyu.net/plenoxels)\n    > 我们介绍了 Plenoxels（全光体素），一种用于照片级真实视图合成的系统。 Plenoxels 将场景表示为具有球谐函数的稀疏 3D 网格。这种表示可以通过梯度方法和正则化从校准图像中优化，而无需任何神经组件。在标准的基准任务中，Plenoxels 的优化速度比神经辐射场快两个数量级，而视觉质量没有损失。\n  - [城市辐射场, CVPR2022](https://urban-radiance-fields.github.io/) | [code]\n    > 这项工作的目标是从扫描平台捕获的数据中执行 3D 重建和新颖的视图合成，这些平台通常用于城市户外环境（例如街景）中的世界地图绘制。给定一系列由相机和扫描仪在户外场景中移动获得的 RGB 图像序列和激光雷达扫描，我们生成了一个模型，可以从中提取 3D 表面并合成新的 RGB 图像。我们的方法扩展了神经辐射场，该方法已被证明可以在受控环境中为小场景合成逼真的新颖图像，以及利用异步捕获的激光雷达数据、解决捕获图像之间的曝光变化以及利用预测的图像分割来监督密度的新方法在指向天空的光线上。这三个扩展中的每一个都在街景数据的实验中提供了显着的性能改进。与传统方法（例如~COLMAP）和最近的神经表示（例如~Mip-NeRF）相比，我们的系统产生最先进的 3D 表面重建并合成更高质量的新视图。\n  - [NeRF：将场景表示为用于视图合成的神经辐射场, ECCV2020](https://arxiv.org/abs/2003.08934) | [***``[code]``***](http://tancik.com/nerf)\n    > 我们提出了一种方法，该方法通过使用稀疏输入视图集优化底层连续体积场景函数，实现了合成复杂场景的新视图的最新结果。我们的算法使用全连接（非卷积）深度网络表示场景，其输入是单个连续 5D 坐标（空间位置（x,y,z）和观察方向（θ,φ）），其输出是该空间位置的体积密度和与视图相关的发射辐射。我们通过沿相机光线查询 5D 坐标来合成视图，并使用经典的体渲染技术将输出颜色和密度投影到图像中。因为体积渲染是自然可微的，所以优化我们的表示所需的唯一输入是一组具有已知相机姿势的图像。我们描述了如何有效地优化神经辐射场以渲染具有复杂几何形状和外观的场景的逼真的新颖视图，并展示了优于先前在神经渲染和视图合成方面的工作的结果。查看合成结果最好以视频形式观看，因此我们敦促读者观看我们的补充视频以进行令人信服的比较。\n  - [神经稀疏体素场, NeurIPS2020](https://lingjie0206.github.io/papers/NSVF/) | [***``[code]``***](https://github.com/facebookresearch/NSVF)\n    > 我们介绍了神经稀疏体素场 (NSVF)，这是一种用于快速和高质量自由视点渲染的新神经场景表示。 NSVF 定义了一组以稀疏体素八叉树组织的体素有界隐式字段，以对每个单元中的局部属性进行建模。 我们仅从一组姿势的 RGB 图像中通过可区分的光线行进操作逐步学习底层体素结构。 使用稀疏体素八叉树结构，可以通过跳过不包含相关场景内容的体素来加速渲染新颖的视图。 我们的方法在推理时比最先进的方法（即 NeRF (Mildenhall et al., 2020)）快 10 倍以上，同时获得更高质量的结果。 此外，通过利用显式稀疏体素表示，我们的方法可以很容易地应用于场景编辑和场景合成。 我们还展示了几个具有挑战性的任务，包括多场景学习、移动人体的自由视点渲染和大规模场景渲染。\n  - [AutoInt：快速神经体积渲染的自动集成, CVPR2021](http://www.computationalimaging.org/publications/automatic-integration/) | [***``[code]``***](https://github.com/computational-imaging/automatic-integration)\n    > 数值积分是科学计算的基础技术，是许多计算机视觉应用的核心。在这些应用中，隐式神经体绘制最近被提出作为视图合成的新范式，实现逼真的图像质量。然而，使这些方法实用的一个基本障碍是在训练和推理期间沿渲染光线所需的体积积分导致的极端计算和内存要求。需要数百万条光线，每条光线都需要数百次通过神经网络的前向传播，才能通过蒙特卡罗采样来近似这些集成。在这里，我们提出了自动积分，这是一种使用隐式神经表示网络来学习有效的、封闭形式的积分解决方案的新框架。对于训练，我们实例化对应于隐式神经表示的导数的计算图。该图适合要积分的信号。优化后，我们重新组装图以获得代表反导数的网络。根据微积分的基本定理，这可以在网络的两次评估中计算任何定积分。使用这种方法，我们展示了超过 10 倍的计算要求改进，从而实现了快速的神经体绘制。\n  - [DeRF：分解的辐射场](https://arxiv.org/abs/2011.12490) | [code]\n    > 随着神经辐射场 (NeRF) 的出现，神经网络现在可以渲染 3D 场景的新颖视图，其质量足以愚弄人眼。然而，生成这些图像的计算量非常大，限制了它们在实际场景中的适用性。在本文中，我们提出了一种基于空间分解的技术，能够缓解这个问题。我们的主要观察结果是，使用更大（更深和/或更宽）的网络会带来收益递减。因此，我们建议对场景进行空间分解，并为每个分解部分分配更小的网络。当一起工作时，这些网络可以渲染整个场景。这使我们无论分解部分的数量如何，都能获得近乎恒定的推理时间。此外，我们表明，Voronoi 空间分解更适合此目的，因为它可证明与 Painter 算法兼容，可实现高效且 GPU 友好的渲染。我们的实验表明，对于现实世界的场景，我们的方法提供的推理效率比 NeRF 高出 3 倍（具有相同的渲染质量），或者 PSNR 提高了 1.0~dB（对于相同的推理成本）。\n  - [DONeRF：使用 Depth Oracle Networks 实现紧凑神经辐射场的实时渲染, CGF2021](https://depthoraclenerf.github.io/) | [***``[code]``***](https://github.com/facebookresearch/DONERF)\n    > 最近围绕神经辐射场 (NeRFs) 的研究爆炸表明，在神经网络中隐式存储场景和照明信息具有巨大的潜力，例如，用于生成新的视图。然而，阻止 NeRF 广泛使用的一个主要限制是沿每个视图射线进行过多网络评估的计算成本过高，当针对当前设备上的实时渲染时需要数十 petaFLOPS。我们表明，当将局部样本放置在场景中的表面周围时，可以显着减少每个视图光线所需的样本数量。为此，我们提出了一个深度预言网络，它通过单个网络评估来预测每个视图光线的光线样本位置。我们表明，使用围绕对数离散和球面扭曲深度值的分类网络对于编码表面位置而不是直接估计深度至关重要。这些技术的结合产生了 DONeRF，这是一种双网络设计，第一步是深度预言网络，以及用于光线累积的局部采样着色网络。通过我们的设计，与 NeRF 相比，我们将推理成本降低了 48 倍。使用现成的推理 API 与简单的计算内核相结合，我们率先在单个 GPU 上以交互式帧速率（每秒 15 帧，800x800）渲染基于光线追踪的神经表示。同时，由于我们专注于表面周围场景的重要部分，与 NeRF 相比，我们获得了相同或更好的质量。\n  - [FastNeRF：200FPS 的高保真神经渲染, ICCV2021](https://arxiv.org/abs/2103.10380) | [code]\n    > 最近关于神经辐射场 (NeRF) 的工作展示了如何使用神经网络对复杂的 3D 环境进行编码，这些环境可以从新颖的视角进行逼真的渲染。渲染这些图像对计算的要求非常高，最近的改进距离实现交互速率还有很长的路要走，即使在高端硬件上也是如此。受移动和混合现实设备场景的启发，我们提出了 FastNeRF，这是第一个基于 NeRF 的系统，能够在高端消费 GPU 上以 200Hz 渲染高保真逼真图像。我们方法的核心是受图形启发的分解，它允许 (i) 在空间中的每个位置紧凑地缓存深度辐射图，(ii) 使用光线方向有效地查询该图以估计渲染图像中的像素值。大量实验表明，所提出的方法比原始的 NeRF 算法快 3000 倍，并且比现有的加速 NeRF 的工作至少快一个数量级，同时保持视觉质量和可扩展性。\n  - [KiloNeRF：使用数千个微型 MLP 加速神经辐射场, ICCV2021](https://arxiv.org/abs/2103.13744) | [***``[code]``***](https://github.com/creiser/kilonerf/)\n    > NeRF 通过将神经辐射场拟合到 RGB 图像，以前所未有的质量合成场景的新视图。然而，NeRF 需要数百万次查询深度多层感知器 (MLP)，导致渲染时间变慢，即使在现代 GPU 上也是如此。在本文中，我们证明了通过使用数千个微型 MLP 而不是一个大型 MLP，实时渲染是可能的。在我们的设置中，每个单独的 MLP 只需要表示场景的一部分，因此可以使用更小、更快评估的 MLP。通过将这种分而治之的策略与进一步的优化相结合，与原始 NeRF 模型相比，渲染速度提高了三个数量级，而不会产生高昂的存储成本。此外，使用师生蒸馏进行培训，我们表明可以在不牺牲视觉质量的情况下实现这种加速。\n  - [用于实时渲染神经辐射场的 PlenOctrees, ICCV2021(oral)](https://alexyu.net/plenoctrees/) | [***``[code]``***](https://github.com/sxyu/volrend)\n    > 实时性能是通过将 NeRF 预先制成基于八叉树的辐射场（我们称为 PlenOctrees）来实现的。为了保留与视图相关的效果，例如镜面反射，我们建议通过封闭形式的球面基函数对外观进行编码。具体来说，我们表明可以训练 NeRFs 来预测辐射的球谐表示，将观察方向作为神经网络的输入。此外，我们表明我们的 PlenOctrees 可以直接优化以进一步最小化重建损失，这导致与竞争方法相同或更好的质量。我们进一步表明，这个八叉树优化步骤可用于加快训练时间，因为我们不再需要等待 NeRF 训练完全收敛。我们的实时神经渲染方法可能会支持新的应用，例如 6 自由度工业和产品可视化，以及下一代 AR/VR 系统。\n  - [用于高效神经渲染的体积基元混合, SIGGRAPH2021](https://arxiv.org/abs/2103.01954) | [code]\n    > 人类的实时渲染和动画是游戏、电影和远程呈现应用中的核心功能。现有方法有许多我们的工作旨在解决的缺点。三角形网格难以建模像头发这样的细结构，像神经体积这样的体积表示在合理的内存预算下分辨率太低，而像神经辐射场这样的高分辨率隐式表示在实时应用中使用太慢。我们提出了体积基元混合（MVP），一种用于渲染动态 3D 内容的表示，它结合了体积表示的完整性和基于基元的渲染的效率，例如，基于点或基于网格的方法。我们的方法通过利用具有反卷积架构的空间共享计算以及通过使用可以移动以仅覆盖被占用区域的体积基元来最小化空间空白区域中的计算来实现这一点。我们的参数化支持对应和跟踪约束的集成，同时对经典跟踪失败的区域具有鲁棒性，例如薄或半透明结构周围以及具有大拓扑可变性的区域。 MVP 是一种混合体，它概括了基于体积和基元的表示。通过一系列广泛的实验，我们证明它继承了每种方法的优点，同时避免了它们的许多局限性。我们还将我们的方法与几种最先进的方法进行比较，并证明 MVP 在质量和运行时性能方面产生了卓越的结果。\n  - [光场网络：具有单次评估渲染的神经场景表示, NeurIPS2021(spotlight)](https://www.vincentsitzmann.com/lfns/) | [***``[code]``***](https://github.com/vsitzmann/light-field-networks)\n    > 从 2D 观察推断 3D 场景的表示是计算机图形学、计算机视觉和人工智能的基本问题。新兴的 3D 结构神经场景表示是一种有前途的 3D 场景理解方法。在这项工作中，我们提出了一种新的神经场景表示，光场网络或 LFN，它通过神经隐式表示在 360 度、四维光场中表示底层 3D 场景的几何形状和外观。渲染来自 LFN 的光线只需要*单个*网络评估，而 3D 结构化神经场景表示中的光线行进或基于体积的渲染器每条光线需要数百次评估。在简单场景的设置中，我们利用元学习来学习 LFN 的先验，从而能够从单个图像观察中进行多视图一致的光场重建。这导致时间和内存复杂性的显着降低，并实现了实时渲染。通过 LFN 存储 360 度光场的成本比 Lumigraph 等传统方法低两个数量级。利用神经隐式表示的分析可微性和光空间的新参数化，我们进一步证明了从 LFN 中提取稀疏深度图。\n  - [深度监督的 NeRF：更少的视图和更快的免费训练, CVPR2022](https://arxiv.org/abs/2107.02791) | [***``[code]``***](https://github.com/dunbar12138/DSNeRF)\n    > 当输入视图数量不足时，通常观察到的神经辐射场 (NeRF) 故障模式会拟合不正确的几何形状。一个潜在的原因是标准体积渲染不会强制执行大多数场景几何体由空白空间和不透明表面组成的约束。我们通过 DS-NeRF（深度监督神经辐射场）将上述假设形式化，这是一种利用现成的深度监督学习辐射场的损失。我们利用当前的 NeRF 管道需要具有已知相机姿势的图像这一事实，这些图像通常通过运行从运动结构 (SFM) 来估计。至关重要的是，SFM 还产生稀疏 3D 点，可在训练期间用作“免费”深度监督：我们添加损失以鼓励光线的终止深度分布匹配给定的 3D 关键点，并结合深度不确定性。 DS-NeRF 可以在训练视图更少的情况下渲染更好的图像，同时训练速度提高 2-3 倍。此外，我们表明我们的损失与最近提出的其他 NeRF 方法兼容，证明深度是一种廉价且易于消化的监督信号。最后，我们发现 DS-NeRF 可以支持其他类型的深度监督，例如扫描深度传感器和 RGB-D 重建输出。\n  - [直接体素网格优化：辐射场重建的超快速收敛, CVPR2022(oral)](https://arxiv.org/abs/2111.11215) | [***``[code]``***](https://github.com/sunset1995/DirectVoxGO)\n    > 我们提出了一种超快速收敛方法，用于从一组捕获具有已知姿势的场景的图像中重建每个场景的辐射场。这项任务通常应用于新颖的视图合成，最近因其最先进的质量和灵活性而被神经辐射场 (NeRF) 彻底改变。然而，对于单个场景，NeRF 及其变体需要很长的训练时间，从数小时到数天不等。相比之下，我们的方法实现了与 NeRF 相当的质量，并在不到 15 分钟的时间内使用单个 GPU 从头开始​​快速收敛。我们采用由用于场景几何的密度体素网格和具有浅层网络的特征体素网格组成的表示，用于复杂的依赖于视图的外观。使用显式和离散化的体积表示进行建模并不新鲜，但我们提出了两种简单但非平凡的技术，有助于快速收敛和高质量输出。首先，我们介绍了体素密度的激活后插值，它能够以较低的网格分辨率产生锐利的表面。其次，直接体素密度优化容易出现次优几何解决方案，因此我们通过强加几个先验来加强优化过程。最后，对五个内向基准的评估表明，我们的方法与 NeRF 的质量相匹配，甚至超过，但从头开始训练新场景只需要大约 15 分钟。\n  - [野外的 NeRF：无约束照片集的神经辐射场, CVPR2021](https://arxiv.org/abs/2008.02268) | [code]\n    > 我们提出了一种基于学习的方法，用于仅使用野外照片的非结构化集合来合成复杂场景的新视图。我们建立在神经辐射场 (NeRF) 的基础上，它使用多层感知器的权重将场景的密度和颜色建模为 3D 坐标的函数。虽然 NeRF 在受控设置下捕获的静态对象的图像上效果很好，但它无法在不受控的图像中模拟许多普遍存在的真实世界现象，例如可变照明或瞬态遮挡物。我们为 NeRF 引入了一系列扩展来解决这些问题，从而能够从互联网上获取的非结构化图像集合中进行准确的重建。我们将我们的系统（称为 NeRF-W）应用于著名地标的互联网照片集，并展示时间一致的新颖视图渲染，这些渲染比现有技术更接近真实感。\n  - [Ha-NeRF：野外的幻觉神经辐射场, CVPR2022](https://rover-xingyu.github.io/Ha-NeRF/) | [***``[code]``***](https://github.com/rover-xingyu/Ha-NeRF)\n    > 神经辐射场 (NeRF) 最近因其令人印象深刻的新颖视图合成能力而广受欢迎。本文研究了幻觉 NeRF 的问题：即在一天中的不同时间从一组旅游图像中恢复一个真实的 NeRF。现有的解决方案采用具有可控外观嵌入的 NeRF 在各种条件下渲染新颖的视图，但它们无法渲染具有看不见的外观的视图一致图像。为了解决这个问题，我们提出了一个用于构建幻觉 NeRF 的端到端框架，称为 Ha-NeRF。具体来说，我们提出了一个外观幻觉模块来处理随时间变化的外观并将它们转移到新的视图中。考虑到旅游图像的复杂遮挡，我们引入了一个反遮挡模块来准确地分解静态主体以获得可见性。合成数据和真实旅游照片集的实验结果表明，我们的方法可以产生幻觉，并从不同的视图呈现无遮挡的图像。\n  - [Nerfies：可变形的神经辐射场, ICCV2021](https://arxiv.org/abs/2011.12948) | [code]\n    > 我们提出了第一种能够使用从手机随便捕获的照片/视频来逼真地重建可变形场景的方法。我们的方法通过优化一个额外的连续体积变形场来增强神经辐射场 (NeRF)，该场将每个观察点扭曲成一个规范的 5D NeRF。我们观察到这些类似 NeRF 的变形场容易出现局部最小值，并为基于坐标的模型提出了一种从粗到细的优化方法，可以实现更稳健的优化。通过将几何处理和物理模拟的原理应用于类似 NeRF 的模型，我们提出了变形场的弹性正则化，进一步提高了鲁棒性。我们表明，我们的方法可以将随意捕获的自拍照片/视频转换为可变形的 NeRF 模型，允许从任意视角对主体进行逼真的渲染，我们称之为“nerfies”。我们通过使用带有两部手机的装备收集时间同步数据来评估我们的方法，从而在不同视点产生相同姿势的训练/验证图像。我们表明，我们的方法忠实地重建了非刚性变形的场景，并以高保真度再现了看不见的视图。\n  - [D-NeRF：动态场景的神经辐射场, CVPR2021](https://arxiv.org/abs/2011.13961) | [***``[code]``***](https://github.com/albertpumarola/D-NeRF)\n    > 将机器学习与几何推理相结合的神经渲染技术已成为从一组稀疏图像中合成场景新视图的最有前途的方法之一。其中，神经辐射场 (NeRF) 尤为突出，它训练深度网络将 5D 输入坐标（表示空间位置和观察方向）映射为体积密度和与视图相关的发射辐射。然而，尽管在生成的图像上实现了前所未有的真实感水平，但 NeRF 仅适用于静态场景，其中可以从不同的图像中查询相同的空间位置。在本文中，我们介绍了 D-NeRF，这是一种将神经辐射场扩展到动态域的方法，允许在场景中移动的 \\emph{single} 相机的刚性和非刚性运动下重建和渲染物体的新图像。为此，我们将时间视为系统的附加输入，并将学习过程分为两个主要阶段：一个将场景编码为规范空间，另一个将这个规范表示映射到特定时间的变形场景。两种映射都是使用全连接网络同时学习的。一旦网络经过训练，D-NeRF 就可以渲染新颖的图像，同时控制相机视图和时间变量，从而控制对象的移动。我们展示了我们的方法在物体处​​于刚性、关节和非刚性运动的场景中的有效性。代码、模型权重和动态场景数据集将发布。\n  - [用于单目 4D 面部头像重建的动态神经辐射场, CVPR2021](https://gafniguy.github.io/4D-Facial-Avatars/) | [***``[code]``***](https://github.com/gafniguy/4D-Facial-Avatars)\n    > 我们提出了用于模拟人脸外观和动态的动态神经辐射场。对说话的人进行数字建模和重建是各种应用程序的关键组成部分。特别是对于 AR 或 VR 中的远程呈现应用，需要忠实再现外观，包括新颖的视点或头部姿势。与显式建模几何和材料属性或纯粹基于图像的最先进方法相比，我们引入了基于场景表示网络的头部隐式表示。为了处理面部的动态，我们将场景表示网络与低维可变形模型相结合，该模型提供对姿势和表情的显式控制。我们使用体积渲染从这种混合表示中生成图像，并证明这种动态神经场景表示只能从单目输入数据中学习，而不需要专门的捕获设置。在我们的实验中，我们表明这种学习的体积表示允许生成照片般逼真的图像，其质量超过了基于视频的最先进的重演方法的质量。\n  - [非刚性神经辐射场：单目视频变形场景的重建和新视图合成，, ICCV2021](https://vcai.mpi-inf.mpg.de/projects/nonrigid_nerf/) | [***``[code]``***](https://github.com/facebookresearch/nonrigid_nerf)\n    > 我们提出了非刚性神经辐射场 (NR-NeRF)，这是一种用于一般非刚性动态场景的重建和新颖的视图合成方法。我们的方法将动态场景的 RGB 图像作为输入（例如，来自单目视频记录），并创建高质量的时空几何和外观表示。我们表明，单个手持消费级相机足以从新颖的虚拟相机视图合成动态场景的复杂渲染，例如一个“子弹时间”的视频效果。 NR-NeRF 将动态场景分解为规范体积及其变形。场景变形被实现为光线弯曲，其中直线光线被非刚性变形。我们还提出了一种新的刚性网络来更好地约束场景的刚性区域，从而获得更稳定的结果。射线弯曲和刚性网络在没有明确监督的情况下进行训练。我们的公式可以实现跨视图和时间的密集对应估计，以及引人注目的视频编辑应用程序，例如运动夸张。我们的代码将是开源的。\n  - [PVA：像素对齐的体积化身, CVPR2021](https://volumetric-avatars.github.io/) | [code]\n    > 逼真的人头的采集和渲染是一个极具挑战性的研究问题，对于虚拟远程呈现特别重要。目前，最高质量是通过在多视图数据上以个人特定方式训练的体积方法实现的。与更简单的基于网格的模型相比，这些模型更好地表示精细结构，例如头发。体积模型通常使用全局代码来表示面部表情，以便它们可以由一小组动画参数驱动。虽然这样的架构实现了令人印象深刻的渲染质量，但它们不能轻易地扩展到多身份设置。在本文中，我们设计了一种新颖的方法，用于在仅给定少量输入的情况下预测人头的体积化身。我们通过一种新颖的参数化实现跨身份的泛化，该参数化将神经辐射场与直接从输入中提取的局部像素对齐特征相结合，从而避免了对非常深或复杂网络的需求。我们的方法仅基于光度重新渲染损失以端到端的方式进行训练，无需明确的 3D 监督。我们证明我们的方法在质量方面优于现有的现有技术，并且能够生成忠实的面部表情多身份设置。\n  - [神经关节辐射场, ICCV2021](https://arxiv.org/abs/2104.03110) | [***``[code]``***](https://github.com/nogu-atsu/NARF#code)\n    > 我们提出了神经关节辐射场 (NARF)，这是一种新颖的可变形 3D 表示，用于从图像中学习到的关节对象。虽然 3D 隐式表示的最新进展使得学习复杂对象的模型成为可能，但学习关节对象的姿势可控表示仍然是一个挑战，因为当前的方法需要 3D 形状监督并且无法呈现外观。在制定 3D 关节对象的隐式表示时，我们的方法在求解每个 3D 位置的辐射场时仅考虑最相关对象部分的刚性变换。通过这种方式，所提出的方法可以表示与姿势相关的变化，而不会显着增加计算复杂度。 NARF 是完全可微的，可以从带有姿势注释的图像中训练出来。此外，通过使用自动编码器，它可以学习对象类的多个实例的外观变化。实验表明，所提出的方法是有效的，并且可以很好地推广到新的姿势。\n  - [CLA-NeRF：类别级关节神经辐射场, ICRA2022](https://arxiv.org/abs/2202.00181) | [code]\n    > 我们提出了 CLA-NeRF——一种类别级的关节神经辐射场，可以执行视图合成、部分分割和关节姿态估计。 CLA-NeRF 在对象类别级别进行训练，不使用 CAD 模型和深度，而是使用一组具有地面实况相机姿势和部分片段的 RGB 图像。在推理过程中，只需对已知类别中未见过的 3D 对象实例进行少量 RGB 视图（即少镜头）即可推断对象部分分割和神经辐射场。给定一个关节姿态作为输入，CLA-NeRF 可以执行关节感知体积渲染，以在任何相机姿态下生成相应的 RGB 图像。此外，可以通过逆向渲染来估计对象的关节姿势。在我们的实验中，我们对合成数据和真实数据的五个类别的框架进行了评估。在所有情况下，我们的方法都显示了真实的变形结果和准确的关节姿态估计。我们相信，少量的关节对象渲染和关节姿势估计都为机器人感知和与看不见的关节对象交互打开了大门。\n  - [用于人体建模的动画神经辐射场, ICCV2021](https://zju3dv.github.io/animatable_nerf/) | [***``[code]``***](https://github.com/zju3dv/animatable_nerf)\n    > 本文解决了从多视图视频中重建可动画人体模型的挑战。最近的一些工作提出将非刚性变形场景分解为规范神经辐射场和一组将观察空间点映射到规范空间的变形场，从而使他们能够从图像中学习动态场景。然而，它们将变形场表示为平移矢量场或 SE(3) 场，这使得优化受到高度约束。此外，这些表示不能由输入运动明确控制。相反，我们引入了神经混合权重场来产生变形场。基于骨架驱动的变形，混合权重场与 3D 人体骨骼一起使用，以生成观察到规范和规范到观察的对应关系。由于 3D 人体骨骼更易观察，它们可以规范变形场的学习。此外，学习到的混合权重场可以与输入的骨骼运动相结合，以生成新的变形场来为人体模型设置动画。实验表明，我们的方法明显优于最近的人类合成方法。该代码将在 https://zju3dv.github.io/animatable_nerf/ 上提供。\n  - [神经演员：具有姿势控制的人类演员的神经自由视图合成, SIGSIGGRAPH Asia 2021](https://vcai.mpi-inf.mpg.de/projects/NeuralActor/) | [***``[code]``***](https://people.mpi-inf.mpg.de/~lliu/projects/NeuralActor/)\n    > 我们提出了神经演员 (NA)，这是一种从任意视角和任意可控姿势下高质量合成人类的新方法。我们的方法建立在最近的神经场景表示和渲染工作之上，这些工作仅从 2D 图像中学习几何和外观的表示。虽然现有作品展示了令人信服的静态场景渲染和动态场景回放，但使用神经隐式方法对人类进行逼真的重建和渲染，特别是在用户控制的新姿势下，仍然很困难。为了解决这个问题，我们利用粗体模型作为代理将周围的 3D 空间展开为规范姿势。神经辐射场从多视图视频输入中学习规范空间中与姿势相关的几何变形以及与姿势和视图相关的外观效果。为了合成高保真动态几何和外观的新视图，我们利用在身体模型上定义的 2D 纹理图作为潜在变量来预测残余变形和动态外观。实验表明，我们的方法在回放和新颖的姿势合成方面取得了比现有技术更好的质量，甚至可以很好地推广到与训练姿势截然不同的新姿势。此外，我们的方法还支持合成结果的体形控制。\n  - [用于动态场景时空视图合成的神经场景流场, CVPR2021](http://www.cs.cornell.edu/~zl548/NSFF/) | [***``[code]``***](https://github.com/zhengqili/Neural-Scene-Flow-Fields)\n    > 我们提出了一种方法来执行动态场景的新颖视图和时间合成，只需要具有已知相机姿势的单目视频作为输入。为此，我们引入了神经场景流场，这是一种将动态场景建模为外观、几何和 3D 场景运动的时变连续函数的新表示。我们的表示通过神经网络进行优化，以适应观察到的输入视图。我们表明，我们的表示可用于复杂的动态场景，包括薄结构、视图相关效果和自然运动度。我们进行了许多实验，证明我们的方法明显优于最近的单目视图合成方法，并展示了各种真实世界视频的时空视图合成的定性结果。\n  - [神经体：具有结构化潜在代码的隐式神经表示，用于动态人类的新视图合成, CVPR2021](https://zju3dv.github.io/neuralbody/) | [***``[code]``***](https://github.com/zju3dv/neuralbody)\n    > 本文解决了人类表演者从一组非常稀疏的摄像机视图中合成新颖视图的挑战。最近的一些工作表明，在给定密集输入视图的情况下，学习 3D 场景的隐式神经表示可以实现显着的视图合成质量。但是，如果视图高度稀疏，则表示学习将是不适定的。为了解决这个不适定问题，我们的关键思想是整合对视频帧的观察。为此，我们提出了神经体，这是一种新的人体表示，它假设在不同帧上学习到的神经表示共享同一组锚定到可变形网格的潜在代码，以便可以自然地整合跨帧的观察结果。可变形网格还为网络提供几何指导，以更有效地学习 3D 表示。为了评估我们的方法，我们创建了一个名为 ZJU-MoCap 的多视图数据集，用于捕捉具有复杂动作的表演者。 ZJU-MoCap 的实验表明，我们的方法在新颖的视图合成质量方面大大优于先前的工作。我们还展示了我们的方法从 People-Snapshot 数据集上的单目视频重建移动人物的能力。\n  - [来自多视图视频的神经 3D 视频合成, CVPR2022(oral)](https://neural-3d-video.github.io/) | [code]\n    > 我们提出了一种新颖的 3D 视频合成方法，能够以紧凑但富有表现力的表示形式表示动态真实世界场景的多视图视频记录，从而实现高质量的视图合成和运动插值。我们的方法将静态神经辐射场的高质量和紧凑性带到了一个新的方向：无模型的动态设置。我们方法的核心是一种新颖的时间条件神经辐射场，它使用一组紧凑的潜在代码来表示场景动态。为了利用视频相邻帧之间的变化通常很小且局部一致的事实，我们提出了两种有效训练神经网络的新策略：1）有效的分层训练方案，以及 2）选择根据输入视频的时间变化进行训练的下一条光线。结合起来，这两种策略显着提高了训练速度，导致训练过程快速收敛，并获得高质量的结果。我们学习的表示非常紧凑，能够表示由 18 个摄像机录制的 10 秒 30 FPS 多视图视频，模型大小仅为 28MB。我们证明了我们的方法可以以超过 1K 的分辨率渲染高保真广角新颖视图，即使对于高度复杂和动态的场景也是如此。我们进行了广泛的定性和定量评估，表明我们的方法优于当前的技术水平。项目网站：https://neural-3d-video.github.io。\n  - [动态单目视频的动态视图合成, ICCV2021](https://free-view-video.github.io/) | [***``[code]``***](https://github.com/gaochen315/DynamicNeRF)\n    > 我们提出了一种算法，用于在给定动态场景的单目视频的任意视点和任何输入时间步长处生成新视图。我们的工作建立在神经隐式表示的最新进展的基础上，并使用连续和可微的函数来建模时变结构和场景的外观。我们联合训练一个时不变的静态 NeRF 和一个时变的动态 NeRF，并学习如何以无监督的方式混合结果。然而，从单个视频中学习这个隐式函数是非常不适定的（与输入视频匹配的解决方案有无限多）。为了解决歧义，我们引入了正则化损失以鼓励更合理的解决方案。我们展示了从随意捕获的视频中进行动态视图合成的广泛定量和定性结果。\n  - [GRAF：用于 3D 感知图像合成的生成辐射场, NeurIPS2020](https://avg.is.mpg.de/publications/schwarz2020NeurIPS) | [***``[code]``***](https://github.com/autonomousvision/graf)\n    > 虽然 2D 生成对抗网络已经实现了高分辨率图像合成，但它们在很大程度上缺乏对 3D 世界和图像形成过程的理解。因此，它们不提供对相机视点或物体姿势的精确控制。为了解决这个问题，最近的几种方法将基于中间体素的表示与可微渲染相结合。然而，现有方法要么产生低图像分辨率，要么在解开相机和场景属性方面存在不足，例如，对象身份可能随视点而变化。在本文中，我们提出了一种辐射场的生成模型，该模型最近被证明在单个场景的新颖视图合成方面是成功的。与基于体素的表示相比，辐射场并不局限于 3D 空间的粗略离散化，还允许解开相机和场景属性，同时在存在重建模糊性的情况下优雅地退化。通过引入基于多尺度补丁的鉴别器，我们展示了高分辨率图像的合成，同时仅从未定位的 2D 图像训练我们的模型。我们系统地分析了我们在几个具有挑战性的合成和现实世界数据集上的方法。我们的实验表明，辐射场是生成图像合成的强大表示，可生成以高保真度渲染的 3D 一致模型。\n  - [GRF：学习用于 3D 场景表示和渲染的一般辐射场, ICCV2021(oral)](https://arxiv.org/abs/2010.04595) | [***``[code]``***](https://github.com/alextrevithick/GRF)\n    > 我们提出了一个简单而强大的神经网络，它仅从 2D 观察中隐式表示和渲染 3D 对象和场景。该网络将 3D 几何建模为一般辐射场，它以一组具有相机位姿和内在函数的 2D 图像作为输入，为 3D 空间的每个点构建内部表示，然后渲染该点的相应外观和几何观察从任意位置。我们方法的关键是学习 2D 图像中每个像素的局部特征，然后将这些特征投影到 3D 点，从而产生一般和丰富的点表示。我们还集成了一种注意力机制来聚合来自多个 2D 视图的像素特征，从而隐式考虑视觉遮挡。大量实验表明，我们的方法可以为新物体、看不见的类别和具有挑战性的现实世界场景生成高质量和逼真的新视图。\n  - [pixelNeRF：来自一个或几个图像的神经辐射场, CVPR2021](https://arxiv.org/abs/2012.02190) | [***``[code]``***](https://github.com/sxyu/pixel-nerf)\n    > 我们提出了 pixelNeRF，这是一种学习框架，可以预测以一个或几个输入图像为条件的连续神经场景表示。构建神经辐射场的现有方法涉及独立优化每个场景的表示，需要许多校准视图和大量计算时间。我们通过引入一种以完全卷积方式在图像输入上调节 NeRF 的架构，朝着解决这些缺点迈出了一步。这允许网络在多个场景中进行训练，以先学习一个场景，使其能够从一组稀疏的视图（少至一个）以前馈方式执行新颖的视图合成。利用 NeRF 的体积渲染方法，我们的模型可以直接从图像中训练，无需明确的 3D 监督。我们在 ShapeNet 基准上进行了广泛的实验，用于具有保留对象以及整个未见类别的单图像新颖视图合成任务。我们通过在多对象 ShapeNet 场景和来自 DTU 数据集的真实场景上展示 pixelNeRF 的灵活性，进一步展示了它的灵活性。在所有情况下，对于新颖的视图合成和单图像 3D 重建，pixelNeRF 都优于当前最先进的基线。有关视频和代码，请访问项目网站：此 https 网址\n  - [用于优化基于坐标的神经表示的学习初始化, CVPR2021](https://www.matthewtancik.com/learnit) | [***``[code]``***](https://github.com/tancik/learnit)\n    > 基于坐标的神经表示已显示出作为复杂低维信号的离散、基于数组的表示的替代方案的重要前景。然而，从每个新信号的随机初始化权重优化基于坐标的网络是低效的。我们建议应用标准的元学习算法来学习这些全连接网络的初始权重参数，这些参数基于所表示的底层信号类别（例如，面部图像或椅子的 3D 模型）。尽管只需要在实现中进行微小的更改，但使用这些学习到的初始权重可以在优化过程中实现更快的收敛，并且可以作为所建模信号类的强先验，从而在只有给定信号的部分观察可用时产生更好的泛化。我们在各种任务中探索这些好处，包括表示 2D 图像、重建 CT 扫描以及从 2D 图像观察中恢复 3D 形状和场景。\n  - [pi-GAN：用于 3D 感知图像合成的周期性隐式生成对抗网络, CVPR2021(oral)](https://marcoamonteiro.github.io/pi-GAN-website/) | [***``[code]``***](https://github.com/marcoamonteiro/pi-GAN)\n    > 我们见证了 3D 感知图像合成的快速进展，利用了生成视觉模型和神经渲染的最新进展。然而，现有方法在两个方面存在不足：首先，它们可能缺乏底层 3D 表示或依赖于视图不一致的渲染，因此合成的图像不是多视图一致的；其次，它们通常依赖于表达能力不足的表示网络架构，因此它们的结果缺乏图像质量。我们提出了一种新颖的生成模型，称为周期性隐式生成对抗网络（π-GAN 或 pi-GAN），用于高质量的 3D 感知图像合成。 π-GAN 利用具有周期性激活函数和体积渲染的神经表示将场景表示为具有精细细节的视图一致的 3D 表示。所提出的方法获得了具有多个真实和合成数据集的 3D 感知图像合成的最新结果。\n  - [单张图像的人像神经辐射场](https://portrait-nerf.github.io/) | [code]\n    > 我们提出了一种从单个爆头肖像估计神经辐射场 (NeRF) 的方法。虽然 NeRF 已经展示了高质量的视图合成，但它需要静态场景的多个图像，因此对于随意捕捉和移动主体是不切实际的。在这项工作中，我们建议使用使用灯光舞台肖像数据集的元学习框架来预训练多层感知器 (MLP) 的权重，该多层感知器隐含地对体积密度和颜色进行建模。为了提高对看不见的人脸的泛化能力，我们在由 3D 人脸可变形模型近似的规范坐标空间中训练 MLP。我们使用受控捕获对方法进行定量评估，并展示了对真实肖像图像的泛化性，显示出对最先进技术的有利结果。\n  - [ShaRF：单一视图的形状条件辐射场, ICML2021](https://arxiv.org/abs/2102.08860) | [***``[code]``***](https://github.com/tensorflow/graphics/tree/master/tensorflow_graphics/projects/radiance_fields)\n    > 我们提出了一种方法，用于估计仅给定单个图像的对象的神经场景表示。我们方法的核心是估计物体的几何支架，并将其用作重建底层辐射场的指导。我们的公式基于一个生成过程，该过程首先将潜在代码映射到体素化形状，然后将其渲染为图像，对象外观由第二个潜在代码控制。在推理过程中，我们优化了潜在代码和网络以适应新对象的测试图像。形状和外观的明确解开允许我们的模型在给定单个图像的情况下进行微调。然后，我们可以以几何一致的方式渲染新视图，它们忠实地表示输入对象。此外，我们的方法能够推广到训练域之外的图像（更逼真的渲染甚至真实照片）。最后，推断的几何支架本身就是对物体 3D 形状的准确估计。我们在几个实验中证明了我们的方法在合成图像和真实图像中的有效性。\n  - [IBRNet：学习基于图像的多视图渲染, CVPR2021](https://arxiv.org/abs/2102.13090) | [***``[code]``***](https://github.com/googleinterns/IBRNet)\n    > 我们提出了一种通过插入一组稀疏的附近视图来合成复杂场景的新视图的方法。我们方法的核心是一个网络架构，其中包括一个多层感知器和一个光线转换器，用于估计连续 5D 位置（3D 空间位置和 2D 观察方向）的辐射和体积密度，从多个源视图动态绘制外观信息。通过在渲染时绘制源视图，我们的方法回归了基于图像的渲染 (IBR) 的经典工作，并允许我们渲染高分辨率图像。与优化每个场景函数以进行渲染的神经场景表示工作不同，我们学习了一种通用视图插值函数，该函数可以推广到新场景。我们使用经典的体渲染来渲染图像，这是完全可微的，并且允许我们仅使用多视图姿势图像作为监督进行训练。实验表明，我们的方法优于最近的新视图合成方法，这些方法也试图推广到新场景。此外，如果在每个场景上进行微调，我们的方法与最先进的单场景神经渲染方法具有竞争力。项目页面：此 https 网址\n  - [CAMPARI：相机感知分解生成神经辐射场](https://arxiv.org/pdf/2103.17269.pdf) | [code]\n    > 深度生成模型的巨大进步导致了逼真的图像合成。在取得令人信服的结果的同时，大多数方法都在二维图像域中运行，而忽略了我们世界的三维性质。因此，最近的几项工作提出了具有 3D 感知能力的生成模型，即场景以 3D 建模，然后可微分地渲染到图像平面。这导致了令人印象深刻的 3D 一致性，但纳入这种偏差是有代价的：相机也需要建模。当前的方法假定固定的内在函数和预先定义的相机姿势范围。因此，实际数据通常需要参数调整，如果数据分布不匹配，结果会下降。我们的关键假设是，与图像生成器一起学习相机生成器会导致更原则性的 3D 感知图像合成方法。此外，我们建议将场景分解为背景和前景模型，从而实现更有效和更清晰的场景表示。在从原始的、未定型的图像集合中进行训练时，我们学习了一个 3D 和相机感知的生成模型，它不仅忠实地恢复了图像，而且还忠实地恢复了相机数据分布。在测试时，我们的模型生成的图像可以显式控制相机以及场景的形状和外观。\n  - [NeRF-VAE：几何感知 3D 场景生成模型](https://arxiv.org/abs/2104.00587) | [code]\n    > 我们提出了 NeRF-VAE，这是一种 3D 场景生成模型，它通过 NeRF 和可微体渲染结合了几何结构。与 NeRF 相比，我们的模型考虑了跨场景的共享结构，并且能够使用摊销推理推断新场景的结构——无需重新训练。 NeRF-VAE 的显式 3D 渲染过程进一步将先前的生成模型与缺乏几何结构的基于卷积的渲染进行对比。我们的模型是一个 VAE，它通过在潜在场景表示上调节辐射场来学习辐射场的分布。我们表明，经过训练，NeRF-VAE 能够使用很少的输入图像从以前看不见的 3D 环境中推断和渲染几何一致的场景。我们进一步证明了 NeRF-VAE 可以很好地推广到分布式相机，而卷积模型则不能。最后，我们介绍并研究了 NeRF-VAE 解码器的一种基于注意力的调节机制，该机制提高了模型性能。\n  - [具有局部条件辐射场的无约束场景生成, ICCV2021](https://apple.github.io/ml-gsn/) | [***``[code]``***](https://github.com/apple/ml-gsn)\n    > 我们遵循对抗性学习框架，其中生成器通过其辐射场对场景进行建模，鉴别器尝试区分从这些辐射场渲染的图像和真实场景的图像。从概念上讲，我们的模型将场景的辐射场分解为许多小的局部辐射场，这些辐射场是由二维潜在代码 W 网格上的条件产生的。W 可以解释为表示场景的潜在平面图。\n  - [MVSNeRF：从多视图立体快速概括辐射场重建, ICCV2021](https://apchenstu.github.io/mvsnerf/) | [***``[code]``***](https://github.com/apchenstu/mvsnerf)\n    > 我们提出了 MVSNeRF，一种新颖的神经渲染方法，可以有效地重建神经辐射场以进行视图合成。与先前的神经辐射场工作考虑对密集捕获的图像进行逐场景优化不同，我们提出了一个通用的深度神经网络，它可以通过快速网络推理仅从三个附近的输入视图重建辐射场。我们的方法利用平面扫描成本体积（广泛用于多视图立体）进行几何感知场景推理，并将其与基于物理的体积渲染相结合用于神经辐射场重建。我们在 DTU 数据集中的真实对象上训练我们的网络，并在三个不同的数据集上对其进行测试，以评估其有效性和普遍性。我们的方法可以跨场景（甚至是室内场景，与我们的对象训练场景完全不同）进行泛化，并仅使用三个输入图像生成逼真的视图合成结果，显着优于可泛化辐射场重建的并行工作。此外，如果捕捉到密集的图像，我们估计的辐射场表示可以很容易地进行微调；与 NeRF 相比，这导致具有更高渲染质量和更短优化时间的快速每场景重建。\n  - [立体辐射场 (SRF)：从新场景的稀疏视图中学习视图合成, CVPR2021](https://arxiv.org/abs/2104.06935) | [***``[code]``***](https://virtualhumans.mpi-inf.mpg.de/srf/)\n    > 最近的神经视图合成方法取得了令人印象深刻的质量和真实性，超越了依赖多视图重建的经典管道。最先进的方法，例如 NeRF，旨在使用神经网络学习单个场景，并且需要密集的多视图输入。在新场景上进行测试需要从头开始重新训练，这需要 2-3 天。在这项工作中，我们介绍了立体辐射场 (SRF)，这是一种端到端训练的神经视图合成方法，可以推广到新场景，并且在测试时只需要稀疏视图。核心思想是一种受经典多视图立体方法启发的神经架构，它通过在立体图像中找到相似的图像区域来估计表面点。在 SRF 中，我们预测每个 3D 点的颜色和密度，给定输入图像中立体对应的编码。编码是通过成对相似性的集合隐式学习的——模拟经典立体声。实验表明，SRF 在场景上学习结构而不是过度拟合。我们在 DTU 数据集的多个场景上进行训练，并在不重新训练的情况下推广到新场景，只需要 10 个稀疏和展开的视图作为输入。我们展示了 10-15 分钟的微调进一步改善了结果，与特定场景的模型相比，获得了更清晰、更详细的结果。代码、模型和视频可在此 https 网址上找到。\n  - [用于遮挡感知的基于图像的渲染的神经射线, CVPR2022](https://liuyuan-pal.github.io/NeuRay/) | [***``[code]``***](https://github.com/liuyuan-pal/NeuRay)\n    > 我们提出了一种新的神经表示，称为神经射线 (NeuRay)，用于新的视图合成任务。最近的工作从输入视图的图像特征构建辐射场以渲染新颖的视图图像，从而能够泛化到新场景。但是，由于遮挡，3D 点可能对某些输入视图不可见。在这样的 3D 点上，这些泛化方法将包括来自不可见视图的不一致图像特征，这会干扰辐射场的构建。为了解决这个问题，我们在 NeuRay 表示中预测 3D 点对输入视图的可见性。这种可见性使辐射场构建能够专注于可见图像特征，从而显着提高其渲染质量。同时，提出了一种新颖的一致性损失，以在对特定场景进行微调时改进 NeuRay 中的可见性。实验表明，我们的方法在推广到看不见的场景时在新颖的视图合成任务上实现了最先进的性能，并且在微调后优于每个场景的优化方法。\n  - [节食 NeRF：语义一致的 Few-Shot 视图合成, ICCV2021](https://www.ajayj.com/dietnerf) | [***``[code]``***](https://github.com/ajayjain/DietNeRF)\n    > 我们提出了 DietNeRF，一种从几张图像估计的 3D 神经场景表示。神经辐射场 (NeRF) 通过多视图一致性学习场景的连续体积表示，并且可以通过光线投射从新颖的视点进行渲染。虽然 NeRF 在给定许多图像的情况下具有令人印象深刻的重建几何和精细细节的能力，对于具有挑战性的 360° 场景最多可重建 100 个，但当只有少数输入视图可用时，它通常会为其图像重建目标找到退化的解决方案。为了提高few-shot质量，我们提出了DietNeRF。我们引入了一种辅助语义一致性损失，它鼓励以新颖的姿势进行逼真的渲染。 DietNeRF 在单个场景上进行训练，以 (1) 从相同的姿势正确渲染给定的输入视图，以及 (2) 在不同的随机姿势中匹配高级语义属性。我们的语义损失使我们能够从任意姿势监督 DietNeRF。我们使用预训练的视觉编码器提取这些语义，例如 CLIP，这是一种视觉转换器，通过自然语言监督从网络挖掘出的数亿张不同的单视图 2D 照片进行训练。在实验中，DietNeRF 在从头开始学习时提高了少镜头视图合成的感知质量，在多视图数据集上进行预训练时，可以用少至一张观察到的图像渲染新视图，并生成完全未观察到的区域的合理完成。\n  - [使用 NeRF 实现新视图合成的连续深度 MPI, ICCV2021](https://arxiv.org/abs/2103.14910) | [***``[code]``***](https://github.com/vincentfung13/MINE)\n    > 在本文中，我们建议 MINE 通过从单个图像进行密集 3D 重建来执行新颖的视图合成和深度估计。我们的方法是通过引入神经辐射场 (NeRF) 对多平面图像 (MPI) 进行连续深度泛化。给定单个图像作为输入，MINE 预测任意深度值的 4 通道图像（RGB 和体积密度）以联合重建相机平截头体并填充被遮挡的内容。然后可以使用可微分渲染轻松地将重建和修复的截锥体渲染为新颖的 RGB 或深度视图。在 RealEstate10K、KITTI 和 Flowers Light Fields 上进行的大量实验表明，我们的 MINE 在新颖的视图合成中大大优于最先进的技术。我们还在 iBims-1 和 NYU-v2 的深度估计方面取得了具有竞争力的结果，而无需注释深度监督。我们的源代码可在此 https 网址获得\n  - [TöRF：动态场景视图合成的飞行时间辐射场, NeurIPS2021](https://imaging.cs.cmu.edu/torf/) | [***``[code]``***](https://github.com/breuckelen/torf)\n    > 神经网络可以表示和准确重建静态 3D 场景（例如 NeRF）的辐射场。一些作品将这些扩展到用单目视频捕获的动态场景，并具有可观的性能。然而，众所周知，单眼设置是一个约束不足的问题，因此方法依赖于数据驱动的先验来重建动态内容。我们用飞行时间 (ToF) 相机的测量值替换这些先验，并引入基于连续波 ToF 相机图像形成模型的神经表示。我们不使用处理过的深度图，而是对原始 ToF 传感器测量进行建模，以提高重建质量并避免低反射率区域、多路径干扰和传感器有限的明确深度范围等问题。我们展示了这种方法提高了动态场景重建对错误校准和大运动的鲁棒性，并讨论了集成现代智能手机上现在可用的 RGB+ToF 传感器的好处和局限性。\n  - [CodeNeRF：对象类别的解开神经辐射场, ICCV2021(oral)](https://www.google.com/url?q=https%3A%2F%2Farxiv.org%2Fpdf%2F2109.01750.pdf&sa=D&sntz=1&usg=AOvVaw1Fnir0e4aRa22Nt0HoXDWh) | [***``[code]``***](https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fwbjang%2Fcode-nerf&sa=D&sntz=1&usg=AOvVaw2eD5ZoRbk2aWFuwUSHlh5_)\n    > CodeNeRF 是一种隐式 3D 神经表示，它学习对象形状和纹理在一个类别中的变化，并且可以从一组姿势图像中进行训练，以合成看不见的对象的新视图。与特定场景的原始 NeRF 不同，CodeNeRF 通过学习单独的嵌入来学习解开形状和纹理。在测试时，给定一个看不见的物体的单个未定位图像，CodeNeRF 通过优化联合估计相机视点、形状和外观代码。看不见的物体可以从单个图像中重建，然后从新的视点渲染，或者通过改变潜在代码编辑它们的形状和纹理。我们在 SRN 基准上进行了实验，结果表明 CodeNeRF 可以很好地泛化到看不见的对象，并且在测试时需要已知相机姿态的方法达到同等性能。我们在真实世界图像上的结果表明，CodeNeRF 可以弥合模拟到真实的差距。\n  - [StyleNeRF：用于高分辨率图像合成的基于样式的 3D 感知生成器, ICLR2022](https://jiataogu.me/style_nerf/) | [***``[code]``***](https://github.com/facebookresearch/StyleNeRF)\n    > 我们提出了 StyleNeRF，这是一种 3D 感知生成模型，用于具有高多视图一致性的逼真的高分辨率图像合成，可以在非结构化 2D 图像上进行训练。现有方法要么无法合成具有精细细节的高分辨率图像，要么产生明显的 3D 不一致伪影。此外，他们中的许多人缺乏对风格属性和明确的 3D 相机姿势的控制。 StyleNeRF 将神经辐射场 (NeRF) 集成到基于样式的生成器中，以应对上述挑战，即提高渲染效率和 3D 一致性以生成高分辨率图像。我们执行体积渲染只是为了生成一个低分辨率的特征图，并在 2D 中逐步应用上采样来解决第一个问题。为了减轻 2D 上采样引起的不一致性，我们提出了多种设计，包括更好的上采样器和新的正则化损失。通过这些设计，StyleNeRF 可以以交互速率合成高分辨率图像，同时保持高质量的 3D 一致性。 StyleNeRF 还可以控制相机姿势和不同级别的样式，可以推广到看不见的视图。它还支持具有挑战性的任务，包括放大和缩小、样式混合、反转和语义编辑。\n  - [黑暗中的 NeRF：来自嘈杂原始图像的高动态范围视图合成, CVPR2022(oral)](https://bmild.github.io/rawnerf/) | [***``[code]``***](https://github.com/google-research/multinerf)\n    > 神经辐射场 (NeRF) 是一种从姿势输入图像的集合中合成高质量新颖视图的技术。与大多数视图合成方法一样，NeRF 使用色调映射低动态范围（LDR）作为输入；这些图像已由有损相机管道处理，该管道可以平滑细节、剪辑高光并扭曲原始传感器数据的简单噪声分布。我们将 NeRF 修改为直接在线性原始图像上进行训练，保留场景的完整动态范围。通过从生成的 NeRF 渲染原始输出图像，我们可以执行新颖的高动态范围 (HDR) 视图合成任务。除了改变相机视角之外，我们还可以在事后操纵焦点、曝光和色调映射。尽管单个原始图像看起来比后处理的图像噪声大得多，但我们表明 NeRF 对原始噪声的零均值分布具有高度鲁棒性。当针对许多嘈杂的原始输入 (25-200) 进行优化时，NeRF 生成的场景表示非常准确，以至于其渲染的新颖视图优于在相同宽基线输入图像上运行的专用单图像和多图像深度原始降噪器。因此，我们的方法（我们称为 RawNeRF）可以从在近黑暗中捕获的极其嘈杂的图像中重建场景。\n  - [iNeRF：用于姿势估计的反转神经辐射场, IROS2021](http://yenchenlin.me/inerf/) | [***``[code]``***](https://github.com/yenchenlin/iNeRF-public)\n    > 我们提出了 iNeRF，这是一个通过“反转”经过训练的神经辐射场 (NeRF) 来执行姿态估计的框架。 NeRF 已被证明对视图合成任务非常有效——合成真实世界场景或对象的逼真的新视图。在这项工作中，我们研究是否可以使用 NeRF 进行综合分析来进行 6DoF 姿势估计——给定图像，找到相机相对于 3D 模型的平移和旋转。从初始姿态估计开始，我们使用梯度下降来最小化从已经训练的 NeRF 渲染的像素和观察图像中的像素之间的残差。在我们的实验中，我们首先研究 1）如何在 iNeRF 的姿势细化过程中对光线进行采样以收集信息梯度，以及 2）不同批次大小的光线如何影响合成数据集上的 iNeRF。然后，我们展示了对于来自 LLFF 数据集的复杂现实世界场景，iNeRF 可以通过估计新图像的相机位姿并将这些图像用作 NeRF 的额外训练数据来改进 NeRF。最后，我们展示了 iNeRF 可以与基于特征的姿势初始化相结合。该方法优于所有其他依赖 LineMOD 上的合成数据的基于 RGB 的方法。\n  - [A-NeRF：通过神经渲染进行无表面人体 3D 姿势细化, NeurIPS2021](https://arxiv.org/abs/2102.06199) | [***``[code]``***](https://github.com/LemonATsu/A-NeRF)\n    > 虽然深度学习使用前馈网络重塑了经典的运动捕捉管道，但需要生成模型通过迭代细化来恢复精细对齐。不幸的是，现有模型通常是在受控条件下手工制作或学习的，仅适用于有限的领域。我们提出了一种通过扩展神经辐射场 (NeRFs) 从未标记的单目视频中学习生成神经体模型的方法。我们为它们配备了骨架，以适用于时变和关节运动。一个关键的见解是，隐式模型需要与显式曲面模型中使用的正向运动学相反。我们的重新参数化定义了相对于身体部位姿势的空间潜在变量，从而克服了过度参数化的不适定逆运算。这使得从头开始学习体积身体形状和外观，同时共同改进关节姿势；输入视频上的所有外观、姿势或 3D 形状都没有地面实况标签。当用于新视图合成和动作捕捉时，我们的神经模型提高了不同数据集的准确性。项目网站：此 https 网址。\n  - [NeRF--：没有已知相机参数的神经辐射场](https://nerfmm.active.vision/) | [***``[code]``***](https://github.com/ActiveVisionLab/nerfmm)\n    > 考虑到仅来自一组 2D 图像的新视图合成 (NVS) 问题，我们通过消除已知或预先计算的相机参数的要求，简化了前向场景中神经辐射场 (NeRF) 的训练过程，包括内在函数和 6DoF 姿势。为此，我们提出了 NeRF−−，具有三个贡献：首先，我们表明相机参数可以通过光度重建作为可学习参数与 NeRF 训练联合优化；其次，为了对相机参数估计和新颖视图渲染的质量进行基准测试，我们引入了一个新的路径跟踪合成场景数据集，称为 Blender Forward-Facing Dataset (BLEFF)；第三，我们进行了广泛的分析以了解各种相机运动下的训练行为，并表明在大多数情况下，联合优化管道可以恢复准确的相机参数并实现与使用 COLMAP 预计算相机参数训练的方法相当的新视图合成质量。\n  - [实时隐式映射和定位, ICCV2021](https://arxiv.org/abs/2103.12352) | [code]\n    > 我们首次展示了多层感知器 (MLP) 可以作为手持 RGB-D 相机的实时 SLAM 系统中唯一的场景表示。我们的网络在没有先验数据的情况下进行实时操作训练，构建了一个密集的、特定于场景的隐式 3D 占用率和颜色模型，该模型也可立即用于跟踪。\n  - [用于 SLAM 的 NICE-SLAM 神经隐​​式可扩展编码, CVPR2022](https://arxiv.org/abs/2112.12130) | [***``[code]``***](https://github.com/cvg/nice-slam)\n    > 神经隐式表示最近在各个领域都显示出令人鼓舞的结果，包括在同时定位和映射 (SLAM) 方面取得的可喜进展。然而，现有方法会产生过度平滑的场景重建，并且难以扩展到大场景。这些限制主要是由于它们简单的全连接网络架构没有在观察中包含本地信息。在本文中，我们提出了 NICE-SLAM，这是一种密集的 SLAM 系统，它通过引入分层场景表示来结合多级局部信息。使用预先训练的几何先验优化这种表示可以在大型室内场景中进行详细的重建。与最近的神经隐式 SLAM 系统相比，我们的方法更具可扩展性、高效性和鲁棒性。在五个具有挑战性的数据集上的实验证明了 NICE-SLAM 在映射和跟踪质量方面的竞争结果。\n  - [GNeRF：基于 GAN 的无姿势相机的神经辐射场, ICCV2021(oral)](https://arxiv.org/abs/2103.15606) | [code]\n    > 我们介绍了 GNeRF，这是一个将生成对抗网络 (GAN) 与神经辐射场 (NeRF) 重建相结合的框架，用于具有未知甚至随机初始化相机姿势的复杂场景。最近基于 NeRF 的进展因显着的逼真的新视图合成而受到欢迎。然而，它们中的大多数严重依赖于准确的相机位姿估计，而最近的一些方法只能在相机轨迹相对较短的大致前向场景中优化未知相机位姿，并且需要粗略的相机位姿初始化。不同的是，我们的 GNeRF 仅将随机初始化的姿势用于复杂的由外而内的场景。我们提出了一种新颖的两阶段端到端框架。第一阶段将 GAN 的使用带入新领域，以联合优化粗略的相机姿势和辐射场，而第二阶段通过额外的光度损失对它们进行细化。我们使用混合迭代优化方案克服了局部最小值。对各种合成和自然场景的广泛实验证明了 GNeRF 的有效性。更令人印象深刻的是，我们的方法在那些以前被认为极具挑战性的重复模式甚至低纹理的场景中优于基线。\n  - [BARF：捆绑调整神经辐射场, ICCV2021(oral)](https://chenhsuanlin.bitbucket.io/bundle-adjusting-NeRF/) | [***``[code]``***](https://github.com/chenhsuanlin/bundle-adjusting-NeRF)\n    > 神经辐射场 (NeRF) 最近在计算机视觉界引起了极大的兴趣，因为它具有合成真实世界场景的逼真的新颖视图的能力。然而，NeRF 的一个限制是它需要准确的相机姿势来学习场景表示。在本文中，我们提出了 Bundle-Adjusting Neural Radiance Fields (BARF)，用于从不完美（甚至未知）的相机姿势训练 NeRF——学习神经 3D 表示和注册相机帧的联合问题。我们建立了与经典图像对齐的理论联系，并表明从粗到细的配准也适用于 NeRF。此外，我们表明，在 NeRF 中天真地应用位置编码会对基于合成的目标的注册产生负面影响。合成数据和真实世界数据的实验表明，BARF 可以有效地优化神经场景表示并同时解决大的相机位姿错位问题。这使得来自未知相机位姿的视频序列的视图合成和定位成为可能，为视觉定位系统（例如 SLAM）和密集 3D 映射和重建的潜在应用开辟了新途径。\n  - [自校准神经辐射场, ICCV2021](https://postech-cvlab.github.io/SCNeRF/) | [***``[code]``***](https://github.com/POSTECH-CVLab/SCNeRF)\n    > 在这项工作中，我们提出了一种用于具有任意非线性失真的通用相机的相机自校准算法。我们共同学习场景的几何形状和准确的相机参数，无需任何校准对象。我们的相机模型包括针孔模型、径向失真和可以学习任意非线性相机失真的通用噪声模型。虽然传统的自校准算法主要依赖于几何约束，但我们还结合了光度一致性。这需要学习场景的几何形状，我们使用神经辐射场 (NeRF)。我们还提出了一种新的几何损失函数，即投影射线距离损失，以结合复杂非线性相机模型的几何一致性。我们在标准真实图像数据集上验证了我们的方法，并证明我们的模型可以从头开始学习相机的内在和外在（姿势），而无需 COLMAP 初始化。此外，我们表明，以可微分的方式学习准确的相机模型可以让我们在 NeRF 上提高 PSNR。我们通过实验证明我们提出的方法适用于 NeRF 的变体。此外，我们使用一组用鱼眼镜头拍摄的图像来证明学习相机模型与 COLMAP 初始化相比，共同提高了性能。\n  - [NeRD：来自图像集合的神经反射分解, ICCV2021](https://markboss.me/publication/2021-nerd/#:~:text=NeRD%20is%20a%20novel%20method,can%20turn%20around%20the%20object.) | [***``[code]``***](https://github.com/cgtuebingen/NeRD-Neural-Reflectance-Decomposition)\n    > 将场景分解为其形状、反射率和照明度是计算机视觉和图形学中一个具有挑战性但重要的问题。当照明不是实验室条件下的单一光源而是不受约束的环境照明时，这个问题本质上更具挑战性。尽管最近的工作表明可以使用隐式表示来模拟物体的辐射场，但这些技术中的大多数只能实现视图合成而不是重新照明。此外，评估这些辐射场是资源和时间密集型的。我们提出了一种神经反射分解 (NeRD) 技术，该技术使用基于物理的渲染将场景分解为空间变化的 BRDF 材料属性。与现有技术相比，我们的输入图像可以在不同的照明条件下捕获。此外，我们还提出了将学习到的反射体积转换为可重新照明的纹理网格的技术，从而能够使用新颖的照明进行快速实时渲染。我们通过在合成数据集和真实数据集上的实验证明了所提出方法的潜力，我们能够从图像集合中获得高质量的可重新点亮的 3D 资产。\n  - [NeRV：用于重新照明和视图合成的神经反射率和可见性场, CVPR2021](https://pratulsrinivasan.github.io/nerv/) | [code]\n    > 我们提出了一种方法，该方法将由不受约束的已知照明照明的场景的一组图像作为输入，并生成可以在任意照明条件下从新视点渲染的 3D 表示作为输出。我们的方法将场景表示为参数化为 MLP 的连续体积函数，其输入是 3D 位置，其输出是该输入位置的以下场景属性：体积密度、表面法线、材料参数、到任何方向上第一个表面交点的距离，以及任何方向的外部环境的可见性。总之，这些允许我们在任意照明下渲染物体的新视图，包括间接照明效果。预测的能见度和表面相交场对于我们的模型在训练期间模拟直接和间接照明的能力至关重要，因为先前工作使用的蛮力技术对于具有单灯的受控设置之外的照明条件是难以处理的。我们的方法在恢复可重新照明的 3D 场景表示方面优于替代方法，并且在对先前工作构成重大挑战的复杂照明设置中表现良好。\n  - [NeX：具有神经基础扩展的实时视图合成, CVPR2021(oral)](https://nex-mpi.github.io/) | [***``[code]``***](https://github.com/nex-mpi/nex-code/)\n    > 我们提出了 NeX，这是一种基于多平面图像 (MPI) 增强的新型视图合成的新方法，可以实时再现 NeXt 级别的视图相关效果。与使用一组简单 RGBα 平面的传统 MPI 不同，我们的技术通过将每个像素参数化为从神经网络学习的基函数的线性组合来模拟视图相关的效果。此外，我们提出了一种混合隐式-显式建模策略，该策略改进了精细细节并产生了最先进的结果。我们的方法在基准前向数据集以及我们新引入的数据集上进行了评估，该数据集旨在测试与视图相关的建模的极限，具有明显更具挑战性的效果，例如 CD 上的彩虹反射。我们的方法在这些数据集的所有主要指标上都取得了最好的总体得分，渲染时间比现有技术快 1000 倍以上。\n  - [NeRFactor：未知光照下形状和反射率的神经分解, TOG 2021 (Proc. SIGGRAPH Asia)](https://xiuming.info/projects/nerfactor/) | [code]\n    > 我们解决了从由一种未知光照条件照射的物体的多视图图像（及其相机姿势）中恢复物体的形状和空间变化反射率的问题。这使得能够在任意环境照明下渲染对象的新颖视图并编辑对象的材质属性。我们方法的关键，我们称之为神经辐射分解（NeRFactor），是提取神经辐射场（NeRF）的体积几何[Mildenhall et al。 2020] 将对象表示为表面表示，然后在解决空间变化的反射率和环境照明的同时联合细化几何。具体来说，NeRFactor 在没有任何监督的情况下恢复表面法线、光能见度、反照率和双向反射分布函数 (BRDF) 的 3D 神经场，仅使用重新渲染损失、简单的平滑先验和从真实数据中学习的数据驱动的 BRDF 先验-世界BRDF测量。通过显式建模光可见性，NeRFactor 能够从反照率中分离出阴影，并在任意光照条件下合成逼真的软阴影或硬阴影。 NeRFactor 能够恢复令人信服的 3D 模型，用于在合成场景和真实场景的这种具有挑战性且约束不足的捕获设置中进行自由视点重新照明。定性和定量实验表明，NeRFactor 在各种任务中都优于经典和基于深度学习的最新技术。我们的视频、代码和数据可在 people.csail.mit.edu/xiuming/projects/nerfactor/ 上找到。\n  - [NeRF++：分析和改进神经辐射场](https://arxiv.org/abs/2010.07492) | [***``[code]``***](https://github.com/Kai-46/nerfplusplus;)\n    > 神经辐射场 (NeRF) 为各种捕捉设置实现了令人印象深刻的视图合成结果，包括有界场景的 360 度捕捉以及有界和无界场景的前向捕捉。 NeRF 将表示视图不变不透明度和视图相关颜色体积的多层感知器 (MLP) 拟合到一组训练图像，并基于体积渲染技术对新视图进行采样。在这份技术报告中，我们首先评论了辐射场及其潜在的模糊性，即形状-辐射模糊度，并分析了 NeRF 在避免这种模糊性方面的成功。其次，我们解决了将 NeRF 应用于大规模、无界 3D 场景中对象的 360 度捕获所涉及的参数化问题。我们的方法在这种具有挑战性的场景中提高了视图合成保真度。此 https 网址提供了代码。\n  - [GIRAFFE：将场景表示为合成生成神经特征场, CVPR2021(oral)](https://arxiv.org/abs/2011.12100) | [***``[code]``***](https://github.com/autonomousvision/giraffe)\n    > 深度生成模型允许以高分辨率进行逼真的图像合成。但对于许多应用程序来说，这还不够：内容创建还需要可控。虽然最近的几项工作研究了如何解开数据变化的潜在因素，但它们中的大多数都在 2D 中运行，因此忽略了我们的世界是 3D 的。此外，只有少数作品考虑场景的构图性质。我们的关键假设是，将合成 3D 场景表示合并到生成模型中会导致更可控的图像合成。将场景表示为合成生成神经特征场使我们能够从背景中解开一个或多个对象以及单个对象的形状和外观，同时从非结构化和未定型的图像集合中学习，而无需任何额外的监督。将这种场景表示与神经渲染管道相结合，可以生成快速且逼真的图像合成模型。正如我们的实验所证明的那样，我们的模型能够解开单个对象，并允许在场景中平移和旋转它们以及改变相机姿势。\n  - [以对象为中心的神经场景渲染](https://shellguo.com/osf/) | [***``[code]``***](https://shellguo.com/osf/)\n    > 我们提出了一种从捕获的对象图像中合成逼真场景的方法。我们的工作建立在神经辐射场 (NeRFs) 之上，它隐含地模拟了场景的体积密度和定向发射的辐射。虽然 NeRF 可以合成逼真的图片，但它们只对静态场景进行建模，并且与特定的成像条件密切相关。这个属性使得 NeRFs 难以泛化到新场景，包括新的光照或对象的新排列。我们建议学习以对象为中心的神经散射函数 (OSF)，而不是像 NeRF 那样学习场景辐射场，这是一种使用与光照和视图相关的神经网络隐式模拟每个对象的光传输的表示。即使物体或灯光移动，这也可以渲染场景，而无需重新训练。结合体积路径跟踪程序，我们的框架能够渲染对象内和对象间的光传输效果，包括遮挡、镜面反射、阴影和间接照明。我们评估了我们的场景合成方法，并表明它可以推广到新的照明条件，产生逼真的、物理上精确的多对象场景渲染。\n  - [学习动态人头的组成辐射场, CVPR2021(oral)](https://ziyanw1.github.io/hybrid_nerf/) | [code]\n    > 动态人体的逼真渲染是远程呈现系统、虚拟购物、合成数据生成等的重要能力。最近，结合计算机图形学和机器学习技术的神经渲染方法已经创建了人类和物体的高保真模型。其中一些方法不会为可驱动的人体模型（神经体积）产生足够高保真度的结果，而其他方法则具有极长的渲染时间（NeRF）。我们提出了一种新颖的组合 3D 表示，它结合了以前最好的方法来产生更高分辨率和更快的结果。我们的表示通过将粗略的 3D 结构感知动画代码网格与连续学习的场景函数相结合，弥合了离散和连续体积表示之间的差距，该函数将每个位置及其相应的局部动画代码映射到其与视图相关的发射辐射和局部体积密度。可微分体渲染用于计算人头和上身的照片般逼真的新颖视图，并仅使用 2D 监督来端到端训练我们的新颖表示。此外，我们表明，学习到的动态辐射场可用于基于全局动画代码合成新的看不见的表情。我们的方法在合成动态人头和上半身的新视图方面取得了最先进的结果。\n  - [动态场景的神经场景图, CVPR2021(oral)](https://arxiv.org/abs/2011.10379) | [***``[code]``***](https://github.com/princeton-computational-imaging/neural-scene-graphs)\n    > 最近的隐式神经渲染方法表明，可以通过仅由一组 RGB 图像监督的预测其体积密度和颜色来学习复杂场景的准确视图合成。然而，现有方法仅限于学习将所有场景对象编码为单个神经网络的静态场景的有效表示，并且缺乏将动态场景表示和分解为单个场景对象的能力。在这项工作中，我们提出了第一个将动态场景分解为场景图的神经渲染方法。我们提出了一种学习的场景图表示，它对对象变换和辐射进行编码，以有效地渲染场景的新颖排列和视图。为此，我们学习隐式编码的场景，并结合联合学习的潜在表示来描述具有单个隐式函数的对象。我们在合成和真实汽车数据上评估所提出的方法，验证我们的方法学习动态场景 - 仅通过观察该场景的视频 - 并允许渲染具有看不见的对象集的新颖场景组合的新颖照片般逼真的视图看不见的姿势。\n  - [物体辐射场的无监督发现, ICLR2022](https://arxiv.org/abs/2107.07905) | [code]\n    > 我们研究从单个图像推断以对象为中心的场景表示的问题，旨在推导出解释图像形成过程的表示，捕捉场景的 3D 性质，并且在没有监督的情况下学习。由于将复杂的 3D 到 2D 图像形成过程集成到强大的推理方案（如深度网络）中存在根本性挑战，大多数现有的场景分解方法都缺乏这些特征中的一个或多个。在本文中，我们提出了对象辐射场 (uORF) 的无监督发现，将神经 3D 场景表示和渲染的最新进展与深度推理网络相结合，用于无监督 3D 场景分解。在没有注释的多视图 RGB 图像上进行训练，uORF 学习从单个图像分解具有不同纹理背景的复杂场景。我们展示了 uORF 在无监督 3D 场景分割、新视图合成和三个数据集上的场景编辑方面表现良好。\n  - [学习用于可编辑场景渲染的对象组合神经辐射场, ICCV2021](https://zju3dv.github.io/object_nerf/) | [***``[code]``***](https://github.com/zju3dv/object_nerf)\n    > 隐式神经渲染技术已经显示出用于新视图合成的有希望的结果。然而，现有方法通常将整个场景编码为一个整体，这通常不知道对象身份，并且限制了移动或添加家具等高级编辑任务的能力。在本文中，我们提出了一种新颖的神经场景渲染系统，该系统学习对象组成的神经辐射场，并为集群和真实世界场景生成具有编辑能力的逼真渲染。具体来说，我们设计了一种新颖的双路径架构，其中场景分支对场景几何和外观进行编码，对象分支根据可学习的对象激活码对每个独立对象进行编码。为了在严重混乱的场景中进行训练，我们提出了一种场景引导的训练策略来解决遮挡区域中的 3D 空间模糊性并学习每个对象的清晰边界。大量实验表明，我们的系统不仅在静态场景新视图合成方面取得了有竞争力的性能，而且为对象级编辑产生了逼真的渲染。\n  - [使用隐式场景表示进行就地场景标记和理解, ICCV2021(oral)](https://shuaifengzhi.com/Semantic-NeRF/) | [***``[code]``***](https://github.com/Harry-Zhi/semantic_nerf/)\n    > 语义标签与几何和辐射重建高度相关，因为具有相似形状和外观的场景实体更有可能来自相似的类别。最近的隐式神经重建技术很有吸引力，因为它们不需要事先的训练数据，但同样的完全自我监督的方法对于语义来说是不可能的，因为标签是人类定义的属性。\n  - [编辑条件辐射场, ICCV2021](http://editnerf.csail.mit.edu/) | [***``[code]``***](https://github.com/stevliu/editnerf)\n    > 神经辐射场 (NeRF) 是支持高质量视图合成的场景模型，针对每个场景进行了优化。在本文中，我们探索启用用户编辑类别级 NeRF - 也称为条件辐射场 - 在形状类别上训练。具体来说，我们介绍了一种将粗略的 2D 用户涂鸦传播到 3D 空间的方法，以修改局部区域的颜色或形状。首先，我们提出了一个条件辐射场，它结合了新的模块化网络组件，包括一个跨对象实例共享的形状分支。观察同一类别的多个实例，我们的模型在没有任何监督的情况下学习底层部分语义，从而允许将粗略的 2D 用户涂鸦传播到整个 3D 区域（例如，椅子座位）。接下来，我们提出了一种针对特定网络组件的混合网络更新策略，该策略平衡了效率和准确性。在用户交互过程中，我们制定了一个既满足用户约束又保留原始对象结构的优化问题。我们在三个形状数据集上展示了我们在各种编辑任务上的方法，并表明它优于以前的神经编辑方法。最后，我们编辑真实照片的外观和形状，并显示编辑传播到外推的新视图。\n  - [使用分层神经表示的可编辑自由视点视频, SIGGRAPH2021](https://jiakai-zhang.github.io/st-nerf/) | [***``[code]``***](https://jiakai-zhang.github.io/st-nerf/#code)\n    > 生成自由视点视频对于沉浸式 VR/AR 体验至关重要，但最近的神经学进展仍然缺乏编辑能力来操纵大型动态场景的视觉感知。为了填补这一空白，在本文中，我们提出了第一种仅使用稀疏的 16 个摄像头为大规模动态场景生成可编辑照片般逼真的自由视点视频的方法。我们方法的核心是一种新的分层神经表示，其中包括环境本身的每个动态实体都被制定为称为 ST-NeRF 的时空相干神经分层辐射表示。这种分层表示支持对动态场景的完全感知和真实操作，同时仍支持大范围的自由观看体验。在我们的 ST-NeRF 中，动态实体/层被表示为连续函数，以连续和自监督的方式实现动态实体的位置、变形以及外观的解耦。我们提出了一个场景解析 4D 标签映射跟踪来显式地解开空间信息，以及一个连续变形模块来隐式地解开时间运动。进一步引入了一种对象感知体绘制方案，用于重新组装所有神经层。我们采用了一种新颖的分层损失和运动感知光线采样策略，以实现对具有多个表演者的大型动态场景的有效训练，我们的框架进一步实现了各种编辑功能，即操纵规模和位置，复制或重新定时单个神经层在保持高度真实感的同时创造众多视觉效果。大量实验证明了我们的方法在为动态场景生成高质量、照片般逼真和可编辑的自由视点视频方面的有效性。\n  - [Fig-NeRF：用于 3D 对象类别建模的图地面神经辐射场, 3DV2021](https://fig-nerf.github.io/) | [code]\n    > 我们研究使用神经辐射场 (NeRF) 从输入图像的集合中学习高质量的 3D 对象类别模型。与以前的工作相比，我们能够做到这一点，同时将前景对象与不同的背景分开。我们通过 2 分量 NeRF 模型 FiG-NeRF 实现了这一点，该模型更喜欢将场景解释为几何恒定的背景和代表对象类别的可变形前景。我们表明，这种方法可以仅使用光度监督和随意捕获的对象图像来学习准确的 3D 对象类别模型。此外，我们的两部分分解允许模型执行准确和清晰的模态分割。我们使用合成的、实验室捕获的和野外数据，通过视图合成和图像保真度指标对我们的方法进行定量评估。我们的结果证明了令人信服的 3D 对象类别建模，其性能超过了现有方法的性能。\n  - [NeRF-Tex：神经反射场纹理, EGSR2021](https://developer.nvidia.com/blog/nvidia-research-nerf-tex-neural-reflectance-field-textures/) | [***``[code]``***](https://github.com/hbaatz/nerf-tex)\n    > 我们研究使用神经场来模拟不同的中尺度结构，例如毛皮、织物和草。我们建议使用由神经反射场 (NeRF-Tex) 表示的多功能体积基元，而不是使用经典的图形基元来建模结构，它联合建模材料的几何形状及其对照明的响应。 NeRF-Tex 原语可以在基础网格上实例化，以使用所需的细观和微尺度外观对其进行“纹理化”。我们根据控制外观的用户定义参数来调节反射率场。因此，单个 NeRF 纹理捕获了反射场的整个空间，而不是一个特定的结构。这增加了可以建模的外观范围，并提供了一种解决重复纹理伪影的解决方案。我们还证明了 NeRF 纹理自然地促进了连续的细节层次渲染。我们的方法将神经网络的多功能性和建模能力与虚拟场景精确建模所需的艺术控制相结合。虽然我们所有的训练数据目前都是合成的，但我们的工作提供了一个方法，可以进一步扩展以从真实图像中提取复杂、难以建模的外观。\n  - [Mip-NeRF：抗锯齿神经辐射场的多尺度表示, ICCV2021(oral)](https://jonbarron.info/mipnerf/) | [***``[code]``***](https://github.com/google/mipnerf)\n    > 神经辐射场 (NeRF) 使用的渲染过程对每个像素单条射线进行采样，因此在训练或测试图像以不同分辨率观察场景内容时，可能会产生过度模糊或混叠的渲染。对于 NeRF 来说，通过每个像素渲染多条光线来进行超级采样的直接解决方案是不切实际的，因为渲染每条光线需要查询多层感知器数百次。我们的解决方案，我们称之为“mip-NeRF”（à la“mipmap”），扩展了 NeRF 以在连续值的尺度上表示场景。通过有效地渲染抗锯齿圆锥截头体而不是射线，mip-NeRF 减少了令人反感的锯齿伪影并显着提高了 NeRF 表示精细细节的能力，同时也比 NeRF 快 7% 和一半的大小。与 NeRF 相比，mip-NeRF 在使用 NeRF 呈现的数据集上将平均错误率降低了 17%，在我们呈现的该数据集的具有挑战性的多尺度变体上降低了 60%。 mip-NeRF 还能够在我们的多尺度数据集上匹配蛮力超采样 NeRF 的准确性，同时速度提高 22 倍。\n  - [UNISURF：统一神经隐式表面和辐射场以进行多视图重建, ICCV2021(oral)](https://arxiv.org/abs/2104.10078) | [***``[code]``***](https://github.com/autonomousvision/unisurf)\n    > 神经隐式 3D 表示已成为从多视图图像重建表面和合成新视图的强大范例。不幸的是，DVR 或 IDR 等现有方法需要精确的每像素对象掩码作为监督。同时，神经辐射场已经彻底改变了新的视图合成。然而，NeRF 的估计体积密度不允许精确的表面重建。我们的主要见解是隐式表面模型和辐射场可以以统一的方式制定，从而使用相同的模型实现表面和体积渲染。这种统一的视角实现了新颖、更有效的采样程序，并能够在没有输入掩码的情况下重建准确的表面。我们在 DTU、BlendedMVS 和合成室内数据集上比较我们的方法。我们的实验表明，我们在重建质量方面优于 NeRF，同时在不需要掩码的情况下与 IDR 相当。\n  - [NeuS：通过体渲染学习神经隐式表面以进行多视图重建, NeurIPS2021](https://arxiv.org/abs/2106.10689) | [***``[code]``***](https://github.com/Totoro97/NeuS)\n    > 我们提出了一种新的神经表面重建方法，称为 NeuS，用于从 2D 图像输入中重建具有高保真度的对象和场景。现有的神经表面重建方法，如 DVR 和 IDR，需要前景掩码作为监督，容易陷入局部最小值，因此难以重建具有严重自遮挡或薄结构的物体。同时，最近用于新视图合成的神经方法，例如 NeRF 及其变体，使用体积渲染来生成具有优化鲁棒性的神经场景表示，即使对于高度复杂的对象也是如此。然而，从这种学习到的隐式表示中提取高质量的表面是很困难的，因为表示中没有足够的表面约束。在 NeuS 中，我们建议将表面表示为有符号距离函数 (SDF) 的零级集，并开发一种新的体绘制方法来训练神经 SDF 表示。我们观察到传统的体绘制方法会导致表面重建的固有几何误差（即偏差），因此提出了一种新的公式，该公式在一阶近似中没有偏差，从而即使没有掩模监督也能实现更准确的表面重建.在 DTU 数据集和 BlendedMVS 数据集上的实验表明，NeuS 在高质量表面重建方面优于最先进的技术，特别是对于具有复杂结构和自遮挡的物体和场景。\n  - [神经隐式表面的体积渲染, NeurIPS2021](https://arxiv.org/abs/2106.12052) | [code]\n    > 神经体绘制最近变得越来越流行，因为它成功地从一组稀疏的输入图像中合成了场景的新视图。到目前为止，通过神经体绘制技术学习的几何图形是使用通用密度函数建模的。此外，几何本身是使用密度函数的任意水平集提取的，这会导致嘈杂的、通常是低保真度的重建。本文的目标是改进神经体绘制中的几何表示和重建。我们通过将体积密度建模为几何形状的函数来实现这一点。这与之前将几何建模为体积密度函数的工作形成对比。更详细地说，我们将体积密度函数定义为应用于有符号距离函数 (SDF) 表示的拉普拉斯累积分布函数 (CDF)。这种简单的密度表示具有三个好处：（i）它为在神经体绘制过程中学习的几何图形提供了有用的归纳偏差； (ii) 它有助于限制不透明度近似误差，从而实现对视线的准确采样。准确的采样对于提供几何和辐射的精确耦合很重要； (iii) 它允许在体积渲染中对形状和外观进行有效的无监督解开。将这种新的密度表示应用于具有挑战性的场景多视图数据集产生了高质量的几何重建，优于相关的基线。此外，由于两者的分离，可以在场景之间切换形状和外观。\n  - [NerfingMVS：室内多视角立体神经辐射场的引导优化, ICCV2021(oral)](https://arxiv.org/abs/2109.01129) | [***``[code]``***](https://github.com/weiyithu/NerfingMVS)\n    > 在这项工作中，我们提出了一种新的多视图深度估计方法，该方法在最近提出的神经辐射场 (NeRF) 上利用了传统的 SfM 重建和基于学习的先验。与现有的依赖于估计对应的基于神经网络的优化方法不同，我们的方法直接优化隐式体积，消除了在室内场景中匹配像素的挑战性步骤。我们方法的关键是利用基于学习的先验来指导 NeRF 的优化过程。我们的系统首先通过微调其稀疏 SfM 重建来适应目标场景上的单目深度网络。然后，我们证明了 NeRF 的形状-辐射模糊性仍然存在于室内环境中，并建议通过采用适应的深度先验来监控体绘制的采样过程来解决这个问题。最后，通过对渲染图像进行误差计算获得的每像素置信度图可用于进一步提高深度质量。实验表明，我们提出的框架在室内场景中显着优于最先进的方法，在基于对应的优化和基于 NeRF 的优化对适应深度先验的有效性方面提出了令人惊讶的发现。此外，我们表明引导优化方案不会牺牲神经辐射场的原始合成能力，提高了可见视图和新视图的渲染质量。\n  - [用于视觉运动控制的 3D 神经场景表示, CoRL2021(oral)](https://3d-representation-learning.github.io/nerf-dy/) | [code]\n    > 人类对我们周围的 3D 环境有着强烈的直觉理解。我们大脑中的物理心智模型适用于不同材料的物体，使我们能够执行远远超出当前机器人范围的广泛操作任务。在这项工作中，我们希望纯粹从 2D 视觉观察中学习动态 3D 场景的模型。我们的模型结合了神经弧度\n  - [神经辐射世界中的仅视觉机器人导航](https://arxiv.org/abs/2110.00168) | [code]\n    > 神经辐射场 (NeRFs) 最近已成为表示自然、复杂 3D 场景的强大范例。 NeRF 表示神经网络中的连续体积密度和 RGB 值，并通过光线追踪从看不见的相机视点生成照片般逼真的图像。我们提出了一种算法，用于在表示为 NeRF 的 3D 环境中导航机器人，仅使用板载 RGB 相机进行定位。我们假设场景的 NeRF 已经离线预训练，机器人的目标是在 NeRF 中的未占用空间中导航以达到目标姿势。我们引入了一种轨迹优化算法，该算法基于离散时间版本的差分平坦度避免与 NeRF 中的高密度区域发生碰撞，该版本可以约束机器人的完整姿势和控制输入。我们还引入了一种基于优化的过滤方法来估计 NeRF 中机器人的 6DoF 姿势和速度，仅给定一个板载 RGB 相机。我们将轨迹规划器与位姿过滤器结合在一个在线重新规划循环中，以提供基于视觉的机器人导航管道。我们展示了一个四旋翼机器人仅使用 RGB 相机在丛林健身房环境、教堂内部和巨石阵中导航的模拟结果。我们还演示了一个在教堂中导航的全向地面机器人，要求它重新定向以适应狭窄的缝隙。可以在此 https 网址上找到这项工作的视频。\n"
  },
  {
    "path": "eval_block_nerf.py",
    "content": "import os\nimport cv2\nimport pdb\nimport json\nimport torch\nimport numpy as np\nfrom tqdm import tqdm\nfrom collections import defaultdict\nfrom argparse import ArgumentParser\nfrom block_nerf.rendering import *\nfrom block_nerf.block_nerf_model import *\nfrom block_nerf.block_nerf_lightning import *\nfrom block_nerf.waymo_dataset import *\nimport imageio\n\n\ndef get_hparams():\n    parser = ArgumentParser()\n    parser.add_argument('--save_path', type=str,\n                        default='data/result_pytorch_waymo',\n                        help='result directory of dataset')\n\n    parser.add_argument('--root_dir', type=str,\n                        default='data/pytorch_waymo_dataset',\n                        help='root directory of dataset')\n\n    parser.add_argument('--ckpt_dir', type=str, default='data/ckpts',\n                        help='path to load the trianed block checkpoints (e.g., block_1.ckpt).'\n                        )\n\n    parser.add_argument('--IDW_Power', type=int, default=1,\n                        help='the value of the IDW power')\n\n    parser.add_argument('--chunk', type=int, default=1024 * 2,\n                        help='number of chunks')\n\n    parser.add_argument('--cam_idx', type=list, default=[0],\n                        help='the index of the camera you want to inference,0~11, total 12 cameras'\n                        )\n\n    return vars(parser.parse_args())\n\n\n@torch.no_grad()\ndef batched_inference(\n    model,\n    embeddings,\n    rays,\n    ts,\n    N_samples=128,\n    N_importance=128,\n    chunk=1024,\n    use_disp=False,\n    ):\n\n    B = rays.shape[0]\n    results = defaultdict(list)\n    for i in range(0, B, chunk):\n        result_chunk = render_rays(\n            model,\n            embeddings,\n            rays[i:i + chunk],\n            ts[i:i + chunk],\n            N_samples=N_samples,\n            N_importance=N_importance,\n            chunk=chunk,\n            type='test',\n            use_disp=use_disp,\n            )\n        for (k, v) in result_chunk.items():\n            results[k] += [v.cpu()]\n    for (k, v) in results.items():\n        results[k] = torch.cat(v, 0)\n\n    return results\n\n\ndef filter_cam_info_by_index(index, cam_infos):\n    for (i, cam_info) in enumerate(cam_infos):\n        if i == index:\n            print('Now is inferencing the {cam_info} camera..')\n            return cam_infos[cam_info]\n    return None\n\n\ndef filter_Block(begin, blocks):\n    block_filter = []\n    for block in blocks:\n        for element in blocks[block]['elements']:\n            if element[0] == begin:\n                block_filter.append(block)\n    return block_filter\n\n\ndef DistanceWeight(point, centroid, p=4):\n    point = point.numpy()\n    centroid = np.array(centroid)\n    return np.linalg.norm(point - centroid) ** -p\n\n\ndef Inverse_Interpolation(model_result, W_H):\n    weights = []\n\n    img_RGB = {}\n    img_DEPTH = {}\n    for block in model_result:\n        block_RGB = np.clip(model_result[block]['rgb_fine'].view(H, W,\n                            3).detach().numpy(), 0, 1)\n        block_RGB = (block_RGB * 255).astype(np.uint8)\n        img_RGB[block] = block_RGB\n\n        block_depth = model_result[block]['depth_fine'].view(H,\n                W).numpy()\n        block_depth = np.nan_to_num(block_depth)  # change nan to 0\n        mi = np.min(block_depth)  # get minimum depth\n        ma = np.max(block_depth)\n        block_depth = (block_depth - mi) / max(ma - mi, 1e-8)  # normalize to 0~1\n        block_depth = (255 * block_depth).astype(np.uint8)\n        img_DEPTH[block] = block_depth\n\n        weights.append(model_result[block]['distance_weight'])\n\n    weights = [weight / sum(weights) for weight in weights]\n    print('The weight of each block is:', weights)\n    img_pred = sum(weight * rgb for (weight, rgb) in zip(weights,\n                   img_RGB.values())).astype(np.uint8)\n    img_depth = sum(weight * depth for (weight, depth) in zip(weights,\n                    img_DEPTH.values())).astype(np.uint8)\n\n    img_RGB['compose'] = img_pred\n    img_DEPTH['compose'] = img_depth\n\n    return (img_RGB, img_DEPTH)\n\n\nif __name__ == '__main__':\n    print(\"Warning, this old implementation of BlockNeRF will be deprecated in the next version!\")\n    torch.cuda.empty_cache()\n    hparams = get_hparams()\n    os.makedirs(hparams['save_path'], exist_ok=True)\n\n    block_split_info = None\n    with open(os.path.join(hparams['root_dir'], 'train',\n              'split_block_train.json'), 'r') as fp:\n        block_split_info = json.load(fp)\n\n    centroids = []\n    for block in block_split_info:\n        centroids.append(block_split_info[block]['centroid'])\n\n    block_model = ['block_1', 'block_2']  # only render these models\n\n    # block_model = [\"block_6\", \"block_7\"]\n\n    with open(os.path.join(hparams['root_dir'], 'cam_info.json'), 'r'\n              ) as fp:\n        cam_infos = json.load(fp)\n\n    (rgb_video_writer, depth_video_writer) = (None, None)\n\n    for cam_idx in hparams['cam_idx']:\n        print('Now is inferencing the {cam_idx} camera!')\n        cam_infos = filter_cam_info_by_index(cam_idx, cam_infos)\n        cam_info_begin = cam_infos[:-1]\n        cam_info_end = cam_infos[1:]\n        os.makedirs(os.path.join(hparams['save_path'], str(cam_idx)),\n                    exist_ok=True)\n        rgb_save_p = os.path.join(hparams['save_path'], str(cam_idx),\n                                  'rgb_images')\n        depth_save_p = os.path.join(hparams['save_path'], str(cam_idx),\n                                    'depth_images')\n        os.makedirs(rgb_save_p, exist_ok=True)\n        os.makedirs(depth_save_p, exist_ok=True)\n\n        # imgs = []\n        # imgs_depth = []\n\n        for i in tqdm(range(len(cam_info_begin))):\n            (begin, end) = (cam_info_begin[i], cam_info_end[i])\n            dataset = WaymoDataset(root_dir=hparams['root_dir'],\n                                   split='compose', cam_begin=begin,\n                                   cam_end=end)\n            for j in tqdm(range(len(dataset))):\n                batch = dataset[j]\n                (rays, ts) = (batch['rays'], batch['ts'])\n                (W, H) = batch['w_h']\n                origin = rays[0, 0:3]\n                blocks = filter_Block(begin, block_split_info)\n                print('The current view belongs to the block of {blocks}.')\n                model_result = {}\n                for block in blocks:\n                    if block in block_model:\n                        ts[:] = \\\n                            find_idx_name(block_split_info[block]['elements'\n                                ], begin)\n                        print('Loading model ...')\n                        model = \\\n                            Block_NeRF_System.load_from_checkpoint(os.path.join(hparams['ckpt_dir'\n                                ], str(block) + '.ckpt')).cuda().eval()\n                        models = {'block_model': model.Block_NeRF,\n                                  'visibility_model': model.Visibility}\n                        print(\"Model loaded. Now is inferring the {0}'s model.\".format(block))\n                        results = batched_inference(\n                            models,\n                            model.Embedding,\n                            rays.cuda(),\n                            ts.cuda(),\n                            use_disp=model.hparams['use_disp'],\n                            N_samples=model.hparams['N_samples'] * 2,\n                            N_importance=model.hparams['N_importance']\n                                * 2,\n                            chunk=hparams['chunk'],\n                            )\n                        print(\"Finished inferring the {0}'s model.\".format(block))\n                        if results['transmittance_fine_vis'].mean() \\\n                            > 0.05:\n                            results['distance_weight'] = \\\n                                DistanceWeight(point=origin,\n                                    centroid=block_split_info[block]['centroid'\n                                    ][1], p=hparams['IDW_Power'])\n                            model_result[block] = results\n                if not len(model_result):\n                    continue\n                (RGB_compose, Depth_compose) = \\\n                    Inverse_Interpolation(model_result, [W, H])\n                if rgb_video_writer is None:\n                    rgb_video_path = os.path.join(hparams['save_path'],\n                            str(cam_idx), 'rgb_video.mp4')\n                    depth_video_path = os.path.join(hparams['save_path'\n                            ], str(cam_idx), 'depth_video.mp4')\n                    (height, width) = RGB_compose['compose'].shape[:2]\n                    rgb_video_writer = cv2.VideoWriter(rgb_video_path,\n                            cv2.VideoWriter_fourcc(*'mp4v'), 15,\n                            (width, height))\n                    depth_video_writer = \\\n                        cv2.VideoWriter(depth_video_path,\n                            cv2.VideoWriter_fourcc(*'mp4v'), 15,\n                            (width, height))\n\n                # imgs.append(RGB_compose['compose'])\n                # imgs_depth.append(Depth_compose['compose'])\n\n                rgb_video_writer.write(RGB_compose['compose'])\n                depth_video_writer.write(Depth_compose['compose'])\n\n                # save each rendered image\n\n                for (RGB, Depth) in zip(RGB_compose, Depth_compose):\n                    imageio.imwrite(os.path.join(rgb_save_p,\n                                    '{0}_{1}_{2}_{3}.png'.format(i,\n                                    begin, end, RGB)), RGB_compose[RGB])\n                    imageio.imwrite(os.path.join(depth_save_p,\n                                    '{0}_{1}_{2}_{3}_depth.png'.format(i,\n                                    begin, end, Depth)),\n                                    Depth_compose[Depth])\n        if rgb_video_writer is not None:\n            rgb_video_writer.release()\n            depth_video_writer.release()\n    print('All done.')\n"
  },
  {
    "path": "requirements.txt",
    "content": "numpy\nscipy\ntqdm\nlpips\npandas\nmmcv\nimageio\nimageio-ffmpeg\nopencv-python\ntorch_efficient_distloss\nscikit-image\nninja\neinops\nparscript\nopen3d # for visualization\nmatplotlib \nconfigargparse\nmdutils\nopenpyxl\nkornia\ntorch_optimizer\ntest-tube\ngdown\npytorch_lightning\ntorch_scatter\nopenmim"
  },
  {
    "path": "run_FourierGrid.py",
    "content": "import os, sys, copy, glob, json, time, random, argparse\nimport mmengine\nimport numpy as np\nimport pdb\nimport torch\nfrom FourierGrid.load_everything import load_everything\nfrom FourierGrid.run_export_bbox import *\nfrom FourierGrid.run_export_coarse import run_export_coarse\nfrom FourierGrid.run_train import run_train\nfrom FourierGrid.run_render import run_render\nfrom FourierGrid.run_gen_cam_paths import run_gen_cam_paths\nfrom FourierGrid.FourierGrid_ckpt_manager import FourierGridCheckpointManager\n\n\ndef config_parser():\n    '''Define command line arguments\n    '''\n    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)\n    parser.add_argument('--program', required=True, type=str, \n                        help='choose one program to run', choices=['export_bbox', 'export_coarse', \n                                                                   'render', 'train', 'gen_trace', 'sfm', 'tune_pose']\n                        )\n    parser.add_argument('--exp_id', required=True, type=str, \n                        help='append exp_id to exp names', default=\"\"\n                        )\n    parser.add_argument('--config', required=True,\n                        help='config file path')\n    parser.add_argument(\"--seed\", type=int, default=777,\n                        help='Random seed')\n    parser.add_argument(\"--sample_num\", type=int, default=-1,\n                        help='Sample number of data points in the dataset, used for debugging.')\n    parser.add_argument(\"--num_per_block\", type=int, default=-1,\n                        help='Number of images per block. Set to -1 to forbid block training.')\n    parser.add_argument(\"--no_reload\", action='store_true',\n                        help='do not reload weights from saved ckpt')\n    parser.add_argument(\"--no_reload_optimizer\", action='store_true',\n                        help='do not reload optimizer state from saved ckpt')\n    parser.add_argument(\"--ft_path\", type=str, default='',\n                        help='specific weights npy file to reload for coarse network')\n    parser.add_argument(\"--export_bbox_and_cams_only\", type=str, default='',\n                        help='export scene bbox and camera poses for debugging and 3d visualization')\n    parser.add_argument(\"--export_coarse_only\", type=str, default='')\n\n    # render and eval options\n    parser.add_argument(\"--render_only\", action='store_true',\n                        help='do not optimize, reload weights and render out render_poses path')\n    parser.add_argument(\"--render_test\", action='store_true')\n    parser.add_argument(\"--render_train\", action='store_true')\n    parser.add_argument(\"--render_video\", action='store_true')\n    parser.add_argument(\"--render_video_flipy\", action='store_true')\n    parser.add_argument(\"--render_video_rot90\", default=0, type=int)\n    parser.add_argument(\"--render_video_factor\", type=float, default=0,\n                        help='downsampling factor to speed up rendering, set 4 or 8 for fast preview')\n    parser.add_argument(\"--dump_images\", action='store_true')\n    parser.add_argument(\"--eval_ssim\", action='store_true')\n    parser.add_argument(\"--eval_lpips_alex\", action='store_true')\n    parser.add_argument(\"--eval_lpips_vgg\", action='store_true')\n    parser.add_argument(\"--save_train_imgs\", action='store_true', help=\"save training images to the exp folder\")\n    parser.add_argument(\"--diffuse\", action='store_true', help=\"use diffused images\")\n\n    # logging/saving options\n    parser.add_argument(\"--i_print\",   type=int, default=500,\n                        help='frequency of console printout and metric loggin')\n    parser.add_argument(\"--i_weights\", type=int, default=1000000,\n                        help='frequency of weight ckpt saving, by default not save ckpts during training.')\n    return parser\n\n\ndef seed_everything():\n    '''Seed everything for better reproducibility.\n    (some pytorch operation is non-deterministic like the backprop of grid_samples)\n    '''\n    torch.manual_seed(args.seed)\n    np.random.seed(args.seed)\n    random.seed(args.seed)\n\n\nif __name__=='__main__':\n    # load setup\n    parser = config_parser()\n    args = parser.parse_args()\n    cfg = mmengine.Config.fromfile(args.config)\n    # create exp name with exp_id\n    cfg.expname = cfg.expname + args.exp_id\n    # init enviroment\n    if torch.cuda.is_available():\n        torch.set_default_tensor_type('torch.cuda.FloatTensor')\n        device = torch.device('cuda')\n    else:\n        device = torch.device('cpu')\n    seed_everything()\n\n    # load images / poses / camera settings / data split\n    data_dict, args = load_everything(args=args, cfg=cfg)\n    args.block_num = -1\n    args.running_block_id = -1\n    program = args.program\n    FourierGrid_datasets = [\"waymo\", \"mega\", \"nerfpp\", \"llff\", \"free\"]\n    if cfg.data.dataset_type in FourierGrid_datasets or cfg.model == 'FourierGrid':\n        args.ckpt_manager = FourierGridCheckpointManager(args, cfg)\n        if args.num_per_block > 0:\n            args.block_num = int(len(data_dict['i_train']) // args.num_per_block)\n            print(f\"Running in {args.block_num} blocks where each block contains {args.num_per_block} number of images.\")\n    else:\n        args.ckpt_manager = None\n        args.num_per_block = -1\n\n    # launch the corresponding program\n    if program == \"export_bbox\":\n        run_export_bbox_cams(args=args, cfg=cfg, data_dict=data_dict)\n    elif program == \"export_coarse\":\n        run_export_coarse(args=args, cfg=cfg, device=device)\n    elif program == \"train\":\n        args.running_block_id = -1\n        run_train(args, cfg, data_dict, export_cam=True, export_geometry=True)\n        print(\"Training finished. Run rendering.\")\n        run_render(args=args, cfg=cfg, data_dict=data_dict, device=device)\n    elif program == 'render':\n        run_render(args=args, cfg=cfg, data_dict=data_dict, device=device)\n    elif program == 'gen_trace':\n        run_gen_cam_paths(args=args, cfg=cfg, data_dict=data_dict)\n    else:\n        raise NotImplementedError(f\"Program {program} is not supported!\")\n    \n    render_notes = \"\"\n    if args.render_train:\n        render_notes += \"Rendered train. \"\n    elif args.render_test:\n        render_notes += \"Rendered test.\"\n    print(f\"Finished running program {program}.\" + render_notes)\n"
  },
  {
    "path": "scripts/block_nerf_eval.sh",
    "content": "python eval_block_nerf.py --chunk 8192 # 3090ti\npython eval_block_nerf.py --chunk 2048"
  },
  {
    "path": "scripts/block_nerf_train.sh",
    "content": "#!/bin/bash\npython train_block_nerf.py --block_index $1"
  },
  {
    "path": "scripts/create_cluster_mask.sh",
    "content": "# arguments\nexport DATASET_NAME=building\nexport MASK_PATH=data/mega/${DATASET_NAME}/pixsfm-grid-8  # output would be put at this folder\nexport DATASET_PATH=./data/mega/${DATASET_NAME}/${DATASET_NAME}-pixsfm  # raw image folder with poses\nexport NUM_GPUS=4 # number of GPUs\n# for debugging only, this is slow.\n# python create_cluster_masks.py --config mega_nerf/configs/${DATASET_NAME}.yaml --dataset_path ${DATASET_PATH} --output ${MASK_PATH}-debug --grid_dim 2 4\n# for a standard run, comment the following line when debugging\npython -m torch.distributed.run --standalone --nnodes=1 --nproc_per_node ${NUM_GPUS} --max_restarts 0 create_cluster_masks.py --config mega_nerf/configs/${DATASET_NAME}.yaml --dataset_path ${DATASET_PATH} --output ${MASK_PATH} --grid_dim 2 4\n"
  },
  {
    "path": "scripts/download_nerfstudio.sh",
    "content": "cd data\nwget https://data.nerf.studio/nerfstudio/Giannini-Hall.zip\nwget https://data.nerf.studio/nerfstudio/sculpture.zip\nwget https://data.nerf.studio/nerfstudio/stump.zip\nwget https://data.nerf.studio/nerfstudio/aspen.zip\nwget https://data.nerf.studio/nerfstudio/floating-tree.zip\nwget https://data.nerf.studio/nerfstudio/dozer.zip\nwget https://data.nerf.studio/nerfstudio/plane.zip\nwget https://data.nerf.studio/nerfstudio/kitchen.zip\nwget https://data.nerf.studio/nerfstudio/person.zip\nwget https://data.nerf.studio/nerfstudio/Egypt.zip\n\nunzip Giannini-Hall.zip\nunzip sculpture.zip\nunzip stump.zip\nunzip aspen.zip\nunzip floating-tree.zip\nunzip dozer.zip\nunzip plane.zip\nunzip kitchen.zip\nunzip person.zip\nunzip Egypt.zip\ncd ../"
  },
  {
    "path": "scripts/gen_path_FourierGrid.sh",
    "content": "export CONFIG=FourierGrid/configs/waymo/waymo_full.py\npython run_FourierGrid.py --program gen_trace --config ${CONFIG}\n"
  },
  {
    "path": "scripts/merge_sub_modules.sh",
    "content": "# arguments\nexport DATASET_NAME=building\nexport EXP_FOLDER=data/mega/${DATASET_NAME}/train_exp_logs/  # load checkpoints from this folder\nexport MERGED_OUTPUT=./data/mega/${DATASET_NAME}/${DATASET_NAME}-pixsfm-8.pt # merge trained models and put to this path\nexport MASK_PATH=data/mega/${DATASET_NAME}/building-pixsfm-grid-8  # load mask from this path\n# for debugging and standard running\npython merge_submodules.py --config_file mega_nerf/configs/${DATASET_NAME}.yaml --ckpt_prefix ${EXP_FOLDER}/ --centroid_path ${MASK_PATH}/params.pt --output $MERGED_OUTPUT"
  },
  {
    "path": "scripts/one_block_train.sh",
    "content": "# python run_FourierGrid.py --config configs/waymo/block_0.py\npython run_FourierGrid.py --config configs/waymo/block_0_online.py"
  },
  {
    "path": "scripts/render_FourierGrid.sh",
    "content": "# render testing sequences\npython run_FourierGrid.py --program render --config FourierGrid/configs/waymo/waymo_tank.py --sample_num 100 --render_test --exp_id 87\n# render training sequences\npython run_FourierGrid.py --program render --config FourierGrid/configs/waymo/waymo_tank.py --sample_num 5 --render_train --exp_id 73\npython run_FourierGrid.py --program render --config FourierGrid/configs/tankstemple_unbounded/Playground.py --render_test --render_only --render_video_factor 1\n"
  },
  {
    "path": "scripts/sfm_FourierGrid.sh",
    "content": "export CONFIG=FourierGrid/configs/waymo/waymo_no_block.py\n# python run_FourierGrid.py --program sfm --config ${CONFIG} --sample_num 100 --exp_id 7\npython run_FourierGrid.py --program tune_pose --config FourierGrid/configs/waymo/waymo_no_block.py --sample_num 300 --render_video --exp_id 45\n# on the mega building dataset\npython run_FourierGrid.py --program tune_pose --config FourierGrid/configs/mega/building_no_block.py --sample_num 300 --render_video --exp_id 45\n"
  },
  {
    "path": "scripts/train_FourierGrid.sh",
    "content": "# Unbounded tanks and temples\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/playground_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 57\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/train_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 13\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/truck_single.py --num_per_block -1 --render_train --render_test --render_video --exp_id 9\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple_unbounded/m60_single.py --num_per_block -1 --render_train --render_test --render_video --eval_ssim --eval_lpips_vgg --exp_id 8\n\n# 360 degree dataset\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/room_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 9\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/stump_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 1\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/bicycle_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 12\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/bonsai_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 1\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/garden_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 6\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/kitchen_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 2\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf_unbounded/counter_single.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 9\n\n# Bounded scenes\npython run_FourierGrid.py --program train --config FourierGrid/configs/tankstemple/Family_lg.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 11\npython run_FourierGrid.py --program train --config FourierGrid/configs/llff/leaves.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 12\npython run_FourierGrid.py --program train --config FourierGrid/configs/llff/horns.py --num_per_block -1 --eval_ssim --eval_lpips_vgg --render_train --render_test --render_video --exp_id 4\npython FourierGrid/run_colmap2standard.py --data_dir ./data/nerf_llff_data/horns/\npython FourierGrid/run_colmap2standard.py --data_dir ./data/free_dataset/grass/\n\n# Free camera dataset\npython run_FourierGrid.py --program train --config FourierGrid/configs/free_dataset/grass.py --num_per_block -1 --render_train --render_test --render_video --eval_ssim --eval_lpips_vgg --exp_id 7\n\n# nerf-studio dataset\npython run_FourierGrid.py  --program train --config FourierGrid/configs/nerf_studio/Giannini_Hall.py --num_per_block -1 --eval_ssim --render_train --render_test --render_video --exp_id 1\n\n# original DVGOv2 training\n# python run_FourierGrid.py --program train --config FourierGrid/configs/waymo/block_0_tt.py\n# -----------------------------------------------------------------------\n# building\npython run_FourierGrid.py --program train --config FourierGrid/configs/mega/building_no_block.py --sample_num 300 --render_train --render_video --exp_id 50\npython run_FourierGrid.py --program render --config FourierGrid/configs/mega/building.py --sample_num 100 --render_train --exp_id 12 --num_per_block 5  # for render\n# rubble\npython run_FourierGrid.py --program train --config FourierGrid/configs/mega/rubble.py --sample_num 100 --render_train --num_per_block 5 --exp_id 4  # for train\npython run_FourierGrid.py --program train --config FourierGrid/configs/mega/rubble.py --sample_num 10 --render_train --num_per_block 5 --exp_id 4  # for debug\npython run_FourierGrid.py --program render --config FourierGrid/configs/mega/rubble.py --sample_num 100 --render_train --num_per_block 5 --exp_id 4  # for render\n# quad\npython run_FourierGrid.py --program train --config FourierGrid/configs/mega/quad.py --sample_num 100 --render_train --num_per_block 5 --exp_id 1\n# lego\npython run_FourierGrid.py --program train --config FourierGrid/configs/nerf/lego.py --render_test --eval_ssim --eval_lpips_vgg --exp_id 8\n"
  },
  {
    "path": "scripts/visualize_FourierGrid.sh",
    "content": "# export CONFIG=FourierGrid/configs/tankstemple_unbounded/Playground.py #FourierGrid/configs/waymo/block_0_tt.py\nexport CONFIG=FourierGrid/configs/waymo/waymo_full.py\n# visualize cameras\nCUDA_VISIBLE_DEVICES=8 python run_FourierGrid.py --program export_bbox --config ${CONFIG} --export_bbox_and_cams_only data/sep26_2/cam.npz --sample_num 100\n# visualize geometry\nCUDA_VISIBLE_DEVICES=3 python run_FourierGrid.py --program export_coarse --config ${CONFIG} --export_coarse_only data/sep26_2/cam_coarse.npz --sample_num 100\n# the following commands require a local desktop\npython data_preprocess/visualize_cameras.py --data_path data/samples/block_0 --multi_scale\npython FourierGrid/tools/vis_train.py data/sep13_block0/cam.npz\npython FourierGrid/tools/vis_volume.py coarse_mic.npz 0.001 --cam data/sep13_block0/cam_coarse.npz\n"
  },
  {
    "path": "setup.py",
    "content": "import setuptools\n\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as fh:\n  long_description = fh.read()\n\nsetuptools.setup(\n    name=\"large_scale_nerf\",\n    version=\"0.0.2\",\n    author=\"Zelin Zhao\",\n    author_email=\"sjtuytc@gmail.com\",\n    description=\"Code for large-scale neural radiance fields.\",\n    long_description=long_description,\n    long_description_content_type=\"text/markdown\",\n    url=\"https://github.com/dvlab-research/BlockNeRFPytorch\",\n    packages=setuptools.find_packages(),\n    classifiers=[\n        \"Programming Language :: Python :: 3\",\n        \"License :: OSI Approved :: Apache License 2.0\",\n        \"Operating System :: OS Independent\",\n    ],\n    python_requires='>=3.7',\n)"
  },
  {
    "path": "train_block_nerf.py",
    "content": "import os\nimport torch\nfrom torch.utils.data import DataLoader\nfrom collections import defaultdict\nfrom block_nerf.waymo_dataset import *\nfrom block_nerf.block_nerf_model import *\nfrom block_nerf.block_nerf_lightning import *\nfrom block_nerf.rendering import *\nfrom block_nerf.metrics import *\n\nfrom pytorch_lightning import LightningModule, Trainer\nfrom pytorch_lightning.plugins import DDPPlugin\nfrom pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar\nfrom pytorch_lightning.loggers import TensorBoardLogger\n\nimport argparse\n\n\ndef get_opts():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--root_dir', type=str,\n                        default='data/pytorch_waymo_dataset',\n                        help='root directory of dataset')\n    parser.add_argument('--block_index', type=int, default='0',\n                        help='index of the blocks')\n    parser.add_argument('--img_downscale', type=int, default=4,\n                        help='number of xyz embedding frequencies')\n    parser.add_argument('--near', type=float, default=1e-2,\n                        help='the range to sample along the ray')\n    parser.add_argument('--far', type=float, default=15,\n                        help='the range to sample along the ray')\n    parser.add_argument('--N_IPE_xyz', type=int, default=16,\n                        help='number of xyz embedding frequencies')\n    parser.add_argument('--N_PE_dir_exposure', type=int, default=4,\n                        help='number of direction embedding frequencies'\n                        )\n    parser.add_argument('--N_samples', type=int, default=128,\n                        help='number of coarse samples')\n    parser.add_argument('--N_importance', type=int, default=128,\n                        help='number of additional fine samples')\n\n    # NeRF-W\n    parser.add_argument('--N_vocab', type=int, default=1500,\n                        help='''number of vocabulary (number of images) \n                                        in the dataset for nn.Embedding'''\n                        )\n    parser.add_argument('--N_appearance', type=int, default=32,\n                        help='number of embeddings for appearance')\n\n    parser.add_argument('--Visi_loss', type=float, default=1e-2,\n                        help='number of embeddings for appearance')\n\n    parser.add_argument('--use_disp', type=bool, default=True,\n                        help='use disparity depth sampling')\n\n    parser.add_argument('--chunk', type=int, default=1024 * 16,\n                        help='chunk to avoid OOM')\n    parser.add_argument('--batch_size', type=int, default=1024,\n                        help='batch size')\n    parser.add_argument('--num_epochs', type=int, default=10,\n                        help='number of training epochs')\n    parser.add_argument('--num_gpus', type=int, default=1,\n                        help='number of gpus')\n\n    parser.add_argument('--ckpt_path', type=str, default=None,\n                        help='pretrained checkpoint path to load')\n\n    parser.add_argument('--optimizer', type=str, default='adam',\n                        help='optimizer type', choices=['sgd', 'adam',\n                        'radam', 'ranger'])\n    parser.add_argument('--lr', type=float, default=5e-4,\n                        help='learning rate')\n    parser.add_argument('--momentum', type=float, default=0.9,\n                        help='learning rate momentum')\n    parser.add_argument('--weight_decay', type=float, default=0,\n                        help='weight decay')\n    parser.add_argument('--lr_scheduler', type=str, default='steplr',\n                        help='scheduler type', choices=['steplr',\n                        'cosine', 'poly'])\n    parser.add_argument('--warmup_multiplier', type=float, default=1.0,\n                        help='lr is multiplied by this factor after --warmup_epochs'\n                        )\n    parser.add_argument('--warmup_epochs', type=int, default=0,\n                        help='Gradually warm-up(increasing) learning rate in optimizer'\n                        )\n    parser.add_argument('--decay_step', nargs='+', type=int,\n                        default=[20], help='scheduler decay step')\n    parser.add_argument('--decay_gamma', type=float, default=0.1,\n                        help='learning rate decay amount')\n    parser.add_argument('--poly_exp', type=float, default=0.9,\n                        help='exponent for polynomial learning rate decay'\n                        )\n    parser.add_argument('--exp_name', type=str, default='exp',\n                        help='experiment name')\n    parser.add_argument('--refresh_every', type=int, default=1,\n                        help='print the progress bar every X steps')\n\n    return vars(parser.parse_args())\n\n\ndef main(hparams):\n    print(\"Warning, this old implementation of BlockNeRF will be deprecated in the next version!\")\n    hparams['block_index'] = 'block_' + str(hparams['block_index'])\n    system = Block_NeRF_System(hparams)\n    checkpoint_callback = \\\n        ModelCheckpoint(dirpath=os.path.join('data/ckpts/{0}'.format(hparams['exp_name'\n                        ]), str(hparams['block_index']) + '_{epoch:d}'\n                        ), monitor='val/loss', mode='min', save_top_k=5)\n    callbacks = [checkpoint_callback]\n\n    logger = TensorBoardLogger(save_dir='logs',\n                               name=hparams['block_index'],\n                               default_hp_metric=False)\n\n    trainer = Trainer(\n        max_epochs=hparams['num_epochs'],\n        precision=16,\n        callbacks=callbacks,\n        resume_from_checkpoint=hparams['ckpt_path'],\n        logger=logger,\n        enable_model_summary=True,\n        gpus=hparams['num_gpus'],\n        accelerator='auto',\n        num_sanity_val_steps=1,\n        benchmark=True,\n        profiler=('simple' if hparams['num_gpus'] == 1 else None),\n        strategy=(DDPPlugin(find_unused_parameters=False) if hparams['num_gpus'\n                  ] > 1 else None),\n        )\n\n    trainer.fit(system)\n    print('The best model is saved in', checkpoint_callback.best_model_path)\n\n\nif __name__ == '__main__':\n    hparams = get_opts()\n    torch.cuda.empty_cache()\n    main(hparams)\n"
  }
]