Getting to build a NeRF! Exciting! :-)
.gif
s after my 1000, 2000, 3000, and 4000 iteration checkpoints on my best model so far:
for epoch in range(EPOCHS):
# TRAINING
model.train()
print_memory_usage(f"Prior to Epoch {epoch+1}")
r_o, r_d, pixels = train_dataset.sample_rays(BATCH_SIZE)
x = sample_along_rays(r_o, r_d, perturb=True, n_samples=N_SAMPLES)
r_d_expanded = np.repeat(r_d[:,np.newaxis,:], N_SAMPLES, axis=1)
x = x.astype(np.float32); r_d_expanded = r_d_expanded.astype(np.float32);
pixels = pixels.astype(np.float32) # pixel is great name
X = torch.from_numpy(x); D = torch.from_numpy(r_d_expanded); P = torch.from_numpy(pixels)
print_memory_usage(f"Right after torch.from_numpy")
X = X.to(device); D = D.to(device); P = P.to(device)
print_memory_usage(f"Right after .to(device)")
density, rgb = model(X, D)
print_memory_usage(f"Right after calling model")
P_pred = volrend(density, rgb, N_SAMPLES)
print_memory_usage(f"Right after volume render")
l = loss(P_pred, P)
print_memory_usage(f"Right after evaluating loss")
optimizer.zero_grad()
print_memory_usage(f"Right after zero_grad")
l.backward()
print_memory_usage(f"Right after backward")
optimizer.step()
print_memory_usage(f"Right after optimizer.step()")
optimizer.zero_grad()
print_memory_usage(f"Right after zero grad")
t_losses.append(l.item())
t_psnrs.append(psnr(l).item())
if epoch % 5 == 4:
# VALIDATION
model.eval()
i = random.randint(0, 9) # pick a random image
r_o = val_dataset.rays_o[i*40000:(i+1)*40000]
r_d = val_dataset.rays_d[i*40000:(i+1)*40000]
pixels = val_dataset.pixels[i*40000:(i+1)*40000]
x = sample_along_rays(r_o, r_d, perturb=True, n_samples=N_SAMPLES)
r_d_expanded = np.repeat(r_d[:,np.newaxis,:], N_SAMPLES, axis=1)
x = x.astype(np.float32); r_d_expanded = r_d_expanded.astype(np.float32);
pixels = pixels.astype(np.float32) # pixel is great name
X = torch.from_numpy(x); D = torch.from_numpy(r_d_expanded); P = torch.from_numpy(pixels)
X = X.to(device); D = D.to(device); P = P.to(device)
density, rgb = model(X, D)
P_pred = volrend(density, rgb, N_SAMPLES)
val_l = loss(P_pred, P)
P = P.detach()
P_pred = P_pred.detach()
v_losses.append(val_l.item())
v_psnrs.append(psnr(val_l).item())
optimizer.zero_grad()
print(f"Epoch {epoch+1}, Loss: {t_losses[-1]}, psnr: {t_psnrs[-1]}, Validation Loss: {v_losses[-1]},
psnr: {v_psnrs[-1]}", "Threads used:", torch.get_num_threads())
if epoch % 50 == 49:
image = torch.cat(P_pred, P).reshape((200, 400, 3)).detach().numpy()
cv2.imwrite(f'lego-truck-reconstruction-epoch-{epoch+1}.png', 255*image)
and the output was (sorry for how verbose this is):
Memory at Prior to Epoch 1: Allocated = 0.029 GB, Reserved = 2.403 GB
Memory at Right after torch.from_numpy: Allocated = 0.029 GB, Reserved = 2.403 GB
Memory at Right after .to(device): Allocated = 0.033 GB, Reserved = 2.403 GB
Memory at Right after calling model: Allocated = 1.690 GB, Reserved = 2.403 GB
Memory at Right after volume render: Allocated = 1.693 GB, Reserved = 2.405 GB
Memory at Right after evaluating loss: Allocated = 1.693 GB, Reserved = 2.405 GB
Memory at Right after zero_grad: Allocated = 1.693 GB, Reserved = 2.405 GB
Memory at Right after backward: Allocated = 0.038 GB, Reserved = 2.405 GB
Memory at Right after optimizer.step(): Allocated = 0.038 GB, Reserved = 2.405 GB
Memory at Right after zero grad: Allocated = 0.036 GB, Reserved = 2.405 GB
Memory at Prior to Epoch 2: Allocated = 0.029 GB, Reserved = 2.405 GB
Memory at Right after torch.from_numpy: Allocated = 0.029 GB, Reserved = 2.405 GB
Memory at Right after .to(device): Allocated = 0.034 GB, Reserved = 2.405 GB
Memory at Right after calling model: Allocated = 1.691 GB, Reserved = 2.405 GB
Memory at Right after volume render: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after evaluating loss: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after zero_grad: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after backward: Allocated = 0.039 GB, Reserved = 2.405 GB
Memory at Right after optimizer.step(): Allocated = 0.039 GB, Reserved = 2.405 GB
Memory at Right after zero grad: Allocated = 0.037 GB, Reserved = 2.405 GB
Memory at Prior to Epoch 3: Allocated = 0.030 GB, Reserved = 2.405 GB
Memory at Right after torch.from_numpy: Allocated = 0.030 GB, Reserved = 2.405 GB
Memory at Right after .to(device): Allocated = 0.034 GB, Reserved = 2.405 GB
Memory at Right after calling model: Allocated = 1.691 GB, Reserved = 2.405 GB
Memory at Right after volume render: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after evaluating loss: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after zero_grad: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after backward: Allocated = 0.039 GB, Reserved = 2.405 GB
Memory at Right after optimizer.step(): Allocated = 0.039 GB, Reserved = 2.405 GB
Memory at Right after zero grad: Allocated = 0.037 GB, Reserved = 2.405 GB
Memory at Prior to Epoch 4: Allocated = 0.037 GB, Reserved = 2.405 GB
Memory at Right after torch.from_numpy: Allocated = 0.033 GB, Reserved = 2.405 GB
Memory at Right after .to(device): Allocated = 0.037 GB, Reserved = 2.405 GB
Memory at Right after calling model: Allocated = 1.691 GB, Reserved = 2.405 GB
Memory at Right after volume render: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after evaluating loss: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after zero_grad: Allocated = 1.694 GB, Reserved = 2.405 GB
Memory at Right after backward: Allocated = 0.039 GB, Reserved = 2.405 GB
Memory at Right after optimizer.step(): Allocated = 0.039 GB, Reserved = 2.405 GB
Memory at Right after zero grad: Allocated = 0.037 GB, Reserved = 2.405 GB
Memory at Prior to Epoch 5: Allocated = 0.029 GB, Reserved = 2.405 GB
Memory at Right after torch.from_numpy: Allocated = 0.029 GB, Reserved = 2.405 GB
Memory at Right after .to(device): Allocated = 0.033 GB, Reserved = 2.405 GB
Memory at Right after calling model: Allocated = 1.690 GB, Reserved = 2.405 GB
Memory at Right after volume render: Allocated = 1.693 GB, Reserved = 2.405 GB
Memory at Right after evaluating loss: Allocated = 1.693 GB, Reserved = 2.405 GB
Memory at Right after zero_grad: Allocated = 1.693 GB, Reserved = 2.405 GB
Memory at Right after backward: Allocated = 0.038 GB, Reserved = 2.405 GB
Memory at Right after optimizer.step(): Allocated = 0.038 GB, Reserved = 2.405 GB
Memory at Right after zero grad: Allocated = 0.036 GB, Reserved = 2.405 GB
Epoch 5, Loss: 0.17976847290992737, psnr: 7.452865123748779,
Validation Loss: 0.1619608849287033, psnr: 7.9058990478515625 Threads used: 1
Memory at Prior to Epoch 6: Allocated = 13.341 GB, Reserved = 14.152 GB
Memory at Right after torch.from_numpy: Allocated = 13.341 GB, Reserved = 14.152 GB
Memory at Right after .to(device): Allocated = 13.346 GB, Reserved = 14.152 GB
Memory at Right after calling model: Allocated = 15.003 GB, Reserved = 15.370 GB
Memory at Right after volume render: Allocated = 15.006 GB, Reserved = 15.372 GB
Memory at Right after evaluating loss: Allocated = 15.006 GB, Reserved = 15.372 GB
Memory at Right after zero_grad: Allocated = 15.006 GB, Reserved = 15.372 GB
Memory at Right after backward: Allocated = 13.350 GB, Reserved = 14.896 GB
Memory at Right after optimizer.step(): Allocated = 13.350 GB, Reserved = 14.898 GB
Memory at Right after zero grad: Allocated = 13.348 GB, Reserved = 14.898 GB
Memory at Prior to Epoch 7: Allocated = 13.315 GB, Reserved = 14.898 GB
Memory at Right after torch.from_numpy: Allocated = 13.315 GB, Reserved = 14.898 GB
Memory at Right after .to(device): Allocated = 13.319 GB, Reserved = 14.898 GB
Memory at Right after calling model: Allocated = 14.976 GB, Reserved = 15.412 GB
Memory at Right after volume render: Allocated = 14.979 GB, Reserved = 15.412 GB
Memory at Right after evaluating loss: Allocated = 14.979 GB, Reserved = 15.412 GB
Memory at Right after zero_grad: Allocated = 14.979 GB, Reserved = 15.412 GB
Memory at Right after backward: Allocated = 13.324 GB, Reserved = 15.594 GB
Memory at Right after optimizer.step(): Allocated = 13.324 GB, Reserved = 15.594 GB
Memory at Right after zero grad: Allocated = 13.322 GB, Reserved = 15.594 GB
Memory at Prior to Epoch 8: Allocated = 13.322 GB, Reserved = 15.594 GB
Memory at Right after torch.from_numpy: Allocated = 13.318 GB, Reserved = 15.594 GB
Memory at Right after .to(device): Allocated = 13.322 GB, Reserved = 15.594 GB
Memory at Right after calling model: Allocated = 14.976 GB, Reserved = 15.594 GB
Memory at Right after volume render: Allocated = 14.979 GB, Reserved = 15.594 GB
Memory at Right after evaluating loss: Allocated = 14.979 GB, Reserved = 15.594 GB
Memory at Right after zero_grad: Allocated = 14.979 GB, Reserved = 15.594 GB
Memory at Right after backward: Allocated = 13.324 GB, Reserved = 15.594 GB
Memory at Right after optimizer.step(): Allocated = 13.324 GB, Reserved = 15.594 GB
Memory at Right after zero grad: Allocated = 13.322 GB, Reserved = 15.594 GB
Memory at Prior to Epoch 9: Allocated = 13.314 GB, Reserved = 15.594 GB
Memory at Right after torch.from_numpy: Allocated = 13.314 GB, Reserved = 15.594 GB
Memory at Right after .to(device): Allocated = 13.318 GB, Reserved = 15.594 GB
Memory at Right after calling model: Allocated = 14.975 GB, Reserved = 15.594 GB
Memory at Right after volume render: Allocated = 14.978 GB, Reserved = 15.594 GB
Memory at Right after evaluating loss: Allocated = 14.978 GB, Reserved = 15.594 GB
Memory at Right after zero_grad: Allocated = 14.978 GB, Reserved = 15.594 GB
Memory at Right after backward: Allocated = 13.323 GB, Reserved = 15.594 GB
Memory at Right after optimizer.step(): Allocated = 13.323 GB, Reserved = 15.594 GB
Memory at Right after zero grad: Allocated = 13.321 GB, Reserved = 15.594 GB
Memory at Prior to Epoch 10: Allocated = 13.314 GB, Reserved = 15.594 GB
Memory at Right after torch.from_numpy: Allocated = 13.314 GB, Reserved = 15.594 GB
Memory at Right after .to(device): Allocated = 13.319 GB, Reserved = 15.594 GB
Memory at Right after calling model: Allocated = 14.976 GB, Reserved = 15.594 GB
Memory at Right after volume render: Allocated = 14.979 GB, Reserved = 15.594 GB
Memory at Right after evaluating loss: Allocated = 14.979 GB, Reserved = 15.594 GB
Memory at Right after zero_grad: Allocated = 14.979 GB, Reserved = 15.594 GB
Memory at Right after backward: Allocated = 13.324 GB, Reserved = 15.594 GB
Memory at Right after optimizer.step(): Allocated = 13.324 GB, Reserved = 15.594 GB
Memory at Right after zero grad: Allocated = 13.322 GB, Reserved = 15.594 GB
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
in ()
52 X = X.to(device); D = D.to(device); P = P.to(device)
53
---> 54 density, rgb = model(X, D)
55 P_pred = volrend(density, rgb, N_SAMPLES)
56
9 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in relu(input, inplace)
1702 result = torch.relu_(input)
1703 else:
-> 1704 result = torch.relu(input)
1705 return result
1706
OutOfMemoryError: CUDA out of memory. Tried to allocate 1.22 GiB...
|
Looking through the above, I'm actually doing fine on memory until validation time! At this moment, we suddenly throw a ton of stuff onto the GPU, and it never gets freed (perhaps because I'm never doing a .backward()
call). To resolve this, I tried using with torch.no_grad()
to see if this would prevent loss from storing a bunch of extra information relating to gradients and such. volrend
function without too many pains. The main issue was I shifted torch.exp
). Seeing tests passing feels awesome:
camera_to_world
! I was a bit silly and actually used w2c
in this method instead of c2w
. My tests weren't able to pick up on this because regardless of the matrix, K = np.diag([focal, focal, 1])
instead of the whole matrix including the principal point. After that, I got to this delightful image:
camera_to_world
function was fairly easy with numpy
, and I simply tested it on the identity matrix to make sure that it would work for all vectors in x == camera_to_world(c2w.inv(), camera_to_world(c2w, x))
, it made more sense to use np.isclose
or manually check for floating point errors.pixel_to_camera
, but didn't for pixel_to_ray
to save time.
dataloader = DataLoader(img_data, batch_size=N_SAMPLE, shuffle=True)
has been creating segfault after segfault... and debugging hasn't helped. To avoid spending too much time on it, I decided to just load data myself.
for epoch in range(EPOCHS):
random_indices = random.sample(list(range(len(img_data))), BATCH_SIZE)
X, Y = img_data[random_indices]
print(X.shape)
X = torch.from_numpy(X)
print(X.shape, X.device)
break
which outputted:
(10000, 2)
torch.Size([10000, 2]) cpu
as one would expected. I think based on this output, there should be no need to unsqueeze or do any funny business like that.def forward(self, x: torch.Tensor) -> torch.Tensor:
print("here")
batch_size, seq_len = x.shape
print(batch_size, seq_len)
temp = 2 * self.L + 1
print("here7", [batch_size, seq_len * temp])
y = torch.zeros([batch_size, seq_len * temp], device="cpu")
print("here7")
for i in range(batch_size):
for j in range(seq_len):
idx = j * temp
y[i, idx] = x[i, j]
print(idx)
for k in range(self.L):
factor = 2**k * math.pi * x[i, j]
y[i, idx + 2*k + 1] = torch.sin(factor)
y[i, idx + 2*k + 2] = torch.cos(factor)
print("here5")
return y
which didn't really work (caused segfault), but changing the `torch.zeros` to `np.zeros` and then returning `torch.from_numpy(y)` magically worked... I guess because of memory allocation issues. I still don't know exactly why, or how to just initialize with `torch` from the start. But I think this might be a more memory-stable solution for now, so I went with it.for param in model.parameters():
print("hi", param)
print("grad", param.grad)
if param.grad is not None:
if torch.isnan(param.grad).any() or torch.isinf(param.grad).any():
print("NaN or Inf in gradients!")
break
else:
print(param.grad)
which also segfaulted, on the third "hi" and "grad". By visual inspection, I didn't actually notice anything funky about the parameters or grad. Somehow, with some more print debugging, I realized the segfault was coming from the torch.isnan(...).any()
and torch.isinf(...).any()
. At this point, I also thought about it and realized... my positional encodings don't involve any model parameters. As such, they definitely shouldn't be the reason for the segfault, considering the forward
call worked.adam.py
file, I found that the issue was actually the line: state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
. I should've known... more memory allocation issues. At this point, I decided I should probably figure out what was up. I temporarily replaced that with numpy computations, but still not working. Me thinking I could debug this with print statements: (look at the terminal lol)
[[1.0, 1.0]]
, I got:
Example Positional Encoding:
tensor([[ 1.0000e+00, 1.0000e+00, -8.7423e-08, 1.7485e-07, 3.4969e-07,
6.9938e-07, 1.3988e-06, 2.7975e-06, 5.5951e-06, 1.1190e-05,
2.2380e-05, 4.4760e-05, -1.0000e+00, 1.0000e+00, 1.0000e+00,
1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
1.0000e+00, 1.0000e+00, -8.7423e-08, 1.7485e-07, 3.4969e-07,
6.9938e-07, 1.3988e-06, 2.7975e-06, 5.5951e-06, 1.1190e-05,
2.2380e-05, 4.4760e-05, -1.0000e+00, 1.0000e+00, 1.0000e+00,
1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
1.0000e+00, 1.0000e+00]])
After I looked at this output and thought about it, I realized something sus... it makes no sense to do positional encodings this way because the input dataloader
to doing:
random_indices = random.sample(list(range(len(img_data))), BATCH_SIZE)
X, Y = img_data[random_indices]
X = torch.from_numpy(X); Y = torch.from_numpy(Y)
completely changed my results!
Experiment | L | LR | Other Parameters | Training Time (s) * | Loss | PSNR | Notes |
---|---|---|---|---|---|---|---|
6 | 5 | 0.01 | 3000 Epochs, 10k Batch, 3 linear layers, Hidden Dimension 256 | 231.2 | 0.00296 | 25.293 | |
1 | 10 | " | " | 236.5 | 0.00163 | 27.867 | Spec Params |
2 | 15 | " | " | 244.7 | 0.00167 | 27.776 | |
3 | 25 | " | " | 278.9 | 0.00152 | 28.189 | |
4 | 50 | " | " | 331.6 | 0.00129 | 28.907 | |
5 | 100 | " | " | 384.8 | 0.00162 | 27.903 | |
13 | 50 | 0.001 | " | 307.6 | 0.00144 | 28.405 | |
12 | " | 0.002 | " | 296.5 | 0.00131 | 28.820 | |
11 | " | 0.005 | " | 310.3 | 0.00117 | 29.323 | |
17 | " | 0.006 | " | 344.3 | 0.00120 | 29.191 | |
14 | " | 0.007 | " | 289.1 | 0.00120 | 29.192 | |
15 | " | 0.008 | " | 339.1 | 0.00132 | 28.798 | |
16 | " | 0.009 | " | 333.7 | 0.00121 | 29.187 | |
7 | " | 0.02 | " | 302.6 | 0.00156 | 28.056 | |
8 | " | 0.05 | " | 298.9 | 0.00242 | 26.166 | |
9 | " | 0.1 | " | 298.8 | 0.00453 | 23.444 | |
10 | " | 1 | " | 290.7 | 0.218 | 6.621 | |
17 | " | 0.005 | 3000 Epochs, 10k Batch, 7 linear layers, Hidden Dimension 256 | 503.2 | 0.000903 | 30.444 | |
18 | " | 0.01 | 3000 Epochs, 10k Batch, 7 linear layers, Hidden Dimension 1024 | 1004.2 | 0.000726 | 31.390 | |
21 | " | 0.0001 | 3000 Epochs, 10k Batch, 7 linear layers, Hidden Dimension 512, Only one BatchNorm after first liner layer | 800.2 | 0.00153 | 28.144 | |
19 | " | 0.005 | 3000 Epochs, 10k Batch, 7 linear layers, Hidden Dimension 1024, Only one BatchNorm after first liner layer | 788.2 | 0.000696 | 31.574 | |
20 | " | 0.01 | " | 896.1 | 0.213 | 6.713 | |
22 | " | 0.001 | 3000 Epochs, 10k Batch, 12 linear layers, Hidden Dimension 512, NO BatchNorm | 1699.7 | 0.000591 | 32.283 | TRIUMPH! |