taichi 库探索 wu-kan

我们的计算机图形学小组 Project2 选题项目~

Taichi是一种用于计算机图形应用程序的高性能编程语言。

  • 性能
  • 生产率
  • 稀疏计算
  • 可微编程
  • 元编程

开发环境

硬件

所用机器型号为 VAIO Z Flip 2016

  • Intel(R) Core(TM) i7-6567U CPU @3.30GHZ 3.31GHz
  • 8.00GB RAM

软件

  • Windows 10, 64-bit (Build 17763) 10.0.17763
  • Visual Studio Code 1.39.2
    • Python 2019.10.41019:九月底发布的 VSCode Python 插件支持在编辑器窗口内原生运行 juyter nootbook 了,非常赞!
    • Remote - WSL 0.39.9:配合 WSL,在 Windows 上获得 Linux 接近原生环境的体验。
  • Windows Subsystem for Linux [Ubuntu 18.04.2 LTS]:WSL 是以软件的形式运行在 Windows 下的 Linux 子系统,是近些年微软推出来的新工具,可以在 Windows 系统上原生运行 Linux。
    • Python 3.7.4 64-bit (‘anaconda3’:virtualenv):安装在 WSL 中。

taichi 库环境配置

出于效率的考虑,太極本身是由 C++ 构建的,但用 Python 包装了接口。

# 安装openCV
python3 -m pip install opencv-python

# CPU only. No GPU/CUDA needed. (Linux, OS X and Windows)
python3 -m pip install taichi-nightly==0.3.16

# With GPU (CUDA 10.0) support (Linux only)
# python3 -m pip install taichi-nightly-cuda-10-0

# With GPU (CUDA 10.1) support (Linux only)
# python3 -m pip install taichi-nightly-cuda-10-1

这里我本地机器所用的显卡是 Intel(R) Iris(TM) 550,本身不具备 CUDA 环境,因此安装了 CPU ONLY 版本。同时由于 taichi 每天都在更新,这里仅保证代码在我们所安装的版本(这篇文章完成于 2019-12-31,此时的最新版本)上能够跑通。

运行官方示例文件

可以执行下述代码获取官方 repo 中的一些示例文件。

svn checkout https://github.com/yuanming-hu/taichi/trunk/examples

taichi_logo.py

下面这份代码来自于https://github.com/yuanming-hu/taichi/blob/master/examples/taichi_logo.py

# TODO: this program crashes clang-7 when compiled with ti.x86_64. Why?
import taichi as ti
import cv2
import numpy as np


def Vector2(x, y):
    return ti.Vector([x, y])


@ti.func
def inside(p, c, r):
    return (p - c).norm_sqr() <= r * r


@ti.func
def inside_taichi(p_):
    p = p_
    p = Vector2(0.5, 0.5) + (p - Vector2(0.5, 0.5)) * 1.11
    ret = -1
    if not inside(p, Vector2(0.50, 0.50), 0.55):
        if ret == -1:
            ret = 0
    if not inside(p, Vector2(0.50, 0.50), 0.50):
        if ret == -1:
            ret = 1
    if inside(p, Vector2(0.50, 0.25), 0.09):
        if ret == -1:
            ret = 1
    if inside(p, Vector2(0.50, 0.75), 0.09):
        if ret == -1:
            ret = 0
    if inside(p, Vector2(0.50, 0.25), 0.25):
        if ret == -1:
            ret = 0
    if inside(p, Vector2(0.50, 0.75), 0.25):
        if ret == -1:
            ret = 1
    if p[0] < 0.5:
        if ret == -1:
            ret = 1
    else:
        if ret == -1:
            ret = 0
    return ret


x = ti.var(ti.f32)

n = 512
ti.cfg.use_llvm = True


@ti.layout
def layout():
    ti.root.dense(ti.ij, n).place(x)


@ti.kernel
def paint():
    for i in range(n * 4):
        for j in range(n * 4):
            ret = 1.0 - \
                inside_taichi(Vector2(1.0 * i / n / 4, 1.0 * j / n / 4))
            x[i // 4, j // 4] += ret / 16


paint()

img = np.empty((n, n), dtype=np.float32)
for i in range(n):
    for j in range(n):
        img[i, j] = x[j, n - 1 - i]

cv2.imshow('Taichi', img)
cv2.waitKey(0)

运行上述代码,结果如下:

taichi_logo

可以看到,这里成功画出了一个太极图,说明本地配置没有问题。

mpm88.py

上面这个太极图是静态的,我们再来看一下动态画面实时渲染的效果。

下面这份代码来自于https://github.com/yuanming-hu/taichi/blob/master/examples/mpm88.py

import taichi as ti
import random

dim = 2
n_particles = 8192
n_grid = 128
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 2.0e-4
p_vol = (dx * 0.5) ** 2
p_rho = 1
p_mass = p_vol * p_rho
E = 400

x = ti.Vector(dim, dt=ti.f32, shape=n_particles)
v = ti.Vector(dim, dt=ti.f32, shape=n_particles)
C = ti.Matrix(dim, dim, dt=ti.f32, shape=n_particles)
J = ti.var(dt=ti.f32, shape=n_particles)
grid_v = ti.Vector(dim, dt=ti.f32, shape=(n_grid, n_grid))
grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid))

ti.cfg.arch = ti.cuda


@ti.kernel
def substep():
    for p in x:
        base = (x[p] * inv_dx - 0.5).cast(int)
        fx = x[p] * inv_dx - base.cast(float)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1),
             0.5 * ti.sqr(fx - 0.5)]
        stress = -dt * p_vol * (J[p] - 1) * 4 * inv_dx * inv_dx * E
        affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                offset = ti.Vector([i, j])
                dpos = (offset.cast(float) - fx) * dx
                weight = w[i][0] * w[j][1]
                grid_v[base +
                       offset].atomic_add(weight * (p_mass * v[p] + affine @ dpos))
                grid_m[base + offset].atomic_add(weight * p_mass)

    for i, j in grid_m:
        if grid_m[i, j] > 0:
            bound = 3
            inv_m = 1 / grid_m[i, j]
            grid_v[i, j] = inv_m * grid_v[i, j]
            grid_v[i, j][1] -= dt * 9.8
            if i < bound and grid_v[i, j][0] < 0:
                grid_v[i, j][0] = 0
            if i > n_grid - bound and grid_v[i, j][0] > 0:
                grid_v[i, j][0] = 0
            if j < bound and grid_v[i, j][1] < 0:
                grid_v[i, j][1] = 0
            if j > n_grid - bound and grid_v[i, j][1] > 0:
                grid_v[i, j][1] = 0

    for p in x:
        base = (x[p] * inv_dx - 0.5).cast(int)
        fx = x[p] * inv_dx - base.cast(float)
        w = [0.5 * ti.sqr(1.5 - fx), 0.75 - ti.sqr(fx - 1.0),
             0.5 * ti.sqr(fx - 0.5)]
        new_v = ti.Vector.zero(ti.f32, 2)
        new_C = ti.Matrix.zero(ti.f32, 2, 2)
        for i in ti.static(range(3)):
            for j in ti.static(range(3)):
                dpos = ti.Vector([i, j]).cast(float) - fx
                g_v = grid_v[base + ti.Vector([i, j])]
                weight = w[i][0] * w[j][1]
                new_v += weight * g_v
                new_C += 4 * weight * ti.outer_product(g_v, dpos) * inv_dx
        v[p] = new_v
        x[p] += dt * v[p]
        J[p] *= 1 + dt * new_C.trace()
        C[p] = new_C


gui = ti.core.GUI("MPM99", ti.veci(512, 512))
canvas = gui.get_canvas()

for i in range(n_particles):
    x[i] = [random.random() * 0.4 + 0.2, random.random() * 0.4 + 0.2]
    v[i] = [0, -1]
    J[i] = 1

for frame in range(200):
    for s in range(50):
        grid_v.fill([0, 0])
        grid_m.fill(0)
        substep()

    canvas.clear(0x112F41)
    pos = x.to_numpy(as_vector=True)
    for i in range(n_particles):
        canvas.circle(ti.vec(pos[i, 0], pos[i, 1])).radius(
            1.5).color(0x068587).finish()
    gui.update()

运行上述代码,效果如下:

mpm88

模拟了一杯「粒子」水晃动的情况,由于代码是运行在 CPU 上的,而对大量粒子(8192 个)的模拟仿真非常吃多线程算力,因此这里我跑出来的帧率比较低。

wave.py

最后来看一个比较炫酷的例子。

wave

在原作者的这个 Repo里面找到了这段代码

import taichi as ti
import math
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt

real = ti.f32
ti.set_default_fp(real)
# ti.runtime.print_preprocessed = True

n_grid = 256
dx = 1 / n_grid
inv_dx = 1 / dx
dt = 3e-4
max_steps = 512
vis_interval = 32
output_vis_interval = 2
steps = 256
assert steps * 2 <= max_steps
amplify = 1

scalar = lambda: ti.var(dt=real)
vec = lambda: ti.Vector(2, dt=real)

p = scalar()
target = scalar()
initial = scalar()
loss = scalar()

ti.cfg.arch = ti.cuda

@ti.layout
def place():
  ti.root.dense(ti.l, max_steps).dense(ti.ij, n_grid).place(p)
  ti.root.dense(ti.l, max_steps).dense(ti.ij, n_grid).place(p.grad)
  ti.root.dense(ti.ij, n_grid).place(target)
  ti.root.dense(ti.ij, n_grid).place(target.grad)
  ti.root.dense(ti.ij, n_grid).place(initial)
  ti.root.dense(ti.ij, n_grid).place(initial.grad)
  ti.root.place(loss)
  ti.root.place(loss.grad)


c = 340
# damping
alpha = 0.00000
inv_dx2 = inv_dx * inv_dx
dt = (math.sqrt(alpha * alpha + dx * dx / 3) - alpha) / c
learning_rate = 1


# TODO: there may by out-of-bound accesses here
@ti.func
def laplacian(t, i, j):
  return inv_dx2 * (
      -4 * p[t, i, j] + p[t, i, j - 1] + p[t, i, j + 1] + p[t, i + 1, j] +
      p[t, i - 1, j])

@ti.kernel
def initialize():
  for i in range(n_grid):
    for j in range(n_grid):
      p[0, i, j] = initial[i, j]


@ti.kernel
def fdtd(t: ti.i32):
  for i in range(n_grid): # Parallelized over GPU threads
    for j in range(n_grid):
      laplacian_p = laplacian(t - 2, i, j)
      laplacian_q = laplacian(t - 1, i, j)
      p[t, i, j] = 2 * p[t - 1, i, j] + (
          c * c * dt * dt + c * alpha * dt) * laplacian_q - p[
                     t - 2, i, j] - c * alpha * dt * laplacian_p

@ti.kernel
def compute_loss(t: ti.i32):
  for i in range(n_grid):
    for j in range(n_grid):
      ti.atomic_add(loss, dx * dx * ti.sqr(target[i, j] - p[t, i, j]))

@ti.kernel
def apply_grad():
  # gradient descent
  for i, j in initial.grad:
    initial[i, j] -= learning_rate * initial.grad[i, j]

def forward(output=None):
  steps_mul = 1
  interval = vis_interval
  if output:
    os.makedirs(output, exist_ok=True)
    steps_mul = 2
    interval = output_vis_interval
  initialize()
  for t in range(2, steps * steps_mul):
    fdtd(t)
    if (t + 1) % interval == 0:
      img = np.zeros(shape=(n_grid, n_grid), dtype=np.float32)
      for i in range(n_grid):
        for j in range(n_grid): img[i, j] = p[t, i, j] * amplify + 0.5
      img = cv2.resize(img, fx=4, fy=4, dsize=None)
      cv2.imshow('img', img)
      cv2.waitKey(1)
      if output:
        img = np.clip(img, 0, 255)
        cv2.imwrite(output + "/{:04d}.png".format(t), img * 255)
  compute_loss(steps - 1)

def main():
  # initialization
  target_img = cv2.imread('taichi.png')[:,:,0] / 255.0
  target_img -= target_img.mean()
  target_img = cv2.resize(target_img, (n_grid, n_grid))
  cv2.imshow('target', target_img * amplify + 0.5)
  # print(target_img.min(), target_img.max())
  for i in range(n_grid):
    for j in range(n_grid):
      target[i, j] = float(target_img[i, j])

  if False:
    # this is not too exciting...
    initial[n_grid // 2, n_grid // 2] = -2
    forward('center')
    initial[n_grid // 2, n_grid // 2] = 0

  for opt in range(200):
    with ti.Tape(loss):
      output = None
      if opt % 20 == 19:
        output = 'wave/iter{:03d}/'.format(opt)
      forward(output)

    print('Iter', opt, ' Loss =', loss[None])

    apply_grad()

  forward('optimized')

if __name__ == '__main__':
  main()

算法一共迭代了 200 步,每 20 步本地保存一次。执行下述代码wave_rander.py,将渲染出的图片合成为视频。

import os

if __name__ == '__main__':
    for i in range(19, 200, 20):
        name = "iter%03d" % (i)
        for j in range(3, 512, 2):
            inputfile = name+"/%04d.png" % (j)
            outputfile = name+"/%04d.png" % (j//2)
            os.system("mv %s %s" % (inputfile, outputfile))
        os.system("ffmpeg %s.mp4 -i '%s/%%04d.png' -start_number 0001 -vf fps=5 -r 5 -pattern_type glob" % (name, name))

结果保存在wave/目录下,iter019.mp4~iter199.mp4分别保存了 19 次迭代后的结果。

iter019iter059iter199