From: Ben Skeggs Date: Tue, 8 May 2018 20:39:46 +1000 Subject: drm/nouveau/gr/gf100-: virtualise alpha_beta_tables + improve algorithms Git-commit: 43952c6f43106c88b4dcdc99285d92172d8c57cd Patch-mainline: v4.18-rc1 References: FATE#326289 FATE#326079 FATE#326049 FATE#322398 FATE#326166 I haven't yet been able to find a fully programatic way of calculating the same mapping as NVIDIA for GF100-GF119, so the algorithm partially depends on data tables for specific configurations. I couldn't find traces for every possibility, so the algorithm will switch to a mapping similar to what GK104-GM10x use if it encounters one. We did the wrong thing before anyway, so shouldn't matter too much. The algorithm used in the GK104 implementation was ported from NVGPU. Signed-off-by: Ben Skeggs Acked-by: Petr Tesarik --- drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c | 158 +++++++++++++++++---- drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h | 5 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c | 2 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c | 49 ++++++ drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c | 1 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c | 2 drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c | 2 drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 10 - drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h | 1 15 files changed, 201 insertions(+), 35 deletions(-) --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c @@ -1163,37 +1163,140 @@ gf100_grctx_generate_rop_mapping(struct nvkm_wr32(device, 0x40780c + (i * 4), data[i]); } +static const u32 +gf100_grctx_alpha_beta_map[17][32] = { + [1] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + }, + [2] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + }, + //XXX: 3 + [4] = { + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + }, + //XXX: 5 + //XXX: 6 + [7] = { + 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, + }, + [8] = { + 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, + }, + //XXX: 9 + //XXX: 10 + [11] = { + 1, 1, + 2, 2, 2, 2, + 3, 3, 3, + 4, 4, 4, 4, + 5, 5, 5, + 6, 6, 6, + 7, 7, 7, 7, + 8, 8, 8, + 9, 9, 9, 9, + 10, 10, + }, + //XXX: 12 + //XXX: 13 + [14] = { + 1, 1, + 2, 2, + 3, 3, 3, + 4, 4, 4, + 5, 5, + 6, 6, 6, + 7, 7, + 8, 8, 8, + 9, 9, + 10, 10, 10, + 11, 11, 11, + 12, 12, + 13, 13, + }, + [15] = { + 1, 1, + 2, 2, + 3, 3, + 4, 4, 4, + 5, 5, + 6, 6, 6, + 7, 7, + 8, 8, + 9, 9, 9, + 10, 10, + 11, 11, 11, + 12, 12, + 13, 13, + 14, 14, + }, + [16] = { + 1, 1, + 2, 2, + 3, 3, + 4, 4, + 5, 5, + 6, 6, 6, + 7, 7, + 8, 8, + 9, 9, + 10, 10, 10, + 11, 11, + 12, 12, + 13, 13, + 14, 14, + 15, 15, + }, +}; + void -gf100_grctx_generate_r406800(struct gf100_gr *gr) +gf100_grctx_generate_alpha_beta_tables(struct gf100_gr *gr) { - struct nvkm_device *device = gr->base.engine.subdev.device; - u64 tpc_mask = 0, tpc_set = 0; - u8 tpcnr[GPC_MAX]; - int gpc, tpc; - int i, a, b; - - memcpy(tpcnr, gr->tpc_nr, sizeof(gr->tpc_nr)); - for (gpc = 0; gpc < gr->gpc_nr; gpc++) - tpc_mask |= ((1ULL << gr->tpc_nr[gpc]) - 1) << (gpc * 8); - - for (i = 0, gpc = -1, b = -1; i < 32; i++) { - a = (i * (gr->tpc_total - 1)) / 32; - if (a != b) { - b = a; - do { - gpc = (gpc + 1) % gr->gpc_nr; - } while (!tpcnr[gpc]); - tpc = gr->tpc_nr[gpc] - tpcnr[gpc]--; + struct nvkm_subdev *subdev = &gr->base.engine.subdev; + struct nvkm_device *device = subdev->device; + int i, gpc; - tpc_set |= 1ULL << ((gpc * 8) + tpc); + for (i = 0; i < 32; i++) { + u32 atarget = gf100_grctx_alpha_beta_map[gr->tpc_total][i]; + u32 abits[GPC_MAX] = {}, amask = 0, bmask = 0; + + if (!atarget) { + nvkm_warn(subdev, "missing alpha/beta mapping table\n"); + atarget = max_t(u32, gr->tpc_total * i / 32, 1); } - nvkm_wr32(device, 0x406800 + (i * 0x20), lower_32_bits(tpc_set)); - nvkm_wr32(device, 0x406c00 + (i * 0x20), lower_32_bits(tpc_set ^ tpc_mask)); - if (gr->gpc_nr > 4) { - nvkm_wr32(device, 0x406804 + (i * 0x20), upper_32_bits(tpc_set)); - nvkm_wr32(device, 0x406c04 + (i * 0x20), upper_32_bits(tpc_set ^ tpc_mask)); + while (atarget) { + for (gpc = 0; atarget && gpc < gr->gpc_nr; gpc++) { + if (abits[gpc] < gr->tpc_nr[gpc]) { + abits[gpc]++; + atarget--; + } + } } + + for (gpc = 0; gpc < gr->gpc_nr; gpc++) { + u32 bbits = gr->tpc_nr[gpc] - abits[gpc]; + amask |= ((1 << abits[gpc]) - 1) << (gpc * 8); + bmask |= ((1 << bbits) - 1) << abits[gpc] << (gpc * 8); + } + + nvkm_wr32(device, 0x406800 + (i * 0x20), amask); + nvkm_wr32(device, 0x406c00 + (i * 0x20), bmask); } } @@ -1243,6 +1346,9 @@ gf100_grctx_generate_floorsweep(struct g func->r4060a8(gr); func->rop_mapping(gr); + + if (func->alpha_beta_tables) + func->alpha_beta_tables(gr); } void @@ -1274,7 +1380,6 @@ gf100_grctx_generate_main(struct gf100_g grctx->unkn(gr); gf100_grctx_generate_floorsweep(gr); - gf100_grctx_generate_r406800(gr); gf100_gr_icmd(gr, grctx->icmd); nvkm_wr32(device, 0x404154, idle_timeout); @@ -1426,4 +1531,5 @@ gf100_grctx = { .tpc_nr = gf100_grctx_generate_tpc_nr, .r4060a8 = gf100_grctx_generate_r4060a8, .rop_mapping = gf100_grctx_generate_rop_mapping, + .alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h @@ -54,6 +54,7 @@ struct gf100_grctx_func { void (*tpc_nr)(struct gf100_gr *, int gpc); void (*r4060a8)(struct gf100_gr *); void (*rop_mapping)(struct gf100_gr *); + void (*alpha_beta_tables)(struct gf100_gr *); }; extern const struct gf100_grctx_func gf100_grctx; @@ -64,11 +65,11 @@ void gf100_grctx_generate_pagepool(struc void gf100_grctx_generate_attrib(struct gf100_grctx *); void gf100_grctx_generate_unkn(struct gf100_gr *); void gf100_grctx_generate_floorsweep(struct gf100_gr *); -void gf100_grctx_generate_r406800(struct gf100_gr *); void gf100_grctx_generate_sm_id(struct gf100_gr *, int, int, int); void gf100_grctx_generate_tpc_nr(struct gf100_gr *, int); void gf100_grctx_generate_r4060a8(struct gf100_gr *); void gf100_grctx_generate_rop_mapping(struct gf100_gr *); +void gf100_grctx_generate_alpha_beta_tables(struct gf100_gr *); extern const struct gf100_grctx_func gf108_grctx; void gf108_grctx_generate_attrib(struct gf100_grctx *); @@ -84,6 +85,8 @@ void gf117_grctx_generate_rop_mapping(st extern const struct gf100_grctx_func gf119_grctx; extern const struct gf100_grctx_func gk104_grctx; +void gk104_grctx_generate_alpha_beta_tables(struct gf100_gr *); + extern const struct gf100_grctx_func gk20a_grctx; void gk104_grctx_generate_main(struct gf100_gr *, struct gf100_grctx *); void gk104_grctx_generate_bundle(struct gf100_grctx *); --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf104.c @@ -100,4 +100,5 @@ gf104_grctx = { .tpc_nr = gf100_grctx_generate_tpc_nr, .r4060a8 = gf100_grctx_generate_r4060a8, .rop_mapping = gf100_grctx_generate_rop_mapping, + .alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c @@ -798,4 +798,5 @@ gf108_grctx = { .tpc_nr = gf100_grctx_generate_tpc_nr, .r4060a8 = gf100_grctx_generate_r4060a8, .rop_mapping = gf100_grctx_generate_rop_mapping, + .alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf110.c @@ -351,4 +351,5 @@ gf110_grctx = { .tpc_nr = gf100_grctx_generate_tpc_nr, .r4060a8 = gf100_grctx_generate_r4060a8, .rop_mapping = gf100_grctx_generate_rop_mapping, + .alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c @@ -300,7 +300,6 @@ gf117_grctx_generate_main(struct gf100_g grctx->unkn(gr); gf100_grctx_generate_floorsweep(gr); - gf100_grctx_generate_r406800(gr); for (i = 0; i < 8; i++) nvkm_wr32(device, 0x4064d0 + (i * 0x04), 0x00000000); @@ -335,4 +334,5 @@ gf117_grctx = { .tpc_nr = gf100_grctx_generate_tpc_nr, .r4060a8 = gf100_grctx_generate_r4060a8, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf119.c @@ -521,4 +521,5 @@ gf119_grctx = { .tpc_nr = gf100_grctx_generate_tpc_nr, .r4060a8 = gf100_grctx_generate_r4060a8, .rop_mapping = gf100_grctx_generate_rop_mapping, + .alpha_beta_tables = gf100_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c @@ -916,7 +916,6 @@ gk104_grctx_generate_main(struct gf100_g grctx->unkn(gr); gf100_grctx_generate_floorsweep(gr); - gf100_grctx_generate_r406800(gr); for (i = 0; i < 8; i++) nvkm_wr32(device, 0x4064d0 + (i * 0x04), 0x00000000); @@ -933,6 +932,53 @@ gk104_grctx_generate_main(struct gf100_g nvkm_mask(device, 0x41be10, 0x00800000, 0x00800000); } +void +gk104_grctx_generate_alpha_beta_tables(struct gf100_gr *gr) +{ + struct nvkm_device *device = gr->base.engine.subdev.device; + int i, j, gpc, ppc; + + for (i = 0; i < 32; i++) { + u32 atarget = max_t(u32, gr->tpc_total * i / 32, 1); + u32 btarget = gr->tpc_total - atarget; + bool alpha = atarget < btarget; + u64 amask = 0, bmask = 0; + + for (gpc = 0; gpc < gr->gpc_nr; gpc++) { + for (ppc = 0; ppc < gr->func->ppc_nr; ppc++) { + u32 ppc_tpcs = gr->ppc_tpc_nr[gpc][ppc]; + u32 abits, bbits, pmask; + + if (alpha) { + abits = atarget ? ppc_tpcs : 0; + bbits = ppc_tpcs - abits; + } else { + bbits = btarget ? ppc_tpcs : 0; + abits = ppc_tpcs - bbits; + } + + pmask = gr->ppc_tpc_mask[gpc][ppc]; + while (ppc_tpcs-- > abits) + pmask &= pmask - 1; + amask |= (u64)pmask << (gpc * 8); + + pmask ^= gr->ppc_tpc_mask[gpc][ppc]; + bmask |= (u64)pmask << (gpc * 8); + + atarget -= min(abits, atarget); + btarget -= min(bbits, btarget); + if ((abits > 0) || (bbits > 0)) + alpha = !alpha; + } + } + + for (j = 0; j < gr->gpc_nr; j += 4, amask >>= 32, bmask >>= 32) { + nvkm_wr32(device, 0x406800 + (i * 0x20) + j, amask); + nvkm_wr32(device, 0x406c00 + (i * 0x20) + j, bmask); + } + } +} + const struct gf100_grctx_func gk104_grctx = { .main = gk104_grctx_generate_main, @@ -959,4 +1005,5 @@ gk104_grctx = { .sm_id = gf100_grctx_generate_sm_id, .tpc_nr = gf100_grctx_generate_tpc_nr, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gk104_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c @@ -834,4 +834,5 @@ gk110_grctx = { .sm_id = gf100_grctx_generate_sm_id, .tpc_nr = gf100_grctx_generate_tpc_nr, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gk104_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c @@ -95,4 +95,5 @@ gk110b_grctx = { .sm_id = gf100_grctx_generate_sm_id, .tpc_nr = gf100_grctx_generate_tpc_nr, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gk104_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c @@ -556,4 +556,5 @@ gk208_grctx = { .sm_id = gf100_grctx_generate_sm_id, .tpc_nr = gf100_grctx_generate_tpc_nr, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gk104_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk20a.c @@ -43,7 +43,6 @@ gk20a_grctx_generate_main(struct gf100_g grctx->unkn(gr); gf100_grctx_generate_floorsweep(gr); - gf100_grctx_generate_r406800(gr); for (i = 0; i < 8; i++) nvkm_wr32(device, 0x4064d0 + (i * 0x04), 0x00000000); @@ -83,4 +82,5 @@ gk20a_grctx = { .sm_id = gf100_grctx_generate_sm_id, .tpc_nr = gf100_grctx_generate_tpc_nr, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gk104_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c @@ -961,7 +961,6 @@ gm107_grctx_generate_main(struct gf100_g grctx->unkn(gr); gf100_grctx_generate_floorsweep(gr); - gf100_grctx_generate_r406800(gr); nvkm_wr32(device, 0x4064d0, 0x00000001); for (i = 1; i < 8; i++) @@ -1005,4 +1004,5 @@ gm107_grctx = { .sm_id = gm107_grctx_generate_sm_id, .tpc_nr = gf100_grctx_generate_tpc_nr, .rop_mapping = gf117_grctx_generate_rop_mapping, + .alpha_beta_tables = gk104_grctx_generate_alpha_beta_tables, }; --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c @@ -1679,10 +1679,12 @@ gf100_gr_oneinit(struct nvkm_gr *base) gr->tpc_total += gr->tpc_nr[i]; gr->ppc_nr[i] = gr->func->ppc_nr; for (j = 0; j < gr->ppc_nr[i]; j++) { - u8 mask = nvkm_rd32(device, GPC_UNIT(i, 0x0c30 + (j * 4))); - if (mask) - gr->ppc_mask[i] |= (1 << j); - gr->ppc_tpc_nr[i][j] = hweight8(mask); + gr->ppc_tpc_mask[i][j] = + nvkm_rd32(device, GPC_UNIT(i, 0x0c30 + (j * 4))); + if (gr->ppc_tpc_mask[i][j] == 0) + continue; + gr->ppc_mask[i] |= (1 << j); + gr->ppc_tpc_nr[i][j] = hweight8(gr->ppc_tpc_mask[i][j]); } } --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h @@ -103,6 +103,7 @@ struct gf100_gr { u8 tpc_total; u8 ppc_nr[GPC_MAX]; u8 ppc_mask[GPC_MAX]; + u8 ppc_tpc_mask[GPC_MAX][4]; u8 ppc_tpc_nr[GPC_MAX][4]; struct gf100_gr_data mmio_data[4];