7a8f2d3892fb9488d2f41d12ce4017e9dd5b6c80
[openwrt/staging/stintel.git] /
1 From 0962d9e4a57c3faa29000c63c1cb28ad39a6c80c Mon Sep 17 00:00:00 2001
2 From: John Cox <jc@kynesim.co.uk>
3 Date: Thu, 21 May 2020 11:49:37 +0100
4 Subject: [PATCH] media: rpivid: Remove the need to have
5 num_entry_points set
6
7 VAAPI H265 has num entry points but never sets it. Allow a VAAPI
8 shim to work without requiring rewriting the VAAPI driver.
9 num_entry_points can be calculated from the slice_segment_addr
10 of the next slice so delay processing until we have that.
11
12 Also includes some minor cosmetics.
13
14 Signed-off-by: John Cox <jc@kynesim.co.uk>
15 ---
16 drivers/staging/media/rpivid/rpivid_h265.c | 699 +++++++++++----------
17 1 file changed, 365 insertions(+), 334 deletions(-)
18
19 --- a/drivers/staging/media/rpivid/rpivid_h265.c
20 +++ b/drivers/staging/media/rpivid/rpivid_h265.c
21 @@ -202,8 +202,17 @@ struct rpivid_dec_env {
22 unsigned int dpbno_col;
23 u32 reg_slicestart;
24 int collocated_from_l0_flag;
25 - unsigned int wpp_entry_x;
26 - unsigned int wpp_entry_y;
27 + /*
28 + * Last CTB/Tile X,Y processed by (wpp_)entry_point
29 + * Could be in _state as P0 only but needs updating where _state
30 + * is const
31 + */
32 + unsigned int entry_ctb_x;
33 + unsigned int entry_ctb_y;
34 + unsigned int entry_tile_x;
35 + unsigned int entry_tile_y;
36 + unsigned int entry_qp;
37 + u32 entry_slice;
38
39 u32 rpi_config2;
40 u32 rpi_framesize;
41 @@ -239,22 +248,17 @@ struct rpivid_dec_state {
42 struct v4l2_ctrl_hevc_pps pps;
43
44 // Helper vars & tables derived from sps/pps
45 - unsigned int log2_ctb_size; /* log2 width of a CTB */
46 - unsigned int ctb_width; /* Width in CTBs */
47 - unsigned int ctb_height; /* Height in CTBs */
48 - unsigned int ctb_size; /* Pic area in CTBs */
49 - unsigned int num_tile_columns;
50 - unsigned int num_tile_rows;
51 - u8 column_width[member_size(struct v4l2_ctrl_hevc_pps,
52 - column_width_minus1)];
53 - u8 row_height[member_size(struct v4l2_ctrl_hevc_pps,
54 - row_height_minus1)];
55 + unsigned int log2_ctb_size; /* log2 width of a CTB */
56 + unsigned int ctb_width; /* Width in CTBs */
57 + unsigned int ctb_height; /* Height in CTBs */
58 + unsigned int ctb_size; /* Pic area in CTBs */
59 + unsigned int tile_width; /* Width in tiles */
60 + unsigned int tile_height; /* Height in tiles */
61
62 int *col_bd;
63 int *row_bd;
64 int *ctb_addr_rs_to_ts;
65 int *ctb_addr_ts_to_rs;
66 - int *tile_id;
67
68 // Aux starage for DPB
69 // Hold refs
70 @@ -274,6 +278,12 @@ struct rpivid_dec_state {
71 unsigned int slice_qp;
72 unsigned int max_num_merge_cand; // 0 if I-slice
73 bool dependent_slice_segment_flag;
74 +
75 + unsigned int start_ts; /* slice_segment_addr -> ts */
76 + unsigned int start_ctb_x; /* CTB X,Y of start_ts */
77 + unsigned int start_ctb_y;
78 + unsigned int prev_ctb_x; /* CTB X,Y of start_ts - 1 */
79 + unsigned int prev_ctb_y;
80 };
81
82 static inline int clip_int(const int x, const int lo, const int hi)
83 @@ -319,15 +329,16 @@ static int ctb_to_tile(unsigned int ctb,
84 return i - 1;
85 }
86
87 -static int ctb_to_slice_w_h(unsigned int ctb, int ctb_size, int width,
88 - unsigned int *bd, int num)
89 +static unsigned int ctb_to_tile_x(const struct rpivid_dec_state *const s,
90 + const unsigned int ctb_x)
91 {
92 - if (ctb < bd[num - 1])
93 - return ctb_size;
94 - else if (width % ctb_size)
95 - return width % ctb_size;
96 - else
97 - return ctb_size;
98 + return ctb_to_tile(ctb_x, s->col_bd, s->tile_width);
99 +}
100 +
101 +static unsigned int ctb_to_tile_y(const struct rpivid_dec_state *const s,
102 + const unsigned int ctb_y)
103 +{
104 + return ctb_to_tile(ctb_y, s->row_bd, s->tile_height);
105 }
106
107 static void aux_q_free(struct rpivid_ctx *const ctx,
108 @@ -532,6 +543,15 @@ static void write_prob(struct rpivid_dec
109 p1_apb_write(de, 0x1000 + i,
110 dst[i] + (dst[i + 1] << 8) + (dst[i + 2] << 16) +
111 (dst[i + 3] << 24));
112 +
113 + /*
114 + * Having written the prob array back it up
115 + * This is not always needed but is a small overhead that simplifies
116 + * (and speeds up) some multi-tile & WPP scenarios
117 + * There are no scenarios where having written a prob we ever want
118 + * a previous (non-initial) state back
119 + */
120 + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
121 }
122
123 static void write_scaling_factors(struct rpivid_dec_env *const de)
124 @@ -552,8 +572,8 @@ static inline __u32 dma_to_axi_addr(dma_
125 static void write_bitstream(struct rpivid_dec_env *const de,
126 const struct rpivid_dec_state *const s)
127 {
128 - // Note that FFmpeg removes emulation prevention bytes, so this is
129 - // matched in the configuration here.
130 + // Note that FFmpeg V4L2 does not remove emulation prevention bytes,
131 + // so this is matched in the configuration here.
132 // Whether that is the correct behaviour or not is not clear in the
133 // spec.
134 const int rpi_use_emu = 1;
135 @@ -579,78 +599,26 @@ static void write_bitstream(struct rpivi
136
137 //////////////////////////////////////////////////////////////////////////////
138
139 -static void write_slice(struct rpivid_dec_env *const de,
140 - const struct rpivid_dec_state *const s,
141 - const unsigned int slice_w,
142 - const unsigned int slice_h)
143 -{
144 - u32 u32 = (s->sh->slice_type << 12) +
145 - (((s->sh->flags &
146 - V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA) != 0)
147 - << 14) +
148 - (((s->sh->flags &
149 - V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA) != 0)
150 - << 15) +
151 - (slice_w << 17) + (slice_h << 24);
152 -
153 - u32 |= (s->max_num_merge_cand << 0) + (s->nb_refs[L0] << 4) +
154 - (s->nb_refs[L1] << 8);
155 -
156 - if (s->sh->slice_type == HEVC_SLICE_B)
157 - u32 |= ((s->sh->flags &
158 - V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO) != 0)
159 - << 16;
160 - p1_apb_write(de, RPI_SLICE, u32);
161 -}
162 -
163 -//////////////////////////////////////////////////////////////////////////////
164 -// Tiles mode
165 -
166 -static void new_entry_point(struct rpivid_dec_env *const de,
167 - const struct rpivid_dec_state *const s,
168 - const int do_bte,
169 - const int reset_qp_y, const int ctb_addr_ts)
170 +/*
171 + * The slice constant part of the slice register - width and height need to
172 + * be ORed in later as they are per-tile / WPP-row
173 + */
174 +static u32 slice_reg_const(const struct rpivid_dec_state *const s)
175 {
176 - int ctb_col = s->ctb_addr_ts_to_rs[ctb_addr_ts] %
177 - de->pic_width_in_ctbs_y;
178 - int ctb_row = s->ctb_addr_ts_to_rs[ctb_addr_ts] /
179 - de->pic_width_in_ctbs_y;
180 -
181 - int tile_x = ctb_to_tile(ctb_col, s->col_bd, s->num_tile_columns);
182 - int tile_y = ctb_to_tile(ctb_row, s->row_bd, s->num_tile_rows);
183 -
184 - int endx = s->col_bd[tile_x + 1] - 1;
185 - int endy = s->row_bd[tile_y + 1] - 1;
186 -
187 - u8 slice_w = ctb_to_slice_w_h(ctb_col, 1 << s->log2_ctb_size,
188 - s->sps.pic_width_in_luma_samples,
189 - s->col_bd, s->num_tile_columns);
190 - u8 slice_h = ctb_to_slice_w_h(ctb_row, 1 << s->log2_ctb_size,
191 - s->sps.pic_height_in_luma_samples,
192 - s->row_bd, s->num_tile_rows);
193 -
194 - p1_apb_write(de, RPI_TILESTART,
195 - s->col_bd[tile_x] + (s->row_bd[tile_y] << 16));
196 - p1_apb_write(de, RPI_TILEEND, endx + (endy << 16));
197 -
198 - if (do_bte)
199 - p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy << 16));
200 + u32 x = (s->max_num_merge_cand << 0) |
201 + (s->nb_refs[L0] << 4) |
202 + (s->nb_refs[L1] << 8) |
203 + (s->sh->slice_type << 12);
204 +
205 + if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA)
206 + x |= BIT(14);
207 + if (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA)
208 + x |= BIT(15);
209 + if (s->sh->slice_type == HEVC_SLICE_B &&
210 + (s->sh->flags & V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO))
211 + x |= BIT(16);
212
213 - write_slice(de, s, slice_w, slice_h);
214 -
215 - if (reset_qp_y) {
216 - unsigned int sps_qp_bd_offset =
217 - 6 * s->sps.bit_depth_luma_minus8;
218 -
219 - p1_apb_write(de, RPI_QP, sps_qp_bd_offset + s->slice_qp);
220 - }
221 -
222 - p1_apb_write(de, RPI_MODE,
223 - (0xFFFF << 0) + (0x0 << 16) +
224 - ((tile_x == s->num_tile_columns - 1) << 17) +
225 - ((tile_y == s->num_tile_rows - 1) << 18));
226 -
227 - p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) + (ctb_row << 16));
228 + return x;
229 }
230
231 //////////////////////////////////////////////////////////////////////////////
232 @@ -934,197 +902,256 @@ static void pre_slice_decode(struct rpiv
233 (sh->slice_cb_qp_offset & 31)); // CMD_QPOFF
234 }
235
236 -//////////////////////////////////////////////////////////////////////////////
237 -// Write STATUS register with expected end CTU address of previous slice
238 -
239 -static void end_previous_slice(struct rpivid_dec_env *const de,
240 - const struct rpivid_dec_state *const s,
241 - const int ctb_addr_ts)
242 -{
243 - int last_x =
244 - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] % de->pic_width_in_ctbs_y;
245 - int last_y =
246 - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] / de->pic_width_in_ctbs_y;
247 -
248 - p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18));
249 -}
250 -
251 -static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row)
252 -{
253 - p1_apb_write(de, RPI_STATUS, (ctb_row << 18) + 0x25);
254 - p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
255 - p1_apb_write(de, RPI_MODE,
256 - ctb_row == de->pic_height_in_ctbs_y - 1 ?
257 - 0x70000 : 0x30000);
258 - p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2);
259 -}
260 -
261 -static void wpp_end_previous_slice(struct rpivid_dec_env *const de,
262 - const struct rpivid_dec_state *const s,
263 - int ctb_addr_ts)
264 -{
265 - int new_x = s->sh->slice_segment_addr % de->pic_width_in_ctbs_y;
266 - int new_y = s->sh->slice_segment_addr / de->pic_width_in_ctbs_y;
267 - int last_x =
268 - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] % de->pic_width_in_ctbs_y;
269 - int last_y =
270 - s->ctb_addr_ts_to_rs[ctb_addr_ts - 1] / de->pic_width_in_ctbs_y;
271 -
272 - if (de->wpp_entry_x < 2 && (de->wpp_entry_y < new_y || new_x > 2) &&
273 - de->pic_width_in_ctbs_y > 2)
274 - wpp_pause(de, last_y);
275 - p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18));
276 - if (new_x == 2 || (de->pic_width_in_ctbs_y == 2 &&
277 - de->wpp_entry_y < new_y))
278 - p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
279 +static void write_slice(struct rpivid_dec_env *const de,
280 + const struct rpivid_dec_state *const s,
281 + const u32 slice_const,
282 + const unsigned int ctb_col,
283 + const unsigned int ctb_row)
284 +{
285 + const unsigned int cs = (1 << s->log2_ctb_size);
286 + const unsigned int w_last = s->sps.pic_width_in_luma_samples & (cs - 1);
287 + const unsigned int h_last = s->sps.pic_height_in_luma_samples & (cs - 1);
288 +
289 + p1_apb_write(de, RPI_SLICE,
290 + slice_const |
291 + ((ctb_col + 1 < s->ctb_width || !w_last ?
292 + cs : w_last) << 17) |
293 + ((ctb_row + 1 < s->ctb_height || !h_last ?
294 + cs : h_last) << 24));
295 }
296
297 -//////////////////////////////////////////////////////////////////////////////
298 -// Wavefront mode
299 +#define PAUSE_MODE_WPP 1
300 +#define PAUSE_MODE_TILE 0xffff
301
302 -static void wpp_entry_point(struct rpivid_dec_env *const de,
303 +/*
304 + * N.B. This can be called to fill in data from the previous slice so must not
305 + * use any state data that may change from slice to slice (e.g. qp)
306 + */
307 +static void new_entry_point(struct rpivid_dec_env *const de,
308 const struct rpivid_dec_state *const s,
309 - const int do_bte,
310 - const int reset_qp_y, const int ctb_addr_ts)
311 -{
312 - int ctb_size = 1 << s->log2_ctb_size;
313 - int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts];
314 -
315 - int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->pic_width_in_ctbs_y;
316 - int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->pic_width_in_ctbs_y;
317 + const bool do_bte,
318 + const bool reset_qp_y,
319 + const u32 pause_mode,
320 + const unsigned int tile_x,
321 + const unsigned int tile_y,
322 + const unsigned int ctb_col,
323 + const unsigned int ctb_row,
324 + const unsigned int slice_qp,
325 + const u32 slice_const)
326 +{
327 + const unsigned int endx = s->col_bd[tile_x + 1] - 1;
328 + const unsigned int endy = (pause_mode == PAUSE_MODE_WPP) ?
329 + ctb_row : s->row_bd[tile_y + 1] - 1;
330
331 - int endx = de->pic_width_in_ctbs_y - 1;
332 - int endy = ctb_row;
333 -
334 - u8 slice_w = ctb_to_slice_w_h(ctb_col, ctb_size,
335 - s->sps.pic_width_in_luma_samples,
336 - s->col_bd, s->num_tile_columns);
337 - u8 slice_h = ctb_to_slice_w_h(ctb_row, ctb_size,
338 - s->sps.pic_height_in_luma_samples,
339 - s->row_bd, s->num_tile_rows);
340 -
341 - p1_apb_write(de, RPI_TILESTART, 0);
342 - p1_apb_write(de, RPI_TILEEND, endx + (endy << 16));
343 + p1_apb_write(de, RPI_TILESTART,
344 + s->col_bd[tile_x] | (s->row_bd[tile_y] << 16));
345 + p1_apb_write(de, RPI_TILEEND, endx | (endy << 16));
346
347 if (do_bte)
348 - p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy << 16));
349 + p1_apb_write(de, RPI_BEGINTILEEND, endx | (endy << 16));
350
351 - write_slice(de, s, slice_w,
352 - ctb_row == de->pic_height_in_ctbs_y - 1 ?
353 - slice_h : ctb_size);
354 + write_slice(de, s, slice_const, endx, endy);
355
356 if (reset_qp_y) {
357 unsigned int sps_qp_bd_offset =
358 6 * s->sps.bit_depth_luma_minus8;
359
360 - p1_apb_write(de, RPI_QP, sps_qp_bd_offset + s->slice_qp);
361 + p1_apb_write(de, RPI_QP, sps_qp_bd_offset + slice_qp);
362 }
363
364 p1_apb_write(de, RPI_MODE,
365 - ctb_row == de->pic_height_in_ctbs_y - 1 ?
366 - 0x60001 : 0x20001);
367 - p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) + (ctb_row << 16));
368 + pause_mode |
369 + ((endx == s->ctb_width - 1) << 17) |
370 + ((endy == s->ctb_height - 1) << 18));
371 +
372 + p1_apb_write(de, RPI_CONTROL, (ctb_col << 0) | (ctb_row << 16));
373 +
374 + de->entry_tile_x = tile_x;
375 + de->entry_tile_y = tile_y;
376 + de->entry_ctb_x = ctb_col;
377 + de->entry_ctb_y = ctb_row;
378 + de->entry_qp = slice_qp;
379 + de->entry_slice = slice_const;
380 }
381
382 //////////////////////////////////////////////////////////////////////////////
383 // Wavefront mode
384
385 +static void wpp_pause(struct rpivid_dec_env *const de, int ctb_row)
386 +{
387 + p1_apb_write(de, RPI_STATUS, (ctb_row << 18) | 0x25);
388 + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
389 + p1_apb_write(de, RPI_MODE,
390 + ctb_row == de->pic_height_in_ctbs_y - 1 ?
391 + 0x70000 : 0x30000);
392 + p1_apb_write(de, RPI_CONTROL, (ctb_row << 16) + 2);
393 +}
394 +
395 +static void wpp_entry_fill(struct rpivid_dec_env *const de,
396 + const struct rpivid_dec_state *const s,
397 + const unsigned int last_y)
398 +{
399 + const unsigned int last_x = s->ctb_width - 1;
400 +
401 + while (de->entry_ctb_y < last_y) {
402 + /* wpp_entry_x/y set by wpp_entry_point */
403 + if (s->ctb_width > 2)
404 + wpp_pause(de, de->entry_ctb_y);
405 + p1_apb_write(de, RPI_STATUS,
406 + (de->entry_ctb_y << 18) | (last_x << 5) | 2);
407 +
408 + /* if width == 1 then the saved state is the init one */
409 + if (s->ctb_width == 2)
410 + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
411 + else
412 + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
413 +
414 + new_entry_point(de, s, false, true, PAUSE_MODE_WPP,
415 + 0, 0, 0, de->entry_ctb_y + 1,
416 + de->entry_qp, de->entry_slice);
417 + }
418 +}
419 +
420 +static void wpp_end_previous_slice(struct rpivid_dec_env *const de,
421 + const struct rpivid_dec_state *const s)
422 +{
423 + wpp_entry_fill(de, s, s->prev_ctb_y);
424 +
425 + if (de->entry_ctb_x < 2 &&
426 + (de->entry_ctb_y < s->start_ctb_y || s->start_ctb_x > 2) &&
427 + s->ctb_width > 2)
428 + wpp_pause(de, s->prev_ctb_y);
429 + p1_apb_write(de, RPI_STATUS,
430 + 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
431 + if (s->start_ctb_x == 2 ||
432 + (s->ctb_width == 2 && de->entry_ctb_y < s->start_ctb_y))
433 + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
434 +}
435 +
436 +/* Only main profile supported so WPP => !Tiles which makes some of the
437 + * next chunk code simpler
438 + */
439 static void wpp_decode_slice(struct rpivid_dec_env *const de,
440 - const struct rpivid_dec_state *const s,
441 - const struct v4l2_ctrl_hevc_slice_params *sh,
442 - int ctb_addr_ts)
443 -{
444 - int i, reset_qp_y = 1;
445 - int indep = !s->dependent_slice_segment_flag;
446 - int ctb_col = s->sh->slice_segment_addr % de->pic_width_in_ctbs_y;
447 + const struct rpivid_dec_state *const s)
448 +{
449 + bool reset_qp_y = true;
450 + const bool indep = !s->dependent_slice_segment_flag;
451
452 - if (ctb_addr_ts)
453 - wpp_end_previous_slice(de, s, ctb_addr_ts);
454 + if (s->start_ts)
455 + wpp_end_previous_slice(de, s);
456 pre_slice_decode(de, s);
457 write_bitstream(de, s);
458 - if (ctb_addr_ts == 0 || indep || de->pic_width_in_ctbs_y == 1)
459 +
460 + if (!s->start_ts || indep || s->ctb_width == 1)
461 write_prob(de, s);
462 - else if (ctb_col == 0)
463 + else if (!s->start_ctb_x)
464 p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
465 else
466 - reset_qp_y = 0;
467 + reset_qp_y = false;
468 +
469 program_slicecmds(de, s->slice_idx);
470 new_slice_segment(de, s);
471 - wpp_entry_point(de, s, indep, reset_qp_y, ctb_addr_ts);
472 + new_entry_point(de, s, indep, reset_qp_y, PAUSE_MODE_WPP,
473 + 0, 0, s->start_ctb_x, s->start_ctb_y,
474 + s->slice_qp, slice_reg_const(s));
475
476 - for (i = 0; i < s->sh->num_entry_point_offsets; i++) {
477 - int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts];
478 - int ctb_row = ctb_addr_rs / de->pic_width_in_ctbs_y;
479 - int last_x = de->pic_width_in_ctbs_y - 1;
480 + if (s->frame_end) {
481 + wpp_entry_fill(de, s, s->ctb_height - 1);
482 +
483 + if (de->entry_ctb_x < 2 && s->ctb_width > 2)
484 + wpp_pause(de, s->ctb_height - 1);
485
486 - if (de->pic_width_in_ctbs_y > 2)
487 - wpp_pause(de, ctb_row);
488 p1_apb_write(de, RPI_STATUS,
489 - (ctb_row << 18) + (last_x << 5) + 2);
490 - if (de->pic_width_in_ctbs_y == 2)
491 - p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
492 - if (de->pic_width_in_ctbs_y == 1)
493 - write_prob(de, s);
494 - else
495 - p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
496 - ctb_addr_ts += s->column_width[0];
497 - wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
498 + 1 | ((s->ctb_width - 1) << 5) |
499 + ((s->ctb_height - 1) << 18));
500 }
501 +
502 }
503
504 //////////////////////////////////////////////////////////////////////////////
505 // Tiles mode
506
507 +static void tile_entry_fill(struct rpivid_dec_env *const de,
508 + const struct rpivid_dec_state *const s,
509 + const unsigned int last_tile_x,
510 + const unsigned int last_tile_y)
511 +{
512 + while (de->entry_tile_y < last_tile_y ||
513 + (de->entry_tile_y == last_tile_y &&
514 + de->entry_tile_x < last_tile_x)) {
515 + unsigned int t_x = de->entry_tile_x;
516 + unsigned int t_y = de->entry_tile_y;
517 + const unsigned int last_x = s->col_bd[t_x + 1] - 1;
518 + const unsigned int last_y = s->row_bd[t_y + 1] - 1;
519 +
520 + p1_apb_write(de, RPI_STATUS,
521 + 2 | (last_x << 5) | (last_y << 18));
522 + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
523 +
524 + // Inc tile
525 + if (++t_x >= s->tile_width) {
526 + t_x = 0;
527 + ++t_y;
528 + }
529 +
530 + new_entry_point(de, s, false, true, PAUSE_MODE_TILE,
531 + t_x, t_y, s->col_bd[t_x], s->row_bd[t_y],
532 + de->entry_qp, de->entry_slice);
533 + }
534 +}
535 +
536 +/*
537 + * Write STATUS register with expected end CTU address of previous slice
538 + */
539 +static void end_previous_slice(struct rpivid_dec_env *const de,
540 + const struct rpivid_dec_state *const s)
541 +{
542 + tile_entry_fill(de, s,
543 + ctb_to_tile_x(s, s->prev_ctb_x),
544 + ctb_to_tile_y(s, s->prev_ctb_y));
545 + p1_apb_write(de, RPI_STATUS,
546 + 1 | (s->prev_ctb_x << 5) | (s->prev_ctb_y << 18));
547 +}
548 +
549 static void decode_slice(struct rpivid_dec_env *const de,
550 - const struct rpivid_dec_state *const s,
551 - const struct v4l2_ctrl_hevc_slice_params *const sh,
552 - int ctb_addr_ts)
553 + const struct rpivid_dec_state *const s)
554 {
555 - int i, reset_qp_y;
556 + bool reset_qp_y;
557 + unsigned int tile_x = ctb_to_tile_x(s, s->start_ctb_x);
558 + unsigned int tile_y = ctb_to_tile_y(s, s->start_ctb_y);
559
560 - if (ctb_addr_ts)
561 - end_previous_slice(de, s, ctb_addr_ts);
562 + if (s->start_ts)
563 + end_previous_slice(de, s);
564
565 pre_slice_decode(de, s);
566 write_bitstream(de, s);
567
568 -#if DEBUG_TRACE_P1_CMD
569 - if (p1_z < 256) {
570 - v4l2_info(&de->ctx->dev->v4l2_dev,
571 - "TS=%d, tile=%d/%d, dss=%d, flags=%#llx\n",
572 - ctb_addr_ts, s->tile_id[ctb_addr_ts],
573 - s->tile_id[ctb_addr_ts - 1],
574 - s->dependent_slice_segment_flag, sh->flags);
575 - }
576 -#endif
577 -
578 - reset_qp_y = ctb_addr_ts == 0 ||
579 - s->tile_id[ctb_addr_ts] != s->tile_id[ctb_addr_ts - 1] ||
580 - !s->dependent_slice_segment_flag;
581 + reset_qp_y = !s->start_ts ||
582 + !s->dependent_slice_segment_flag ||
583 + tile_x != ctb_to_tile_x(s, s->prev_ctb_x) ||
584 + tile_y != ctb_to_tile_y(s, s->prev_ctb_y);
585 if (reset_qp_y)
586 write_prob(de, s);
587
588 program_slicecmds(de, s->slice_idx);
589 new_slice_segment(de, s);
590 new_entry_point(de, s, !s->dependent_slice_segment_flag, reset_qp_y,
591 - ctb_addr_ts);
592 -
593 - for (i = 0; i < s->sh->num_entry_point_offsets; i++) {
594 - int ctb_addr_rs = s->ctb_addr_ts_to_rs[ctb_addr_ts];
595 - int ctb_col = ctb_addr_rs % de->pic_width_in_ctbs_y;
596 - int ctb_row = ctb_addr_rs / de->pic_width_in_ctbs_y;
597 - int tile_x = ctb_to_tile(ctb_col, s->col_bd,
598 - s->num_tile_columns - 1);
599 - int tile_y =
600 - ctb_to_tile(ctb_row, s->row_bd, s->num_tile_rows - 1);
601 - int last_x = s->col_bd[tile_x + 1] - 1;
602 - int last_y = s->row_bd[tile_y + 1] - 1;
603 + PAUSE_MODE_TILE,
604 + tile_x, tile_y, s->start_ctb_x, s->start_ctb_y,
605 + s->slice_qp, slice_reg_const(s));
606
607 + /*
608 + * If this is the last slice then fill in the other tile entries
609 + * now, otherwise this will be done at the start of the next slice
610 + * when it will be known where this slice finishes
611 + */
612 + if (s->frame_end) {
613 + tile_entry_fill(de, s,
614 + s->tile_width - 1,
615 + s->tile_height - 1);
616 p1_apb_write(de, RPI_STATUS,
617 - 2 + (last_x << 5) + (last_y << 18));
618 - write_prob(de, s);
619 - ctb_addr_ts += s->column_width[tile_x] * s->row_height[tile_y];
620 - new_entry_point(de, s, 0, 1, ctb_addr_ts);
621 + 1 | ((s->ctb_width - 1) << 5) |
622 + ((s->ctb_height - 1) << 18));
623 }
624 }
625
626 @@ -1132,13 +1159,12 @@ static void decode_slice(struct rpivid_d
627 // Scaling factors
628
629 static void expand_scaling_list(const unsigned int size_id,
630 - const unsigned int matrix_id, u8 *const dst0,
631 + u8 *const dst0,
632 const u8 *const src0, uint8_t dc)
633 {
634 u8 *d;
635 unsigned int x, y;
636
637 - // FIXME: matrix_id is unused ?
638 switch (size_id) {
639 case 0:
640 memcpy(dst0, src0, 16);
641 @@ -1199,24 +1225,20 @@ static void populate_scaling_factors(con
642 unsigned int mid;
643
644 for (mid = 0; mid < 6; mid++)
645 - expand_scaling_list(0, mid,
646 - de->scaling_factors +
647 + expand_scaling_list(0, de->scaling_factors +
648 scaling_factor_offsets[0][mid],
649 sl->scaling_list_4x4[mid], 0);
650 for (mid = 0; mid < 6; mid++)
651 - expand_scaling_list(1, mid,
652 - de->scaling_factors +
653 + expand_scaling_list(1, de->scaling_factors +
654 scaling_factor_offsets[1][mid],
655 sl->scaling_list_8x8[mid], 0);
656 for (mid = 0; mid < 6; mid++)
657 - expand_scaling_list(2, mid,
658 - de->scaling_factors +
659 + expand_scaling_list(2, de->scaling_factors +
660 scaling_factor_offsets[2][mid],
661 sl->scaling_list_16x16[mid],
662 sl->scaling_list_dc_coef_16x16[mid]);
663 - for (mid = 0; mid < 2; mid += 1)
664 - expand_scaling_list(3, mid,
665 - de->scaling_factors +
666 + for (mid = 0; mid < 2; mid++)
667 + expand_scaling_list(3, de->scaling_factors +
668 scaling_factor_offsets[3][mid],
669 sl->scaling_list_32x32[mid],
670 sl->scaling_list_dc_coef_32x32[mid]);
671 @@ -1228,8 +1250,6 @@ static void free_ps_info(struct rpivid_d
672 s->ctb_addr_rs_to_ts = NULL;
673 kfree(s->ctb_addr_ts_to_rs);
674 s->ctb_addr_ts_to_rs = NULL;
675 - kfree(s->tile_id);
676 - s->tile_id = NULL;
677
678 kfree(s->col_bd);
679 s->col_bd = NULL;
680 @@ -1237,10 +1257,52 @@ static void free_ps_info(struct rpivid_d
681 s->row_bd = NULL;
682 }
683
684 +static unsigned int tile_width(const struct rpivid_dec_state *const s,
685 + const unsigned int t_x)
686 +{
687 + return s->col_bd[t_x + 1] - s->col_bd[t_x];
688 +}
689 +
690 +static unsigned int tile_height(const struct rpivid_dec_state *const s,
691 + const unsigned int t_y)
692 +{
693 + return s->row_bd[t_y + 1] - s->row_bd[t_y];
694 +}
695 +
696 +static void fill_rs_to_ts(struct rpivid_dec_state *const s)
697 +{
698 + unsigned int ts = 0;
699 + unsigned int t_y;
700 + unsigned int tr_rs = 0;
701 +
702 + for (t_y = 0; t_y != s->tile_height; ++t_y) {
703 + const unsigned int t_h = tile_height(s, t_y);
704 + unsigned int t_x;
705 + unsigned int tc_rs = tr_rs;
706 +
707 + for (t_x = 0; t_x != s->tile_width; ++t_x) {
708 + const unsigned int t_w = tile_width(s, t_x);
709 + unsigned int y;
710 + unsigned int rs = tc_rs;
711 +
712 + for (y = 0; y != t_h; ++y) {
713 + unsigned int x;
714 +
715 + for (x = 0; x != t_w; ++x) {
716 + s->ctb_addr_rs_to_ts[rs + x] = ts;
717 + s->ctb_addr_ts_to_rs[ts] = rs + x;
718 + ++ts;
719 + }
720 + rs += s->ctb_width;
721 + }
722 + tc_rs += t_w;
723 + }
724 + tr_rs += t_h * s->ctb_width;
725 + }
726 +}
727 +
728 static int updated_ps(struct rpivid_dec_state *const s)
729 {
730 - unsigned int ctb_addr_rs;
731 - int j, x, y, tile_id;
732 unsigned int i;
733
734 free_ps_info(s);
735 @@ -1259,104 +1321,49 @@ static int updated_ps(struct rpivid_dec_
736
737 // Inferred parameters
738
739 + s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size,
740 + sizeof(*s->ctb_addr_rs_to_ts),
741 + GFP_KERNEL);
742 + s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size,
743 + sizeof(*s->ctb_addr_ts_to_rs),
744 + GFP_KERNEL);
745 +
746 if (!(s->pps.flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) {
747 - s->num_tile_columns = 1;
748 - s->num_tile_rows = 1;
749 - s->column_width[0] = s->ctb_width;
750 - s->row_height[0] = s->ctb_height;
751 + s->tile_width = 1;
752 + s->tile_height = 1;
753 } else {
754 - s->num_tile_columns = s->pps.num_tile_columns_minus1 + 1;
755 - s->num_tile_rows = s->pps.num_tile_rows_minus1 + 1;
756 - for (i = 0; i < s->num_tile_columns; ++i)
757 - s->column_width[i] = s->pps.column_width_minus1[i] + 1;
758 - for (i = 0; i < s->num_tile_rows; ++i)
759 - s->row_height[i] = s->pps.row_height_minus1[i] + 1;
760 + s->tile_width = s->pps.num_tile_columns_minus1 + 1;
761 + s->tile_height = s->pps.num_tile_rows_minus1 + 1;
762 }
763
764 - s->col_bd = kmalloc((s->num_tile_columns + 1) * sizeof(*s->col_bd),
765 + s->col_bd = kmalloc((s->tile_width + 1) * sizeof(*s->col_bd),
766 GFP_KERNEL);
767 - s->row_bd = kmalloc((s->num_tile_rows + 1) * sizeof(*s->row_bd),
768 + s->row_bd = kmalloc((s->tile_height + 1) * sizeof(*s->row_bd),
769 GFP_KERNEL);
770
771 s->col_bd[0] = 0;
772 - for (i = 0; i < s->num_tile_columns; i++)
773 - s->col_bd[i + 1] = s->col_bd[i] + s->column_width[i];
774 + for (i = 1; i < s->tile_width; i++)
775 + s->col_bd[i] = s->col_bd[i - 1] +
776 + s->pps.column_width_minus1[i - 1] + 1;
777 + s->col_bd[s->tile_width] = s->ctb_width;
778
779 s->row_bd[0] = 0;
780 - for (i = 0; i < s->num_tile_rows; i++)
781 - s->row_bd[i + 1] = s->row_bd[i] + s->row_height[i];
782 + for (i = 1; i < s->tile_height; i++)
783 + s->row_bd[i] = s->row_bd[i - 1] +
784 + s->pps.row_height_minus1[i - 1] + 1;
785 + s->row_bd[s->tile_height] = s->ctb_height;
786
787 - s->ctb_addr_rs_to_ts = kmalloc_array(s->ctb_size,
788 - sizeof(*s->ctb_addr_rs_to_ts),
789 - GFP_KERNEL);
790 - s->ctb_addr_ts_to_rs = kmalloc_array(s->ctb_size,
791 - sizeof(*s->ctb_addr_ts_to_rs),
792 - GFP_KERNEL);
793 - s->tile_id = kmalloc_array(s->ctb_size, sizeof(*s->tile_id),
794 - GFP_KERNEL);
795 -
796 - for (ctb_addr_rs = 0; ctb_addr_rs < s->ctb_size; ctb_addr_rs++) {
797 - int tb_x = ctb_addr_rs % s->ctb_width;
798 - int tb_y = ctb_addr_rs / s->ctb_width;
799 - int tile_x = 0;
800 - int tile_y = 0;
801 - int val = 0;
802 -
803 - for (i = 0; i < s->num_tile_columns; i++) {
804 - if (tb_x < s->col_bd[i + 1]) {
805 - tile_x = i;
806 - break;
807 - }
808 - }
809 -
810 - for (i = 0; i < s->num_tile_rows; i++) {
811 - if (tb_y < s->row_bd[i + 1]) {
812 - tile_y = i;
813 - break;
814 - }
815 - }
816 -
817 - for (i = 0; i < tile_x; i++)
818 - val += s->row_height[tile_y] * s->column_width[i];
819 - for (i = 0; i < tile_y; i++)
820 - val += s->ctb_width * s->row_height[i];
821 -
822 - val += (tb_y - s->row_bd[tile_y]) * s->column_width[tile_x] +
823 - tb_x - s->col_bd[tile_x];
824 -
825 - s->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
826 - s->ctb_addr_ts_to_rs[val] = ctb_addr_rs;
827 - }
828 -
829 - for (j = 0, tile_id = 0; j < s->num_tile_rows; j++)
830 - for (i = 0; i < s->num_tile_columns; i++, tile_id++)
831 - for (y = s->row_bd[j]; y < s->row_bd[j + 1]; y++)
832 - for (x = s->col_bd[i];
833 - x < s->col_bd[i + 1];
834 - x++)
835 - s->tile_id[s->ctb_addr_rs_to_ts
836 - [y * s->ctb_width +
837 - x]] = tile_id;
838 + fill_rs_to_ts(s);
839
840 return 0;
841 }
842
843 -static int frame_end(struct rpivid_dev *const dev,
844 - struct rpivid_dec_env *const de,
845 - const struct rpivid_dec_state *const s)
846 -{
847 - const unsigned int last_x = s->col_bd[s->num_tile_columns] - 1;
848 - const unsigned int last_y = s->row_bd[s->num_tile_rows] - 1;
849 - size_t cmd_size;
850 -
851 - if (s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED) {
852 - if (de->wpp_entry_x < 2 && de->pic_width_in_ctbs_y > 2)
853 - wpp_pause(de, last_y);
854 - }
855 - p1_apb_write(de, RPI_STATUS, 1 + (last_x << 5) + (last_y << 18));
856 -
857 +static int write_cmd_buffer(struct rpivid_dev *const dev,
858 + struct rpivid_dec_env *const de,
859 + const struct rpivid_dec_state *const s)
860 +{
861 // Copy commands out to dma buf
862 - cmd_size = de->cmd_len * sizeof(de->cmd_fifo[0]);
863 + const size_t cmd_size = de->cmd_len * sizeof(de->cmd_fifo[0]);
864
865 if (!de->cmd_copy_gptr->ptr || cmd_size > de->cmd_copy_gptr->size) {
866 size_t cmd_alloc = round_up_size(cmd_size);
867 @@ -1521,18 +1528,19 @@ static void rpivid_h265_setup(struct rpi
868 struct rpivid_q_aux *dpb_q_aux[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
869 struct rpivid_dec_state *const s = ctx->state;
870 struct vb2_queue *vq;
871 - struct rpivid_dec_env *de;
872 - int ctb_addr_ts;
873 + struct rpivid_dec_env *de = ctx->dec0;
874 + unsigned int prev_rs;
875 unsigned int i;
876 int use_aux;
877 bool slice_temporal_mvp;
878
879 + xtrace_in(dev, de);
880 +
881 pred_weight_table = &sh->pred_weight_table;
882
883 s->frame_end =
884 ((run->src->flags & V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF) == 0);
885
886 - de = ctx->dec0;
887 slice_temporal_mvp = (sh->flags &
888 V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED);
889
890 @@ -1662,6 +1670,13 @@ static void rpivid_h265_setup(struct rpi
891 s->sps.pic_height_in_luma_samples);
892 goto fail;
893 }
894 + if ((s->tile_width != 1 || s->tile_height != 1) &&
895 + (s->pps.flags &
896 + V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED)) {
897 + v4l2_warn(&dev->v4l2_dev,
898 + "Tiles + WPP not supported\n");
899 + goto fail;
900 + }
901
902 // Fill in ref planes with our address s.t. if we mess
903 // up refs somehow then we still have a valid address
904 @@ -1760,15 +1775,24 @@ static void rpivid_h265_setup(struct rpi
905 if (s->sps.flags & V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED)
906 populate_scaling_factors(run, de, s);
907
908 - ctb_addr_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr];
909 + // Calc all the random coord info to avoid repeated conversion in/out
910 + s->start_ts = s->ctb_addr_rs_to_ts[sh->slice_segment_addr];
911 + s->start_ctb_x = sh->slice_segment_addr % de->pic_width_in_ctbs_y;
912 + s->start_ctb_y = sh->slice_segment_addr / de->pic_width_in_ctbs_y;
913 + // Last CTB of previous slice
914 + prev_rs = !s->start_ts ? 0 : s->ctb_addr_ts_to_rs[s->start_ts - 1];
915 + s->prev_ctb_x = prev_rs % de->pic_width_in_ctbs_y;
916 + s->prev_ctb_y = prev_rs / de->pic_width_in_ctbs_y;
917
918 if ((s->pps.flags & V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED))
919 - wpp_decode_slice(de, s, sh, ctb_addr_ts);
920 + wpp_decode_slice(de, s);
921 else
922 - decode_slice(de, s, sh, ctb_addr_ts);
923 + decode_slice(de, s);
924
925 - if (!s->frame_end)
926 + if (!s->frame_end) {
927 + xtrace_ok(dev, de);
928 return;
929 + }
930
931 // Frame end
932 memset(dpb_q_aux, 0,
933 @@ -1776,8 +1800,9 @@ static void rpivid_h265_setup(struct rpi
934 /*
935 * Need Aux ents for all (ref) DPB ents if temporal MV could
936 * be enabled for any pic
937 - * ** At the moment we have aux ents for all pics whether or not
938 - * they are ref
939 + * ** At the moment we create aux ents for all pics whether or not
940 + * they are ref - they should then be discarded by the DPB-aux
941 + * garbage collection code
942 */
943 use_aux = ((s->sps.flags &
944 V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED) != 0);
945 @@ -1795,7 +1820,7 @@ static void rpivid_h265_setup(struct rpi
946 }
947
948 // v4l2_info(&dev->v4l2_dev, "rpivid_h265_end of frame\n");
949 - if (frame_end(dev, de, s))
950 + if (write_cmd_buffer(dev, de, s))
951 goto fail;
952
953 for (i = 0; i < sh->num_active_dpb_entries; ++i) {
954 @@ -1876,6 +1901,7 @@ static void rpivid_h265_setup(struct rpi
955 }
956
957 de->state = RPIVID_DECODE_PHASE1;
958 + xtrace_ok(dev, de);
959 return;
960
961 fail:
962 @@ -1883,6 +1909,7 @@ fail:
963 // Actual error reporting happens in Trigger
964 de->state = s->frame_end ? RPIVID_DECODE_ERROR_DONE :
965 RPIVID_DECODE_ERROR_CONTINUE;
966 + xtrace_fail(dev, de);
967 }
968
969 //////////////////////////////////////////////////////////////////////////////
970 @@ -2210,6 +2237,10 @@ static int rpivid_h265_start(struct rpiv
971 size_t pu_alloc;
972 size_t coeff_alloc;
973
974 +#if DEBUG_TRACE_P1_CMD
975 + p1_z = 0;
976 +#endif
977 +
978 // Generate a sanitised WxH for memory alloc
979 // Assume HD if unset
980 if (w == 0)