/* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
-#ifdef __bfin__
-static inline void lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
-{
- int i;
- int offset1;
- int offset2;
- int factor;
- int exp;
- int16_t *phist;
- int n;
-
- if (shift > 0)
- factor = clean << shift;
- else
- factor = clean >> -shift;
-
- /* Update the FIR taps */
-
- offset2 = ec->curr_pos;
- offset1 = ec->taps - offset2;
- phist = &ec->fir_state_bg.history[offset2];
-
- /* st: and en: help us locate the assembler in echo.s */
-
- /* asm("st:"); */
- n = ec->taps;
- for (i = 0; i < n; i++) {
- exp = *phist++ * factor;
- ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
- }
- /* asm("en:"); */
-
- /* Note the asm for the inner loop above generated by Blackfin gcc
- 4.1.1 is pretty good (note even parallel instructions used):
-
- R0 = W [P0++] (X);
- R0 *= R2;
- R0 = R0 + R3 (NS) ||
- R1 = W [P1] (X) ||
- nop;
- R0 >>>= 15;
- R0 = R0 + R1;
- W [P1++] = R0;
-
- A block based update algorithm would be much faster but the
- above can't be improved on much. Every instruction saved in
- the loop above is 2 MIPs/ch! The for loop above is where the
- Blackfin spends most of it's time - about 17 MIPs/ch measured
- with speedtest.c with 256 taps (32ms). Write-back and
- Write-through cache gave about the same performance.
- */
-}
-
-/*
- IDEAS for further optimisation of lms_adapt_bg():
-
- 1/ The rounding is quite costly. Could we keep as 32 bit coeffs
- then make filter pluck the MS 16-bits of the coeffs when filtering?
- However this would lower potential optimisation of filter, as I
- think the dual-MAC architecture requires packed 16 bit coeffs.
-
- 2/ Block based update would be more efficient, as per comments above,
- could use dual MAC architecture.
-
- 3/ Look for same sample Blackfin LMS code, see if we can get dual-MAC
- packing.
-
- 4/ Execute the whole e/c in a block of say 20ms rather than sample
- by sample. Processing a few samples every ms is inefficient.
-*/
-
-#else
static inline void lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
{
int i;
ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
}
}
-#endif
static inline int top_bit(unsigned int bits)
{
#define _FIR_H_
/*
- Blackfin NOTES & IDEAS:
-
- A simple dot product function is used to implement the filter. This performs
- just one MAC/cycle which is inefficient but was easy to implement as a first
- pass. The current Blackfin code also uses an unrolled form of the filter
- history to avoid 0 length hardware loop issues. This is wasteful of
- memory.
-
Ideas for improvement:
1/ Rewrite filter for dual MAC inner loop. The issue here is handling
fir->taps = taps;
fir->curr_pos = taps - 1;
fir->coeffs = coeffs;
-#if defined(__bfin__)
- fir->history = kcalloc(2 * taps, sizeof(int16_t), GFP_KERNEL);
-#else
fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
-#endif
return fir->history;
}
static inline void fir16_flush(struct fir16_state_t *fir)
{
-#if defined(__bfin__)
- memset(fir->history, 0, 2 * fir->taps * sizeof(int16_t));
-#else
memset(fir->history, 0, fir->taps * sizeof(int16_t));
-#endif
}
static inline void fir16_free(struct fir16_state_t *fir)
kfree(fir->history);
}
-#ifdef __bfin__
-static inline int32_t dot_asm(short *x, short *y, int len)
-{
- int dot;
-
- len--;
-
- __asm__("I0 = %1;\n\t"
- "I1 = %2;\n\t"
- "A0 = 0;\n\t"
- "R0.L = W[I0++] || R1.L = W[I1++];\n\t"
- "LOOP dot%= LC0 = %3;\n\t"
- "LOOP_BEGIN dot%=;\n\t"
- "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
- "LOOP_END dot%=;\n\t"
- "A0 += R0.L*R1.L (IS);\n\t"
- "R0 = A0;\n\t"
- "%0 = R0;\n\t"
- : "=&d"(dot)
- : "a"(x), "a"(y), "a"(len)
- : "I0", "I1", "A1", "A0", "R0", "R1"
- );
-
- return dot;
-}
-#endif
-
static inline int16_t fir16(struct fir16_state_t *fir, int16_t sample)
{
int32_t y;
-#if defined(__bfin__)
- fir->history[fir->curr_pos] = sample;
- fir->history[fir->curr_pos + fir->taps] = sample;
- y = dot_asm((int16_t *) fir->coeffs, &fir->history[fir->curr_pos],
- fir->taps);
-#else
int i;
int offset1;
int offset2;
y += fir->coeffs[i] * fir->history[i - offset1];
for (; i >= 0; i--)
y += fir->coeffs[i] * fir->history[i + offset2];
-#endif
if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps;
fir->curr_pos--;