29 #include "libavutil/attributes.h"
37 #if COMPILE_TEMPLATE_AMD3DNOW
38 #define PREFETCH "prefetch"
39 #define PAVGB "pavgusb"
40 #elif COMPILE_TEMPLATE_MMXEXT
41 #define PREFETCH "prefetchnta"
44 #define PREFETCH " # nop"
47 #if COMPILE_TEMPLATE_AMD3DNOW
54 #if COMPILE_TEMPLATE_MMXEXT
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
59 #define SFENCE " # nop"
62 #if !COMPILE_TEMPLATE_SSE2
64 #if !COMPILE_TEMPLATE_AMD3DNOW
73 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
75 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
79 "movd (%1), %%mm0 \n\t"
80 "punpckldq 3(%1), %%mm0 \n\t"
81 "movd 6(%1), %%mm1 \n\t"
82 "punpckldq 9(%1), %%mm1 \n\t"
83 "movd 12(%1), %%mm2 \n\t"
84 "punpckldq 15(%1), %%mm2 \n\t"
85 "movd 18(%1), %%mm3 \n\t"
86 "punpckldq 21(%1), %%mm3 \n\t"
87 "por %%mm7, %%mm0 \n\t"
88 "por %%mm7, %%mm1 \n\t"
89 "por %%mm7, %%mm2 \n\t"
90 "por %%mm7, %%mm3 \n\t"
93 MOVNTQ" %%mm2, 16(%0) \n\t"
100 __asm__
volatile(
SFENCE:::
"memory");
101 __asm__
volatile(
EMMS:::
"memory");
110 #define STORE_BGR24_MMX \
111 "psrlq $8, %%mm2 \n\t" \
112 "psrlq $8, %%mm3 \n\t" \
113 "psrlq $8, %%mm6 \n\t" \
114 "psrlq $8, %%mm7 \n\t" \
115 "pand "MANGLE(mask24l)", %%mm0\n\t" \
116 "pand "MANGLE(mask24l)", %%mm1\n\t" \
117 "pand "MANGLE(mask24l)", %%mm4\n\t" \
118 "pand "MANGLE(mask24l)", %%mm5\n\t" \
119 "pand "MANGLE(mask24h)", %%mm2\n\t" \
120 "pand "MANGLE(mask24h)", %%mm3\n\t" \
121 "pand "MANGLE(mask24h)", %%mm6\n\t" \
122 "pand "MANGLE(mask24h)", %%mm7\n\t" \
123 "por %%mm2, %%mm0 \n\t" \
124 "por %%mm3, %%mm1 \n\t" \
125 "por %%mm6, %%mm4 \n\t" \
126 "por %%mm7, %%mm5 \n\t" \
128 "movq %%mm1, %%mm2 \n\t" \
129 "movq %%mm4, %%mm3 \n\t" \
130 "psllq $48, %%mm2 \n\t" \
131 "psllq $32, %%mm3 \n\t" \
132 "por %%mm2, %%mm0 \n\t" \
133 "psrlq $16, %%mm1 \n\t" \
134 "psrlq $32, %%mm4 \n\t" \
135 "psllq $16, %%mm5 \n\t" \
136 "por %%mm3, %%mm1 \n\t" \
137 "por %%mm5, %%mm4 \n\t" \
139 MOVNTQ" %%mm0, (%0) \n\t" \
140 MOVNTQ" %%mm1, 8(%0) \n\t" \
141 MOVNTQ" %%mm4, 16(%0)"
151 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
156 "movq (%1), %%mm0 \n\t"
157 "movq 8(%1), %%mm1 \n\t"
158 "movq 16(%1), %%mm4 \n\t"
159 "movq 24(%1), %%mm5 \n\t"
160 "movq %%mm0, %%mm2 \n\t"
161 "movq %%mm1, %%mm3 \n\t"
162 "movq %%mm4, %%mm6 \n\t"
163 "movq %%mm5, %%mm7 \n\t"
170 __asm__
volatile(
SFENCE:::
"memory");
171 __asm__
volatile(
EMMS:::
"memory");
193 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
194 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
199 "movq (%1), %%mm0 \n\t"
200 "movq 8(%1), %%mm2 \n\t"
201 "movq %%mm0, %%mm1 \n\t"
202 "movq %%mm2, %%mm3 \n\t"
203 "pand %%mm4, %%mm0 \n\t"
204 "pand %%mm4, %%mm2 \n\t"
205 "paddw %%mm1, %%mm0 \n\t"
206 "paddw %%mm3, %%mm2 \n\t"
214 __asm__
volatile(
SFENCE:::
"memory");
215 __asm__
volatile(
EMMS:::
"memory");
218 register unsigned x= *((
const uint32_t *)s);
219 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
224 register unsigned short x= *((
const uint16_t *)s);
225 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
236 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
237 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
238 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
243 "movq (%1), %%mm0 \n\t"
244 "movq 8(%1), %%mm2 \n\t"
245 "movq %%mm0, %%mm1 \n\t"
246 "movq %%mm2, %%mm3 \n\t"
247 "psrlq $1, %%mm0 \n\t"
248 "psrlq $1, %%mm2 \n\t"
249 "pand %%mm7, %%mm0 \n\t"
250 "pand %%mm7, %%mm2 \n\t"
251 "pand %%mm6, %%mm1 \n\t"
252 "pand %%mm6, %%mm3 \n\t"
253 "por %%mm1, %%mm0 \n\t"
254 "por %%mm3, %%mm2 \n\t"
262 __asm__
volatile(
SFENCE:::
"memory");
263 __asm__
volatile(
EMMS:::
"memory");
266 register uint32_t x= *((
const uint32_t*)s);
267 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
272 register uint16_t x= *((
const uint16_t*)s);
273 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
282 uint16_t *d = (uint16_t *)dst;
286 "movq %3, %%mm5 \n\t"
287 "movq %4, %%mm6 \n\t"
288 "movq %5, %%mm7 \n\t"
293 "movd (%1), %%mm0 \n\t"
294 "movd 4(%1), %%mm3 \n\t"
295 "punpckldq 8(%1), %%mm0 \n\t"
296 "punpckldq 12(%1), %%mm3 \n\t"
297 "movq %%mm0, %%mm1 \n\t"
298 "movq %%mm3, %%mm4 \n\t"
299 "pand %%mm6, %%mm0 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "pmaddwd %%mm7, %%mm0 \n\t"
302 "pmaddwd %%mm7, %%mm3 \n\t"
303 "pand %%mm5, %%mm1 \n\t"
304 "pand %%mm5, %%mm4 \n\t"
305 "por %%mm1, %%mm0 \n\t"
306 "por %%mm4, %%mm3 \n\t"
307 "psrld $5, %%mm0 \n\t"
308 "pslld $11, %%mm3 \n\t"
309 "por %%mm3, %%mm0 \n\t"
317 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
319 __asm__
volatile(
SFENCE:::
"memory");
320 __asm__
volatile(
EMMS:::
"memory");
322 register int rgb = *(
const uint32_t*)s; s += 4;
323 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
332 uint16_t *d = (uint16_t *)dst;
334 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
336 "movq %0, %%mm7 \n\t"
337 "movq %1, %%mm6 \n\t"
338 ::
"m"(red_16mask),
"m"(green_16mask));
343 "movd (%1), %%mm0 \n\t"
344 "movd 4(%1), %%mm3 \n\t"
345 "punpckldq 8(%1), %%mm0 \n\t"
346 "punpckldq 12(%1), %%mm3 \n\t"
347 "movq %%mm0, %%mm1 \n\t"
348 "movq %%mm0, %%mm2 \n\t"
349 "movq %%mm3, %%mm4 \n\t"
350 "movq %%mm3, %%mm5 \n\t"
351 "psllq $8, %%mm0 \n\t"
352 "psllq $8, %%mm3 \n\t"
353 "pand %%mm7, %%mm0 \n\t"
354 "pand %%mm7, %%mm3 \n\t"
355 "psrlq $5, %%mm1 \n\t"
356 "psrlq $5, %%mm4 \n\t"
357 "pand %%mm6, %%mm1 \n\t"
358 "pand %%mm6, %%mm4 \n\t"
359 "psrlq $19, %%mm2 \n\t"
360 "psrlq $19, %%mm5 \n\t"
361 "pand %2, %%mm2 \n\t"
362 "pand %2, %%mm5 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "por %%mm2, %%mm0 \n\t"
366 "por %%mm5, %%mm3 \n\t"
367 "psllq $16, %%mm3 \n\t"
368 "por %%mm3, %%mm0 \n\t"
370 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
374 __asm__
volatile(
SFENCE:::
"memory");
375 __asm__
volatile(
EMMS:::
"memory");
377 register int rgb = *(
const uint32_t*)s; s += 4;
378 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
387 uint16_t *d = (uint16_t *)dst;
391 "movq %3, %%mm5 \n\t"
392 "movq %4, %%mm6 \n\t"
393 "movq %5, %%mm7 \n\t"
398 "movd (%1), %%mm0 \n\t"
399 "movd 4(%1), %%mm3 \n\t"
400 "punpckldq 8(%1), %%mm0 \n\t"
401 "punpckldq 12(%1), %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm3, %%mm4 \n\t"
404 "pand %%mm6, %%mm0 \n\t"
405 "pand %%mm6, %%mm3 \n\t"
406 "pmaddwd %%mm7, %%mm0 \n\t"
407 "pmaddwd %%mm7, %%mm3 \n\t"
408 "pand %%mm5, %%mm1 \n\t"
409 "pand %%mm5, %%mm4 \n\t"
410 "por %%mm1, %%mm0 \n\t"
411 "por %%mm4, %%mm3 \n\t"
412 "psrld $6, %%mm0 \n\t"
413 "pslld $10, %%mm3 \n\t"
414 "por %%mm3, %%mm0 \n\t"
422 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
424 __asm__
volatile(
SFENCE:::
"memory");
425 __asm__
volatile(
EMMS:::
"memory");
427 register int rgb = *(
const uint32_t*)s; s += 4;
428 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
437 uint16_t *d = (uint16_t *)dst;
439 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::
"m"(red_15mask),
"m"(green_15mask));
448 "movd (%1), %%mm0 \n\t"
449 "movd 4(%1), %%mm3 \n\t"
450 "punpckldq 8(%1), %%mm0 \n\t"
451 "punpckldq 12(%1), %%mm3 \n\t"
452 "movq %%mm0, %%mm1 \n\t"
453 "movq %%mm0, %%mm2 \n\t"
454 "movq %%mm3, %%mm4 \n\t"
455 "movq %%mm3, %%mm5 \n\t"
456 "psllq $7, %%mm0 \n\t"
457 "psllq $7, %%mm3 \n\t"
458 "pand %%mm7, %%mm0 \n\t"
459 "pand %%mm7, %%mm3 \n\t"
460 "psrlq $6, %%mm1 \n\t"
461 "psrlq $6, %%mm4 \n\t"
462 "pand %%mm6, %%mm1 \n\t"
463 "pand %%mm6, %%mm4 \n\t"
464 "psrlq $19, %%mm2 \n\t"
465 "psrlq $19, %%mm5 \n\t"
466 "pand %2, %%mm2 \n\t"
467 "pand %2, %%mm5 \n\t"
468 "por %%mm1, %%mm0 \n\t"
469 "por %%mm4, %%mm3 \n\t"
470 "por %%mm2, %%mm0 \n\t"
471 "por %%mm5, %%mm3 \n\t"
472 "psllq $16, %%mm3 \n\t"
473 "por %%mm3, %%mm0 \n\t"
475 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
479 __asm__
volatile(
SFENCE:::
"memory");
480 __asm__
volatile(
EMMS:::
"memory");
482 register int rgb = *(
const uint32_t*)s; s += 4;
483 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
492 uint16_t *d = (uint16_t *)dst;
494 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
496 "movq %0, %%mm7 \n\t"
497 "movq %1, %%mm6 \n\t"
498 ::
"m"(red_16mask),
"m"(green_16mask));
503 "movd (%1), %%mm0 \n\t"
504 "movd 3(%1), %%mm3 \n\t"
505 "punpckldq 6(%1), %%mm0 \n\t"
506 "punpckldq 9(%1), %%mm3 \n\t"
507 "movq %%mm0, %%mm1 \n\t"
508 "movq %%mm0, %%mm2 \n\t"
509 "movq %%mm3, %%mm4 \n\t"
510 "movq %%mm3, %%mm5 \n\t"
511 "psrlq $3, %%mm0 \n\t"
512 "psrlq $3, %%mm3 \n\t"
513 "pand %2, %%mm0 \n\t"
514 "pand %2, %%mm3 \n\t"
515 "psrlq $5, %%mm1 \n\t"
516 "psrlq $5, %%mm4 \n\t"
517 "pand %%mm6, %%mm1 \n\t"
518 "pand %%mm6, %%mm4 \n\t"
519 "psrlq $8, %%mm2 \n\t"
520 "psrlq $8, %%mm5 \n\t"
521 "pand %%mm7, %%mm2 \n\t"
522 "pand %%mm7, %%mm5 \n\t"
523 "por %%mm1, %%mm0 \n\t"
524 "por %%mm4, %%mm3 \n\t"
525 "por %%mm2, %%mm0 \n\t"
526 "por %%mm5, %%mm3 \n\t"
527 "psllq $16, %%mm3 \n\t"
528 "por %%mm3, %%mm0 \n\t"
530 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
534 __asm__
volatile(
SFENCE:::
"memory");
535 __asm__
volatile(
EMMS:::
"memory");
540 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
549 uint16_t *d = (uint16_t *)dst;
551 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
553 "movq %0, %%mm7 \n\t"
554 "movq %1, %%mm6 \n\t"
555 ::
"m"(red_16mask),
"m"(green_16mask));
560 "movd (%1), %%mm0 \n\t"
561 "movd 3(%1), %%mm3 \n\t"
562 "punpckldq 6(%1), %%mm0 \n\t"
563 "punpckldq 9(%1), %%mm3 \n\t"
564 "movq %%mm0, %%mm1 \n\t"
565 "movq %%mm0, %%mm2 \n\t"
566 "movq %%mm3, %%mm4 \n\t"
567 "movq %%mm3, %%mm5 \n\t"
568 "psllq $8, %%mm0 \n\t"
569 "psllq $8, %%mm3 \n\t"
570 "pand %%mm7, %%mm0 \n\t"
571 "pand %%mm7, %%mm3 \n\t"
572 "psrlq $5, %%mm1 \n\t"
573 "psrlq $5, %%mm4 \n\t"
574 "pand %%mm6, %%mm1 \n\t"
575 "pand %%mm6, %%mm4 \n\t"
576 "psrlq $19, %%mm2 \n\t"
577 "psrlq $19, %%mm5 \n\t"
578 "pand %2, %%mm2 \n\t"
579 "pand %2, %%mm5 \n\t"
580 "por %%mm1, %%mm0 \n\t"
581 "por %%mm4, %%mm3 \n\t"
582 "por %%mm2, %%mm0 \n\t"
583 "por %%mm5, %%mm3 \n\t"
584 "psllq $16, %%mm3 \n\t"
585 "por %%mm3, %%mm0 \n\t"
587 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
591 __asm__
volatile(
SFENCE:::
"memory");
592 __asm__
volatile(
EMMS:::
"memory");
597 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
606 uint16_t *d = (uint16_t *)dst;
608 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
610 "movq %0, %%mm7 \n\t"
611 "movq %1, %%mm6 \n\t"
612 ::
"m"(red_15mask),
"m"(green_15mask));
617 "movd (%1), %%mm0 \n\t"
618 "movd 3(%1), %%mm3 \n\t"
619 "punpckldq 6(%1), %%mm0 \n\t"
620 "punpckldq 9(%1), %%mm3 \n\t"
621 "movq %%mm0, %%mm1 \n\t"
622 "movq %%mm0, %%mm2 \n\t"
623 "movq %%mm3, %%mm4 \n\t"
624 "movq %%mm3, %%mm5 \n\t"
625 "psrlq $3, %%mm0 \n\t"
626 "psrlq $3, %%mm3 \n\t"
627 "pand %2, %%mm0 \n\t"
628 "pand %2, %%mm3 \n\t"
629 "psrlq $6, %%mm1 \n\t"
630 "psrlq $6, %%mm4 \n\t"
631 "pand %%mm6, %%mm1 \n\t"
632 "pand %%mm6, %%mm4 \n\t"
633 "psrlq $9, %%mm2 \n\t"
634 "psrlq $9, %%mm5 \n\t"
635 "pand %%mm7, %%mm2 \n\t"
636 "pand %%mm7, %%mm5 \n\t"
637 "por %%mm1, %%mm0 \n\t"
638 "por %%mm4, %%mm3 \n\t"
639 "por %%mm2, %%mm0 \n\t"
640 "por %%mm5, %%mm3 \n\t"
641 "psllq $16, %%mm3 \n\t"
642 "por %%mm3, %%mm0 \n\t"
644 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
648 __asm__
volatile(
SFENCE:::
"memory");
649 __asm__
volatile(
EMMS:::
"memory");
654 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
663 uint16_t *d = (uint16_t *)dst;
665 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
667 "movq %0, %%mm7 \n\t"
668 "movq %1, %%mm6 \n\t"
669 ::
"m"(red_15mask),
"m"(green_15mask));
674 "movd (%1), %%mm0 \n\t"
675 "movd 3(%1), %%mm3 \n\t"
676 "punpckldq 6(%1), %%mm0 \n\t"
677 "punpckldq 9(%1), %%mm3 \n\t"
678 "movq %%mm0, %%mm1 \n\t"
679 "movq %%mm0, %%mm2 \n\t"
680 "movq %%mm3, %%mm4 \n\t"
681 "movq %%mm3, %%mm5 \n\t"
682 "psllq $7, %%mm0 \n\t"
683 "psllq $7, %%mm3 \n\t"
684 "pand %%mm7, %%mm0 \n\t"
685 "pand %%mm7, %%mm3 \n\t"
686 "psrlq $6, %%mm1 \n\t"
687 "psrlq $6, %%mm4 \n\t"
688 "pand %%mm6, %%mm1 \n\t"
689 "pand %%mm6, %%mm4 \n\t"
690 "psrlq $19, %%mm2 \n\t"
691 "psrlq $19, %%mm5 \n\t"
692 "pand %2, %%mm2 \n\t"
693 "pand %2, %%mm5 \n\t"
694 "por %%mm1, %%mm0 \n\t"
695 "por %%mm4, %%mm3 \n\t"
696 "por %%mm2, %%mm0 \n\t"
697 "por %%mm5, %%mm3 \n\t"
698 "psllq $16, %%mm3 \n\t"
699 "por %%mm3, %%mm0 \n\t"
701 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
705 __asm__
volatile(
SFENCE:::
"memory");
706 __asm__
volatile(
EMMS:::
"memory");
711 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
718 const uint16_t *mm_end;
720 const uint16_t *
s = (
const uint16_t*)
src;
721 end = s + src_size/2;
722 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
727 "movq (%1), %%mm0 \n\t"
728 "movq (%1), %%mm1 \n\t"
729 "movq (%1), %%mm2 \n\t"
730 "pand %2, %%mm0 \n\t"
731 "pand %3, %%mm1 \n\t"
732 "pand %4, %%mm2 \n\t"
733 "psllq $5, %%mm0 \n\t"
734 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
735 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
736 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
737 "movq %%mm0, %%mm3 \n\t"
738 "movq %%mm1, %%mm4 \n\t"
739 "movq %%mm2, %%mm5 \n\t"
740 "punpcklwd %5, %%mm0 \n\t"
741 "punpcklwd %5, %%mm1 \n\t"
742 "punpcklwd %5, %%mm2 \n\t"
743 "punpckhwd %5, %%mm3 \n\t"
744 "punpckhwd %5, %%mm4 \n\t"
745 "punpckhwd %5, %%mm5 \n\t"
746 "psllq $8, %%mm1 \n\t"
747 "psllq $16, %%mm2 \n\t"
748 "por %%mm1, %%mm0 \n\t"
749 "por %%mm2, %%mm0 \n\t"
750 "psllq $8, %%mm4 \n\t"
751 "psllq $16, %%mm5 \n\t"
752 "por %%mm4, %%mm3 \n\t"
753 "por %%mm5, %%mm3 \n\t"
755 "movq %%mm0, %%mm6 \n\t"
756 "movq %%mm3, %%mm7 \n\t"
758 "movq 8(%1), %%mm0 \n\t"
759 "movq 8(%1), %%mm1 \n\t"
760 "movq 8(%1), %%mm2 \n\t"
761 "pand %2, %%mm0 \n\t"
762 "pand %3, %%mm1 \n\t"
763 "pand %4, %%mm2 \n\t"
764 "psllq $5, %%mm0 \n\t"
765 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
766 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
767 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
768 "movq %%mm0, %%mm3 \n\t"
769 "movq %%mm1, %%mm4 \n\t"
770 "movq %%mm2, %%mm5 \n\t"
771 "punpcklwd %5, %%mm0 \n\t"
772 "punpcklwd %5, %%mm1 \n\t"
773 "punpcklwd %5, %%mm2 \n\t"
774 "punpckhwd %5, %%mm3 \n\t"
775 "punpckhwd %5, %%mm4 \n\t"
776 "punpckhwd %5, %%mm5 \n\t"
777 "psllq $8, %%mm1 \n\t"
778 "psllq $16, %%mm2 \n\t"
779 "por %%mm1, %%mm0 \n\t"
780 "por %%mm2, %%mm0 \n\t"
781 "psllq $8, %%mm4 \n\t"
782 "psllq $16, %%mm5 \n\t"
783 "por %%mm4, %%mm3 \n\t"
784 "por %%mm5, %%mm3 \n\t"
787 :
"r"(
s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
791 "movq %%mm0, %%mm4 \n\t"
792 "movq %%mm3, %%mm5 \n\t"
793 "movq %%mm6, %%mm0 \n\t"
794 "movq %%mm7, %%mm1 \n\t"
796 "movq %%mm4, %%mm6 \n\t"
797 "movq %%mm5, %%mm7 \n\t"
798 "movq %%mm0, %%mm2 \n\t"
799 "movq %%mm1, %%mm3 \n\t"
808 __asm__
volatile(
SFENCE:::
"memory");
809 __asm__
volatile(
EMMS:::
"memory");
811 register uint16_t bgr;
813 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
814 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
815 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
822 const uint16_t *mm_end;
824 const uint16_t *
s = (
const uint16_t *)
src;
825 end = s + src_size/2;
826 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
831 "movq (%1), %%mm0 \n\t"
832 "movq (%1), %%mm1 \n\t"
833 "movq (%1), %%mm2 \n\t"
834 "pand %2, %%mm0 \n\t"
835 "pand %3, %%mm1 \n\t"
836 "pand %4, %%mm2 \n\t"
837 "psllq $5, %%mm0 \n\t"
838 "psrlq $1, %%mm2 \n\t"
839 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
840 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
841 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
842 "movq %%mm0, %%mm3 \n\t"
843 "movq %%mm1, %%mm4 \n\t"
844 "movq %%mm2, %%mm5 \n\t"
845 "punpcklwd %5, %%mm0 \n\t"
846 "punpcklwd %5, %%mm1 \n\t"
847 "punpcklwd %5, %%mm2 \n\t"
848 "punpckhwd %5, %%mm3 \n\t"
849 "punpckhwd %5, %%mm4 \n\t"
850 "punpckhwd %5, %%mm5 \n\t"
851 "psllq $8, %%mm1 \n\t"
852 "psllq $16, %%mm2 \n\t"
853 "por %%mm1, %%mm0 \n\t"
854 "por %%mm2, %%mm0 \n\t"
855 "psllq $8, %%mm4 \n\t"
856 "psllq $16, %%mm5 \n\t"
857 "por %%mm4, %%mm3 \n\t"
858 "por %%mm5, %%mm3 \n\t"
860 "movq %%mm0, %%mm6 \n\t"
861 "movq %%mm3, %%mm7 \n\t"
863 "movq 8(%1), %%mm0 \n\t"
864 "movq 8(%1), %%mm1 \n\t"
865 "movq 8(%1), %%mm2 \n\t"
866 "pand %2, %%mm0 \n\t"
867 "pand %3, %%mm1 \n\t"
868 "pand %4, %%mm2 \n\t"
869 "psllq $5, %%mm0 \n\t"
870 "psrlq $1, %%mm2 \n\t"
871 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
872 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
873 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
874 "movq %%mm0, %%mm3 \n\t"
875 "movq %%mm1, %%mm4 \n\t"
876 "movq %%mm2, %%mm5 \n\t"
877 "punpcklwd %5, %%mm0 \n\t"
878 "punpcklwd %5, %%mm1 \n\t"
879 "punpcklwd %5, %%mm2 \n\t"
880 "punpckhwd %5, %%mm3 \n\t"
881 "punpckhwd %5, %%mm4 \n\t"
882 "punpckhwd %5, %%mm5 \n\t"
883 "psllq $8, %%mm1 \n\t"
884 "psllq $16, %%mm2 \n\t"
885 "por %%mm1, %%mm0 \n\t"
886 "por %%mm2, %%mm0 \n\t"
887 "psllq $8, %%mm4 \n\t"
888 "psllq $16, %%mm5 \n\t"
889 "por %%mm4, %%mm3 \n\t"
890 "por %%mm5, %%mm3 \n\t"
892 :
"r"(
s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
896 "movq %%mm0, %%mm4 \n\t"
897 "movq %%mm3, %%mm5 \n\t"
898 "movq %%mm6, %%mm0 \n\t"
899 "movq %%mm7, %%mm1 \n\t"
901 "movq %%mm4, %%mm6 \n\t"
902 "movq %%mm5, %%mm7 \n\t"
903 "movq %%mm0, %%mm2 \n\t"
904 "movq %%mm1, %%mm3 \n\t"
913 __asm__
volatile(
SFENCE:::
"memory");
914 __asm__
volatile(
EMMS:::
"memory");
916 register uint16_t bgr;
918 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
919 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
920 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
932 "packuswb %%mm7, %%mm0 \n\t" \
933 "packuswb %%mm7, %%mm1 \n\t" \
934 "packuswb %%mm7, %%mm2 \n\t" \
935 "punpcklbw %%mm1, %%mm0 \n\t" \
936 "punpcklbw %%mm6, %%mm2 \n\t" \
937 "movq %%mm0, %%mm3 \n\t" \
938 "punpcklwd %%mm2, %%mm0 \n\t" \
939 "punpckhwd %%mm2, %%mm3 \n\t" \
940 MOVNTQ" %%mm0, (%0) \n\t" \
941 MOVNTQ" %%mm3, 8(%0) \n\t" \
946 const uint16_t *mm_end;
948 const uint16_t *
s = (
const uint16_t *)
src;
949 end = s + src_size/2;
950 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
951 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
952 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
957 "movq (%1), %%mm0 \n\t"
958 "movq (%1), %%mm1 \n\t"
959 "movq (%1), %%mm2 \n\t"
960 "pand %2, %%mm0 \n\t"
961 "pand %3, %%mm1 \n\t"
962 "pand %4, %%mm2 \n\t"
963 "psllq $5, %%mm0 \n\t"
964 "pmulhw %5, %%mm0 \n\t"
965 "pmulhw %5, %%mm1 \n\t"
966 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
968 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
973 __asm__
volatile(
SFENCE:::
"memory");
974 __asm__
volatile(
EMMS:::
"memory");
976 register uint16_t bgr;
978 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
979 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
980 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
988 const uint16_t *mm_end;
990 const uint16_t *
s = (
const uint16_t*)
src;
991 end = s + src_size/2;
992 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
993 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
994 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
999 "movq (%1), %%mm0 \n\t"
1000 "movq (%1), %%mm1 \n\t"
1001 "movq (%1), %%mm2 \n\t"
1002 "pand %2, %%mm0 \n\t"
1003 "pand %3, %%mm1 \n\t"
1004 "pand %4, %%mm2 \n\t"
1005 "psllq $5, %%mm0 \n\t"
1006 "psrlq $1, %%mm2 \n\t"
1007 "pmulhw %5, %%mm0 \n\t"
1008 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
1009 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
1011 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1016 __asm__
volatile(
SFENCE:::
"memory");
1017 __asm__
volatile(
EMMS:::
"memory");
1019 register uint16_t bgr;
1021 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1022 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1023 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1037 "movq %3, %%mm7 \n\t"
1038 "pxor %4, %%mm7 \n\t"
1039 "movq %%mm7, %%mm6 \n\t"
1040 "pxor %5, %%mm7 \n\t"
1044 "movq (%1, %0), %%mm0 \n\t"
1045 "movq 8(%1, %0), %%mm1 \n\t"
1046 # if COMPILE_TEMPLATE_MMXEXT
1047 "pshufw $177, %%mm0, %%mm3 \n\t"
1048 "pshufw $177, %%mm1, %%mm5 \n\t"
1049 "pand %%mm7, %%mm0 \n\t"
1050 "pand %%mm6, %%mm3 \n\t"
1051 "pand %%mm7, %%mm1 \n\t"
1052 "pand %%mm6, %%mm5 \n\t"
1053 "por %%mm3, %%mm0 \n\t"
1054 "por %%mm5, %%mm1 \n\t"
1056 "movq %%mm0, %%mm2 \n\t"
1057 "movq %%mm1, %%mm4 \n\t"
1058 "pand %%mm7, %%mm0 \n\t"
1059 "pand %%mm6, %%mm2 \n\t"
1060 "pand %%mm7, %%mm1 \n\t"
1061 "pand %%mm6, %%mm4 \n\t"
1062 "movq %%mm2, %%mm3 \n\t"
1063 "movq %%mm4, %%mm5 \n\t"
1064 "pslld $16, %%mm2 \n\t"
1065 "psrld $16, %%mm3 \n\t"
1066 "pslld $16, %%mm4 \n\t"
1067 "psrld $16, %%mm5 \n\t"
1068 "por %%mm2, %%mm0 \n\t"
1069 "por %%mm4, %%mm1 \n\t"
1070 "por %%mm3, %%mm0 \n\t"
1071 "por %%mm5, %%mm1 \n\t"
1073 MOVNTQ" %%mm0, (%2, %0) \n\t"
1074 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1081 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1083 for (; idx<15; idx+=4) {
1084 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1086 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1093 x86_reg mmx_size= 23 - src_size;
1095 "test %%"REG_a
", %%"REG_a
" \n\t"
1097 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1098 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1099 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1103 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1104 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1105 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1106 "psllq $16, %%mm0 \n\t"
1107 "pand %%mm5, %%mm0 \n\t"
1108 "pand %%mm6, %%mm1 \n\t"
1109 "pand %%mm7, %%mm2 \n\t"
1110 "por %%mm0, %%mm1 \n\t"
1111 "por %%mm2, %%mm1 \n\t"
1112 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1113 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1114 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1115 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1116 "pand %%mm7, %%mm0 \n\t"
1117 "pand %%mm5, %%mm1 \n\t"
1118 "pand %%mm6, %%mm2 \n\t"
1119 "por %%mm0, %%mm1 \n\t"
1120 "por %%mm2, %%mm1 \n\t"
1121 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1122 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1123 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1124 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1125 "pand %%mm6, %%mm0 \n\t"
1126 "pand %%mm7, %%mm1 \n\t"
1127 "pand %%mm5, %%mm2 \n\t"
1128 "por %%mm0, %%mm1 \n\t"
1129 "por %%mm2, %%mm1 \n\t"
1130 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1131 "add $24, %%"REG_a
" \n\t"
1135 :
"r" (
src-mmx_size),
"r"(dst-mmx_size)
1138 __asm__
volatile(
SFENCE:::
"memory");
1139 __asm__
volatile(
EMMS:::
"memory");
1141 if (mmx_size==23)
return;
1145 src_size= 23-mmx_size;
1148 for (i=0; i<src_size; i+=3) {
1151 dst[i + 1] =
src[i + 1];
1152 dst[i + 2] =
src[i + 0];
1159 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1163 for (y=0; y<
height; y++) {
1166 "xor %%"REG_a
", %%"REG_a
" \n\t"
1169 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1172 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1173 "movq %%mm0, %%mm2 \n\t"
1174 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1175 "punpcklbw %%mm1, %%mm0 \n\t"
1176 "punpckhbw %%mm1, %%mm2 \n\t"
1178 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1179 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1180 "movq %%mm3, %%mm4 \n\t"
1181 "movq %%mm5, %%mm6 \n\t"
1182 "punpcklbw %%mm0, %%mm3 \n\t"
1183 "punpckhbw %%mm0, %%mm4 \n\t"
1184 "punpcklbw %%mm2, %%mm5 \n\t"
1185 "punpckhbw %%mm2, %%mm6 \n\t"
1187 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1188 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1189 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1190 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1192 "add $8, %%"REG_a
" \n\t"
1193 "cmp %4, %%"REG_a
" \n\t"
1195 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1198 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1199 usrc += chromStride;
1200 vsrc += chromStride;
1216 int lumStride,
int chromStride,
int dstStride)
1224 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1228 for (y=0; y<
height; y++) {
1231 "xor %%"REG_a
", %%"REG_a
" \n\t"
1234 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1237 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1238 "movq %%mm0, %%mm2 \n\t"
1239 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1240 "punpcklbw %%mm1, %%mm0 \n\t"
1241 "punpckhbw %%mm1, %%mm2 \n\t"
1243 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1244 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1245 "movq %%mm0, %%mm4 \n\t"
1246 "movq %%mm2, %%mm6 \n\t"
1247 "punpcklbw %%mm3, %%mm0 \n\t"
1248 "punpckhbw %%mm3, %%mm4 \n\t"
1249 "punpcklbw %%mm5, %%mm2 \n\t"
1250 "punpckhbw %%mm5, %%mm6 \n\t"
1252 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1253 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1254 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1255 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1257 "add $8, %%"REG_a
" \n\t"
1258 "cmp %4, %%"REG_a
" \n\t"
1260 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1263 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1264 usrc += chromStride;
1265 vsrc += chromStride;
1281 int lumStride,
int chromStride,
int dstStride)
1292 int lumStride,
int chromStride,
int dstStride)
1302 int lumStride,
int chromStride,
int dstStride)
1313 int lumStride,
int chromStride,
int srcStride)
1317 for (y=0; y<
height; y+=2) {
1319 "xor %%"REG_a
", %%"REG_a
" \n\t"
1320 "pcmpeqw %%mm7, %%mm7 \n\t"
1321 "psrlw $8, %%mm7 \n\t"
1324 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1325 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1326 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1327 "movq %%mm0, %%mm2 \n\t"
1328 "movq %%mm1, %%mm3 \n\t"
1329 "psrlw $8, %%mm0 \n\t"
1330 "psrlw $8, %%mm1 \n\t"
1331 "pand %%mm7, %%mm2 \n\t"
1332 "pand %%mm7, %%mm3 \n\t"
1333 "packuswb %%mm1, %%mm0 \n\t"
1334 "packuswb %%mm3, %%mm2 \n\t"
1336 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1338 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1339 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1340 "movq %%mm1, %%mm3 \n\t"
1341 "movq %%mm2, %%mm4 \n\t"
1342 "psrlw $8, %%mm1 \n\t"
1343 "psrlw $8, %%mm2 \n\t"
1344 "pand %%mm7, %%mm3 \n\t"
1345 "pand %%mm7, %%mm4 \n\t"
1346 "packuswb %%mm2, %%mm1 \n\t"
1347 "packuswb %%mm4, %%mm3 \n\t"
1349 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1351 "movq %%mm0, %%mm2 \n\t"
1352 "movq %%mm1, %%mm3 \n\t"
1353 "psrlw $8, %%mm0 \n\t"
1354 "psrlw $8, %%mm1 \n\t"
1355 "pand %%mm7, %%mm2 \n\t"
1356 "pand %%mm7, %%mm3 \n\t"
1357 "packuswb %%mm1, %%mm0 \n\t"
1358 "packuswb %%mm3, %%mm2 \n\t"
1360 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1361 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1363 "add $8, %%"REG_a
" \n\t"
1364 "cmp %4, %%"REG_a
" \n\t"
1366 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1367 :
"memory",
"%"REG_a
1374 "xor %%"REG_a
", %%"REG_a
" \n\t"
1377 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1378 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1379 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1380 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1381 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1382 "pand %%mm7, %%mm0 \n\t"
1383 "pand %%mm7, %%mm1 \n\t"
1384 "pand %%mm7, %%mm2 \n\t"
1385 "pand %%mm7, %%mm3 \n\t"
1386 "packuswb %%mm1, %%mm0 \n\t"
1387 "packuswb %%mm3, %%mm2 \n\t"
1389 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1390 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1392 "add $8, %%"REG_a
" \n\t"
1393 "cmp %4, %%"REG_a
" \n\t"
1396 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1397 :
"memory",
"%"REG_a
1399 udst += chromStride;
1400 vdst += chromStride;
1404 __asm__
volatile(
EMMS" \n\t"
1410 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1418 for (x=0; x<srcWidth-1; x++) {
1419 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1420 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1422 dst[2*srcWidth-1]=
src[srcWidth-1];
1426 for (y=1; y<srcHeight; y++) {
1427 const x86_reg mmxSize= srcWidth&~15;
1429 "mov %4, %%"REG_a
" \n\t"
1430 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1431 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1432 "movq %%mm4, %%mm2 \n\t"
1433 "psllq $8, %%mm4 \n\t"
1434 "pand %%mm0, %%mm2 \n\t"
1435 "por %%mm2, %%mm4 \n\t"
1436 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1437 "movq %%mm5, %%mm3 \n\t"
1438 "psllq $8, %%mm5 \n\t"
1439 "pand %%mm0, %%mm3 \n\t"
1440 "por %%mm3, %%mm5 \n\t"
1442 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1443 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1444 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1445 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1446 PAVGB" %%mm0, %%mm5 \n\t"
1447 PAVGB" %%mm0, %%mm3 \n\t"
1448 PAVGB" %%mm0, %%mm5 \n\t"
1449 PAVGB" %%mm0, %%mm3 \n\t"
1450 PAVGB" %%mm1, %%mm4 \n\t"
1451 PAVGB" %%mm1, %%mm2 \n\t"
1452 PAVGB" %%mm1, %%mm4 \n\t"
1453 PAVGB" %%mm1, %%mm2 \n\t"
1454 "movq %%mm5, %%mm7 \n\t"
1455 "movq %%mm4, %%mm6 \n\t"
1456 "punpcklbw %%mm3, %%mm5 \n\t"
1457 "punpckhbw %%mm3, %%mm7 \n\t"
1458 "punpcklbw %%mm2, %%mm4 \n\t"
1459 "punpckhbw %%mm2, %%mm6 \n\t"
1460 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1461 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1462 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1463 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1464 "add $8, %%"REG_a
" \n\t"
1465 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1466 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1468 ::
"r" (
src + mmxSize ),
"r" (
src + srcStride + mmxSize ),
1469 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1474 for (x=mmxSize-1; x<srcWidth-1; x++) {
1475 dst[2*x +1]= (3*
src[x+0] +
src[x+srcStride+1])>>2;
1476 dst[2*x+dstStride+2]= (
src[x+0] + 3*
src[x+srcStride+1])>>2;
1477 dst[2*x+dstStride+1]= (
src[x+1] + 3*
src[x+srcStride ])>>2;
1478 dst[2*x +2]= (3*
src[x+1] +
src[x+srcStride ])>>2;
1480 dst[srcWidth*2 -1 ]= (3*
src[srcWidth-1] +
src[srcWidth-1 + srcStride])>>2;
1481 dst[srcWidth*2 -1 + dstStride]= (
src[srcWidth-1] + 3*
src[srcWidth-1 + srcStride])>>2;
1490 for (x=0; x<srcWidth-1; x++) {
1491 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1492 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1494 dst[2*srcWidth-1]=
src[srcWidth-1];
1496 __asm__
volatile(
EMMS" \n\t"
1502 #if !COMPILE_TEMPLATE_AMD3DNOW
1511 int lumStride,
int chromStride,
int srcStride)
1514 const x86_reg chromWidth= width>>1;
1515 for (y=0; y<
height; y+=2) {
1517 "xor %%"REG_a
", %%"REG_a
" \n\t"
1518 "pcmpeqw %%mm7, %%mm7 \n\t"
1519 "psrlw $8, %%mm7 \n\t"
1522 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1523 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1524 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1525 "movq %%mm0, %%mm2 \n\t"
1526 "movq %%mm1, %%mm3 \n\t"
1527 "pand %%mm7, %%mm0 \n\t"
1528 "pand %%mm7, %%mm1 \n\t"
1529 "psrlw $8, %%mm2 \n\t"
1530 "psrlw $8, %%mm3 \n\t"
1531 "packuswb %%mm1, %%mm0 \n\t"
1532 "packuswb %%mm3, %%mm2 \n\t"
1534 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1536 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1537 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1538 "movq %%mm1, %%mm3 \n\t"
1539 "movq %%mm2, %%mm4 \n\t"
1540 "pand %%mm7, %%mm1 \n\t"
1541 "pand %%mm7, %%mm2 \n\t"
1542 "psrlw $8, %%mm3 \n\t"
1543 "psrlw $8, %%mm4 \n\t"
1544 "packuswb %%mm2, %%mm1 \n\t"
1545 "packuswb %%mm4, %%mm3 \n\t"
1547 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1549 "movq %%mm0, %%mm2 \n\t"
1550 "movq %%mm1, %%mm3 \n\t"
1551 "psrlw $8, %%mm0 \n\t"
1552 "psrlw $8, %%mm1 \n\t"
1553 "pand %%mm7, %%mm2 \n\t"
1554 "pand %%mm7, %%mm3 \n\t"
1555 "packuswb %%mm1, %%mm0 \n\t"
1556 "packuswb %%mm3, %%mm2 \n\t"
1558 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1559 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1561 "add $8, %%"REG_a
" \n\t"
1562 "cmp %4, %%"REG_a
" \n\t"
1564 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1565 :
"memory",
"%"REG_a
1572 "xor %%"REG_a
", %%"REG_a
" \n\t"
1575 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1576 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1577 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1578 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1579 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1580 "psrlw $8, %%mm0 \n\t"
1581 "psrlw $8, %%mm1 \n\t"
1582 "psrlw $8, %%mm2 \n\t"
1583 "psrlw $8, %%mm3 \n\t"
1584 "packuswb %%mm1, %%mm0 \n\t"
1585 "packuswb %%mm3, %%mm2 \n\t"
1587 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1588 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1590 "add $8, %%"REG_a
" \n\t"
1591 "cmp %4, %%"REG_a
" \n\t"
1594 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1595 :
"memory",
"%"REG_a
1597 udst += chromStride;
1598 vdst += chromStride;
1602 __asm__
volatile(
EMMS" \n\t"
1618 int lumStride,
int chromStride,
int srcStride,
1621 #define BGR2Y_IDX "16*4+16*32"
1622 #define BGR2U_IDX "16*4+16*33"
1623 #define BGR2V_IDX "16*4+16*34"
1625 const x86_reg chromWidth= width>>1;
1626 for (y=0; y<height-2; y+=2) {
1628 for (i=0; i<2; i++) {
1630 "mov %2, %%"REG_a
" \n\t"
1631 "movq "BGR2Y_IDX
"(%3), %%mm6 \n\t"
1632 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1633 "pxor %%mm7, %%mm7 \n\t"
1634 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1638 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1639 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1640 "punpcklbw %%mm7, %%mm0 \n\t"
1641 "punpcklbw %%mm7, %%mm1 \n\t"
1642 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1643 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1644 "punpcklbw %%mm7, %%mm2 \n\t"
1645 "punpcklbw %%mm7, %%mm3 \n\t"
1646 "pmaddwd %%mm6, %%mm0 \n\t"
1647 "pmaddwd %%mm6, %%mm1 \n\t"
1648 "pmaddwd %%mm6, %%mm2 \n\t"
1649 "pmaddwd %%mm6, %%mm3 \n\t"
1650 "psrad $8, %%mm0 \n\t"
1651 "psrad $8, %%mm1 \n\t"
1652 "psrad $8, %%mm2 \n\t"
1653 "psrad $8, %%mm3 \n\t"
1654 "packssdw %%mm1, %%mm0 \n\t"
1655 "packssdw %%mm3, %%mm2 \n\t"
1656 "pmaddwd %%mm5, %%mm0 \n\t"
1657 "pmaddwd %%mm5, %%mm2 \n\t"
1658 "packssdw %%mm2, %%mm0 \n\t"
1659 "psraw $7, %%mm0 \n\t"
1661 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1662 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1663 "punpcklbw %%mm7, %%mm4 \n\t"
1664 "punpcklbw %%mm7, %%mm1 \n\t"
1665 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1666 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1667 "punpcklbw %%mm7, %%mm2 \n\t"
1668 "punpcklbw %%mm7, %%mm3 \n\t"
1669 "pmaddwd %%mm6, %%mm4 \n\t"
1670 "pmaddwd %%mm6, %%mm1 \n\t"
1671 "pmaddwd %%mm6, %%mm2 \n\t"
1672 "pmaddwd %%mm6, %%mm3 \n\t"
1673 "psrad $8, %%mm4 \n\t"
1674 "psrad $8, %%mm1 \n\t"
1675 "psrad $8, %%mm2 \n\t"
1676 "psrad $8, %%mm3 \n\t"
1677 "packssdw %%mm1, %%mm4 \n\t"
1678 "packssdw %%mm3, %%mm2 \n\t"
1679 "pmaddwd %%mm5, %%mm4 \n\t"
1680 "pmaddwd %%mm5, %%mm2 \n\t"
1681 "add $24, %%"REG_d
" \n\t"
1682 "packssdw %%mm2, %%mm4 \n\t"
1683 "psraw $7, %%mm4 \n\t"
1685 "packuswb %%mm4, %%mm0 \n\t"
1686 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1688 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1689 "add $8, %%"REG_a
" \n\t"
1691 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width),
"r"(rgb2yuv)
1692 :
"%"REG_a,
"%"REG_d
1699 "mov %4, %%"REG_a
" \n\t"
1700 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1701 "movq "BGR2U_IDX
"(%5), %%mm6 \n\t"
1702 "pxor %%mm7, %%mm7 \n\t"
1703 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1704 "add %%"REG_d
", %%"REG_d
" \n\t"
1709 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1710 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1711 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1712 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1713 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1714 PAVGB" %%mm1, %%mm0 \n\t"
1715 PAVGB" %%mm3, %%mm2 \n\t"
1716 "movq %%mm0, %%mm1 \n\t"
1717 "movq %%mm2, %%mm3 \n\t"
1718 "psrlq $24, %%mm0 \n\t"
1719 "psrlq $24, %%mm2 \n\t"
1720 PAVGB" %%mm1, %%mm0 \n\t"
1721 PAVGB" %%mm3, %%mm2 \n\t"
1722 "punpcklbw %%mm7, %%mm0 \n\t"
1723 "punpcklbw %%mm7, %%mm2 \n\t"
1725 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1726 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1727 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1728 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1729 "punpcklbw %%mm7, %%mm0 \n\t"
1730 "punpcklbw %%mm7, %%mm1 \n\t"
1731 "punpcklbw %%mm7, %%mm2 \n\t"
1732 "punpcklbw %%mm7, %%mm3 \n\t"
1733 "paddw %%mm1, %%mm0 \n\t"
1734 "paddw %%mm3, %%mm2 \n\t"
1735 "paddw %%mm2, %%mm0 \n\t"
1736 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1737 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1738 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1739 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1740 "punpcklbw %%mm7, %%mm4 \n\t"
1741 "punpcklbw %%mm7, %%mm1 \n\t"
1742 "punpcklbw %%mm7, %%mm2 \n\t"
1743 "punpcklbw %%mm7, %%mm3 \n\t"
1744 "paddw %%mm1, %%mm4 \n\t"
1745 "paddw %%mm3, %%mm2 \n\t"
1746 "paddw %%mm4, %%mm2 \n\t"
1747 "psrlw $2, %%mm0 \n\t"
1748 "psrlw $2, %%mm2 \n\t"
1750 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t"
1751 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t"
1753 "pmaddwd %%mm0, %%mm1 \n\t"
1754 "pmaddwd %%mm2, %%mm3 \n\t"
1755 "pmaddwd %%mm6, %%mm0 \n\t"
1756 "pmaddwd %%mm6, %%mm2 \n\t"
1757 "psrad $8, %%mm0 \n\t"
1758 "psrad $8, %%mm1 \n\t"
1759 "psrad $8, %%mm2 \n\t"
1760 "psrad $8, %%mm3 \n\t"
1761 "packssdw %%mm2, %%mm0 \n\t"
1762 "packssdw %%mm3, %%mm1 \n\t"
1763 "pmaddwd %%mm5, %%mm0 \n\t"
1764 "pmaddwd %%mm5, %%mm1 \n\t"
1765 "packssdw %%mm1, %%mm0 \n\t"
1766 "psraw $7, %%mm0 \n\t"
1768 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1769 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1770 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1771 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1772 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1773 PAVGB" %%mm1, %%mm4 \n\t"
1774 PAVGB" %%mm3, %%mm2 \n\t"
1775 "movq %%mm4, %%mm1 \n\t"
1776 "movq %%mm2, %%mm3 \n\t"
1777 "psrlq $24, %%mm4 \n\t"
1778 "psrlq $24, %%mm2 \n\t"
1779 PAVGB" %%mm1, %%mm4 \n\t"
1780 PAVGB" %%mm3, %%mm2 \n\t"
1781 "punpcklbw %%mm7, %%mm4 \n\t"
1782 "punpcklbw %%mm7, %%mm2 \n\t"
1784 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1785 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1786 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1787 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1788 "punpcklbw %%mm7, %%mm4 \n\t"
1789 "punpcklbw %%mm7, %%mm1 \n\t"
1790 "punpcklbw %%mm7, %%mm2 \n\t"
1791 "punpcklbw %%mm7, %%mm3 \n\t"
1792 "paddw %%mm1, %%mm4 \n\t"
1793 "paddw %%mm3, %%mm2 \n\t"
1794 "paddw %%mm2, %%mm4 \n\t"
1795 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1796 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1797 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1798 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1799 "punpcklbw %%mm7, %%mm5 \n\t"
1800 "punpcklbw %%mm7, %%mm1 \n\t"
1801 "punpcklbw %%mm7, %%mm2 \n\t"
1802 "punpcklbw %%mm7, %%mm3 \n\t"
1803 "paddw %%mm1, %%mm5 \n\t"
1804 "paddw %%mm3, %%mm2 \n\t"
1805 "paddw %%mm5, %%mm2 \n\t"
1806 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1807 "psrlw $2, %%mm4 \n\t"
1808 "psrlw $2, %%mm2 \n\t"
1810 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t"
1811 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t"
1813 "pmaddwd %%mm4, %%mm1 \n\t"
1814 "pmaddwd %%mm2, %%mm3 \n\t"
1815 "pmaddwd %%mm6, %%mm4 \n\t"
1816 "pmaddwd %%mm6, %%mm2 \n\t"
1817 "psrad $8, %%mm4 \n\t"
1818 "psrad $8, %%mm1 \n\t"
1819 "psrad $8, %%mm2 \n\t"
1820 "psrad $8, %%mm3 \n\t"
1821 "packssdw %%mm2, %%mm4 \n\t"
1822 "packssdw %%mm3, %%mm1 \n\t"
1823 "pmaddwd %%mm5, %%mm4 \n\t"
1824 "pmaddwd %%mm5, %%mm1 \n\t"
1825 "add $24, %%"REG_d
" \n\t"
1826 "packssdw %%mm1, %%mm4 \n\t"
1827 "psraw $7, %%mm4 \n\t"
1829 "movq %%mm0, %%mm1 \n\t"
1830 "punpckldq %%mm4, %%mm0 \n\t"
1831 "punpckhdq %%mm4, %%mm1 \n\t"
1832 "packsswb %%mm1, %%mm0 \n\t"
1833 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1834 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1835 "punpckhdq %%mm0, %%mm0 \n\t"
1836 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1837 "add $4, %%"REG_a
" \n\t"
1839 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth),
"r"(rgb2yuv)
1840 :
"%"REG_a,
"%"REG_d
1843 udst += chromStride;
1844 vdst += chromStride;
1848 __asm__
volatile(
EMMS" \n\t"
1852 ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1857 #if !COMPILE_TEMPLATE_AMD3DNOW
1860 int src2Stride,
int dstStride)
1864 for (h=0; h <
height; h++) {
1867 #if COMPILE_TEMPLATE_SSE2
1869 "xor %%"REG_a
", %%"REG_a
" \n\t"
1873 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1874 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1875 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1876 "punpcklbw %%xmm2, %%xmm0 \n\t"
1877 "punpckhbw %%xmm2, %%xmm1 \n\t"
1878 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1879 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1880 "add $16, %%"REG_a
" \n\t"
1881 "cmp %3, %%"REG_a
" \n\t"
1883 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1884 :
"memory",
"%"REG_a
""
1888 "xor %%"REG_a
", %%"REG_a
" \n\t"
1892 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1893 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1894 "movq %%mm0, %%mm1 \n\t"
1895 "movq %%mm2, %%mm3 \n\t"
1896 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1897 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1898 "punpcklbw %%mm4, %%mm0 \n\t"
1899 "punpckhbw %%mm4, %%mm1 \n\t"
1900 "punpcklbw %%mm5, %%mm2 \n\t"
1901 "punpckhbw %%mm5, %%mm3 \n\t"
1902 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1903 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1904 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1905 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1906 "add $16, %%"REG_a
" \n\t"
1907 "cmp %3, %%"REG_a
" \n\t"
1909 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1910 :
"memory",
"%"REG_a
1913 for (w= (width&(~15)); w <
width; w++) {
1914 dest[2*w+0] = src1[w];
1915 dest[2*w+1] = src2[w];
1929 #if !COMPILE_TEMPLATE_SSE2
1930 #if !COMPILE_TEMPLATE_AMD3DNOW
1934 int srcStride1,
int srcStride2,
1935 int dstStride1,
int dstStride2)
1939 w=width/2; h=height/2;
1943 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1945 const uint8_t*
s1=src1+srcStride1*(y>>1);
1948 for (;x<w-31;x+=32) {
1951 "movq (%1,%2), %%mm0 \n\t"
1952 "movq 8(%1,%2), %%mm2 \n\t"
1953 "movq 16(%1,%2), %%mm4 \n\t"
1954 "movq 24(%1,%2), %%mm6 \n\t"
1955 "movq %%mm0, %%mm1 \n\t"
1956 "movq %%mm2, %%mm3 \n\t"
1957 "movq %%mm4, %%mm5 \n\t"
1958 "movq %%mm6, %%mm7 \n\t"
1959 "punpcklbw %%mm0, %%mm0 \n\t"
1960 "punpckhbw %%mm1, %%mm1 \n\t"
1961 "punpcklbw %%mm2, %%mm2 \n\t"
1962 "punpckhbw %%mm3, %%mm3 \n\t"
1963 "punpcklbw %%mm4, %%mm4 \n\t"
1964 "punpckhbw %%mm5, %%mm5 \n\t"
1965 "punpcklbw %%mm6, %%mm6 \n\t"
1966 "punpckhbw %%mm7, %%mm7 \n\t"
1967 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1968 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1969 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1970 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1971 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1972 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1973 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1974 MOVNTQ" %%mm7, 56(%0,%2,2)"
1975 ::
"r"(d),
"r"(s1),
"r"(x)
1978 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1981 const uint8_t*
s2=src2+srcStride2*(y>>1);
1984 for (;x<w-31;x+=32) {
1987 "movq (%1,%2), %%mm0 \n\t"
1988 "movq 8(%1,%2), %%mm2 \n\t"
1989 "movq 16(%1,%2), %%mm4 \n\t"
1990 "movq 24(%1,%2), %%mm6 \n\t"
1991 "movq %%mm0, %%mm1 \n\t"
1992 "movq %%mm2, %%mm3 \n\t"
1993 "movq %%mm4, %%mm5 \n\t"
1994 "movq %%mm6, %%mm7 \n\t"
1995 "punpcklbw %%mm0, %%mm0 \n\t"
1996 "punpckhbw %%mm1, %%mm1 \n\t"
1997 "punpcklbw %%mm2, %%mm2 \n\t"
1998 "punpckhbw %%mm3, %%mm3 \n\t"
1999 "punpcklbw %%mm4, %%mm4 \n\t"
2000 "punpckhbw %%mm5, %%mm5 \n\t"
2001 "punpcklbw %%mm6, %%mm6 \n\t"
2002 "punpckhbw %%mm7, %%mm7 \n\t"
2003 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2004 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2005 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2006 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2007 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2008 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2009 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2010 MOVNTQ" %%mm7, 56(%0,%2,2)"
2011 ::
"r"(d),
"r"(s2),
"r"(x)
2014 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2026 int srcStride1,
int srcStride2,
2027 int srcStride3,
int dstStride)
2033 const uint8_t* yp=src1+srcStride1*
y;
2034 const uint8_t* up=src2+srcStride2*(y>>2);
2035 const uint8_t* vp=src3+srcStride3*(y>>2);
2043 "movq (%1, %0, 4), %%mm0 \n\t"
2044 "movq (%2, %0), %%mm1 \n\t"
2045 "movq (%3, %0), %%mm2 \n\t"
2046 "movq %%mm0, %%mm3 \n\t"
2047 "movq %%mm1, %%mm4 \n\t"
2048 "movq %%mm2, %%mm5 \n\t"
2049 "punpcklbw %%mm1, %%mm1 \n\t"
2050 "punpcklbw %%mm2, %%mm2 \n\t"
2051 "punpckhbw %%mm4, %%mm4 \n\t"
2052 "punpckhbw %%mm5, %%mm5 \n\t"
2054 "movq %%mm1, %%mm6 \n\t"
2055 "punpcklbw %%mm2, %%mm1 \n\t"
2056 "punpcklbw %%mm1, %%mm0 \n\t"
2057 "punpckhbw %%mm1, %%mm3 \n\t"
2058 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2059 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2061 "punpckhbw %%mm2, %%mm6 \n\t"
2062 "movq 8(%1, %0, 4), %%mm0 \n\t"
2063 "movq %%mm0, %%mm3 \n\t"
2064 "punpcklbw %%mm6, %%mm0 \n\t"
2065 "punpckhbw %%mm6, %%mm3 \n\t"
2066 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2067 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2069 "movq %%mm4, %%mm6 \n\t"
2070 "movq 16(%1, %0, 4), %%mm0 \n\t"
2071 "movq %%mm0, %%mm3 \n\t"
2072 "punpcklbw %%mm5, %%mm4 \n\t"
2073 "punpcklbw %%mm4, %%mm0 \n\t"
2074 "punpckhbw %%mm4, %%mm3 \n\t"
2075 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2076 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2078 "punpckhbw %%mm5, %%mm6 \n\t"
2079 "movq 24(%1, %0, 4), %%mm0 \n\t"
2080 "movq %%mm0, %%mm3 \n\t"
2081 "punpcklbw %%mm6, %%mm0 \n\t"
2082 "punpckhbw %%mm6, %%mm3 \n\t"
2083 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2084 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2087 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2091 const int x2 = x<<2;
2094 d[8*x+2] = yp[x2+1];
2096 d[8*x+4] = yp[x2+2];
2098 d[8*x+6] = yp[x2+3];
2119 "pcmpeqw %%mm7, %%mm7 \n\t"
2120 "psrlw $8, %%mm7 \n\t"
2122 "movq -30(%1, %0, 2), %%mm0 \n\t"
2123 "movq -22(%1, %0, 2), %%mm1 \n\t"
2124 "movq -14(%1, %0, 2), %%mm2 \n\t"
2125 "movq -6(%1, %0, 2), %%mm3 \n\t"
2126 "pand %%mm7, %%mm0 \n\t"
2127 "pand %%mm7, %%mm1 \n\t"
2128 "pand %%mm7, %%mm2 \n\t"
2129 "pand %%mm7, %%mm3 \n\t"
2130 "packuswb %%mm1, %%mm0 \n\t"
2131 "packuswb %%mm3, %%mm2 \n\t"
2132 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2133 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2137 :
"r"(src),
"r"(dst)
2147 #if !COMPILE_TEMPLATE_AMD3DNOW
2157 "pcmpeqw %%mm7, %%mm7 \n\t"
2158 "psrlw $8, %%mm7 \n\t"
2160 "movq -28(%1, %0, 4), %%mm0 \n\t"
2161 "movq -20(%1, %0, 4), %%mm1 \n\t"
2162 "movq -12(%1, %0, 4), %%mm2 \n\t"
2163 "movq -4(%1, %0, 4), %%mm3 \n\t"
2164 "pand %%mm7, %%mm0 \n\t"
2165 "pand %%mm7, %%mm1 \n\t"
2166 "pand %%mm7, %%mm2 \n\t"
2167 "pand %%mm7, %%mm3 \n\t"
2168 "packuswb %%mm1, %%mm0 \n\t"
2169 "packuswb %%mm3, %%mm2 \n\t"
2170 "movq %%mm0, %%mm1 \n\t"
2171 "movq %%mm2, %%mm3 \n\t"
2172 "psrlw $8, %%mm0 \n\t"
2173 "psrlw $8, %%mm2 \n\t"
2174 "pand %%mm7, %%mm1 \n\t"
2175 "pand %%mm7, %%mm3 \n\t"
2176 "packuswb %%mm2, %%mm0 \n\t"
2177 "packuswb %%mm3, %%mm1 \n\t"
2178 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2179 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2183 :
"r"(src),
"r"(dst0),
"r"(dst1)
2206 "pcmpeqw %%mm7, %%mm7 \n\t"
2207 "psrlw $8, %%mm7 \n\t"
2209 "movq -28(%1, %0, 4), %%mm0 \n\t"
2210 "movq -20(%1, %0, 4), %%mm1 \n\t"
2211 "movq -12(%1, %0, 4), %%mm2 \n\t"
2212 "movq -4(%1, %0, 4), %%mm3 \n\t"
2213 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2214 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2215 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2216 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2217 "pand %%mm7, %%mm0 \n\t"
2218 "pand %%mm7, %%mm1 \n\t"
2219 "pand %%mm7, %%mm2 \n\t"
2220 "pand %%mm7, %%mm3 \n\t"
2221 "packuswb %%mm1, %%mm0 \n\t"
2222 "packuswb %%mm3, %%mm2 \n\t"
2223 "movq %%mm0, %%mm1 \n\t"
2224 "movq %%mm2, %%mm3 \n\t"
2225 "psrlw $8, %%mm0 \n\t"
2226 "psrlw $8, %%mm2 \n\t"
2227 "pand %%mm7, %%mm1 \n\t"
2228 "pand %%mm7, %%mm3 \n\t"
2229 "packuswb %%mm2, %%mm0 \n\t"
2230 "packuswb %%mm3, %%mm1 \n\t"
2231 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2232 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2236 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2248 #if !COMPILE_TEMPLATE_AMD3DNOW
2258 "pcmpeqw %%mm7, %%mm7 \n\t"
2259 "psrlw $8, %%mm7 \n\t"
2261 "movq -28(%1, %0, 4), %%mm0 \n\t"
2262 "movq -20(%1, %0, 4), %%mm1 \n\t"
2263 "movq -12(%1, %0, 4), %%mm2 \n\t"
2264 "movq -4(%1, %0, 4), %%mm3 \n\t"
2265 "psrlw $8, %%mm0 \n\t"
2266 "psrlw $8, %%mm1 \n\t"
2267 "psrlw $8, %%mm2 \n\t"
2268 "psrlw $8, %%mm3 \n\t"
2269 "packuswb %%mm1, %%mm0 \n\t"
2270 "packuswb %%mm3, %%mm2 \n\t"
2271 "movq %%mm0, %%mm1 \n\t"
2272 "movq %%mm2, %%mm3 \n\t"
2273 "psrlw $8, %%mm0 \n\t"
2274 "psrlw $8, %%mm2 \n\t"
2275 "pand %%mm7, %%mm1 \n\t"
2276 "pand %%mm7, %%mm3 \n\t"
2277 "packuswb %%mm2, %%mm0 \n\t"
2278 "packuswb %%mm3, %%mm1 \n\t"
2279 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2280 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2284 :
"r"(src),
"r"(dst0),
"r"(dst1)
2308 "pcmpeqw %%mm7, %%mm7 \n\t"
2309 "psrlw $8, %%mm7 \n\t"
2311 "movq -28(%1, %0, 4), %%mm0 \n\t"
2312 "movq -20(%1, %0, 4), %%mm1 \n\t"
2313 "movq -12(%1, %0, 4), %%mm2 \n\t"
2314 "movq -4(%1, %0, 4), %%mm3 \n\t"
2315 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2316 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2317 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2318 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2319 "psrlw $8, %%mm0 \n\t"
2320 "psrlw $8, %%mm1 \n\t"
2321 "psrlw $8, %%mm2 \n\t"
2322 "psrlw $8, %%mm3 \n\t"
2323 "packuswb %%mm1, %%mm0 \n\t"
2324 "packuswb %%mm3, %%mm2 \n\t"
2325 "movq %%mm0, %%mm1 \n\t"
2326 "movq %%mm2, %%mm3 \n\t"
2327 "psrlw $8, %%mm0 \n\t"
2328 "psrlw $8, %%mm2 \n\t"
2329 "pand %%mm7, %%mm1 \n\t"
2330 "pand %%mm7, %%mm3 \n\t"
2331 "packuswb %%mm2, %%mm0 \n\t"
2332 "packuswb %%mm3, %%mm1 \n\t"
2333 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2334 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2338 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2354 int lumStride,
int chromStride,
int srcStride)
2359 for (y=0; y<
height; y++) {
2377 #if !COMPILE_TEMPLATE_AMD3DNOW
2380 int lumStride,
int chromStride,
int srcStride)
2385 for (y=0; y<
height; y++) {
2404 int lumStride,
int chromStride,
int srcStride)
2409 for (y=0; y<
height; y++) {
2427 #if !COMPILE_TEMPLATE_AMD3DNOW
2430 int lumStride,
int chromStride,
int srcStride)
2435 for (y=0; y<
height; y++) {
2455 #if !COMPILE_TEMPLATE_SSE2
2456 #if !COMPILE_TEMPLATE_AMD3DNOW
2486 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2497 #if !COMPILE_TEMPLATE_AMD3DNOW
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
void(* yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
void(* yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
void(* rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
void(* uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
void(* uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
void(* yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static av_cold int end(AVCodecContext *avctx)
#define FF_CEIL_RSHIFT(a, b)
void(* rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
void(* yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, int width, int height, int src1Stride, int src2Stride, int dstStride)
void(* rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
void(* rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
void(* yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
typedef void(RENAME(mix_any_func_type))
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
void(* rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
void(* vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
void(* rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
BYTE int const BYTE int int int height
void(* shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
void(* yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
void(* yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
void(* rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
void(* yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
void(* rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
void(* rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)