mirror of
https://github.com/godotengine/godot.git
synced 2024-11-10 06:03:09 +00:00
Update libtheora to GIT (2020.10)
This commit is contained in:
parent
92bcd3c01d
commit
b87584a070
@ -15,7 +15,7 @@ if env["builtin_libtheora"]:
|
|||||||
# "analyze.c",
|
# "analyze.c",
|
||||||
# "apiwrapper.c",
|
# "apiwrapper.c",
|
||||||
"bitpack.c",
|
"bitpack.c",
|
||||||
"cpu.c",
|
# "collect.c",
|
||||||
# "decapiwrapper.c",
|
# "decapiwrapper.c",
|
||||||
"decinfo.c",
|
"decinfo.c",
|
||||||
"decode.c",
|
"decode.c",
|
||||||
@ -47,8 +47,12 @@ if env["builtin_libtheora"]:
|
|||||||
"x86/mmxfrag.c",
|
"x86/mmxfrag.c",
|
||||||
"x86/mmxidct.c",
|
"x86/mmxidct.c",
|
||||||
"x86/mmxstate.c",
|
"x86/mmxstate.c",
|
||||||
|
# "x86/sse2encfrag.c",
|
||||||
# "x86/sse2fdct.c",
|
# "x86/sse2fdct.c",
|
||||||
|
"x86/sse2idct.c",
|
||||||
|
"x86/x86cpu.c",
|
||||||
# "x86/x86enc.c",
|
# "x86/x86enc.c",
|
||||||
|
# "x86/x86enquant.c"
|
||||||
"x86/x86state.c",
|
"x86/x86state.c",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -58,6 +62,7 @@ if env["builtin_libtheora"]:
|
|||||||
"x86_vc/mmxfrag.c",
|
"x86_vc/mmxfrag.c",
|
||||||
"x86_vc/mmxidct.c",
|
"x86_vc/mmxidct.c",
|
||||||
"x86_vc/mmxstate.c",
|
"x86_vc/mmxstate.c",
|
||||||
|
"x86_vc/x86cpu.c",
|
||||||
# "x86_vc/x86enc.c",
|
# "x86_vc/x86enc.c",
|
||||||
"x86_vc/x86state.c",
|
"x86_vc/x86state.c",
|
||||||
]
|
]
|
||||||
|
7
thirdparty/README.md
vendored
7
thirdparty/README.md
vendored
@ -291,18 +291,15 @@ Files extracted from upstream source:
|
|||||||
## libtheora
|
## libtheora
|
||||||
|
|
||||||
- Upstream: https://www.theora.org
|
- Upstream: https://www.theora.org
|
||||||
- Version: 1.1.1 (2010)
|
- Version: git (7180717276af1ebc7da15c83162d6c5d6203aabf, 2020)
|
||||||
- License: BSD-3-Clause
|
- License: BSD-3-Clause
|
||||||
|
|
||||||
Files extracted from upstream source:
|
Files extracted from upstream source:
|
||||||
|
|
||||||
- all .c, .h in lib/
|
- all .c, .h in lib/, except arm/ and c64x/ folders
|
||||||
- all .h files in include/theora/ as theora/
|
- all .h files in include/theora/ as theora/
|
||||||
- COPYING and LICENSE
|
- COPYING and LICENSE
|
||||||
|
|
||||||
Upstream patches included in the `patches` directory have been applied
|
|
||||||
on top of the 1.1.1 source (not included in any stable release yet).
|
|
||||||
|
|
||||||
|
|
||||||
## libvorbis
|
## libvorbis
|
||||||
|
|
||||||
|
8
thirdparty/libtheora/LICENSE
vendored
8
thirdparty/libtheora/LICENSE
vendored
@ -4,13 +4,13 @@ In addition to and irrespective of the copyright license associated
|
|||||||
with this software, On2 Technologies, Inc. makes the following statement
|
with this software, On2 Technologies, Inc. makes the following statement
|
||||||
regarding technology used in this software:
|
regarding technology used in this software:
|
||||||
|
|
||||||
On2 represents and warrants that it shall not assert any rights
|
On2 represents and warrants that it shall not assert any rights
|
||||||
relating to infringement of On2's registered patents, nor initiate
|
relating to infringement of On2's registered patents, nor initiate
|
||||||
any litigation asserting such rights, against any person who, or
|
any litigation asserting such rights, against any person who, or
|
||||||
entity which utilizes the On2 VP3 Codec Software, including any
|
entity which utilizes the On2 VP3 Codec Software, including any
|
||||||
use, distribution, and sale of said Software; which make changes,
|
use, distribution, and sale of said Software; which make changes,
|
||||||
modifications, and improvements in said Software; and to use,
|
modifications, and improvements in said Software; and to use,
|
||||||
distribute, and sell said changes as well as applications for other
|
distribute, and sell said changes as well as applications for other
|
||||||
fields of use.
|
fields of use.
|
||||||
|
|
||||||
This reference implementation is originally derived from the On2 VP3
|
This reference implementation is originally derived from the On2 VP3
|
||||||
|
2299
thirdparty/libtheora/analyze.c
vendored
2299
thirdparty/libtheora/analyze.c
vendored
File diff suppressed because it is too large
Load Diff
2
thirdparty/libtheora/apiwrapper.c
vendored
2
thirdparty/libtheora/apiwrapper.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
|
2
thirdparty/libtheora/apiwrapper.h
vendored
2
thirdparty/libtheora/apiwrapper.h
vendored
@ -21,7 +21,7 @@
|
|||||||
# include <theora/theora.h>
|
# include <theora/theora.h>
|
||||||
# include "theora/theoradec.h"
|
# include "theora/theoradec.h"
|
||||||
# include "theora/theoraenc.h"
|
# include "theora/theoraenc.h"
|
||||||
# include "internal.h"
|
# include "state.h"
|
||||||
|
|
||||||
typedef struct th_api_wrapper th_api_wrapper;
|
typedef struct th_api_wrapper th_api_wrapper;
|
||||||
typedef struct th_api_info th_api_info;
|
typedef struct th_api_info th_api_info;
|
||||||
|
23
thirdparty/libtheora/bitpack.c
vendored
23
thirdparty/libtheora/bitpack.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function: packing variable sized words into an octet stream
|
function: packing variable sized words into an octet stream
|
||||||
last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@ -32,15 +32,18 @@ static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
|
|||||||
const unsigned char *stop;
|
const unsigned char *stop;
|
||||||
oc_pb_window window;
|
oc_pb_window window;
|
||||||
int available;
|
int available;
|
||||||
|
unsigned shift;
|
||||||
|
stop=_b->stop;
|
||||||
|
ptr=_b->ptr;
|
||||||
window=_b->window;
|
window=_b->window;
|
||||||
available=_b->bits;
|
available=_b->bits;
|
||||||
ptr=_b->ptr;
|
shift=OC_PB_WINDOW_SIZE-available;
|
||||||
stop=_b->stop;
|
while(7<shift&&ptr<stop){
|
||||||
while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
|
shift-=8;
|
||||||
available+=8;
|
window|=(oc_pb_window)*ptr++<<shift;
|
||||||
window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
|
|
||||||
}
|
}
|
||||||
_b->ptr=ptr;
|
_b->ptr=ptr;
|
||||||
|
available=OC_PB_WINDOW_SIZE-shift;
|
||||||
if(_bits>available){
|
if(_bits>available){
|
||||||
if(ptr>=stop){
|
if(ptr>=stop){
|
||||||
_b->eof=1;
|
_b->eof=1;
|
||||||
@ -67,7 +70,7 @@ void oc_pack_adv1(oc_pack_buf *_b){
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*Here we assume that 0<=_bits&&_bits<=32.*/
|
/*Here we assume that 0<=_bits&&_bits<=32.*/
|
||||||
long oc_pack_read(oc_pack_buf *_b,int _bits){
|
long oc_pack_read_c(oc_pack_buf *_b,int _bits){
|
||||||
oc_pb_window window;
|
oc_pb_window window;
|
||||||
int available;
|
int available;
|
||||||
long result;
|
long result;
|
||||||
@ -82,12 +85,12 @@ long oc_pack_read(oc_pack_buf *_b,int _bits){
|
|||||||
available-=_bits;
|
available-=_bits;
|
||||||
window<<=1;
|
window<<=1;
|
||||||
window<<=_bits-1;
|
window<<=_bits-1;
|
||||||
_b->bits=available;
|
|
||||||
_b->window=window;
|
_b->window=window;
|
||||||
|
_b->bits=available;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int oc_pack_read1(oc_pack_buf *_b){
|
int oc_pack_read1_c(oc_pack_buf *_b){
|
||||||
oc_pb_window window;
|
oc_pb_window window;
|
||||||
int available;
|
int available;
|
||||||
int result;
|
int result;
|
||||||
@ -100,8 +103,8 @@ int oc_pack_read1(oc_pack_buf *_b){
|
|||||||
result=window>>OC_PB_WINDOW_SIZE-1;
|
result=window>>OC_PB_WINDOW_SIZE-1;
|
||||||
available--;
|
available--;
|
||||||
window<<=1;
|
window<<=1;
|
||||||
_b->bits=available;
|
|
||||||
_b->window=window;
|
_b->window=window;
|
||||||
|
_b->bits=available;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
27
thirdparty/libtheora/bitpack.h
vendored
27
thirdparty/libtheora/bitpack.h
vendored
@ -16,15 +16,32 @@
|
|||||||
********************************************************************/
|
********************************************************************/
|
||||||
#if !defined(_bitpack_H)
|
#if !defined(_bitpack_H)
|
||||||
# define _bitpack_H (1)
|
# define _bitpack_H (1)
|
||||||
|
# include <stddef.h>
|
||||||
# include <limits.h>
|
# include <limits.h>
|
||||||
|
# include "internal.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef unsigned long oc_pb_window;
|
typedef size_t oc_pb_window;
|
||||||
typedef struct oc_pack_buf oc_pack_buf;
|
typedef struct oc_pack_buf oc_pack_buf;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Custom bitpacker implementations.*/
|
||||||
|
# if defined(OC_ARM_ASM)
|
||||||
|
# include "arm/armbits.h"
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# if !defined(oc_pack_read)
|
||||||
|
# define oc_pack_read oc_pack_read_c
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_pack_read1)
|
||||||
|
# define oc_pack_read1 oc_pack_read1_c
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_huff_token_decode)
|
||||||
|
# define oc_huff_token_decode oc_huff_token_decode_c
|
||||||
|
# endif
|
||||||
|
|
||||||
# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
|
# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
|
||||||
/*This is meant to be a large, positive constant that can still be efficiently
|
/*This is meant to be a large, positive constant that can still be efficiently
|
||||||
loaded as an immediate (on platforms like ARM, for example).
|
loaded as an immediate (on platforms like ARM, for example).
|
||||||
@ -34,9 +51,9 @@ typedef struct oc_pack_buf oc_pack_buf;
|
|||||||
|
|
||||||
|
|
||||||
struct oc_pack_buf{
|
struct oc_pack_buf{
|
||||||
oc_pb_window window;
|
|
||||||
const unsigned char *ptr;
|
|
||||||
const unsigned char *stop;
|
const unsigned char *stop;
|
||||||
|
const unsigned char *ptr;
|
||||||
|
oc_pb_window window;
|
||||||
int bits;
|
int bits;
|
||||||
int eof;
|
int eof;
|
||||||
};
|
};
|
||||||
@ -45,8 +62,8 @@ void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
|
|||||||
int oc_pack_look1(oc_pack_buf *_b);
|
int oc_pack_look1(oc_pack_buf *_b);
|
||||||
void oc_pack_adv1(oc_pack_buf *_b);
|
void oc_pack_adv1(oc_pack_buf *_b);
|
||||||
/*Here we assume 0<=_bits&&_bits<=32.*/
|
/*Here we assume 0<=_bits&&_bits<=32.*/
|
||||||
long oc_pack_read(oc_pack_buf *_b,int _bits);
|
long oc_pack_read_c(oc_pack_buf *_b,int _bits);
|
||||||
int oc_pack_read1(oc_pack_buf *_b);
|
int oc_pack_read1_c(oc_pack_buf *_b);
|
||||||
/* returns -1 for read beyond EOF, or the number of whole bytes available */
|
/* returns -1 for read beyond EOF, or the number of whole bytes available */
|
||||||
long oc_pack_bytes_left(oc_pack_buf *_b);
|
long oc_pack_bytes_left(oc_pack_buf *_b);
|
||||||
|
|
||||||
|
974
thirdparty/libtheora/collect.c
vendored
Normal file
974
thirdparty/libtheora/collect.c
vendored
Normal file
@ -0,0 +1,974 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011 *
|
||||||
|
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function: mode selection code
|
||||||
|
last mod: $Id$
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "collect.h"
|
||||||
|
|
||||||
|
#if defined(OC_COLLECT_METRICS)
|
||||||
|
|
||||||
|
int OC_HAS_MODE_METRICS;
|
||||||
|
double OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||||
|
double OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||||
|
oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||||
|
oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||||
|
const char *OC_MODE_METRICS_FILENAME="modedec.stats";
|
||||||
|
|
||||||
|
void oc_mode_metrics_add(oc_mode_metrics *_metrics,
|
||||||
|
double _w,int _s,int _q,int _r,double _d){
|
||||||
|
if(_metrics->w>0){
|
||||||
|
double ds;
|
||||||
|
double dq;
|
||||||
|
double dr;
|
||||||
|
double dd;
|
||||||
|
double ds2;
|
||||||
|
double dq2;
|
||||||
|
double s2;
|
||||||
|
double sq;
|
||||||
|
double q2;
|
||||||
|
double sr;
|
||||||
|
double qr;
|
||||||
|
double sd;
|
||||||
|
double qd;
|
||||||
|
double s2q;
|
||||||
|
double sq2;
|
||||||
|
double w;
|
||||||
|
double wa;
|
||||||
|
double rwa;
|
||||||
|
double rwa2;
|
||||||
|
double rwb;
|
||||||
|
double rwb2;
|
||||||
|
double rw2;
|
||||||
|
double rw3;
|
||||||
|
double rw4;
|
||||||
|
wa=_metrics->w;
|
||||||
|
ds=_s-_metrics->s/wa;
|
||||||
|
dq=_q-_metrics->q/wa;
|
||||||
|
dr=_r-_metrics->r/wa;
|
||||||
|
dd=_d-_metrics->d/wa;
|
||||||
|
ds2=ds*ds;
|
||||||
|
dq2=dq*dq;
|
||||||
|
s2=_metrics->s2;
|
||||||
|
sq=_metrics->sq;
|
||||||
|
q2=_metrics->q2;
|
||||||
|
sr=_metrics->sr;
|
||||||
|
qr=_metrics->qr;
|
||||||
|
sd=_metrics->sd;
|
||||||
|
qd=_metrics->qd;
|
||||||
|
s2q=_metrics->s2q;
|
||||||
|
sq2=_metrics->sq2;
|
||||||
|
w=wa+_w;
|
||||||
|
rwa=wa/w;
|
||||||
|
rwb=_w/w;
|
||||||
|
rwa2=rwa*rwa;
|
||||||
|
rwb2=rwb*rwb;
|
||||||
|
rw2=wa*rwb;
|
||||||
|
rw3=rw2*(rwa2-rwb2);
|
||||||
|
rw4=_w*rwa2*rwa2+wa*rwb2*rwb2;
|
||||||
|
_metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb
|
||||||
|
+(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4;
|
||||||
|
_metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3;
|
||||||
|
_metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3;
|
||||||
|
_metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3;
|
||||||
|
_metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3;
|
||||||
|
_metrics->s2+=ds2*rw2;
|
||||||
|
_metrics->sq+=ds*dq*rw2;
|
||||||
|
_metrics->q2+=dq2*rw2;
|
||||||
|
_metrics->sr+=ds*dr*rw2;
|
||||||
|
_metrics->qr+=dq*dr*rw2;
|
||||||
|
_metrics->r2+=dr*dr*rw2;
|
||||||
|
_metrics->sd+=ds*dd*rw2;
|
||||||
|
_metrics->qd+=dq*dd*rw2;
|
||||||
|
_metrics->d2+=dd*dd*rw2;
|
||||||
|
}
|
||||||
|
_metrics->w+=_w;
|
||||||
|
_metrics->s+=_s*_w;
|
||||||
|
_metrics->q+=_q*_w;
|
||||||
|
_metrics->r+=_r*_w;
|
||||||
|
_metrics->d+=_d*_w;
|
||||||
|
}
|
||||||
|
|
||||||
|
void oc_mode_metrics_merge(oc_mode_metrics *_dst,
|
||||||
|
const oc_mode_metrics *_src,int _n){
|
||||||
|
int i;
|
||||||
|
/*Find a non-empty set of metrics.*/
|
||||||
|
for(i=0;i<_n&&_src[i].w==0;i++);
|
||||||
|
if(i>=_n){
|
||||||
|
memset(_dst,0,sizeof(*_dst));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
memcpy(_dst,_src+i,sizeof(*_dst));
|
||||||
|
/*And iterate over the remaining non-empty sets of metrics.*/
|
||||||
|
for(i++;i<_n;i++)if(_src[i].w!=0){
|
||||||
|
double ds;
|
||||||
|
double dq;
|
||||||
|
double dr;
|
||||||
|
double dd;
|
||||||
|
double ds2;
|
||||||
|
double dq2;
|
||||||
|
double s2a;
|
||||||
|
double s2b;
|
||||||
|
double sqa;
|
||||||
|
double sqb;
|
||||||
|
double q2a;
|
||||||
|
double q2b;
|
||||||
|
double sra;
|
||||||
|
double srb;
|
||||||
|
double qra;
|
||||||
|
double qrb;
|
||||||
|
double sda;
|
||||||
|
double sdb;
|
||||||
|
double qda;
|
||||||
|
double qdb;
|
||||||
|
double s2qa;
|
||||||
|
double s2qb;
|
||||||
|
double sq2a;
|
||||||
|
double sq2b;
|
||||||
|
double w;
|
||||||
|
double wa;
|
||||||
|
double wb;
|
||||||
|
double rwa;
|
||||||
|
double rwb;
|
||||||
|
double rwa2;
|
||||||
|
double rwb2;
|
||||||
|
double rw2;
|
||||||
|
double rw3;
|
||||||
|
double rw4;
|
||||||
|
wa=_dst->w;
|
||||||
|
wb=_src[i].w;
|
||||||
|
ds=_src[i].s/wb-_dst->s/wa;
|
||||||
|
dq=_src[i].q/wb-_dst->q/wa;
|
||||||
|
dr=_src[i].r/wb-_dst->r/wa;
|
||||||
|
dd=_src[i].d/wb-_dst->d/wa;
|
||||||
|
ds2=ds*ds;
|
||||||
|
dq2=dq*dq;
|
||||||
|
s2a=_dst->s2;
|
||||||
|
sqa=_dst->sq;
|
||||||
|
q2a=_dst->q2;
|
||||||
|
sra=_dst->sr;
|
||||||
|
qra=_dst->qr;
|
||||||
|
sda=_dst->sd;
|
||||||
|
qda=_dst->qd;
|
||||||
|
s2qa=_dst->s2q;
|
||||||
|
sq2a=_dst->sq2;
|
||||||
|
s2b=_src[i].s2;
|
||||||
|
sqb=_src[i].sq;
|
||||||
|
q2b=_src[i].q2;
|
||||||
|
srb=_src[i].sr;
|
||||||
|
qrb=_src[i].qr;
|
||||||
|
sdb=_src[i].sd;
|
||||||
|
qdb=_src[i].qd;
|
||||||
|
s2qb=_src[i].s2q;
|
||||||
|
sq2b=_src[i].sq2;
|
||||||
|
w=wa+wb;
|
||||||
|
if(w==0)rwa=rwb=0;
|
||||||
|
else{
|
||||||
|
rwa=wa/w;
|
||||||
|
rwb=wb/w;
|
||||||
|
}
|
||||||
|
rwa2=rwa*rwa;
|
||||||
|
rwb2=rwb*rwb;
|
||||||
|
rw2=wa*rwb;
|
||||||
|
rw3=rw2*(rwa2-rwb2);
|
||||||
|
rw4=wb*rwa2*rwa2+wa*rwb2*rwb2;
|
||||||
|
/*
|
||||||
|
(1,1,1) ->
|
||||||
|
(0,0,0)#
|
||||||
|
(1,0,0) C(1,1)*C(1,0)*C(1,0)-> d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1})
|
||||||
|
(0,1,0) C(1,0)*C(1,1)*C(1,0)-> d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1})
|
||||||
|
(0,0,1) C(1,0)*C(1,0)*C(1,1)-> d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0})
|
||||||
|
(1,1,0)*
|
||||||
|
(1,0,1)*
|
||||||
|
(0,1,1)*
|
||||||
|
(1,1,1) C(1,1)*C(1,1)*C(1,1)-> d^{1,1,1}*(rwa^3*wb-rwb^3*wa)
|
||||||
|
(2,1) ->
|
||||||
|
(0,0)#
|
||||||
|
(1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1})
|
||||||
|
(0,1) C(2,0)*C(1,1)-> d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0})
|
||||||
|
(2,0)*
|
||||||
|
(1,1)*
|
||||||
|
(2,1) C(2,2)*C(1,1)-> d^{2,1}*(rwa^3*wb-rwb^3*wa)
|
||||||
|
(2,2) ->
|
||||||
|
(0,0)#
|
||||||
|
(1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2})
|
||||||
|
(0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1})
|
||||||
|
(2,0) C(2,2)*C(2,0)-> d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2})
|
||||||
|
(1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1})
|
||||||
|
(0,2) C(2,0)*C(2,2)-> d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0})
|
||||||
|
(1,2)*
|
||||||
|
(2,1)*
|
||||||
|
(2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa)
|
||||||
|
*/
|
||||||
|
_dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa))
|
||||||
|
+ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa)
|
||||||
|
+dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4;
|
||||||
|
_dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa)
|
||||||
|
+dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3;
|
||||||
|
_dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a)
|
||||||
|
+2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3;
|
||||||
|
_dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra)
|
||||||
|
+dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3;
|
||||||
|
_dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda)
|
||||||
|
+dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3;
|
||||||
|
_dst->s2+=_src[i].s2+ds2*rw2;
|
||||||
|
_dst->sq+=_src[i].sq+ds*dq*rw2;
|
||||||
|
_dst->q2+=_src[i].q2+dq2*rw2;
|
||||||
|
_dst->sr+=_src[i].sr+ds*dr*rw2;
|
||||||
|
_dst->qr+=_src[i].qr+dq*dr*rw2;
|
||||||
|
_dst->r2+=_src[i].r2+dr*dr*rw2;
|
||||||
|
_dst->sd+=_src[i].sd+ds*dd*rw2;
|
||||||
|
_dst->qd+=_src[i].qd+dq*dd*rw2;
|
||||||
|
_dst->d2+=_src[i].d2+dd*dd*rw2;
|
||||||
|
_dst->w+=_src[i].w;
|
||||||
|
_dst->s+=_src[i].s;
|
||||||
|
_dst->q+=_src[i].q;
|
||||||
|
_dst->r+=_src[i].r;
|
||||||
|
_dst->d+=_src[i].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Adjust a single corner of a set of metric bins to minimize the squared
|
||||||
|
prediction error of R and D.
|
||||||
|
Each bin is assumed to cover a quad like so:
|
||||||
|
(s0,q0) (s1,q0)
|
||||||
|
A----------B
|
||||||
|
| |
|
||||||
|
| |
|
||||||
|
| |
|
||||||
|
| |
|
||||||
|
C----------Z
|
||||||
|
(s0,q1) (s1,q1)
|
||||||
|
The values A, B, and C are fixed, and Z is the free parameter.
|
||||||
|
Then, for example, R_i is predicted via bilinear interpolation as
|
||||||
|
x_i=(s_i-s0)/(s1-s0)
|
||||||
|
y_i=(q_i-q0)/(q1-q0)
|
||||||
|
dRds1_i=A+(B-A)*x_i
|
||||||
|
dRds2_i=C+(Z-C)*x_i
|
||||||
|
R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i
|
||||||
|
To find the Z that minimizes the squared prediction error over i, this can
|
||||||
|
be rewritten as
|
||||||
|
R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z
|
||||||
|
Letting X={...,x_i*y_i,...}^T and
|
||||||
|
Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T,
|
||||||
|
the optimal Z is given by Z=(X^T.Y)/(X^T.X).
|
||||||
|
Now, we need to compute these dot products without actually storing data for
|
||||||
|
each sample.
|
||||||
|
Starting with X^T.X, we have
|
||||||
|
X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2).
|
||||||
|
Expanding the interior of the sum in a monomial basis of s_i and q_i gives
|
||||||
|
s0^2*q0^2 *(1)
|
||||||
|
-2*s0*q0^2*(s_i)
|
||||||
|
-2*s0^2*q0*(q_i)
|
||||||
|
+q0^2 *(s_i^2)
|
||||||
|
+4*s0*q0 *(s_i*q_i)
|
||||||
|
+s0^2 *(q_i^2)
|
||||||
|
-2*q0 *(s_i^2*q_i)
|
||||||
|
-2*s0 *(s_i*q_i^2)
|
||||||
|
+1 *(s_i^2*q_i^2).
|
||||||
|
However, computing things directly in this basis leads to gross numerical
|
||||||
|
errors, as most of the terms will have similar size and destructive
|
||||||
|
cancellation results.
|
||||||
|
A much better basis is the central (co-)moment basis:
|
||||||
|
{1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2,
|
||||||
|
(s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2},
|
||||||
|
where sbar and qbar are the average s and q values over the bin,
|
||||||
|
respectively.
|
||||||
|
In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is
|
||||||
|
ds^2*dq^2*(1)
|
||||||
|
+dq^2 *((s_i-sbar)^2)
|
||||||
|
+4*ds*dq*((s_i-sbar)*(q_i-qbar))
|
||||||
|
+ds^2 *((q_i-qbar)^2)
|
||||||
|
+2*dq *((s_i-sbar)^2*(q_i-qbar))
|
||||||
|
+2*ds *((s_i-sbar)*(q_i-qbar)^2)
|
||||||
|
+1 *((s_i-sbar)^2*(q_i-qbar)^2).
|
||||||
|
With these expressions in the central (co-)moment bases, all we need to do
|
||||||
|
is compute sums over the (co-)moment terms, which can be done
|
||||||
|
incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()),
|
||||||
|
with no need to store the individual samples.
|
||||||
|
Now, for X^T.Y, we have
|
||||||
|
X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0)
|
||||||
|
-((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/
|
||||||
|
((s1-s0)*(q1-q0)),
|
||||||
|
or, rewriting the constants to simplify notation,
|
||||||
|
X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0)
|
||||||
|
+C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)).
|
||||||
|
Again, converting to the central (co-)moment basis, the interior of the
|
||||||
|
above sum is
|
||||||
|
ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq) *(1)
|
||||||
|
+(C1*dq+C3*dq^2) *((s_i-sbar)^2)
|
||||||
|
+(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar))
|
||||||
|
+(C2*ds+C3*ds^2) *((q_i-qbar)^2)
|
||||||
|
+dq *((s_i-sbar)*(r_i-rbar))
|
||||||
|
+ds *((q_i-qbar)*(r_i-rbar))
|
||||||
|
+(C1+2*C3*dq) *((s_i-sbar)^2*(q_i-qbar))
|
||||||
|
+(C2+2*C3*ds) *((s_i-sbar)*(q_i-qbar)^2)
|
||||||
|
+1 *((s_i-sbar)*(q_i-qbar)*(r_i-rbar))
|
||||||
|
+C3 *((s_i-sbar)^2*(q_i-qbar)^2).
|
||||||
|
You might think it would be easier (if perhaps slightly less robust) to
|
||||||
|
accumulate terms directly around s0 and q0.
|
||||||
|
However, we update each corner of the bins in turn, so we would have to
|
||||||
|
change basis to move the sums from corner to corner anyway.*/
|
||||||
|
double oc_mode_metrics_solve(double *_r,double *_d,
|
||||||
|
const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
|
||||||
|
const int *_q0,const int *_q1,
|
||||||
|
const double *_ra,const double *_rb,const double *_rc,
|
||||||
|
const double *_da,const double *_db,const double *_dc,int _n){
|
||||||
|
double xx;
|
||||||
|
double rxy;
|
||||||
|
double dxy;
|
||||||
|
double wt;
|
||||||
|
int i;
|
||||||
|
xx=rxy=dxy=wt=0;
|
||||||
|
for(i=0;i<_n;i++)if(_metrics[i].w>0){
|
||||||
|
double s10;
|
||||||
|
double q10;
|
||||||
|
double sq10;
|
||||||
|
double ds;
|
||||||
|
double dq;
|
||||||
|
double ds2;
|
||||||
|
double dq2;
|
||||||
|
double r;
|
||||||
|
double d;
|
||||||
|
double s2;
|
||||||
|
double sq;
|
||||||
|
double q2;
|
||||||
|
double sr;
|
||||||
|
double qr;
|
||||||
|
double sd;
|
||||||
|
double qd;
|
||||||
|
double s2q;
|
||||||
|
double sq2;
|
||||||
|
double sqr;
|
||||||
|
double sqd;
|
||||||
|
double s2q2;
|
||||||
|
double c0;
|
||||||
|
double c1;
|
||||||
|
double c2;
|
||||||
|
double c3;
|
||||||
|
double w;
|
||||||
|
w=_metrics[i].w;
|
||||||
|
wt+=w;
|
||||||
|
s10=_s1[i]-_s0[i];
|
||||||
|
q10=_q1[i]-_q0[i];
|
||||||
|
sq10=s10*q10;
|
||||||
|
ds=_metrics[i].s/w-_s0[i];
|
||||||
|
dq=_metrics[i].q/w-_q0[i];
|
||||||
|
ds2=ds*ds;
|
||||||
|
dq2=dq*dq;
|
||||||
|
s2=_metrics[i].s2;
|
||||||
|
sq=_metrics[i].sq;
|
||||||
|
q2=_metrics[i].q2;
|
||||||
|
s2q=_metrics[i].s2q;
|
||||||
|
sq2=_metrics[i].sq2;
|
||||||
|
s2q2=_metrics[i].s2q2;
|
||||||
|
xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10);
|
||||||
|
r=_metrics[i].r/w;
|
||||||
|
sr=_metrics[i].sr;
|
||||||
|
qr=_metrics[i].qr;
|
||||||
|
sqr=_metrics[i].sqr;
|
||||||
|
c0=-_ra[i];
|
||||||
|
c1=-(_rb[i]-_ra[i])/s10;
|
||||||
|
c2=-(_rc[i]-_ra[i])/q10;
|
||||||
|
c3=-(_ra[i]-_rb[i]-_rc[i])/sq10;
|
||||||
|
rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
|
||||||
|
+(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr
|
||||||
|
+(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10;
|
||||||
|
d=_metrics[i].d/w;
|
||||||
|
sd=_metrics[i].sd;
|
||||||
|
qd=_metrics[i].qd;
|
||||||
|
sqd=_metrics[i].sqd;
|
||||||
|
c0=-_da[i];
|
||||||
|
c1=-(_db[i]-_da[i])/s10;
|
||||||
|
c2=-(_dc[i]-_da[i])/q10;
|
||||||
|
c3=-(_da[i]-_db[i]-_dc[i])/sq10;
|
||||||
|
dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
|
||||||
|
+(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd
|
||||||
|
+(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10;
|
||||||
|
}
|
||||||
|
if(xx>1E-3){
|
||||||
|
*_r=rxy/xx;
|
||||||
|
*_d=dxy/xx;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
*_r=0;
|
||||||
|
*_d=0;
|
||||||
|
}
|
||||||
|
return wt;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately
|
||||||
|
useful for mode decision.*/
|
||||||
|
void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
|
||||||
|
int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
|
||||||
|
int _shift,double (*_weight)[3][2][OC_COMP_BINS]){
|
||||||
|
int niters;
|
||||||
|
int prevdr;
|
||||||
|
int prevdd;
|
||||||
|
int dr;
|
||||||
|
int dd;
|
||||||
|
int pli;
|
||||||
|
int qti;
|
||||||
|
int qi;
|
||||||
|
int si;
|
||||||
|
dd=dr=INT_MAX;
|
||||||
|
niters=0;
|
||||||
|
/*The encoder interpolates rate and RMSE terms bilinearly from an
|
||||||
|
OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table.
|
||||||
|
To find the sample values at the grid points that minimize the total
|
||||||
|
squared prediction error actually requires solving a relatively sparse
|
||||||
|
linear system with a number of variables equal to the number of grid
|
||||||
|
points.
|
||||||
|
Instead of writing a general sparse linear system solver, we just use
|
||||||
|
Gauss-Seidel iteration, i.e., we update one grid point at time until
|
||||||
|
they stop changing.*/
|
||||||
|
do{
|
||||||
|
prevdr=dr;
|
||||||
|
prevdd=dd;
|
||||||
|
dd=dr=0;
|
||||||
|
for(pli=0;pli<3;pli++){
|
||||||
|
for(qti=0;qti<2;qti++){
|
||||||
|
for(qi=0;qi<OC_LOGQ_BINS;qi++){
|
||||||
|
for(si=0;si<OC_COMP_BINS;si++){
|
||||||
|
oc_mode_metrics m[4];
|
||||||
|
int s0[4];
|
||||||
|
int s1[4];
|
||||||
|
int q0[4];
|
||||||
|
int q1[4];
|
||||||
|
double ra[4];
|
||||||
|
double rb[4];
|
||||||
|
double rc[4];
|
||||||
|
double da[4];
|
||||||
|
double db[4];
|
||||||
|
double dc[4];
|
||||||
|
double r;
|
||||||
|
double d;
|
||||||
|
int rate;
|
||||||
|
int rmse;
|
||||||
|
int ds;
|
||||||
|
int n;
|
||||||
|
n=0;
|
||||||
|
/*Collect the statistics for the (up to) four bins grid point
|
||||||
|
(si,qi) touches.*/
|
||||||
|
if(qi>0&&si>0){
|
||||||
|
q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
|
||||||
|
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||||
|
s0[n]=si-1<<_shift;
|
||||||
|
s1[n]=si<<_shift;
|
||||||
|
ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||||
|
da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||||
|
rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||||
|
db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||||
|
rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||||
|
dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||||
|
*(m+n++)=*(_metrics[qi-1][pli][qti]+si-1);
|
||||||
|
}
|
||||||
|
if(qi>0){
|
||||||
|
ds=si+1<OC_COMP_BINS?1:-1;
|
||||||
|
q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
|
||||||
|
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||||
|
s0[n]=si+ds<<_shift;
|
||||||
|
s1[n]=si<<_shift;
|
||||||
|
ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||||
|
da[n]=
|
||||||
|
ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||||
|
rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||||
|
db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||||
|
rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||||
|
dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||||
|
*(m+n++)=*(_metrics[qi-1][pli][qti]+si);
|
||||||
|
}
|
||||||
|
if(qi+1<OC_LOGQ_BINS&&si>0){
|
||||||
|
q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
|
||||||
|
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||||
|
s0[n]=si-1<<_shift;
|
||||||
|
s1[n]=si<<_shift;
|
||||||
|
ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||||
|
da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||||
|
rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||||
|
db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||||
|
rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||||
|
dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||||
|
*(m+n++)=*(_metrics[qi][pli][qti]+si-1);
|
||||||
|
}
|
||||||
|
if(qi+1<OC_LOGQ_BINS){
|
||||||
|
ds=si+1<OC_COMP_BINS?1:-1;
|
||||||
|
q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
|
||||||
|
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||||
|
s0[n]=si+ds<<_shift;
|
||||||
|
s1[n]=si<<_shift;
|
||||||
|
ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||||
|
da[n]=
|
||||||
|
ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||||
|
rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||||
|
db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||||
|
rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||||
|
dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||||
|
*(m+n++)=*(_metrics[qi][pli][qti]+si);
|
||||||
|
}
|
||||||
|
/*On the first pass, initialize with a simple weighted average of
|
||||||
|
the neighboring bins.*/
|
||||||
|
if(!OC_HAS_MODE_METRICS&&niters==0){
|
||||||
|
double w;
|
||||||
|
w=r=d=0;
|
||||||
|
while(n-->0){
|
||||||
|
w+=m[n].w;
|
||||||
|
r+=m[n].r;
|
||||||
|
d+=m[n].d;
|
||||||
|
}
|
||||||
|
r=w>1E-3?r/w:0;
|
||||||
|
d=w>1E-3?d/w:0;
|
||||||
|
_weight[qi][pli][qti][si]=w;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
/*Update the grid point and save the weight for later.*/
|
||||||
|
_weight[qi][pli][qti][si]=
|
||||||
|
oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n);
|
||||||
|
}
|
||||||
|
rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767);
|
||||||
|
rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767);
|
||||||
|
dr+=abs(rate-_table[qi][pli][qti][si].rate);
|
||||||
|
dd+=abs(rmse-_table[qi][pli][qti][si].rmse);
|
||||||
|
_table[qi][pli][qti][si].rate=(ogg_int16_t)rate;
|
||||||
|
_table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*After a fixed number of initial iterations, only iterate so long as the
|
||||||
|
total change is decreasing.
|
||||||
|
This ensures we don't oscillate forever, which is a danger, as all of our
|
||||||
|
results are rounded fairly coarsely.*/
|
||||||
|
while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd)));
|
||||||
|
if(_reweight){
|
||||||
|
/*Now, reduce the values of the optimal solution until we get enough
|
||||||
|
samples in each bin to overcome the constant OC_ZWEIGHT factor.
|
||||||
|
This encourages sampling under-populated bins and prevents a single large
|
||||||
|
sample early on from discouraging coding in that bin ever again.*/
|
||||||
|
for(pli=0;pli<3;pli++){
|
||||||
|
for(qti=0;qti<2;qti++){
|
||||||
|
for(qi=0;qi<OC_LOGQ_BINS;qi++){
|
||||||
|
for(si=0;si<OC_COMP_BINS;si++){
|
||||||
|
double wt;
|
||||||
|
wt=_weight[qi][pli][qti][si];
|
||||||
|
wt/=OC_ZWEIGHT+wt;
|
||||||
|
_table[qi][pli][qti][si].rate=(ogg_int16_t)
|
||||||
|
(_table[qi][pli][qti][si].rate*wt+0.5);
|
||||||
|
_table[qi][pli][qti][si].rmse=(ogg_int16_t)
|
||||||
|
(_table[qi][pli][qti][si].rmse*wt+0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Dump the in memory mode metrics to a file.
|
||||||
|
Note this data format isn't portable between different platforms.*/
|
||||||
|
void oc_mode_metrics_dump(void){
|
||||||
|
FILE *fmetrics;
|
||||||
|
fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb");
|
||||||
|
if(fmetrics!=NULL){
|
||||||
|
(void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
|
||||||
|
(void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
|
||||||
|
(void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
|
||||||
|
fclose(fmetrics);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
|
||||||
|
#if !defined(OC_COLLECT_METRICS)
|
||||||
|
const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
|
||||||
|
#else
|
||||||
|
oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
|
||||||
|
#endif
|
||||||
|
int qii;
|
||||||
|
fprintf(_fout,
|
||||||
|
"# if !defined(OC_COLLECT_METRICS)\n"
|
||||||
|
"static const\n"
|
||||||
|
"# endif\n"
|
||||||
|
"oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name);
|
||||||
|
for(qii=0;qii<OC_LOGQ_BINS;qii++){
|
||||||
|
int pli;
|
||||||
|
fprintf(_fout," {\n");
|
||||||
|
for(pli=0;pli<3;pli++){
|
||||||
|
int qti;
|
||||||
|
fprintf(_fout," {\n");
|
||||||
|
for(qti=0;qti<2;qti++){
|
||||||
|
int bin;
|
||||||
|
int qi;
|
||||||
|
static const char *pl_names[3]={"Y'","Cb","Cr"};
|
||||||
|
static const char *qti_names[2]={"INTRA","INTER"};
|
||||||
|
qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
|
||||||
|
fprintf(_fout," /*%s qi=%i %s*/\n",
|
||||||
|
pl_names[pli],qi,qti_names[qti]);
|
||||||
|
fprintf(_fout," {\n");
|
||||||
|
fprintf(_fout," ");
|
||||||
|
for(bin=0;bin<OC_COMP_BINS;bin++){
|
||||||
|
if(bin&&!(bin&0x3))fprintf(_fout,"\n ");
|
||||||
|
fprintf(_fout,"{%5i,%5i}",
|
||||||
|
_mode_rd_table[qii][pli][qti][bin].rate,
|
||||||
|
_mode_rd_table[qii][pli][qti][bin].rmse);
|
||||||
|
if(bin+1<OC_COMP_BINS)fprintf(_fout,",");
|
||||||
|
}
|
||||||
|
fprintf(_fout,"\n }");
|
||||||
|
if(qti<1)fprintf(_fout,",");
|
||||||
|
fprintf(_fout,"\n");
|
||||||
|
}
|
||||||
|
fprintf(_fout," }");
|
||||||
|
if(pli<2)fprintf(_fout,",");
|
||||||
|
fprintf(_fout,"\n");
|
||||||
|
}
|
||||||
|
fprintf(_fout," }");
|
||||||
|
if(qii+1<OC_LOGQ_BINS)fprintf(_fout,",");
|
||||||
|
fprintf(_fout,"\n");
|
||||||
|
}
|
||||||
|
fprintf(_fout,
|
||||||
|
"};\n"
|
||||||
|
"\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void oc_mode_metrics_print(FILE *_fout){
|
||||||
|
int qii;
|
||||||
|
fprintf(_fout,
|
||||||
|
"/*File generated by libtheora with OC_COLLECT_METRICS"
|
||||||
|
" defined at compile time.*/\n"
|
||||||
|
"#if !defined(_modedec_H)\n"
|
||||||
|
"# define _modedec_H (1)\n"
|
||||||
|
"# include \"encint.h\"\n"
|
||||||
|
"\n"
|
||||||
|
"\n"
|
||||||
|
"\n"
|
||||||
|
"/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
|
||||||
|
" (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
|
||||||
|
" The actual statistics used by the encoder will be interpolated from\n"
|
||||||
|
" that table based on log_plq for the actual quantization matrix used.*/\n"
|
||||||
|
"# if !defined(OC_COLLECT_METRICS)\n"
|
||||||
|
"static const\n"
|
||||||
|
"# endif\n"
|
||||||
|
"ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
|
||||||
|
for(qii=0;qii<OC_LOGQ_BINS;qii++){
|
||||||
|
fprintf(_fout," { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
|
||||||
|
OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
|
||||||
|
OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
|
||||||
|
qii+1<OC_LOGQ_BINS?",":"");
|
||||||
|
}
|
||||||
|
fprintf(_fout,
|
||||||
|
"};\n"
|
||||||
|
"\n");
|
||||||
|
oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD);
|
||||||
|
oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD);
|
||||||
|
fprintf(_fout,
|
||||||
|
"#endif\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# if !defined(OC_COLLECT_NO_ENC_FUNCS)
|
||||||
|
void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
|
||||||
|
oc_restore_fpu(&_enc->state);
|
||||||
|
/*Load any existing mode metrics if we haven't already.*/
|
||||||
|
if(!OC_HAS_MODE_METRICS){
|
||||||
|
FILE *fmetrics;
|
||||||
|
memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD));
|
||||||
|
memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD));
|
||||||
|
fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb");
|
||||||
|
if(fmetrics!=NULL){
|
||||||
|
/*Read in the binary structures as written my oc_mode_metrics_dump().
|
||||||
|
Note this format isn't portable between different platforms.*/
|
||||||
|
(void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
|
||||||
|
(void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
|
||||||
|
(void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
|
||||||
|
fclose(fmetrics);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
int qii;
|
||||||
|
int qi;
|
||||||
|
int pli;
|
||||||
|
int qti;
|
||||||
|
for(qii=0;qii<OC_LOGQ_BINS;qii++){
|
||||||
|
qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
|
||||||
|
for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
|
||||||
|
OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1,
|
||||||
|
OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
|
||||||
|
oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1,
|
||||||
|
OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
|
||||||
|
OC_HAS_MODE_METRICS=1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*The following token skipping code used to also be used in the decoder (and
|
||||||
|
even at one point other places in the encoder).
|
||||||
|
However, it was obsoleted by other optimizations, and is now only used here.
|
||||||
|
It has been moved here to avoid generating the code when it's not needed.*/
|
||||||
|
|
||||||
|
/*Determines the number of blocks or coefficients to be skipped for a given
|
||||||
|
token value.
|
||||||
|
_token: The token value to skip.
|
||||||
|
_extra_bits: The extra bits attached to this token.
|
||||||
|
Return: A positive value indicates that number of coefficients are to be
|
||||||
|
skipped in the current block.
|
||||||
|
Otherwise, the negative of the return value indicates that number of
|
||||||
|
blocks are to be ended.*/
|
||||||
|
typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
|
||||||
|
|
||||||
|
/*Handles the simple end of block tokens.*/
|
||||||
|
static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
|
||||||
|
int nblocks_adjust;
|
||||||
|
nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
|
||||||
|
return -_extra_bits-nblocks_adjust;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*The last EOB token has a special case, where an EOB run of size zero ends all
|
||||||
|
the remaining blocks in the frame.*/
|
||||||
|
static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
|
||||||
|
/*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
|
||||||
|
yet available everywhere; this should be equivalent.*/
|
||||||
|
if(!_extra_bits)return -(~(size_t)0>>1);
|
||||||
|
return -_extra_bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Handles the pure zero run tokens.*/
|
||||||
|
static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
|
||||||
|
return _extra_bits+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Handles a normal coefficient value token.*/
|
||||||
|
static ptrdiff_t oc_token_skip_val(void){
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Handles a category 1A zero run/coefficient value combo token.*/
|
||||||
|
static ptrdiff_t oc_token_skip_run_cat1a(int _token){
|
||||||
|
return _token-OC_DCT_RUN_CAT1A+2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
|
||||||
|
static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
|
||||||
|
int run_cati;
|
||||||
|
int ncoeffs_mask;
|
||||||
|
int ncoeffs_adjust;
|
||||||
|
run_cati=_token-OC_DCT_RUN_CAT1B;
|
||||||
|
ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
|
||||||
|
ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
|
||||||
|
return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*A jump table for computing the number of coefficients or blocks to skip for
|
||||||
|
a given token value.
|
||||||
|
This reduces all the conditional branches, etc., needed to parse these token
|
||||||
|
values down to one indirect jump.*/
|
||||||
|
static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
|
||||||
|
oc_token_skip_eob,
|
||||||
|
oc_token_skip_eob,
|
||||||
|
oc_token_skip_eob,
|
||||||
|
oc_token_skip_eob,
|
||||||
|
oc_token_skip_eob,
|
||||||
|
oc_token_skip_eob,
|
||||||
|
oc_token_skip_eob6,
|
||||||
|
oc_token_skip_zrl,
|
||||||
|
oc_token_skip_zrl,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_val,
|
||||||
|
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||||
|
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||||
|
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||||
|
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||||
|
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||||
|
oc_token_skip_run,
|
||||||
|
oc_token_skip_run,
|
||||||
|
oc_token_skip_run,
|
||||||
|
oc_token_skip_run
|
||||||
|
};
|
||||||
|
|
||||||
|
/*Determines the number of blocks or coefficients to be skipped for a given
|
||||||
|
token value.
|
||||||
|
_token: The token value to skip.
|
||||||
|
_extra_bits: The extra bits attached to this token.
|
||||||
|
Return: A positive value indicates that number of coefficients are to be
|
||||||
|
skipped in the current block.
|
||||||
|
Otherwise, the negative of the return value indicates that number of
|
||||||
|
blocks are to be ended.
|
||||||
|
0 will never be returned, so that at least one coefficient in one
|
||||||
|
block will always be decoded for every token.*/
|
||||||
|
static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
|
||||||
|
return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
|
||||||
|
static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
|
||||||
|
0,16,16,16,16,16,32,32,
|
||||||
|
32,32,32,32,32,32,32,48,
|
||||||
|
48,48,48,48,48,48,48,48,
|
||||||
|
48,48,48,48,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64
|
||||||
|
};
|
||||||
|
const oc_fragment *frags;
|
||||||
|
const unsigned *frag_sad;
|
||||||
|
const unsigned *frag_satd;
|
||||||
|
const unsigned *frag_ssd;
|
||||||
|
const ptrdiff_t *coded_fragis;
|
||||||
|
ptrdiff_t ncoded_fragis;
|
||||||
|
ptrdiff_t fragii;
|
||||||
|
double fragw;
|
||||||
|
int modelines[3][3][2];
|
||||||
|
int qti;
|
||||||
|
int qii;
|
||||||
|
int qi;
|
||||||
|
int pli;
|
||||||
|
int zzi;
|
||||||
|
int token;
|
||||||
|
int eb;
|
||||||
|
oc_restore_fpu(&_enc->state);
|
||||||
|
/*Figure out which metric bins to use for this frame's quantizers.*/
|
||||||
|
for(qii=0;qii<_enc->state.nqis;qii++){
|
||||||
|
for(pli=0;pli<3;pli++){
|
||||||
|
for(qti=0;qti<2;qti++){
|
||||||
|
int log_plq;
|
||||||
|
int modeline;
|
||||||
|
log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti];
|
||||||
|
for(modeline=0;modeline<OC_LOGQ_BINS-1&&
|
||||||
|
OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
|
||||||
|
modelines[qii][pli][qti]=modeline;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
qti=_enc->state.frame_type;
|
||||||
|
frags=_enc->state.frags;
|
||||||
|
frag_sad=_enc->frag_sad;
|
||||||
|
frag_satd=_enc->frag_satd;
|
||||||
|
frag_ssd=_enc->frag_ssd;
|
||||||
|
coded_fragis=_enc->state.coded_fragis;
|
||||||
|
ncoded_fragis=fragii=0;
|
||||||
|
/*Weight the fragments by the inverse frame size; this prevents HD content
|
||||||
|
from dominating the statistics.*/
|
||||||
|
fragw=1.0/_enc->state.nfrags;
|
||||||
|
for(pli=0;pli<3;pli++){
|
||||||
|
ptrdiff_t ti[64];
|
||||||
|
int eob_token[64];
|
||||||
|
int eob_run[64];
|
||||||
|
/*Set up token indices and eob run counts.
|
||||||
|
We don't bother trying to figure out the real cost of the runs that span
|
||||||
|
coefficients; instead we use the costs that were available when R-D
|
||||||
|
token optimization was done.*/
|
||||||
|
for(zzi=0;zzi<64;zzi++){
|
||||||
|
ti[zzi]=_enc->dct_token_offs[pli][zzi];
|
||||||
|
if(ti[zzi]>0){
|
||||||
|
token=_enc->dct_tokens[pli][zzi][0];
|
||||||
|
eb=_enc->extra_bits[pli][zzi][0];
|
||||||
|
eob_token[zzi]=token;
|
||||||
|
eob_run[zzi]=-oc_dct_token_skip(token,eb);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
|
||||||
|
eob_run[zzi]=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*Scan the list of coded fragments for this plane.*/
|
||||||
|
ncoded_fragis+=_enc->state.ncoded_fragis[pli];
|
||||||
|
for(;fragii<ncoded_fragis;fragii++){
|
||||||
|
ptrdiff_t fragi;
|
||||||
|
int frag_bits;
|
||||||
|
int huffi;
|
||||||
|
int skip;
|
||||||
|
int mb_mode;
|
||||||
|
unsigned sad;
|
||||||
|
unsigned satd;
|
||||||
|
double sqrt_ssd;
|
||||||
|
int bin;
|
||||||
|
int qtj;
|
||||||
|
fragi=coded_fragis[fragii];
|
||||||
|
frag_bits=0;
|
||||||
|
for(zzi=0;zzi<64;){
|
||||||
|
if(eob_run[zzi]>0){
|
||||||
|
/*We've reached the end of the block.*/
|
||||||
|
eob_run[zzi]--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
|
||||||
|
+OC_ZZI_HUFF_OFFSET[zzi];
|
||||||
|
if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
|
||||||
|
/*This token caused an EOB run to be flushed.
|
||||||
|
Therefore it gets the bits associated with it.*/
|
||||||
|
frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
|
||||||
|
+OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
|
||||||
|
eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
|
||||||
|
}
|
||||||
|
token=_enc->dct_tokens[pli][zzi][ti[zzi]];
|
||||||
|
eb=_enc->extra_bits[pli][zzi][ti[zzi]];
|
||||||
|
ti[zzi]++;
|
||||||
|
skip=oc_dct_token_skip(token,eb);
|
||||||
|
if(skip<0){
|
||||||
|
eob_token[zzi]=token;
|
||||||
|
eob_run[zzi]=-skip;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
/*A regular DCT value token; accumulate the bits for it.*/
|
||||||
|
frag_bits+=_enc->huff_codes[huffi][token].nbits
|
||||||
|
+OC_DCT_TOKEN_EXTRA_BITS[token];
|
||||||
|
zzi+=skip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mb_mode=frags[fragi].mb_mode;
|
||||||
|
qii=frags[fragi].qii;
|
||||||
|
qi=_enc->state.qis[qii];
|
||||||
|
sad=frag_sad[fragi]<<(pli+1&2);
|
||||||
|
satd=frag_satd[fragi]<<(pli+1&2);
|
||||||
|
sqrt_ssd=sqrt(frag_ssd[fragi]);
|
||||||
|
qtj=mb_mode!=OC_MODE_INTRA;
|
||||||
|
/*Accumulate statistics.
|
||||||
|
The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by
|
||||||
|
OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor
|
||||||
|
yet still use old data.*/
|
||||||
|
bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1);
|
||||||
|
oc_mode_metrics_add(
|
||||||
|
OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin,
|
||||||
|
fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
|
||||||
|
bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1);
|
||||||
|
oc_mode_metrics_add(
|
||||||
|
OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin,
|
||||||
|
fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/
|
||||||
|
oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1,
|
||||||
|
OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
|
||||||
|
oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1,
|
||||||
|
OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
|
||||||
|
#endif
|
109
thirdparty/libtheora/collect.h
vendored
Normal file
109
thirdparty/libtheora/collect.h
vendored
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function: mode selection code
|
||||||
|
last mod: $Id$
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
#if !defined(_collect_H)
|
||||||
|
# define _collect_H (1)
|
||||||
|
# include "encint.h"
|
||||||
|
# if defined(OC_COLLECT_METRICS)
|
||||||
|
# include <stdio.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct oc_mode_metrics oc_mode_metrics;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**Sets the file name to load/store mode metrics from/to.
|
||||||
|
* The file name string is stored by reference, and so must be valid for the
|
||||||
|
* lifetime of the encoder.
|
||||||
|
* Mode metric collection uses global tables; do not attempt to perform
|
||||||
|
* multiple collections at once.
|
||||||
|
* \param[in] _buf <tt>char[]</tt> The file name.
|
||||||
|
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||||
|
#define TH_ENCCTL_SET_METRICS_FILE (0x8000)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Accumulates various weighted sums of the measurements.
|
||||||
|
w -> weight
|
||||||
|
s -> SATD
|
||||||
|
q -> log quantizer
|
||||||
|
r -> rate (in bits)
|
||||||
|
d -> RMSE
|
||||||
|
All of the single letters correspond to direct, weighted sums, e.g.,
|
||||||
|
w=sum(w_i), s=sum(s_i*w_i), etc.
|
||||||
|
The others correspond to central moments (or co-moments) of the given order,
|
||||||
|
e.g., sq=sum((s_i-s/w)*(q_i-q/w)*w_i).
|
||||||
|
Because we need some moments up to fourth order, we use central moments to
|
||||||
|
minimize the dynamic range and prevent rounding error from dominating the
|
||||||
|
calculations.*/
|
||||||
|
struct oc_mode_metrics{
|
||||||
|
double w;
|
||||||
|
double s;
|
||||||
|
double q;
|
||||||
|
double r;
|
||||||
|
double d;
|
||||||
|
double s2;
|
||||||
|
double sq;
|
||||||
|
double q2;
|
||||||
|
double sr;
|
||||||
|
double qr;
|
||||||
|
double r2;
|
||||||
|
double sd;
|
||||||
|
double qd;
|
||||||
|
double d2;
|
||||||
|
double s2q;
|
||||||
|
double sq2;
|
||||||
|
double sqr;
|
||||||
|
double sqd;
|
||||||
|
double s2q2;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
# define OC_ZWEIGHT (0.25)
|
||||||
|
|
||||||
|
/*TODO: It may be helpful (for block-level quantizers especially) to separate
|
||||||
|
out the contributions from AC and DC into separate tables.*/
|
||||||
|
|
||||||
|
extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2];
|
||||||
|
extern oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||||
|
extern oc_mode_rd OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||||
|
|
||||||
|
extern int OC_HAS_MODE_METRICS;
|
||||||
|
extern oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||||
|
extern oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||||
|
extern const char *OC_MODE_METRICS_FILENAME;
|
||||||
|
|
||||||
|
void oc_mode_metrics_dump();
|
||||||
|
void oc_mode_metrics_print(FILE *_fout);
|
||||||
|
|
||||||
|
void oc_mode_metrics_add(oc_mode_metrics *_metrics,
|
||||||
|
double _w,int _s,int _q,int _r,double _d);
|
||||||
|
void oc_mode_metrics_merge(oc_mode_metrics *_dst,
|
||||||
|
const oc_mode_metrics *_src,int _n);
|
||||||
|
double oc_mode_metrics_solve(double *_r,double *_d,
|
||||||
|
const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
|
||||||
|
const int *_q0,const int *_q1,
|
||||||
|
const double *_ra,const double *_rb,const double *_rc,
|
||||||
|
const double *_da,const double *_db,const double *_dc,int _n);
|
||||||
|
void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
|
||||||
|
int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
|
||||||
|
int shift,double (*_weight)[3][2][OC_COMP_BINS]);
|
||||||
|
void oc_enc_mode_metrics_load(oc_enc_ctx *_enc);
|
||||||
|
void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
|
||||||
|
|
||||||
|
# endif
|
||||||
|
#endif
|
2
thirdparty/libtheora/dct.h
vendored
2
thirdparty/libtheora/dct.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
|
40
thirdparty/libtheora/decinfo.c
vendored
40
thirdparty/libtheora/decinfo.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -20,6 +20,11 @@
|
|||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include "decint.h"
|
#include "decint.h"
|
||||||
|
|
||||||
|
/*Only used for fuzzing.*/
|
||||||
|
#if defined(HAVE_MEMORY_CONSTRAINT)
|
||||||
|
static const int MAX_FUZZING_WIDTH = 16384;
|
||||||
|
static const int MAX_FUZZING_HEIGHT = 16384;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*Unpacks a series of octets from a given byte array into the pack buffer.
|
/*Unpacks a series of octets from a given byte array into the pack buffer.
|
||||||
@ -55,8 +60,8 @@ static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
|
|||||||
/*verify we can parse this bitstream version.
|
/*verify we can parse this bitstream version.
|
||||||
We accept earlier minors and all subminors, by spec*/
|
We accept earlier minors and all subminors, by spec*/
|
||||||
if(_info->version_major>TH_VERSION_MAJOR||
|
if(_info->version_major>TH_VERSION_MAJOR||
|
||||||
_info->version_major==TH_VERSION_MAJOR&&
|
(_info->version_major==TH_VERSION_MAJOR&&
|
||||||
_info->version_minor>TH_VERSION_MINOR){
|
_info->version_minor>TH_VERSION_MINOR)){
|
||||||
return TH_EVERSION;
|
return TH_EVERSION;
|
||||||
}
|
}
|
||||||
/*Read the encoded frame description.*/
|
/*Read the encoded frame description.*/
|
||||||
@ -82,6 +87,11 @@ static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
|
|||||||
_info->fps_numerator==0||_info->fps_denominator==0){
|
_info->fps_numerator==0||_info->fps_denominator==0){
|
||||||
return TH_EBADHEADER;
|
return TH_EBADHEADER;
|
||||||
}
|
}
|
||||||
|
#if defined(HAVE_MEMORY_CONSTRAINT)
|
||||||
|
if(_info->frame_width>=MAX_FUZZING_WIDTH&&_info->frame_height>=MAX_FUZZING_HEIGHT){
|
||||||
|
return TH_EBADHEADER;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
/*Note: The sense of pic_y is inverted in what we pass back to the
|
/*Note: The sense of pic_y is inverted in what we pass back to the
|
||||||
application compared to how it is stored in the bitstream.
|
application compared to how it is stored in the bitstream.
|
||||||
This is because the bitstream uses a right-handed coordinate system, while
|
This is because the bitstream uses a right-handed coordinate system, while
|
||||||
@ -128,6 +138,10 @@ static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
|
|||||||
_tc->comments*sizeof(_tc->comment_lengths[0]));
|
_tc->comments*sizeof(_tc->comment_lengths[0]));
|
||||||
_tc->user_comments=(char **)_ogg_malloc(
|
_tc->user_comments=(char **)_ogg_malloc(
|
||||||
_tc->comments*sizeof(_tc->user_comments[0]));
|
_tc->comments*sizeof(_tc->user_comments[0]));
|
||||||
|
if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
|
||||||
|
_tc->comments=0;
|
||||||
|
return TH_EFAULT;
|
||||||
|
}
|
||||||
for(i=0;i<_tc->comments;i++){
|
for(i=0;i<_tc->comments;i++){
|
||||||
len=oc_unpack_length(_opb);
|
len=oc_unpack_length(_opb);
|
||||||
if(len<0||len>oc_pack_bytes_left(_opb)){
|
if(len<0||len>oc_pack_bytes_left(_opb)){
|
||||||
@ -168,9 +182,23 @@ static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info,
|
|||||||
int ret;
|
int ret;
|
||||||
val=oc_pack_read(_opb,8);
|
val=oc_pack_read(_opb,8);
|
||||||
packtype=(int)val;
|
packtype=(int)val;
|
||||||
/*If we're at a data packet and we have received all three headers, we're
|
/*If we're at a data packet...*/
|
||||||
done.*/
|
if(!(packtype&0x80)){
|
||||||
if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){
|
/*Check to make sure we received all three headers...
|
||||||
|
If we haven't seen any valid headers, assume this is not actually
|
||||||
|
Theora.*/
|
||||||
|
if(_info->frame_width<=0)return TH_ENOTFORMAT;
|
||||||
|
/*Follow our documentation, which says we'll return TH_EFAULT if this
|
||||||
|
are NULL (_info was checked by our caller).*/
|
||||||
|
if(_tc==NULL)return TH_EFAULT;
|
||||||
|
/*And if any other headers were missing, declare this packet "out of
|
||||||
|
sequence" instead.*/
|
||||||
|
if(_tc->vendor==NULL)return TH_EBADHEADER;
|
||||||
|
/*Don't check this until it's needed, since we allow passing NULL for the
|
||||||
|
arguments that we're not expecting the next header to fill in yet.*/
|
||||||
|
if(_setup==NULL)return TH_EFAULT;
|
||||||
|
if(*_setup==NULL)return TH_EBADHEADER;
|
||||||
|
/*If we got everything, we're done.*/
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
/*Check the codec string.*/
|
/*Check the codec string.*/
|
||||||
|
152
thirdparty/libtheora/decint.h
vendored
152
thirdparty/libtheora/decint.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -19,15 +19,39 @@
|
|||||||
#if !defined(_decint_H)
|
#if !defined(_decint_H)
|
||||||
# define _decint_H (1)
|
# define _decint_H (1)
|
||||||
# include "theora/theoradec.h"
|
# include "theora/theoradec.h"
|
||||||
# include "internal.h"
|
# include "state.h"
|
||||||
# include "bitpack.h"
|
# include "bitpack.h"
|
||||||
|
|
||||||
typedef struct th_setup_info oc_setup_info;
|
|
||||||
typedef struct th_dec_ctx oc_dec_ctx;
|
|
||||||
|
|
||||||
# include "huffdec.h"
|
# include "huffdec.h"
|
||||||
# include "dequant.h"
|
# include "dequant.h"
|
||||||
|
|
||||||
|
typedef struct th_setup_info oc_setup_info;
|
||||||
|
typedef struct oc_dec_opt_vtable oc_dec_opt_vtable;
|
||||||
|
typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
|
||||||
|
typedef struct th_dec_ctx oc_dec_ctx;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Decoder-specific accelerated functions.*/
|
||||||
|
# if defined(OC_C64X_ASM)
|
||||||
|
# include "c64x/c64xdec.h"
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# if !defined(oc_dec_accel_init)
|
||||||
|
# define oc_dec_accel_init oc_dec_accel_init_c
|
||||||
|
# endif
|
||||||
|
# if defined(OC_DEC_USE_VTABLE)
|
||||||
|
# if !defined(oc_dec_dc_unpredict_mcu_plane)
|
||||||
|
# define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
|
||||||
|
((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# if !defined(oc_dec_dc_unpredict_mcu_plane)
|
||||||
|
# define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Constants for the packet-in state machine specific to the decoder.*/
|
/*Constants for the packet-in state machine specific to the decoder.*/
|
||||||
|
|
||||||
/*Next packet to read: Data packet.*/
|
/*Next packet to read: Data packet.*/
|
||||||
@ -37,71 +61,125 @@ typedef struct th_dec_ctx oc_dec_ctx;
|
|||||||
|
|
||||||
struct th_setup_info{
|
struct th_setup_info{
|
||||||
/*The Huffman codes.*/
|
/*The Huffman codes.*/
|
||||||
oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES];
|
ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
|
||||||
/*The quantization parameters.*/
|
/*The quantization parameters.*/
|
||||||
th_quant_info qinfo;
|
th_quant_info qinfo;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Decoder specific functions with accelerated variants.*/
|
||||||
|
struct oc_dec_opt_vtable{
|
||||||
|
void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
|
||||||
|
oc_dec_pipeline_state *_pipe,int _pli);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct oc_dec_pipeline_state{
|
||||||
|
/*Decoded DCT coefficients.
|
||||||
|
These are placed here instead of on the stack so that they can persist
|
||||||
|
between blocks, which makes clearing them back to zero much faster when
|
||||||
|
only a few non-zero coefficients were decoded.
|
||||||
|
It requires at least 65 elements because the zig-zag index array uses the
|
||||||
|
65th element as a dumping ground for out-of-range indices to protect us
|
||||||
|
from buffer overflow.
|
||||||
|
We make it fully twice as large so that the second half can serve as the
|
||||||
|
reconstruction buffer, which saves passing another parameter to all the
|
||||||
|
acceleration functios.
|
||||||
|
It also solves problems with 16-byte alignment for NEON on ARM.
|
||||||
|
gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
|
||||||
|
alignment, and silently produces incorrect results if you ask for 16.
|
||||||
|
Finally, keeping it off the stack means there's less likely to be a data
|
||||||
|
hazard beween the NEON co-processor and the regular ARM core, which avoids
|
||||||
|
unnecessary stalls.*/
|
||||||
|
OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
|
||||||
|
OC_ALIGN16(signed char bounding_values[256]);
|
||||||
|
ptrdiff_t ti[3][64];
|
||||||
|
ptrdiff_t ebi[3][64];
|
||||||
|
ptrdiff_t eob_runs[3][64];
|
||||||
|
const ptrdiff_t *coded_fragis[3];
|
||||||
|
const ptrdiff_t *uncoded_fragis[3];
|
||||||
|
ptrdiff_t ncoded_fragis[3];
|
||||||
|
ptrdiff_t nuncoded_fragis[3];
|
||||||
|
const ogg_uint16_t *dequant[3][3][2];
|
||||||
|
int fragy0[3];
|
||||||
|
int fragy_end[3];
|
||||||
|
int pred_last[3][4];
|
||||||
|
int mcu_nvfrags;
|
||||||
|
int loop_filter;
|
||||||
|
int pp_level;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
struct th_dec_ctx{
|
struct th_dec_ctx{
|
||||||
/*Shared encoder/decoder state.*/
|
/*Shared encoder/decoder state.*/
|
||||||
oc_theora_state state;
|
oc_theora_state state;
|
||||||
/*Whether or not packets are ready to be emitted.
|
/*Whether or not packets are ready to be emitted.
|
||||||
This takes on negative values while there are remaining header packets to
|
This takes on negative values while there are remaining header packets to
|
||||||
be emitted, reaches 0 when the codec is ready for input, and goes to 1
|
be emitted, reaches 0 when the codec is ready for input, and goes to 1
|
||||||
when a frame has been processed and a data packet is ready.*/
|
when a frame has been processed and a data packet is ready.*/
|
||||||
int packet_state;
|
int packet_state;
|
||||||
/*Buffer in which to assemble packets.*/
|
/*Buffer in which to assemble packets.*/
|
||||||
oc_pack_buf opb;
|
oc_pack_buf opb;
|
||||||
/*Huffman decode trees.*/
|
/*Huffman decode trees.*/
|
||||||
oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES];
|
ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
|
||||||
/*The index of the first token in each plane for each coefficient.*/
|
/*The index of the first token in each plane for each coefficient.*/
|
||||||
ptrdiff_t ti0[3][64];
|
ptrdiff_t ti0[3][64];
|
||||||
/*The number of outstanding EOB runs at the start of each coefficient in each
|
/*The number of outstanding EOB runs at the start of each coefficient in each
|
||||||
plane.*/
|
plane.*/
|
||||||
ptrdiff_t eob_runs[3][64];
|
ptrdiff_t eob_runs[3][64];
|
||||||
/*The DCT token lists.*/
|
/*The DCT token lists.*/
|
||||||
unsigned char *dct_tokens;
|
unsigned char *dct_tokens;
|
||||||
/*The extra bits associated with DCT tokens.*/
|
/*The extra bits associated with DCT tokens.*/
|
||||||
unsigned char *extra_bits;
|
unsigned char *extra_bits;
|
||||||
/*The number of dct tokens unpacked so far.*/
|
/*The number of dct tokens unpacked so far.*/
|
||||||
int dct_tokens_count;
|
int dct_tokens_count;
|
||||||
/*The out-of-loop post-processing level.*/
|
/*The out-of-loop post-processing level.*/
|
||||||
int pp_level;
|
int pp_level;
|
||||||
/*The DC scale used for out-of-loop deblocking.*/
|
/*The DC scale used for out-of-loop deblocking.*/
|
||||||
int pp_dc_scale[64];
|
int pp_dc_scale[64];
|
||||||
/*The sharpen modifier used for out-of-loop deringing.*/
|
/*The sharpen modifier used for out-of-loop deringing.*/
|
||||||
int pp_sharp_mod[64];
|
int pp_sharp_mod[64];
|
||||||
/*The DC quantization index of each block.*/
|
/*The DC quantization index of each block.*/
|
||||||
unsigned char *dc_qis;
|
unsigned char *dc_qis;
|
||||||
/*The variance of each block.*/
|
/*The variance of each block.*/
|
||||||
int *variances;
|
int *variances;
|
||||||
/*The storage for the post-processed frame buffer.*/
|
/*The storage for the post-processed frame buffer.*/
|
||||||
unsigned char *pp_frame_data;
|
unsigned char *pp_frame_data;
|
||||||
/*Whether or not the post-processsed frame buffer has space for chroma.*/
|
/*Whether or not the post-processsed frame buffer has space for chroma.*/
|
||||||
int pp_frame_state;
|
int pp_frame_state;
|
||||||
/*The buffer used for the post-processed frame.
|
/*The buffer used for the post-processed frame.
|
||||||
Note that this is _not_ guaranteed to have the same strides and offsets as
|
Note that this is _not_ guaranteed to have the same strides and offsets as
|
||||||
the reference frame buffers.*/
|
the reference frame buffers.*/
|
||||||
th_ycbcr_buffer pp_frame_buf;
|
th_ycbcr_buffer pp_frame_buf;
|
||||||
/*The striped decode callback function.*/
|
/*The striped decode callback function.*/
|
||||||
th_stripe_callback stripe_cb;
|
th_stripe_callback stripe_cb;
|
||||||
|
oc_dec_pipeline_state pipe;
|
||||||
|
# if defined(OC_DEC_USE_VTABLE)
|
||||||
|
/*Table for decoder acceleration functions.*/
|
||||||
|
oc_dec_opt_vtable opt_vtable;
|
||||||
|
# endif
|
||||||
# if defined(HAVE_CAIRO)
|
# if defined(HAVE_CAIRO)
|
||||||
/*Output metrics for debugging.*/
|
/*Output metrics for debugging.*/
|
||||||
int telemetry;
|
int telemetry_mbmode;
|
||||||
int telemetry_mbmode;
|
int telemetry_mv;
|
||||||
int telemetry_mv;
|
int telemetry_qi;
|
||||||
int telemetry_qi;
|
int telemetry_bits;
|
||||||
int telemetry_bits;
|
int telemetry_frame_bytes;
|
||||||
int telemetry_frame_bytes;
|
int telemetry_coding_bytes;
|
||||||
int telemetry_coding_bytes;
|
int telemetry_mode_bytes;
|
||||||
int telemetry_mode_bytes;
|
int telemetry_mv_bytes;
|
||||||
int telemetry_mv_bytes;
|
int telemetry_qi_bytes;
|
||||||
int telemetry_qi_bytes;
|
int telemetry_dc_bytes;
|
||||||
int telemetry_dc_bytes;
|
unsigned char *telemetry_frame_data;
|
||||||
unsigned char *telemetry_frame_data;
|
|
||||||
# endif
|
# endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*Default pure-C implementations of decoder-specific accelerated functions.*/
|
||||||
|
void oc_dec_accel_init_c(oc_dec_ctx *_dec);
|
||||||
|
|
||||||
|
void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
|
||||||
|
oc_dec_pipeline_state *_pipe,int _pli);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
1999
thirdparty/libtheora/decode.c
vendored
1999
thirdparty/libtheora/decode.c
vendored
File diff suppressed because it is too large
Load Diff
2
thirdparty/libtheora/dequant.c
vendored
2
thirdparty/libtheora/dequant.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
|
2
thirdparty/libtheora/dequant.h
vendored
2
thirdparty/libtheora/dequant.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
|
159
thirdparty/libtheora/encfrag.c
vendored
159
thirdparty/libtheora/encfrag.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: encfrag.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -19,11 +19,6 @@
|
|||||||
#include "encint.h"
|
#include "encint.h"
|
||||||
|
|
||||||
|
|
||||||
void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
|
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
|
||||||
(*_enc->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
|
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride){
|
const unsigned char *_ref,int _ystride){
|
||||||
int i;
|
int i;
|
||||||
@ -35,11 +30,6 @@ void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
|
|
||||||
const unsigned char *_src,int _ystride){
|
|
||||||
(*_enc->opt_vtable.frag_sub_128)(_diff,_src,_ystride);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
|
void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
|
||||||
const unsigned char *_src,int _ystride){
|
const unsigned char *_src,int _ystride){
|
||||||
int i;
|
int i;
|
||||||
@ -50,11 +40,6 @@ void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_x,
|
|
||||||
const unsigned char *_y,int _ystride){
|
|
||||||
return (*_enc->opt_vtable.frag_sad)(_x,_y,_ystride);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride){
|
const unsigned char *_ref,int _ystride){
|
||||||
unsigned sad;
|
unsigned sad;
|
||||||
@ -69,12 +54,6 @@ unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
|||||||
return sad;
|
return sad;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
|
||||||
unsigned _thresh){
|
|
||||||
return (*_enc->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
||||||
unsigned sad;
|
unsigned sad;
|
||||||
@ -90,13 +69,6 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
|||||||
return sad;
|
return sad;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,const unsigned char *_ref1,
|
|
||||||
const unsigned char *_ref2,int _ystride,unsigned _thresh){
|
|
||||||
return (*_enc->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,
|
|
||||||
_thresh);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||||
unsigned _thresh){
|
unsigned _thresh){
|
||||||
@ -114,6 +86,27 @@ unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
|||||||
return sad;
|
return sad;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){
|
||||||
|
const unsigned char *src = _src;
|
||||||
|
unsigned dc;
|
||||||
|
unsigned sad;
|
||||||
|
int i;
|
||||||
|
dc=0;
|
||||||
|
for(i=8;i-->0;){
|
||||||
|
int j;
|
||||||
|
for(j=0;j<8;j++)dc+=src[j];
|
||||||
|
src+=_ystride;
|
||||||
|
}
|
||||||
|
dc=dc+32>>6;
|
||||||
|
sad=0;
|
||||||
|
for(i=8;i-->0;){
|
||||||
|
int j;
|
||||||
|
for(j=0;j<8;j++)sad+=abs(_src[j]-dc);
|
||||||
|
_src+=_ystride;
|
||||||
|
}
|
||||||
|
return sad;
|
||||||
|
}
|
||||||
|
|
||||||
static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
|
static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride){
|
const unsigned char *_ref,int _ystride){
|
||||||
int i;
|
int i;
|
||||||
@ -269,19 +262,20 @@ static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
|
unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
|
||||||
unsigned sad;
|
unsigned sad;
|
||||||
int t0;
|
int dc;
|
||||||
int t1;
|
int t0;
|
||||||
int t2;
|
int t1;
|
||||||
int t3;
|
int t2;
|
||||||
int t4;
|
int t3;
|
||||||
int t5;
|
int t4;
|
||||||
int t6;
|
int t5;
|
||||||
int t7;
|
int t6;
|
||||||
int r;
|
int t7;
|
||||||
int i;
|
int r;
|
||||||
sad=0;
|
int i;
|
||||||
|
sad=dc=0;
|
||||||
for(i=0;i<8;i++){
|
for(i=0;i<8;i++){
|
||||||
/*Hadamard stage 1:*/
|
/*Hadamard stage 1:*/
|
||||||
t0=_buf[i*8+0]+_buf[i*8+4];
|
t0=_buf[i*8+0]+_buf[i*8+4];
|
||||||
@ -306,7 +300,7 @@ unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
|
|||||||
t5+=t7;
|
t5+=t7;
|
||||||
t7=r-t7;
|
t7=r-t7;
|
||||||
/*Hadamard stage 3:*/
|
/*Hadamard stage 3:*/
|
||||||
r=abs(t0+t1);
|
r=abs(t0+t1)&-(i>0);
|
||||||
r+=abs(t0-t1);
|
r+=abs(t0-t1);
|
||||||
r+=abs(t2+t3);
|
r+=abs(t2+t3);
|
||||||
r+=abs(t2-t3);
|
r+=abs(t2-t3);
|
||||||
@ -315,54 +309,61 @@ unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
|
|||||||
r+=abs(t6+t7);
|
r+=abs(t6+t7);
|
||||||
r+=abs(t6-t7);
|
r+=abs(t6-t7);
|
||||||
sad+=r;
|
sad+=r;
|
||||||
if(sad>_thresh)break;
|
|
||||||
}
|
}
|
||||||
|
dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
|
||||||
|
*_dc=dc;
|
||||||
return sad;
|
return sad;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
|
unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
const unsigned char *_ref,int _ystride){
|
||||||
unsigned _thresh){
|
|
||||||
return (*_enc->opt_vtable.frag_satd_thresh)(_src,_ref,_ystride,_thresh);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
|
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
|
||||||
ogg_int16_t buf[64];
|
ogg_int16_t buf[64];
|
||||||
oc_diff_hadamard(buf,_src,_ref,_ystride);
|
oc_diff_hadamard(buf,_src,_ref,_ystride);
|
||||||
return oc_hadamard_sad_thresh(buf,_thresh);
|
return oc_hadamard_sad(_dc,buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
|
unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_src,const unsigned char *_ref1,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||||
const unsigned char *_ref2,int _ystride,unsigned _thresh){
|
|
||||||
return (*_enc->opt_vtable.frag_satd2_thresh)(_src,_ref1,_ref2,_ystride,
|
|
||||||
_thresh);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
|
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
|
||||||
unsigned _thresh){
|
|
||||||
ogg_int16_t buf[64];
|
ogg_int16_t buf[64];
|
||||||
oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
|
oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
|
||||||
return oc_hadamard_sad_thresh(buf,_thresh);
|
return oc_hadamard_sad(_dc,buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
|
unsigned oc_enc_frag_intra_satd_c(int *_dc,
|
||||||
const unsigned char *_src,int _ystride){
|
const unsigned char *_src,int _ystride){
|
||||||
return (*_enc->opt_vtable.frag_intra_satd)(_src,_ystride);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride){
|
|
||||||
ogg_int16_t buf[64];
|
ogg_int16_t buf[64];
|
||||||
oc_intra_hadamard(buf,_src,_ystride);
|
oc_intra_hadamard(buf,_src,_ystride);
|
||||||
return oc_hadamard_sad_thresh(buf,UINT_MAX)
|
return oc_hadamard_sad(_dc,buf);
|
||||||
-abs(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
|
unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
|
const unsigned char *_ref,int _ystride){
|
||||||
(*_enc->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);
|
unsigned ret;
|
||||||
|
int y;
|
||||||
|
int x;
|
||||||
|
ret=0;
|
||||||
|
for(y=0;y<8;y++){
|
||||||
|
for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
|
||||||
|
_src+=_ystride;
|
||||||
|
_ref+=_ystride;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
|
||||||
|
unsigned ret;
|
||||||
|
int y;
|
||||||
|
int x;
|
||||||
|
ret=0;
|
||||||
|
for(y=0;y<8;y++){
|
||||||
|
for(x=0;x<8;x++,_mask>>=1){
|
||||||
|
if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
|
||||||
|
}
|
||||||
|
_src+=_ystride;
|
||||||
|
_ref+=_ystride;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
||||||
@ -376,13 +377,3 @@ void oc_enc_frag_copy2_c(unsigned char *_dst,
|
|||||||
_src2+=_ystride;
|
_src2+=_ystride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
|
|
||||||
unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]){
|
|
||||||
(*_enc->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
|
||||||
(*_enc->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue);
|
|
||||||
}
|
|
||||||
|
2
thirdparty/libtheora/encinfo.c
vendored
2
thirdparty/libtheora/encinfo.c
vendored
@ -1,6 +1,6 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "internal.h"
|
#include "state.h"
|
||||||
#include "enquant.h"
|
#include "enquant.h"
|
||||||
#include "huffenc.h"
|
#include "huffenc.h"
|
||||||
|
|
||||||
|
502
thirdparty/libtheora/encint.h
vendored
502
thirdparty/libtheora/encint.h
vendored
@ -11,17 +11,13 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#if !defined(_encint_H)
|
#if !defined(_encint_H)
|
||||||
# define _encint_H (1)
|
# define _encint_H (1)
|
||||||
# if defined(HAVE_CONFIG_H)
|
|
||||||
# include "config.h"
|
|
||||||
# endif
|
|
||||||
# include "theora/theoraenc.h"
|
# include "theora/theoraenc.h"
|
||||||
# include "internal.h"
|
# include "state.h"
|
||||||
# include "ocintrin.h"
|
|
||||||
# include "mathops.h"
|
# include "mathops.h"
|
||||||
# include "enquant.h"
|
# include "enquant.h"
|
||||||
# include "huffenc.h"
|
# include "huffenc.h"
|
||||||
@ -32,8 +28,13 @@
|
|||||||
typedef oc_mv oc_mv2[2];
|
typedef oc_mv oc_mv2[2];
|
||||||
|
|
||||||
typedef struct oc_enc_opt_vtable oc_enc_opt_vtable;
|
typedef struct oc_enc_opt_vtable oc_enc_opt_vtable;
|
||||||
|
typedef struct oc_enc_opt_data oc_enc_opt_data;
|
||||||
typedef struct oc_mb_enc_info oc_mb_enc_info;
|
typedef struct oc_mb_enc_info oc_mb_enc_info;
|
||||||
typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
|
typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
|
||||||
|
typedef struct oc_fr_state oc_fr_state;
|
||||||
|
typedef struct oc_qii_state oc_qii_state;
|
||||||
|
typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
|
||||||
|
typedef struct oc_mode_rd oc_mode_rd;
|
||||||
typedef struct oc_iir_filter oc_iir_filter;
|
typedef struct oc_iir_filter oc_iir_filter;
|
||||||
typedef struct oc_frame_metrics oc_frame_metrics;
|
typedef struct oc_frame_metrics oc_frame_metrics;
|
||||||
typedef struct oc_rc_state oc_rc_state;
|
typedef struct oc_rc_state oc_rc_state;
|
||||||
@ -42,6 +43,170 @@ typedef struct oc_token_checkpoint oc_token_checkpoint;
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Encoder-specific accelerated functions.*/
|
||||||
|
# if defined(OC_X86_ASM)
|
||||||
|
# if defined(_MSC_VER)
|
||||||
|
# include "x86_vc/x86enc.h"
|
||||||
|
# else
|
||||||
|
# include "x86/x86enc.h"
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
# if defined(OC_ARM_ASM)
|
||||||
|
# include "arm/armenc.h"
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# if !defined(oc_enc_accel_init)
|
||||||
|
# define oc_enc_accel_init oc_enc_accel_init_c
|
||||||
|
# endif
|
||||||
|
# if defined(OC_ENC_USE_VTABLE)
|
||||||
|
# if !defined(oc_enc_frag_sub)
|
||||||
|
# define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sub_128)
|
||||||
|
# define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_sub_128)(_diff,_src,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sad)
|
||||||
|
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_sad)(_src,_ref,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sad_thresh)
|
||||||
|
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||||
|
((*(_enc)->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sad2_thresh)
|
||||||
|
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||||
|
((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_intra_sad)
|
||||||
|
# define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_satd)
|
||||||
|
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_satd2)
|
||||||
|
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_satd2)(_dc,_src,_ref1,_ref2,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_intra_satd)
|
||||||
|
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_intra_satd)(_dc,_src,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_ssd)
|
||||||
|
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_ssd)(_src,_ref,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_border_ssd)
|
||||||
|
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||||
|
((*(_enc)->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_copy2)
|
||||||
|
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||||
|
((*(_enc)->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_enquant_table_init)
|
||||||
|
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||||
|
((*(_enc)->opt_vtable.enquant_table_init)(_enquant,_dequant))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_enquant_table_fixup)
|
||||||
|
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||||
|
((*(_enc)->opt_vtable.enquant_table_fixup)(_enquant,_nqis))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_quantize)
|
||||||
|
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||||
|
((*(_enc)->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_recon_intra)
|
||||||
|
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||||
|
((*(_enc)->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_recon_inter)
|
||||||
|
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||||
|
((*(_enc)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_fdct8x8)
|
||||||
|
# define oc_enc_fdct8x8(_enc,_y,_x) \
|
||||||
|
((*(_enc)->opt_vtable.fdct8x8)(_y,_x))
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# if !defined(oc_enc_frag_sub)
|
||||||
|
# define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_sub_c(_diff,_src,_ref,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sub_128)
|
||||||
|
# define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
|
||||||
|
oc_enc_frag_sub_128_c(_diff,_src,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sad)
|
||||||
|
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_sad_c(_src,_ref,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sad_thresh)
|
||||||
|
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||||
|
oc_enc_frag_sad_thresh_c(_src,_ref,_ystride,_thresh)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_sad2_thresh)
|
||||||
|
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||||
|
oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_intra_sad)
|
||||||
|
# define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
|
||||||
|
oc_enc_frag_intra_sad_c(_src,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_satd)
|
||||||
|
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_satd_c(_dc,_src,_ref,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_satd2)
|
||||||
|
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||||
|
oc_enc_frag_satd2_c(_dc,_src,_ref1,_ref2,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_intra_satd)
|
||||||
|
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||||
|
oc_enc_frag_intra_satd_c(_dc,_src,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_ssd)
|
||||||
|
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_ssd_c(_src,_ref,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_border_ssd)
|
||||||
|
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||||
|
oc_enc_frag_border_ssd_c(_src,_ref,_ystride,_mask)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_copy2)
|
||||||
|
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||||
|
oc_enc_frag_copy2_c(_dst,_src1,_src2,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_enquant_table_init)
|
||||||
|
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||||
|
oc_enc_enquant_table_init_c(_enquant,_dequant)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_enquant_table_fixup)
|
||||||
|
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||||
|
oc_enc_enquant_table_fixup_c(_enquant,_nqis)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_quantize)
|
||||||
|
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||||
|
oc_enc_quantize_c(_qdct,_dct,_dequant,_enquant)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_recon_intra)
|
||||||
|
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||||
|
oc_frag_recon_intra_c(_dst,_ystride,_residue)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_frag_recon_inter)
|
||||||
|
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||||
|
oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_enc_fdct8x8)
|
||||||
|
# define oc_enc_fdct8x8(_enc,_y,_x) oc_enc_fdct8x8_c(_y,_x)
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Constants for the packet-out state machine specific to the encoder.*/
|
/*Constants for the packet-out state machine specific to the encoder.*/
|
||||||
|
|
||||||
/*Next packet to emit: Data packet, but none are ready yet.*/
|
/*Next packet to emit: Data packet, but none are ready yet.*/
|
||||||
@ -50,13 +215,61 @@ typedef struct oc_token_checkpoint oc_token_checkpoint;
|
|||||||
#define OC_PACKET_READY (1)
|
#define OC_PACKET_READY (1)
|
||||||
|
|
||||||
/*All features enabled.*/
|
/*All features enabled.*/
|
||||||
#define OC_SP_LEVEL_SLOW (0)
|
#define OC_SP_LEVEL_SLOW (0)
|
||||||
/*Enable early skip.*/
|
/*Enable early skip.*/
|
||||||
#define OC_SP_LEVEL_EARLY_SKIP (1)
|
#define OC_SP_LEVEL_EARLY_SKIP (1)
|
||||||
|
/*Use analysis shortcuts, single quantizer, and faster tokenization.*/
|
||||||
|
#define OC_SP_LEVEL_FAST_ANALYSIS (2)
|
||||||
|
/*Use SAD instead of SATD*/
|
||||||
|
#define OC_SP_LEVEL_NOSATD (3)
|
||||||
/*Disable motion compensation.*/
|
/*Disable motion compensation.*/
|
||||||
#define OC_SP_LEVEL_NOMC (2)
|
#define OC_SP_LEVEL_NOMC (4)
|
||||||
/*Maximum valid speed level.*/
|
/*Maximum valid speed level.*/
|
||||||
#define OC_SP_LEVEL_MAX (2)
|
#define OC_SP_LEVEL_MAX (4)
|
||||||
|
|
||||||
|
|
||||||
|
/*The number of extra bits of precision at which to store rate metrics.*/
|
||||||
|
# define OC_BIT_SCALE (6)
|
||||||
|
/*The number of extra bits of precision at which to store RMSE metrics.
|
||||||
|
This must be at least half OC_BIT_SCALE (rounded up).*/
|
||||||
|
# define OC_RMSE_SCALE (5)
|
||||||
|
/*The number of quantizer bins to partition statistics into.*/
|
||||||
|
# define OC_LOGQ_BINS (8)
|
||||||
|
/*The number of SAD/SATD bins to partition statistics into.*/
|
||||||
|
# define OC_COMP_BINS (24)
|
||||||
|
/*The number of bits of precision to drop from SAD and SATD scores
|
||||||
|
to assign them to a bin.*/
|
||||||
|
# define OC_SAD_SHIFT (6)
|
||||||
|
# define OC_SATD_SHIFT (9)
|
||||||
|
|
||||||
|
/*Masking is applied by scaling the D used in R-D optimization (via rd_scale)
|
||||||
|
or the lambda parameter (via rd_iscale).
|
||||||
|
These are only equivalent within a single block; when more than one block is
|
||||||
|
being considered, the former is the interpretation used.*/
|
||||||
|
|
||||||
|
/*This must be at least 4 for OC_RD_SKIP_SCALE() to work below.*/
|
||||||
|
# define OC_RD_SCALE_BITS (12-OC_BIT_SCALE)
|
||||||
|
# define OC_RD_ISCALE_BITS (11)
|
||||||
|
|
||||||
|
/*This macro is applied to _ssd values with just 4 bits of headroom
|
||||||
|
((15-OC_RMSE_SCALE)*2+OC_BIT_SCALE+2); since we want to allow rd_scales as
|
||||||
|
large as 16, and need additional fractional bits, our only recourse that
|
||||||
|
doesn't lose precision on blocks with very small SSDs is to use a wider
|
||||||
|
multiply.*/
|
||||||
|
# if LONG_MAX>2147483647
|
||||||
|
# define OC_RD_SCALE(_ssd,_rd_scale) \
|
||||||
|
((unsigned)((unsigned long)(_ssd)*(_rd_scale) \
|
||||||
|
+((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
|
||||||
|
# else
|
||||||
|
# define OC_RD_SCALE(_ssd,_rd_scale) \
|
||||||
|
(((_ssd)>>OC_RD_SCALE_BITS)*(_rd_scale) \
|
||||||
|
+(((_ssd)&(1<<OC_RD_SCALE_BITS)-1)*(_rd_scale) \
|
||||||
|
+((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
|
||||||
|
# endif
|
||||||
|
# define OC_RD_SKIP_SCALE(_ssd,_rd_scale) \
|
||||||
|
((_ssd)*(_rd_scale)+((1<<OC_RD_SCALE_BITS-4)>>1)>>OC_RD_SCALE_BITS-4)
|
||||||
|
# define OC_RD_ISCALE(_lambda,_rd_iscale) \
|
||||||
|
((_lambda)*(_rd_iscale)+((1<<OC_RD_ISCALE_BITS)>>1)>>OC_RD_ISCALE_BITS)
|
||||||
|
|
||||||
|
|
||||||
/*The bits used for each of the MB mode codebooks.*/
|
/*The bits used for each of the MB mode codebooks.*/
|
||||||
@ -78,6 +291,10 @@ extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
|
|||||||
|
|
||||||
/*Encoder specific functions with accelerated variants.*/
|
/*Encoder specific functions with accelerated variants.*/
|
||||||
struct oc_enc_opt_vtable{
|
struct oc_enc_opt_vtable{
|
||||||
|
void (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride);
|
||||||
|
void (*frag_sub_128)(ogg_int16_t _diff[64],
|
||||||
|
const unsigned char *_src,int _ystride);
|
||||||
unsigned (*frag_sad)(const unsigned char *_src,
|
unsigned (*frag_sad)(const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride);
|
const unsigned char *_ref,int _ystride);
|
||||||
unsigned (*frag_sad_thresh)(const unsigned char *_src,
|
unsigned (*frag_sad_thresh)(const unsigned char *_src,
|
||||||
@ -85,18 +302,23 @@ struct oc_enc_opt_vtable{
|
|||||||
unsigned (*frag_sad2_thresh)(const unsigned char *_src,
|
unsigned (*frag_sad2_thresh)(const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||||
unsigned _thresh);
|
unsigned _thresh);
|
||||||
unsigned (*frag_satd_thresh)(const unsigned char *_src,
|
unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride);
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
|
||||||
unsigned (*frag_satd2_thresh)(const unsigned char *_src,
|
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
|
||||||
unsigned _thresh);
|
|
||||||
unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
|
|
||||||
void (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
|
|
||||||
const unsigned char *_ref,int _ystride);
|
const unsigned char *_ref,int _ystride);
|
||||||
void (*frag_sub_128)(ogg_int16_t _diff[64],
|
unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_src,int _ystride);
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||||
|
unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
|
||||||
|
unsigned (*frag_ssd)(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride);
|
||||||
|
unsigned (*frag_border_ssd)(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||||
void (*frag_copy2)(unsigned char *_dst,
|
void (*frag_copy2)(unsigned char *_dst,
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||||
|
void (*enquant_table_init)(void *_enquant,
|
||||||
|
const ogg_uint16_t _dequant[64]);
|
||||||
|
void (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
|
||||||
|
int (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||||
|
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||||
const ogg_int16_t _residue[64]);
|
const ogg_int16_t _residue[64]);
|
||||||
void (*frag_recon_inter)(unsigned char *_dst,
|
void (*frag_recon_inter)(unsigned char *_dst,
|
||||||
@ -105,7 +327,19 @@ struct oc_enc_opt_vtable{
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
void oc_enc_vtable_init(oc_enc_ctx *_enc);
|
/*Encoder specific data that varies according to which variants of the above
|
||||||
|
functions are used.*/
|
||||||
|
struct oc_enc_opt_data{
|
||||||
|
/*The size of a single quantizer table.
|
||||||
|
This must be a multiple of enquant_table_alignment.*/
|
||||||
|
size_t enquant_table_size;
|
||||||
|
/*The alignment required for the quantizer tables.
|
||||||
|
This must be a positive power of two.*/
|
||||||
|
int enquant_table_alignment;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
void oc_enc_accel_init(oc_enc_ctx *_enc);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -158,7 +392,7 @@ struct oc_mode_scheme_chooser{
|
|||||||
corresponds to the ranks above.*/
|
corresponds to the ranks above.*/
|
||||||
unsigned char scheme0_list[OC_NMODES];
|
unsigned char scheme0_list[OC_NMODES];
|
||||||
/*The number of times each mode has been chosen so far.*/
|
/*The number of times each mode has been chosen so far.*/
|
||||||
int mode_counts[OC_NMODES];
|
unsigned mode_counts[OC_NMODES];
|
||||||
/*The list of mode coding schemes, sorted in ascending order of bit cost.*/
|
/*The list of mode coding schemes, sorted in ascending order of bit cost.*/
|
||||||
unsigned char scheme_list[8];
|
unsigned char scheme_list[8];
|
||||||
/*The number of bits used by each mode coding scheme.*/
|
/*The number of bits used by each mode coding scheme.*/
|
||||||
@ -170,6 +404,106 @@ void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*State to track coded block flags and their bit cost.
|
||||||
|
We use opportunity cost to measure the bits required to code or skip the next
|
||||||
|
block, using the cheaper of the cost to code it fully or partially, so long
|
||||||
|
as both are possible.*/
|
||||||
|
struct oc_fr_state{
|
||||||
|
/*The number of bits required for the coded block flags so far this frame.*/
|
||||||
|
ptrdiff_t bits;
|
||||||
|
/*The length of the current run for the partial super block flag, not
|
||||||
|
including the current super block.*/
|
||||||
|
unsigned sb_partial_count:16;
|
||||||
|
/*The length of the current run for the full super block flag, not
|
||||||
|
including the current super block.*/
|
||||||
|
unsigned sb_full_count:16;
|
||||||
|
/*The length of the coded block flag run when the current super block
|
||||||
|
started.*/
|
||||||
|
unsigned b_coded_count_prev:6;
|
||||||
|
/*The coded block flag when the current super block started.*/
|
||||||
|
signed int b_coded_prev:2;
|
||||||
|
/*The length of the current coded block flag run.*/
|
||||||
|
unsigned b_coded_count:6;
|
||||||
|
/*The current coded block flag.*/
|
||||||
|
signed int b_coded:2;
|
||||||
|
/*The number of blocks processed in the current super block.*/
|
||||||
|
unsigned b_count:5;
|
||||||
|
/*Whether or not it is cheaper to code the current super block partially,
|
||||||
|
even if it could still be coded fully.*/
|
||||||
|
unsigned sb_prefer_partial:1;
|
||||||
|
/*Whether the last super block was coded partially.*/
|
||||||
|
signed int sb_partial:2;
|
||||||
|
/*The number of bits required for the flags for the current super block.*/
|
||||||
|
unsigned sb_bits:6;
|
||||||
|
/*Whether the last non-partial super block was coded fully.*/
|
||||||
|
signed int sb_full:2;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
struct oc_qii_state{
|
||||||
|
ptrdiff_t bits;
|
||||||
|
unsigned qi01_count:14;
|
||||||
|
signed int qi01:2;
|
||||||
|
unsigned qi12_count:14;
|
||||||
|
signed int qi12:2;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Temporary encoder state for the analysis pipeline.*/
|
||||||
|
struct oc_enc_pipeline_state{
|
||||||
|
/*DCT coefficient storage.
|
||||||
|
This is kept off the stack because a) gcc can't align things on the stack
|
||||||
|
reliably on ARM, and b) it avoids (unintentional) data hazards between
|
||||||
|
ARM and NEON code.*/
|
||||||
|
OC_ALIGN16(ogg_int16_t dct_data[64*3]);
|
||||||
|
OC_ALIGN16(signed char bounding_values[256]);
|
||||||
|
oc_fr_state fr[3];
|
||||||
|
oc_qii_state qs[3];
|
||||||
|
/*Skip SSD storage for the current MCU in each plane.*/
|
||||||
|
unsigned *skip_ssd[3];
|
||||||
|
/*Coded/uncoded fragment lists for each plane for the current MCU.*/
|
||||||
|
ptrdiff_t *coded_fragis[3];
|
||||||
|
ptrdiff_t *uncoded_fragis[3];
|
||||||
|
ptrdiff_t ncoded_fragis[3];
|
||||||
|
ptrdiff_t nuncoded_fragis[3];
|
||||||
|
/*The starting fragment for the current MCU in each plane.*/
|
||||||
|
ptrdiff_t froffset[3];
|
||||||
|
/*The starting row for the current MCU in each plane.*/
|
||||||
|
int fragy0[3];
|
||||||
|
/*The ending row for the current MCU in each plane.*/
|
||||||
|
int fragy_end[3];
|
||||||
|
/*The starting superblock for the current MCU in each plane.*/
|
||||||
|
unsigned sbi0[3];
|
||||||
|
/*The ending superblock for the current MCU in each plane.*/
|
||||||
|
unsigned sbi_end[3];
|
||||||
|
/*The number of tokens for zzi=1 for each color plane.*/
|
||||||
|
int ndct_tokens1[3];
|
||||||
|
/*The outstanding eob_run count for zzi=1 for each color plane.*/
|
||||||
|
int eob_run1[3];
|
||||||
|
/*Whether or not the loop filter is enabled.*/
|
||||||
|
int loop_filter;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Statistics used to estimate R-D cost of a block in a given coding mode.
|
||||||
|
See modedec.h for more details.*/
|
||||||
|
struct oc_mode_rd{
|
||||||
|
/*The expected bits used by the DCT tokens, shifted by OC_BIT_SCALE.*/
|
||||||
|
ogg_int16_t rate;
|
||||||
|
/*The expected square root of the sum of squared errors, shifted by
|
||||||
|
OC_RMSE_SCALE.*/
|
||||||
|
ogg_int16_t rmse;
|
||||||
|
};
|
||||||
|
|
||||||
|
# if defined(OC_COLLECT_METRICS)
|
||||||
|
# include "collect.h"
|
||||||
|
# endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*A 2nd order low-pass Bessel follower.
|
/*A 2nd order low-pass Bessel follower.
|
||||||
We use this for rate control because it has fast reaction time, but is
|
We use this for rate control because it has fast reaction time, but is
|
||||||
critically damped.*/
|
critically damped.*/
|
||||||
@ -190,6 +524,8 @@ struct oc_frame_metrics{
|
|||||||
unsigned dup_count:31;
|
unsigned dup_count:31;
|
||||||
/*The frame type from pass 1.*/
|
/*The frame type from pass 1.*/
|
||||||
unsigned frame_type:1;
|
unsigned frame_type:1;
|
||||||
|
/*The frame activity average from pass 1.*/
|
||||||
|
unsigned activity_avg;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -335,10 +671,15 @@ struct th_enc_ctx{
|
|||||||
size_t mv_bits[2];
|
size_t mv_bits[2];
|
||||||
/*The mode scheme chooser for estimating mode coding costs.*/
|
/*The mode scheme chooser for estimating mode coding costs.*/
|
||||||
oc_mode_scheme_chooser chooser;
|
oc_mode_scheme_chooser chooser;
|
||||||
|
/*Temporary encoder state for the analysis pipeline.*/
|
||||||
|
oc_enc_pipeline_state pipe;
|
||||||
/*The number of vertical super blocks in an MCU.*/
|
/*The number of vertical super blocks in an MCU.*/
|
||||||
int mcu_nvsbs;
|
int mcu_nvsbs;
|
||||||
/*The SSD error for skipping each fragment in the current MCU.*/
|
/*The SSD error for skipping each fragment in the current MCU.*/
|
||||||
unsigned *mcu_skip_ssd;
|
unsigned *mcu_skip_ssd;
|
||||||
|
/*The masking scale factors for chroma blocks in the current MCU.*/
|
||||||
|
ogg_uint16_t *mcu_rd_scale;
|
||||||
|
ogg_uint16_t *mcu_rd_iscale;
|
||||||
/*The DCT token lists for each coefficient and each plane.*/
|
/*The DCT token lists for each coefficient and each plane.*/
|
||||||
unsigned char **dct_tokens[3];
|
unsigned char **dct_tokens[3];
|
||||||
/*The extra bits associated with each DCT token.*/
|
/*The extra bits associated with each DCT token.*/
|
||||||
@ -350,8 +691,10 @@ struct th_enc_ctx{
|
|||||||
/*The offset of the first DCT token for each coefficient for each plane.*/
|
/*The offset of the first DCT token for each coefficient for each plane.*/
|
||||||
unsigned char dct_token_offs[3][64];
|
unsigned char dct_token_offs[3][64];
|
||||||
/*The last DC coefficient for each plane and reference frame.*/
|
/*The last DC coefficient for each plane and reference frame.*/
|
||||||
int dc_pred_last[3][3];
|
int dc_pred_last[3][4];
|
||||||
#if defined(OC_COLLECT_METRICS)
|
#if defined(OC_COLLECT_METRICS)
|
||||||
|
/*Fragment SAD statistics for MB mode estimation metrics.*/
|
||||||
|
unsigned *frag_sad;
|
||||||
/*Fragment SATD statistics for MB mode estimation metrics.*/
|
/*Fragment SATD statistics for MB mode estimation metrics.*/
|
||||||
unsigned *frag_satd;
|
unsigned *frag_satd;
|
||||||
/*Fragment SSD statistics for MB mode estimation metrics.*/
|
/*Fragment SSD statistics for MB mode estimation metrics.*/
|
||||||
@ -359,32 +702,56 @@ struct th_enc_ctx{
|
|||||||
#endif
|
#endif
|
||||||
/*The R-D optimization parameter.*/
|
/*The R-D optimization parameter.*/
|
||||||
int lambda;
|
int lambda;
|
||||||
|
/*The average block "activity" of the previous frame.*/
|
||||||
|
unsigned activity_avg;
|
||||||
|
/*The average MB luma of the previous frame.*/
|
||||||
|
unsigned luma_avg;
|
||||||
/*The huffman tables in use.*/
|
/*The huffman tables in use.*/
|
||||||
th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||||
/*The quantization parameters in use.*/
|
/*The quantization parameters in use.*/
|
||||||
th_quant_info qinfo;
|
th_quant_info qinfo;
|
||||||
oc_iquant *enquant_tables[64][3][2];
|
/*The original DC coefficients saved off from the dequatization tables.*/
|
||||||
oc_iquant_table enquant_table_data[64][3][2];
|
ogg_uint16_t dequant_dc[64][3][2];
|
||||||
/*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
|
/*Condensed dequantization tables.*/
|
||||||
value.
|
const ogg_uint16_t *dequant[3][3][2];
|
||||||
This is used to paramterize the rate control decisions.
|
/*Condensed quantization tables.*/
|
||||||
|
void *enquant[3][3][2];
|
||||||
|
/*The full set of quantization tables.*/
|
||||||
|
void *enquant_tables[64][3][2];
|
||||||
|
/*Storage for the quantization tables.*/
|
||||||
|
unsigned char *enquant_table_data;
|
||||||
|
/*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
|
||||||
|
This is used to parameterize the rate control decisions.
|
||||||
They are kept in the log domain to simplify later processing.
|
They are kept in the log domain to simplify later processing.
|
||||||
Keep in mind these are DCT domain quantizers, and so are scaled by an
|
These are DCT domain quantizers, and so are scaled by an additional factor
|
||||||
additional factor of 4 from the pixel domain.*/
|
of 4 from the pixel domain.*/
|
||||||
ogg_int64_t log_qavg[2][64];
|
ogg_int64_t log_qavg[2][64];
|
||||||
|
/*The "average" quantizer futher partitioned by color plane.
|
||||||
|
This is used to parameterize mode decision.
|
||||||
|
These are DCT domain quantizers, and so are scaled by an additional factor
|
||||||
|
of 4 from the pixel domain.*/
|
||||||
|
ogg_int16_t log_plq[64][3][2];
|
||||||
|
/*The R-D scale factors to apply to chroma blocks for a given frame type
|
||||||
|
(INTRA or INTER) and qi value.
|
||||||
|
The first is the "D" modifier (rd_scale), while the second is the "lambda"
|
||||||
|
modifier (rd_iscale).*/
|
||||||
|
ogg_uint16_t chroma_rd_scale[2][64][2];
|
||||||
|
/*The interpolated mode decision R-D lookup tables for the current
|
||||||
|
quantizers, color plane, and quantization type.*/
|
||||||
|
oc_mode_rd mode_rd[3][3][2][OC_COMP_BINS];
|
||||||
/*The buffer state used to drive rate control.*/
|
/*The buffer state used to drive rate control.*/
|
||||||
oc_rc_state rc;
|
oc_rc_state rc;
|
||||||
|
# if defined(OC_ENC_USE_VTABLE)
|
||||||
/*Table for encoder acceleration functions.*/
|
/*Table for encoder acceleration functions.*/
|
||||||
oc_enc_opt_vtable opt_vtable;
|
oc_enc_opt_vtable opt_vtable;
|
||||||
|
# endif
|
||||||
|
/*Table for encoder data used by accelerated functions.*/
|
||||||
|
oc_enc_opt_data opt_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
|
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
|
||||||
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
|
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
|
||||||
#if defined(OC_COLLECT_METRICS)
|
|
||||||
void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
|
|
||||||
void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -415,8 +782,13 @@ struct oc_token_checkpoint{
|
|||||||
|
|
||||||
void oc_enc_tokenize_start(oc_enc_ctx *_enc);
|
void oc_enc_tokenize_start(oc_enc_ctx *_enc);
|
||||||
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
|
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
|
||||||
ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
|
ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
|
||||||
int _zzi,oc_token_checkpoint **_stack,int _acmin);
|
const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
|
||||||
|
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
|
||||||
|
int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
|
||||||
|
ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
|
||||||
|
const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
|
||||||
|
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
|
||||||
void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
|
void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
|
||||||
const oc_token_checkpoint *_stack,int _n);
|
const oc_token_checkpoint *_stack,int _n);
|
||||||
void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
|
void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
|
||||||
@ -436,45 +808,13 @@ int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Encoder-specific accelerated functions.*/
|
/*Default pure-C implementations of encoder-specific accelerated functions.*/
|
||||||
void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
|
void oc_enc_accel_init_c(oc_enc_ctx *_enc);
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride);
|
|
||||||
void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
|
|
||||||
const unsigned char *_src,int _ystride);
|
|
||||||
unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
|
|
||||||
const unsigned char *_ref,int _ystride);
|
|
||||||
unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
|
||||||
unsigned _thresh);
|
|
||||||
unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,const unsigned char *_ref1,
|
|
||||||
const unsigned char *_ref2,int _ystride,unsigned _thresh);
|
|
||||||
unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
|
||||||
unsigned _thresh);
|
|
||||||
unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,const unsigned char *_ref1,
|
|
||||||
const unsigned char *_ref2,int _ystride,unsigned _thresh);
|
|
||||||
unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
|
|
||||||
const unsigned char *_src,int _ystride);
|
|
||||||
void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
|
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
|
||||||
void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
|
|
||||||
unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
|
|
||||||
const ogg_int16_t _x[64]);
|
|
||||||
|
|
||||||
/*Default pure-C implementations.*/
|
|
||||||
void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
|
|
||||||
|
|
||||||
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
|
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride);
|
const unsigned char *_src,const unsigned char *_ref,int _ystride);
|
||||||
void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
|
void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
|
||||||
const unsigned char *_src,int _ystride);
|
const unsigned char *_src,int _ystride);
|
||||||
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
|
||||||
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride);
|
const unsigned char *_ref,int _ystride);
|
||||||
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
||||||
@ -482,12 +822,24 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
|||||||
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||||
unsigned _thresh);
|
unsigned _thresh);
|
||||||
unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
|
unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride);
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
|
||||||
unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
|
const unsigned char *_ref,int _ystride);
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
|
||||||
unsigned _thresh);
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||||
unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
|
unsigned oc_enc_frag_intra_satd_c(int *_dc,
|
||||||
|
const unsigned char *_src,int _ystride);
|
||||||
|
unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride);
|
||||||
|
unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||||
|
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
||||||
|
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||||
|
void oc_enc_enquant_table_init_c(void *_enquant,
|
||||||
|
const ogg_uint16_t _dequant[64]);
|
||||||
|
void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
|
||||||
|
int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||||
|
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||||
void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
428
thirdparty/libtheora/encode.c
vendored
428
thirdparty/libtheora/encode.c
vendored
@ -11,15 +11,13 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: encode.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "encint.h"
|
#include "encint.h"
|
||||||
#if defined(OC_X86_ASM)
|
#include "dequant.h"
|
||||||
# include "x86/x86enc.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -288,12 +286,12 @@ const th_quant_info TH_DEF_QUANT_INFO={
|
|||||||
28, 25, 24, 22, 20, 17, 14, 10
|
28, 25, 24, 22, 20, 17, 14, 10
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
30,25,20,20,15,15,14,14,
|
15,12, 9, 8, 6, 6, 5, 5,
|
||||||
13,13,12,12,11,11,10,10,
|
5, 5, 5, 5, 5, 5, 5, 5,
|
||||||
9, 9, 8, 8, 7, 7, 7, 7,
|
4, 4, 4, 4, 4, 4, 3, 3,
|
||||||
6, 6, 6, 6, 5, 5, 5, 5,
|
3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
4, 4, 4, 4, 3, 3, 3, 3,
|
|
||||||
2, 2, 2, 2, 2, 2, 2, 2,
|
2, 2, 2, 2, 2, 2, 2, 2,
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0
|
0, 0, 0, 0, 0, 0, 0, 0
|
||||||
},
|
},
|
||||||
@ -623,11 +621,15 @@ static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,int _dx,int _dy){
|
static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,oc_mv _mv){
|
||||||
|
int dx;
|
||||||
|
int dy;
|
||||||
|
dx=OC_MV_X(_mv);
|
||||||
|
dy=OC_MV_Y(_mv);
|
||||||
oggpackB_write(&_enc->opb,
|
oggpackB_write(&_enc->opb,
|
||||||
OC_MV_CODES[_mv_scheme][_dx+31],OC_MV_BITS[_mv_scheme][_dx+31]);
|
OC_MV_CODES[_mv_scheme][dx+31],OC_MV_BITS[_mv_scheme][dx+31]);
|
||||||
oggpackB_write(&_enc->opb,
|
oggpackB_write(&_enc->opb,
|
||||||
OC_MV_CODES[_mv_scheme][_dy+31],OC_MV_BITS[_mv_scheme][_dy+31]);
|
OC_MV_CODES[_mv_scheme][dy+31],OC_MV_BITS[_mv_scheme][dy+31]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
|
static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
|
||||||
@ -650,7 +652,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
|
|||||||
mb_modes=_enc->state.mb_modes;
|
mb_modes=_enc->state.mb_modes;
|
||||||
mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
|
mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
|
||||||
frags=_enc->state.frags;
|
frags=_enc->state.frags;
|
||||||
frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
|
frag_mvs=_enc->state.frag_mvs;
|
||||||
for(mbii=0;mbii<ncoded_mbis;mbii++){
|
for(mbii=0;mbii<ncoded_mbis;mbii++){
|
||||||
ptrdiff_t fragi;
|
ptrdiff_t fragi;
|
||||||
unsigned mbi;
|
unsigned mbi;
|
||||||
@ -662,8 +664,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
|
|||||||
for(bi=0;;bi++){
|
for(bi=0;;bi++){
|
||||||
fragi=mb_maps[mbi][0][bi];
|
fragi=mb_maps[mbi][0][bi];
|
||||||
if(frags[fragi].coded){
|
if(frags[fragi].coded){
|
||||||
oc_enc_mv_pack(_enc,mv_scheme,
|
oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
|
||||||
frag_mvs[fragi][0],frag_mvs[fragi][1]);
|
|
||||||
/*Only code a single MV for this macro block.*/
|
/*Only code a single MV for this macro block.*/
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -673,8 +674,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
|
|||||||
for(bi=0;bi<4;bi++){
|
for(bi=0;bi<4;bi++){
|
||||||
fragi=mb_maps[mbi][0][bi];
|
fragi=mb_maps[mbi][0][bi];
|
||||||
if(frags[fragi].coded){
|
if(frags[fragi].coded){
|
||||||
oc_enc_mv_pack(_enc,mv_scheme,
|
oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
|
||||||
frag_mvs[fragi][0],frag_mvs[fragi][1]);
|
|
||||||
/*Keep coding all the MVs for this macro block.*/
|
/*Keep coding all the MVs for this macro block.*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -863,11 +863,55 @@ static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*Packs an explicit drop frame, instead of using the more efficient 0-byte
|
||||||
|
packet.
|
||||||
|
This is only enabled in VP3-compatibility mode, even though it is not
|
||||||
|
strictly required for VP3 compatibility (VP3 could be encoded in AVI, which
|
||||||
|
also supports dropping frames by inserting 0 byte packets).
|
||||||
|
However, almost every _Theora_ player used to get this wrong (and many still
|
||||||
|
do), and it wasn't until we started shipping a post-VP3 encoder that
|
||||||
|
actually used non-VP3 features that this began to be discovered and fixed,
|
||||||
|
despite being in the standard since 2004.
|
||||||
|
The pack buffer must be reset before calling this function.*/
|
||||||
|
static void oc_enc_drop_frame_pack(oc_enc_ctx *_enc){
|
||||||
|
unsigned nsbs;
|
||||||
|
/*Mark this as a data packet.*/
|
||||||
|
oggpackB_write(&_enc->opb,0,1);
|
||||||
|
/*Output the frame type (key frame or delta frame).*/
|
||||||
|
oggpackB_write(&_enc->opb,OC_INTER_FRAME,1);
|
||||||
|
/*Write out the current qi list.
|
||||||
|
We always use just 1 qi, to avoid wasting bits on the others.*/
|
||||||
|
oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
|
||||||
|
oggpackB_write(&_enc->opb,0,1);
|
||||||
|
/*Coded block flags: everything is uncoded.*/
|
||||||
|
nsbs=_enc->state.nsbs;
|
||||||
|
/*No partially coded SBs.*/
|
||||||
|
oggpackB_write(&_enc->opb,0,1);
|
||||||
|
oc_sb_run_pack(&_enc->opb,nsbs,0,1);
|
||||||
|
/*No fully coded SBs.*/
|
||||||
|
oggpackB_write(&_enc->opb,0,1);
|
||||||
|
oc_sb_run_pack(&_enc->opb,nsbs,0,1);
|
||||||
|
/*MB modes: just need write which scheme to use.
|
||||||
|
Since we have no coded MBs, we can pick any of them except 0, which would
|
||||||
|
require writing out an additional mode list.*/
|
||||||
|
oggpackB_write(&_enc->opb,7,3);
|
||||||
|
/*MVs: just need write which scheme to use.
|
||||||
|
We can pick either one, since we have no MVs.*/
|
||||||
|
oggpackB_write(&_enc->opb,1,1);
|
||||||
|
/*Write the chosen DC token tables.*/
|
||||||
|
oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][0],4);
|
||||||
|
oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][1],4);
|
||||||
|
/*Write the chosen AC token tables.*/
|
||||||
|
oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][0],4);
|
||||||
|
oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][1],4);
|
||||||
|
}
|
||||||
|
|
||||||
static void oc_enc_frame_pack(oc_enc_ctx *_enc){
|
static void oc_enc_frame_pack(oc_enc_ctx *_enc){
|
||||||
|
/*musl libc malloc()/realloc() calls might use floating point, so make sure
|
||||||
|
we've cleared the MMX state for them.*/
|
||||||
|
oc_restore_fpu(&_enc->state);
|
||||||
oggpackB_reset(&_enc->opb);
|
oggpackB_reset(&_enc->opb);
|
||||||
/*Only proceed if we have some coded blocks.
|
/*Only proceed if we have some coded blocks.*/
|
||||||
If there are no coded blocks, we can drop this frame simply by emitting a
|
|
||||||
0 byte packet.*/
|
|
||||||
if(_enc->state.ntotal_coded_fragis>0){
|
if(_enc->state.ntotal_coded_fragis>0){
|
||||||
oc_enc_frame_header_pack(_enc);
|
oc_enc_frame_header_pack(_enc);
|
||||||
if(_enc->state.frame_type==OC_INTER_FRAME){
|
if(_enc->state.frame_type==OC_INTER_FRAME){
|
||||||
@ -880,6 +924,10 @@ static void oc_enc_frame_pack(oc_enc_ctx *_enc){
|
|||||||
oc_enc_tokenize_finish(_enc);
|
oc_enc_tokenize_finish(_enc);
|
||||||
oc_enc_residual_tokens_pack(_enc);
|
oc_enc_residual_tokens_pack(_enc);
|
||||||
}
|
}
|
||||||
|
/*If there are no coded blocks, we can drop this frame simply by emitting a
|
||||||
|
0 byte packet.
|
||||||
|
We emit an inter frame with no coded blocks in VP3-compatibility mode.*/
|
||||||
|
else if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
|
||||||
/*Success: Mark the packet as ready to be flushed.*/
|
/*Success: Mark the packet as ready to be flushed.*/
|
||||||
_enc->packet_state=OC_PACKET_READY;
|
_enc->packet_state=OC_PACKET_READY;
|
||||||
#if defined(OC_COLLECT_METRICS)
|
#if defined(OC_COLLECT_METRICS)
|
||||||
@ -888,21 +936,31 @@ static void oc_enc_frame_pack(oc_enc_ctx *_enc){
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void oc_enc_vtable_init_c(oc_enc_ctx *_enc){
|
void oc_enc_accel_init_c(oc_enc_ctx *_enc){
|
||||||
/*The implementations prefixed with oc_enc_ are encoder-specific.
|
/*The implementations prefixed with oc_enc_ are encoder-specific.
|
||||||
The rest we re-use from the decoder.*/
|
The rest we re-use from the decoder.*/
|
||||||
|
# if defined(OC_ENC_USE_VTABLE)
|
||||||
|
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
|
||||||
|
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
|
||||||
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
|
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
|
||||||
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
|
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
|
||||||
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
|
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
|
||||||
_enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_c;
|
_enc->opt_vtable.frag_intra_sad=oc_enc_frag_intra_sad_c;
|
||||||
_enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_c;
|
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_c;
|
||||||
|
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c;
|
||||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
|
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
|
||||||
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
|
_enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_c;
|
||||||
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
|
_enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_c;
|
||||||
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
|
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
|
||||||
|
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
|
||||||
|
_enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c;
|
||||||
|
_enc->opt_vtable.quantize=oc_enc_quantize_c;
|
||||||
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
|
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
|
||||||
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
|
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
|
||||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
|
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
|
||||||
|
# endif
|
||||||
|
_enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
|
||||||
|
_enc->opt_data.enquant_table_alignment=16;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Initialize the macro block neighbor lists for MC analysis.
|
/*Initialize the macro block neighbor lists for MC analysis.
|
||||||
@ -1003,6 +1061,55 @@ static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
|
||||||
|
const th_quant_info *_qinfo){
|
||||||
|
unsigned char *etd;
|
||||||
|
size_t ets;
|
||||||
|
int align;
|
||||||
|
int qii;
|
||||||
|
int qi;
|
||||||
|
int pli;
|
||||||
|
int qti;
|
||||||
|
for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
|
||||||
|
_enc->state.dequant_tables[qi][pli][qti]=
|
||||||
|
_enc->state.dequant_table_data[qi][pli][qti];
|
||||||
|
}
|
||||||
|
/*Initialize the dequantization tables.*/
|
||||||
|
oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
|
||||||
|
/*And save off the DC values.*/
|
||||||
|
for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
|
||||||
|
_enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0];
|
||||||
|
}
|
||||||
|
/*Set up storage for the quantization tables.*/
|
||||||
|
etd=_enc->enquant_table_data;
|
||||||
|
ets=_enc->opt_data.enquant_table_size;
|
||||||
|
align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
|
||||||
|
etd+=align;
|
||||||
|
/*Set up the main tables.*/
|
||||||
|
for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
|
||||||
|
_enc->enquant_tables[qi][pli][qti]=etd;
|
||||||
|
oc_enc_enquant_table_init(_enc,etd,
|
||||||
|
_enc->state.dequant_tables[qi][pli][qti]);
|
||||||
|
etd+=ets;
|
||||||
|
}
|
||||||
|
/*Set up storage for the local copies we modify for each frame.*/
|
||||||
|
for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){
|
||||||
|
_enc->enquant[pli][qii][qti]=etd;
|
||||||
|
etd+=ets;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Updates the encoder state after the quantization parameters have been
|
||||||
|
changed.*/
|
||||||
|
static void oc_enc_quant_params_updated(oc_enc_ctx *_enc,
|
||||||
|
const th_quant_info *_qinfo){
|
||||||
|
oc_enc_enquant_tables_init(_enc,_qinfo);
|
||||||
|
memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
|
||||||
|
sizeof(_enc->state.loop_filter_limits));
|
||||||
|
oc_enquant_qavg_init(_enc->log_qavg,_enc->log_plq,_enc->chroma_rd_scale,
|
||||||
|
_enc->state.dequant_tables,_enc->state.info.pixel_fmt);
|
||||||
|
}
|
||||||
|
|
||||||
/*Sets the quantization parameters to use.
|
/*Sets the quantization parameters to use.
|
||||||
This may only be called before the setup header is written.
|
This may only be called before the setup header is written.
|
||||||
If it is called multiple times, only the last call has any effect.
|
If it is called multiple times, only the last call has any effect.
|
||||||
@ -1012,25 +1119,20 @@ static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
|
|||||||
will be used.*/
|
will be used.*/
|
||||||
static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
|
static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
|
||||||
const th_quant_info *_qinfo){
|
const th_quant_info *_qinfo){
|
||||||
int qi;
|
th_quant_info old_qinfo;
|
||||||
int pli;
|
int ret;
|
||||||
int qti;
|
|
||||||
if(_enc==NULL)return TH_EFAULT;
|
if(_enc==NULL)return TH_EFAULT;
|
||||||
if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
|
if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
|
||||||
if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
|
if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
|
||||||
/*TODO: Analyze for packing purposes instead of just doing a shallow copy.*/
|
memcpy(&old_qinfo,&_enc->qinfo,sizeof(old_qinfo));
|
||||||
memcpy(&_enc->qinfo,_qinfo,sizeof(_enc->qinfo));
|
ret=oc_quant_params_clone(&_enc->qinfo,_qinfo);
|
||||||
for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
|
if(ret<0){
|
||||||
_enc->state.dequant_tables[qi][pli][qti]=
|
oc_quant_params_clear(&_enc->qinfo);
|
||||||
_enc->state.dequant_table_data[qi][pli][qti];
|
memcpy(&_enc->qinfo,&old_qinfo,sizeof(old_qinfo));
|
||||||
_enc->enquant_tables[qi][pli][qti]=_enc->enquant_table_data[qi][pli][qti];
|
return ret;
|
||||||
}
|
}
|
||||||
oc_enquant_tables_init(_enc->state.dequant_tables,
|
else oc_quant_params_clear(&old_qinfo);
|
||||||
_enc->enquant_tables,_qinfo);
|
oc_enc_quant_params_updated(_enc,_qinfo);
|
||||||
memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
|
|
||||||
sizeof(_enc->state.loop_filter_limits));
|
|
||||||
oc_enquant_qavg_init(_enc->log_qavg,_enc->state.dequant_tables,
|
|
||||||
_enc->state.info.pixel_fmt);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1039,6 +1141,7 @@ static void oc_enc_clear(oc_enc_ctx *_enc);
|
|||||||
static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
|
static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
|
||||||
th_info info;
|
th_info info;
|
||||||
size_t mcu_nmbs;
|
size_t mcu_nmbs;
|
||||||
|
ptrdiff_t mcu_ncfrags;
|
||||||
ptrdiff_t mcu_nfrags;
|
ptrdiff_t mcu_nfrags;
|
||||||
int hdec;
|
int hdec;
|
||||||
int vdec;
|
int vdec;
|
||||||
@ -1053,8 +1156,9 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
|
|||||||
if(info.quality<0)info.quality=32;
|
if(info.quality<0)info.quality=32;
|
||||||
if(info.target_bitrate<0)info.target_bitrate=0;
|
if(info.target_bitrate<0)info.target_bitrate=0;
|
||||||
/*Initialize the shared encoder/decoder state.*/
|
/*Initialize the shared encoder/decoder state.*/
|
||||||
ret=oc_state_init(&_enc->state,&info,4);
|
ret=oc_state_init(&_enc->state,&info,6);
|
||||||
if(ret<0)return ret;
|
if(ret<0)return ret;
|
||||||
|
oc_enc_accel_init(_enc);
|
||||||
_enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
|
_enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
|
||||||
_enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
|
_enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
|
||||||
_enc->coded_mbis=
|
_enc->coded_mbis=
|
||||||
@ -1065,9 +1169,14 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
|
|||||||
super block rows of Y' for each super block row of Cb and Cr.*/
|
super block rows of Y' for each super block row of Cb and Cr.*/
|
||||||
_enc->mcu_nvsbs=1<<vdec;
|
_enc->mcu_nvsbs=1<<vdec;
|
||||||
mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4;
|
mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4;
|
||||||
mcu_nfrags=4*mcu_nmbs+(8*mcu_nmbs>>hdec+vdec);
|
mcu_ncfrags=mcu_nmbs<<3-(hdec+vdec);
|
||||||
|
mcu_nfrags=4*mcu_nmbs+mcu_ncfrags;
|
||||||
_enc->mcu_skip_ssd=(unsigned *)_ogg_malloc(
|
_enc->mcu_skip_ssd=(unsigned *)_ogg_malloc(
|
||||||
mcu_nfrags*sizeof(*_enc->mcu_skip_ssd));
|
mcu_nfrags*sizeof(*_enc->mcu_skip_ssd));
|
||||||
|
_enc->mcu_rd_scale=(ogg_uint16_t *)_ogg_malloc(
|
||||||
|
(mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_scale));
|
||||||
|
_enc->mcu_rd_iscale=(ogg_uint16_t *)_ogg_malloc(
|
||||||
|
(mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_iscale));
|
||||||
for(pli=0;pli<3;pli++){
|
for(pli=0;pli<3;pli++){
|
||||||
_enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
|
_enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
|
||||||
_enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
|
_enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
|
||||||
@ -1075,34 +1184,22 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
|
|||||||
_enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
|
_enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
|
||||||
}
|
}
|
||||||
#if defined(OC_COLLECT_METRICS)
|
#if defined(OC_COLLECT_METRICS)
|
||||||
|
_enc->frag_sad=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_sad));
|
||||||
_enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
|
_enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
|
||||||
_enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
|
_enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
|
||||||
#endif
|
#endif
|
||||||
#if defined(OC_X86_ASM)
|
_enc->enquant_table_data=(unsigned char *)_ogg_malloc(
|
||||||
oc_enc_vtable_init_x86(_enc);
|
(64+3)*3*2*_enc->opt_data.enquant_table_size
|
||||||
#else
|
+_enc->opt_data.enquant_table_alignment-1);
|
||||||
oc_enc_vtable_init_c(_enc);
|
|
||||||
#endif
|
|
||||||
_enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
|
_enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
|
||||||
_enc->state.qis[0]=_enc->state.info.quality;
|
_enc->state.qis[0]=_enc->state.info.quality;
|
||||||
_enc->state.nqis=1;
|
_enc->state.nqis=1;
|
||||||
|
_enc->activity_avg=90<<12;
|
||||||
|
_enc->luma_avg=128<<8;
|
||||||
oc_rc_state_init(&_enc->rc,_enc);
|
oc_rc_state_init(&_enc->rc,_enc);
|
||||||
oggpackB_writeinit(&_enc->opb);
|
oggpackB_writeinit(&_enc->opb);
|
||||||
if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL||
|
memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
|
||||||
_enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL||
|
memset(_enc->qinfo.qi_ranges,0,sizeof(_enc->qinfo.qi_ranges));
|
||||||
_enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL||
|
|
||||||
_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL||
|
|
||||||
_enc->extra_bits[2]==NULL
|
|
||||||
#if defined(OC_COLLECT_METRICS)
|
|
||||||
||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
|
|
||||||
#endif
|
|
||||||
){
|
|
||||||
oc_enc_clear(_enc);
|
|
||||||
return TH_EFAULT;
|
|
||||||
}
|
|
||||||
oc_mode_scheme_chooser_init(&_enc->chooser);
|
|
||||||
oc_enc_mb_info_init(_enc);
|
|
||||||
memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
|
|
||||||
/*Reset the packet-out state machine.*/
|
/*Reset the packet-out state machine.*/
|
||||||
_enc->packet_state=OC_PACKET_INFO_HDR;
|
_enc->packet_state=OC_PACKET_INFO_HDR;
|
||||||
_enc->dup_count=0;
|
_enc->dup_count=0;
|
||||||
@ -1114,26 +1211,45 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
|
|||||||
_enc->vp3_compatible=0;
|
_enc->vp3_compatible=0;
|
||||||
/*No INTER frames coded yet.*/
|
/*No INTER frames coded yet.*/
|
||||||
_enc->coded_inter_frame=0;
|
_enc->coded_inter_frame=0;
|
||||||
memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
|
if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL
|
||||||
oc_enc_set_quant_params(_enc,NULL);
|
||_enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL
|
||||||
|
||_enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL
|
||||||
|
||_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL
|
||||||
|
||_enc->extra_bits[2]==NULL
|
||||||
|
#if defined(OC_COLLECT_METRICS)
|
||||||
|
||_enc->frag_sad==NULL||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
|
||||||
|
#endif
|
||||||
|
||oc_enc_set_quant_params(_enc,NULL)<0){
|
||||||
|
oc_enc_clear(_enc);
|
||||||
|
return TH_EFAULT;
|
||||||
|
}
|
||||||
|
oc_mode_scheme_chooser_init(&_enc->chooser);
|
||||||
|
oc_enc_mb_info_init(_enc);
|
||||||
|
memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_enc_clear(oc_enc_ctx *_enc){
|
static void oc_enc_clear(oc_enc_ctx *_enc){
|
||||||
int pli;
|
int pli;
|
||||||
oc_rc_state_clear(&_enc->rc);
|
oc_rc_state_clear(&_enc->rc);
|
||||||
#if defined(OC_COLLECT_METRICS)
|
|
||||||
oc_enc_mode_metrics_dump(_enc);
|
|
||||||
#endif
|
|
||||||
oggpackB_writeclear(&_enc->opb);
|
oggpackB_writeclear(&_enc->opb);
|
||||||
|
oc_quant_params_clear(&_enc->qinfo);
|
||||||
|
_ogg_free(_enc->enquant_table_data);
|
||||||
#if defined(OC_COLLECT_METRICS)
|
#if defined(OC_COLLECT_METRICS)
|
||||||
|
/*Save the collected metrics from this run.
|
||||||
|
Use tools/process_modedec_stats to actually generate modedec.h from the
|
||||||
|
resulting file.*/
|
||||||
|
oc_mode_metrics_dump();
|
||||||
_ogg_free(_enc->frag_ssd);
|
_ogg_free(_enc->frag_ssd);
|
||||||
_ogg_free(_enc->frag_satd);
|
_ogg_free(_enc->frag_satd);
|
||||||
|
_ogg_free(_enc->frag_sad);
|
||||||
#endif
|
#endif
|
||||||
for(pli=3;pli-->0;){
|
for(pli=3;pli-->0;){
|
||||||
oc_free_2d(_enc->extra_bits[pli]);
|
oc_free_2d(_enc->extra_bits[pli]);
|
||||||
oc_free_2d(_enc->dct_tokens[pli]);
|
oc_free_2d(_enc->dct_tokens[pli]);
|
||||||
}
|
}
|
||||||
|
_ogg_free(_enc->mcu_rd_iscale);
|
||||||
|
_ogg_free(_enc->mcu_rd_scale);
|
||||||
_ogg_free(_enc->mcu_skip_ssd);
|
_ogg_free(_enc->mcu_skip_ssd);
|
||||||
_ogg_free(_enc->coded_mbis);
|
_ogg_free(_enc->coded_mbis);
|
||||||
_ogg_free(_enc->frag_dc);
|
_ogg_free(_enc->frag_dc);
|
||||||
@ -1145,10 +1261,14 @@ static void oc_enc_drop_frame(th_enc_ctx *_enc){
|
|||||||
/*Use the previous frame's reconstruction.*/
|
/*Use the previous frame's reconstruction.*/
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_SELF]=
|
_enc->state.ref_frame_idx[OC_FRAME_SELF]=
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_PREV];
|
_enc->state.ref_frame_idx[OC_FRAME_PREV];
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_SELF]=
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_PREV];
|
||||||
/*Flag motion vector analysis about the frame drop.*/
|
/*Flag motion vector analysis about the frame drop.*/
|
||||||
_enc->prevframe_dropped=1;
|
_enc->prevframe_dropped=1;
|
||||||
/*Zero the packet.*/
|
/*Zero the packet.*/
|
||||||
oggpackB_reset(&_enc->opb);
|
oggpackB_reset(&_enc->opb);
|
||||||
|
/*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
|
||||||
|
if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
|
static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
|
||||||
@ -1222,9 +1342,9 @@ static void oc_enc_set_granpos(oc_enc_ctx *_enc){
|
|||||||
th_enc_ctx *th_encode_alloc(const th_info *_info){
|
th_enc_ctx *th_encode_alloc(const th_info *_info){
|
||||||
oc_enc_ctx *enc;
|
oc_enc_ctx *enc;
|
||||||
if(_info==NULL)return NULL;
|
if(_info==NULL)return NULL;
|
||||||
enc=_ogg_malloc(sizeof(*enc));
|
enc=oc_aligned_malloc(sizeof(*enc),16);
|
||||||
if(enc==NULL||oc_enc_init(enc,_info)<0){
|
if(enc==NULL||oc_enc_init(enc,_info)<0){
|
||||||
_ogg_free(enc);
|
oc_aligned_free(enc);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return enc;
|
return enc;
|
||||||
@ -1233,7 +1353,7 @@ th_enc_ctx *th_encode_alloc(const th_info *_info){
|
|||||||
void th_encode_free(th_enc_ctx *_enc){
|
void th_encode_free(th_enc_ctx *_enc){
|
||||||
if(_enc!=NULL){
|
if(_enc!=NULL){
|
||||||
oc_enc_clear(_enc);
|
oc_enc_clear(_enc);
|
||||||
_ogg_free(_enc);
|
oc_aligned_free(_enc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1272,12 +1392,17 @@ int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
|
|||||||
}break;
|
}break;
|
||||||
case TH_ENCCTL_SET_VP3_COMPATIBLE:{
|
case TH_ENCCTL_SET_VP3_COMPATIBLE:{
|
||||||
int vp3_compatible;
|
int vp3_compatible;
|
||||||
|
int ret;
|
||||||
if(_enc==NULL||_buf==NULL)return TH_EFAULT;
|
if(_enc==NULL||_buf==NULL)return TH_EFAULT;
|
||||||
if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
|
if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
|
||||||
|
/*Try this before we change anything else, because it can fail.*/
|
||||||
|
ret=oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO);
|
||||||
|
/*If we can't allocate enough memory, don't change any of the state.*/
|
||||||
|
if(ret==TH_EFAULT)return ret;
|
||||||
vp3_compatible=*(int *)_buf;
|
vp3_compatible=*(int *)_buf;
|
||||||
_enc->vp3_compatible=vp3_compatible;
|
_enc->vp3_compatible=vp3_compatible;
|
||||||
if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
|
if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
|
||||||
if(oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO)<0)vp3_compatible=0;
|
if(ret<0)vp3_compatible=0;
|
||||||
if(_enc->state.info.pixel_fmt!=TH_PF_420||
|
if(_enc->state.info.pixel_fmt!=TH_PF_420||
|
||||||
_enc->state.info.pic_width<_enc->state.info.frame_width||
|
_enc->state.info.pic_width<_enc->state.info.frame_width||
|
||||||
_enc->state.info.pic_height<_enc->state.info.frame_height||
|
_enc->state.info.pic_height<_enc->state.info.frame_height||
|
||||||
@ -1386,6 +1511,44 @@ int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
|
|||||||
}
|
}
|
||||||
return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
|
return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
|
||||||
}break;
|
}break;
|
||||||
|
case TH_ENCCTL_SET_COMPAT_CONFIG:{
|
||||||
|
unsigned char buf[7];
|
||||||
|
oc_pack_buf opb;
|
||||||
|
th_quant_info qinfo;
|
||||||
|
th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||||
|
int ret;
|
||||||
|
int i;
|
||||||
|
if(_enc==NULL||_buf==NULL)return TH_EFAULT;
|
||||||
|
if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
|
||||||
|
oc_pack_readinit(&opb,_buf,_buf_sz);
|
||||||
|
/*Validate the setup packet header.*/
|
||||||
|
for(i=0;i<7;i++)buf[i]=(unsigned char)oc_pack_read(&opb,8);
|
||||||
|
if(!(buf[0]&0x80)||memcmp(buf+1,"theora",6)!=0)return TH_ENOTFORMAT;
|
||||||
|
if(buf[0]!=0x82)return TH_EBADHEADER;
|
||||||
|
/*Reads its contents.*/
|
||||||
|
ret=oc_quant_params_unpack(&opb,&qinfo);
|
||||||
|
if(ret<0){
|
||||||
|
oc_quant_params_clear(&qinfo);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
ret=oc_huff_codes_unpack(&opb,huff_codes);
|
||||||
|
if(ret<0){
|
||||||
|
oc_quant_params_clear(&qinfo);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
/*Install the new state.*/
|
||||||
|
oc_quant_params_clear(&_enc->qinfo);
|
||||||
|
memcpy(&_enc->qinfo,&qinfo,sizeof(qinfo));
|
||||||
|
oc_enc_quant_params_updated(_enc,&qinfo);
|
||||||
|
memcpy(_enc->huff_codes,huff_codes,sizeof(_enc->huff_codes));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#if defined(OC_COLLECT_METRICS)
|
||||||
|
case TH_ENCCTL_SET_METRICS_FILE:{
|
||||||
|
OC_MODE_METRICS_FILENAME=(const char *)_buf;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
default:return TH_EIMPL;
|
default:return TH_EIMPL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1477,6 +1640,12 @@ static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src,
|
|||||||
|
|
||||||
int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
|
int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
|
||||||
th_ycbcr_buffer img;
|
th_ycbcr_buffer img;
|
||||||
|
int frame_width;
|
||||||
|
int frame_height;
|
||||||
|
int pic_width;
|
||||||
|
int pic_height;
|
||||||
|
int pic_x;
|
||||||
|
int pic_y;
|
||||||
int cframe_width;
|
int cframe_width;
|
||||||
int cframe_height;
|
int cframe_height;
|
||||||
int cpic_width;
|
int cpic_width;
|
||||||
@ -1492,53 +1661,94 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
|
|||||||
if(_enc==NULL||_img==NULL)return TH_EFAULT;
|
if(_enc==NULL||_img==NULL)return TH_EFAULT;
|
||||||
if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
|
if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
|
||||||
if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
|
if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
|
||||||
if((ogg_uint32_t)_img[0].width!=_enc->state.info.frame_width||
|
|
||||||
(ogg_uint32_t)_img[0].height!=_enc->state.info.frame_height){
|
|
||||||
return TH_EINVAL;
|
|
||||||
}
|
|
||||||
hdec=!(_enc->state.info.pixel_fmt&1);
|
hdec=!(_enc->state.info.pixel_fmt&1);
|
||||||
vdec=!(_enc->state.info.pixel_fmt&2);
|
vdec=!(_enc->state.info.pixel_fmt&2);
|
||||||
cframe_width=_enc->state.info.frame_width>>hdec;
|
frame_width=_enc->state.info.frame_width;
|
||||||
cframe_height=_enc->state.info.frame_height>>vdec;
|
frame_height=_enc->state.info.frame_height;
|
||||||
if(_img[1].width!=cframe_width||_img[2].width!=cframe_width||
|
pic_x=_enc->state.info.pic_x;
|
||||||
_img[1].height!=cframe_height||_img[2].height!=cframe_height){
|
pic_y=_enc->state.info.pic_y;
|
||||||
return TH_EINVAL;
|
pic_width=_enc->state.info.pic_width;
|
||||||
}
|
pic_height=_enc->state.info.pic_height;
|
||||||
/*Step 2: Copy the input to our internal buffer.
|
cframe_width=frame_width>>hdec;
|
||||||
This lets us add padding, if necessary, so we don't have to worry about
|
cframe_height=frame_height>>vdec;
|
||||||
dereferencing possibly invalid addresses, and allows us to use the same
|
cpic_x=pic_x>>hdec;
|
||||||
strides and fragment offsets for both the input frame and the reference
|
cpic_y=pic_y>>vdec;
|
||||||
frames.*/
|
cpic_width=(pic_x+pic_width+hdec>>hdec)-cpic_x;
|
||||||
|
cpic_height=(pic_y+pic_height+vdec>>vdec)-cpic_y;
|
||||||
/*Flip the input buffer upside down.*/
|
/*Flip the input buffer upside down.*/
|
||||||
oc_ycbcr_buffer_flip(img,_img);
|
oc_ycbcr_buffer_flip(img,_img);
|
||||||
oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+0,img+0,
|
if(img[0].width!=frame_width||img[0].height!=frame_height||
|
||||||
_enc->state.info.pic_x,_enc->state.info.pic_y,
|
img[1].width!=cframe_width||img[2].width!=cframe_width||
|
||||||
_enc->state.info.pic_width,_enc->state.info.pic_height);
|
img[1].height!=cframe_height||img[2].height!=cframe_height){
|
||||||
cpic_x=_enc->state.info.pic_x>>hdec;
|
/*The buffer does not match the frame size.
|
||||||
cpic_y=_enc->state.info.pic_y>>vdec;
|
Check to see if it matches the picture size.*/
|
||||||
cpic_width=(_enc->state.info.pic_x+_enc->state.info.pic_width+hdec>>hdec)
|
if(img[0].width!=pic_width||img[0].height!=pic_height||
|
||||||
-cpic_x;
|
img[1].width!=cpic_width||img[2].width!=cpic_width||
|
||||||
cpic_height=(_enc->state.info.pic_y+_enc->state.info.pic_height+vdec>>vdec)
|
img[1].height!=cpic_height||img[2].height!=cpic_height){
|
||||||
-cpic_y;
|
/*It doesn't; we don't know how to handle it.*/
|
||||||
for(pli=1;pli<3;pli++){
|
return TH_EINVAL;
|
||||||
oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+pli,img+pli,
|
}
|
||||||
cpic_x,cpic_y,cpic_width,cpic_height);
|
/*Adjust the pointers to address a full frame.
|
||||||
|
We still only use the picture region, however.*/
|
||||||
|
img[0].data-=pic_y*(ptrdiff_t)img[0].stride+pic_x;
|
||||||
|
img[1].data-=cpic_y*(ptrdiff_t)img[1].stride+cpic_x;
|
||||||
|
img[2].data-=cpic_y*(ptrdiff_t)img[2].stride+cpic_x;
|
||||||
}
|
}
|
||||||
/*Step 3: Update the buffer state.*/
|
/*Step 2: Update the buffer state.*/
|
||||||
if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
|
if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_PREV]=
|
_enc->state.ref_frame_idx[OC_FRAME_PREV]=
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_SELF];
|
_enc->state.ref_frame_idx[OC_FRAME_SELF];
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_PREV]=
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_SELF];
|
||||||
if(_enc->state.frame_type==OC_INTRA_FRAME){
|
if(_enc->state.frame_type==OC_INTRA_FRAME){
|
||||||
/*The new frame becomes both the previous and gold reference frames.*/
|
/*The new frame becomes both the previous and gold reference frames.*/
|
||||||
_enc->state.keyframe_num=_enc->state.curframe_num;
|
_enc->state.keyframe_num=_enc->state.curframe_num;
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_GOLD]=
|
_enc->state.ref_frame_idx[OC_FRAME_GOLD]=
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_SELF];
|
_enc->state.ref_frame_idx[OC_FRAME_SELF];
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_GOLD]=
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_SELF];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){
|
||||||
|
_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]=
|
||||||
|
_enc->state.ref_frame_idx[OC_FRAME_IO];
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]=
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||||
|
if(_enc->state.frame_type==OC_INTRA_FRAME){
|
||||||
|
/*The new input frame becomes both the previous and gold
|
||||||
|
original-reference frames.*/
|
||||||
|
_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]=
|
||||||
|
_enc->state.ref_frame_idx[OC_FRAME_IO];
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]=
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*Select a free buffer to use for the incoming frame*/
|
||||||
|
for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]||
|
||||||
|
refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++);
|
||||||
|
_enc->state.ref_frame_idx[OC_FRAME_IO]=refi;
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_IO]=
|
||||||
|
_enc->state.ref_frame_bufs[refi][0].data;
|
||||||
|
/*Step 3: Copy the input to our internal buffer.
|
||||||
|
This lets us add padding, so we don't have to worry about dereferencing
|
||||||
|
possibly invalid addresses, and allows us to use the same strides and
|
||||||
|
fragment offsets for both the input frame and the reference frames.*/
|
||||||
|
oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+0,img+0,
|
||||||
|
pic_x,pic_y,pic_width,pic_height);
|
||||||
|
oc_state_borders_fill_rows(&_enc->state,refi,0,0,frame_height);
|
||||||
|
oc_state_borders_fill_caps(&_enc->state,refi,0);
|
||||||
|
for(pli=1;pli<3;pli++){
|
||||||
|
oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+pli,img+pli,
|
||||||
|
cpic_x,cpic_y,cpic_width,cpic_height);
|
||||||
|
oc_state_borders_fill_rows(&_enc->state,refi,pli,0,cframe_height);
|
||||||
|
oc_state_borders_fill_caps(&_enc->state,refi,pli);
|
||||||
|
}
|
||||||
/*Select a free buffer to use for the reconstructed version of this frame.*/
|
/*Select a free buffer to use for the reconstructed version of this frame.*/
|
||||||
for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
|
for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
|
||||||
refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
|
refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
|
||||||
_enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
|
_enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
|
||||||
|
_enc->state.ref_frame_data[OC_FRAME_SELF]=
|
||||||
|
_enc->state.ref_frame_bufs[refi][0].data;
|
||||||
_enc->state.curframe_num+=_enc->prev_dup_count+1;
|
_enc->state.curframe_num+=_enc->prev_dup_count+1;
|
||||||
/*Step 4: Compress the frame.*/
|
/*Step 4: Compress the frame.*/
|
||||||
/*Start with a keyframe, and don't allow the generation of invalid files that
|
/*Start with a keyframe, and don't allow the generation of invalid files that
|
||||||
@ -1575,11 +1785,11 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
|
|||||||
}
|
}
|
||||||
|
|
||||||
int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
|
int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
|
||||||
|
unsigned char *packet;
|
||||||
if(_enc==NULL||_op==NULL)return TH_EFAULT;
|
if(_enc==NULL||_op==NULL)return TH_EFAULT;
|
||||||
if(_enc->packet_state==OC_PACKET_READY){
|
if(_enc->packet_state==OC_PACKET_READY){
|
||||||
_enc->packet_state=OC_PACKET_EMPTY;
|
_enc->packet_state=OC_PACKET_EMPTY;
|
||||||
if(_enc->rc.twopass!=1){
|
if(_enc->rc.twopass!=1){
|
||||||
unsigned char *packet;
|
|
||||||
packet=oggpackB_get_buffer(&_enc->opb);
|
packet=oggpackB_get_buffer(&_enc->opb);
|
||||||
/*If there's no packet, malloc failed while writing; it's lost forever.*/
|
/*If there's no packet, malloc failed while writing; it's lost forever.*/
|
||||||
if(packet==NULL)return TH_EFAULT;
|
if(packet==NULL)return TH_EFAULT;
|
||||||
@ -1595,8 +1805,22 @@ int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
|
|||||||
else if(_enc->packet_state==OC_PACKET_EMPTY){
|
else if(_enc->packet_state==OC_PACKET_EMPTY){
|
||||||
if(_enc->nqueued_dups>0){
|
if(_enc->nqueued_dups>0){
|
||||||
_enc->nqueued_dups--;
|
_enc->nqueued_dups--;
|
||||||
_op->packet=NULL;
|
/*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
|
||||||
_op->bytes=0;
|
if(_enc->vp3_compatible){
|
||||||
|
oggpackB_reset(&_enc->opb);
|
||||||
|
oc_enc_drop_frame_pack(_enc);
|
||||||
|
packet=oggpackB_get_buffer(&_enc->opb);
|
||||||
|
/*If there's no packet, malloc failed while writing; it's lost
|
||||||
|
forever.*/
|
||||||
|
if(packet==NULL)return TH_EFAULT;
|
||||||
|
_op->packet=packet;
|
||||||
|
_op->bytes=oggpackB_bytes(&_enc->opb);
|
||||||
|
}
|
||||||
|
/*Otherwise emit a 0-byte packet.*/
|
||||||
|
else{
|
||||||
|
_op->packet=NULL;
|
||||||
|
_op->bytes=0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
if(_last_p)_enc->packet_state=OC_PACKET_DONE;
|
if(_last_p)_enc->packet_state=OC_PACKET_DONE;
|
||||||
|
5
thirdparty/libtheora/encoder_disabled.c
vendored
5
thirdparty/libtheora/encoder_disabled.c
vendored
@ -11,12 +11,15 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include "apiwrapper.h"
|
#include "apiwrapper.h"
|
||||||
#include "encint.h"
|
#include "encint.h"
|
||||||
|
|
||||||
|
const th_quant_info TH_VP31_QUANT_INFO = {};
|
||||||
|
const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||||
|
|
||||||
th_enc_ctx *th_encode_alloc(const th_info *_info){
|
th_enc_ctx *th_encode_alloc(const th_info *_info){
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
200
thirdparty/libtheora/enquant.c
vendored
200
thirdparty/libtheora/enquant.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: enquant.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -20,6 +20,69 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src){
|
||||||
|
int i;
|
||||||
|
memcpy(_dst,_src,sizeof(*_dst));
|
||||||
|
memset(_dst->qi_ranges,0,sizeof(_dst->qi_ranges));
|
||||||
|
for(i=0;i<6;i++){
|
||||||
|
int nranges;
|
||||||
|
int qti;
|
||||||
|
int pli;
|
||||||
|
int qtj;
|
||||||
|
int plj;
|
||||||
|
int pdup;
|
||||||
|
int qdup;
|
||||||
|
qti=i/3;
|
||||||
|
pli=i%3;
|
||||||
|
qtj=(i-1)/3;
|
||||||
|
plj=(i-1)%3;
|
||||||
|
nranges=_src->qi_ranges[qti][pli].nranges;
|
||||||
|
/*Check for those duplicates that can be cleanly handled by
|
||||||
|
oc_quant_params_clear().*/
|
||||||
|
pdup=i>0&&nranges<=_src->qi_ranges[qtj][plj].nranges;
|
||||||
|
qdup=qti>0&&nranges<=_src->qi_ranges[0][pli].nranges;
|
||||||
|
_dst->qi_ranges[qti][pli].nranges=nranges;
|
||||||
|
if(pdup&&_src->qi_ranges[qti][pli].sizes==_src->qi_ranges[qtj][plj].sizes){
|
||||||
|
_dst->qi_ranges[qti][pli].sizes=_dst->qi_ranges[qtj][plj].sizes;
|
||||||
|
}
|
||||||
|
else if(qdup&&_src->qi_ranges[1][pli].sizes==_src->qi_ranges[0][pli].sizes){
|
||||||
|
_dst->qi_ranges[1][pli].sizes=_dst->qi_ranges[0][pli].sizes;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
int *sizes;
|
||||||
|
sizes=(int *)_ogg_malloc(nranges*sizeof(*sizes));
|
||||||
|
/*Note: The caller is responsible for cleaning up any partially
|
||||||
|
constructed qinfo.*/
|
||||||
|
if(sizes==NULL)return TH_EFAULT;
|
||||||
|
memcpy(sizes,_src->qi_ranges[qti][pli].sizes,nranges*sizeof(*sizes));
|
||||||
|
_dst->qi_ranges[qti][pli].sizes=sizes;
|
||||||
|
}
|
||||||
|
if(pdup&&_src->qi_ranges[qti][pli].base_matrices==
|
||||||
|
_src->qi_ranges[qtj][plj].base_matrices){
|
||||||
|
_dst->qi_ranges[qti][pli].base_matrices=
|
||||||
|
_dst->qi_ranges[qtj][plj].base_matrices;
|
||||||
|
}
|
||||||
|
else if(qdup&&_src->qi_ranges[1][pli].base_matrices==
|
||||||
|
_src->qi_ranges[0][pli].base_matrices){
|
||||||
|
_dst->qi_ranges[1][pli].base_matrices=
|
||||||
|
_dst->qi_ranges[0][pli].base_matrices;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
th_quant_base *base_matrices;
|
||||||
|
base_matrices=(th_quant_base *)_ogg_malloc(
|
||||||
|
(nranges+1)*sizeof(*base_matrices));
|
||||||
|
/*Note: The caller is responsible for cleaning up any partially
|
||||||
|
constructed qinfo.*/
|
||||||
|
if(base_matrices==NULL)return TH_EFAULT;
|
||||||
|
memcpy(base_matrices,_src->qi_ranges[qti][pli].base_matrices,
|
||||||
|
(nranges+1)*sizeof(*base_matrices));
|
||||||
|
_dst->qi_ranges[qti][pli].base_matrices=
|
||||||
|
(const th_quant_base *)base_matrices;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
|
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
|
||||||
const th_quant_ranges *qranges;
|
const th_quant_ranges *qranges;
|
||||||
const th_quant_base *base_mats[2*3*64];
|
const th_quant_base *base_mats[2*3*64];
|
||||||
@ -119,7 +182,7 @@ void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
|
void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
|
||||||
ogg_uint32_t t;
|
ogg_uint32_t t;
|
||||||
int l;
|
int l;
|
||||||
_d<<=1;
|
_d<<=1;
|
||||||
@ -129,50 +192,63 @@ static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
|
|||||||
_this->l=l;
|
_this->l=l;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*See comments at oc_dequant_tables_init() for how the quantization tables'
|
void oc_enc_enquant_table_init_c(void *_enquant,
|
||||||
storage should be initialized.*/
|
const ogg_uint16_t _dequant[64]){
|
||||||
void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
|
oc_iquant *enquant;
|
||||||
oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo){
|
int zzi;
|
||||||
int qi;
|
/*In the original VP3.2 code, the rounding offset and the size of the
|
||||||
|
dead zone around 0 were controlled by a "sharpness" parameter.
|
||||||
|
We now R-D optimize the tokens for each block after quantization,
|
||||||
|
so the rounding offset should always be 1/2, and an explicit dead
|
||||||
|
zone is unnecessary.
|
||||||
|
Hence, all of that VP3.2 code is gone from here, and the remaining
|
||||||
|
floating point code has been implemented as equivalent integer
|
||||||
|
code with exact precision.*/
|
||||||
|
enquant=(oc_iquant *)_enquant;
|
||||||
|
for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis){
|
||||||
int pli;
|
int pli;
|
||||||
|
int qii;
|
||||||
int qti;
|
int qti;
|
||||||
/*Initialize the dequantization tables first.*/
|
for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
|
||||||
oc_dequant_tables_init(_dequant,NULL,_qinfo);
|
*((oc_iquant *)_enquant[pli][qii][qti])=
|
||||||
/*Derive the quantization tables directly from the dequantization tables.*/
|
*((oc_iquant *)_enquant[pli][0][qti]);
|
||||||
for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
|
|
||||||
int zzi;
|
|
||||||
int plj;
|
|
||||||
int qtj;
|
|
||||||
int dupe;
|
|
||||||
dupe=0;
|
|
||||||
for(qtj=0;qtj<=qti;qtj++){
|
|
||||||
for(plj=0;plj<(qtj<qti?3:pli);plj++){
|
|
||||||
if(_dequant[qi][pli][qti]==_dequant[qi][plj][qtj]){
|
|
||||||
dupe=1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(dupe)break;
|
|
||||||
}
|
|
||||||
if(dupe){
|
|
||||||
_enquant[qi][pli][qti]=_enquant[qi][plj][qtj];
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/*In the original VP3.2 code, the rounding offset and the size of the
|
|
||||||
dead zone around 0 were controlled by a "sharpness" parameter.
|
|
||||||
We now R-D optimize the tokens for each block after quantization,
|
|
||||||
so the rounding offset should always be 1/2, and an explicit dead
|
|
||||||
zone is unnecessary.
|
|
||||||
Hence, all of that VP3.2 code is gone from here, and the remaining
|
|
||||||
floating point code has been implemented as equivalent integer
|
|
||||||
code with exact precision.*/
|
|
||||||
for(zzi=0;zzi<64;zzi++){
|
|
||||||
oc_iquant_init(_enquant[qi][pli][qti]+zzi,
|
|
||||||
_dequant[qi][pli][qti][zzi]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||||
|
const ogg_uint16_t _dequant[64],const void *_enquant){
|
||||||
|
const oc_iquant *enquant;
|
||||||
|
int nonzero;
|
||||||
|
int zzi;
|
||||||
|
int val;
|
||||||
|
int d;
|
||||||
|
int s;
|
||||||
|
enquant=(const oc_iquant *)_enquant;
|
||||||
|
nonzero=0;
|
||||||
|
for(zzi=0;zzi<64;zzi++){
|
||||||
|
val=_dct[zzi];
|
||||||
|
d=_dequant[zzi];
|
||||||
|
val=val<<1;
|
||||||
|
if(abs(val)>=d){
|
||||||
|
s=OC_SIGNMASK(val);
|
||||||
|
/*The bias added here rounds ties away from zero, since token
|
||||||
|
optimization can only decrease the magnitude of the quantized
|
||||||
|
value.*/
|
||||||
|
val+=d+s^s;
|
||||||
|
/*Note the arithmetic right shift is not guaranteed by ANSI C.
|
||||||
|
Hopefully no one still uses ones-complement architectures.*/
|
||||||
|
val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
|
||||||
|
_qdct[zzi]=(ogg_int16_t)val;
|
||||||
|
nonzero=zzi;
|
||||||
|
}
|
||||||
|
else _qdct[zzi]=0;
|
||||||
|
}
|
||||||
|
return nonzero;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*This table gives the square root of the fraction of the squared magnitude of
|
/*This table gives the square root of the fraction of the squared magnitude of
|
||||||
@ -226,7 +302,7 @@ static const ogg_uint16_t OC_RPSD[2][64]={
|
|||||||
relative to the total, scaled by 2**16, for each pixel format.
|
relative to the total, scaled by 2**16, for each pixel format.
|
||||||
These values were measured after motion-compensated prediction, before
|
These values were measured after motion-compensated prediction, before
|
||||||
quantization, over a large set of test video encoded at all possible rates.
|
quantization, over a large set of test video encoded at all possible rates.
|
||||||
TODO: These values are only from INTER frames; it should be re-measured for
|
TODO: These values are only from INTER frames; they should be re-measured for
|
||||||
INTRA frames.*/
|
INTRA frames.*/
|
||||||
static const ogg_uint16_t OC_PCD[4][3]={
|
static const ogg_uint16_t OC_PCD[4][3]={
|
||||||
{59926, 3038, 2572},
|
{59926, 3038, 2572},
|
||||||
@ -236,38 +312,58 @@ static const ogg_uint16_t OC_PCD[4][3]={
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/*Compute an "average" quantizer for each qi level.
|
/*Compute "average" quantizers for each qi level to use for rate control.
|
||||||
We do one for INTER and one for INTRA, since their behavior is very
|
We do one for each color channel, as well as an average across color
|
||||||
different, but average across chroma channels.
|
channels, separately for INTER and INTRA, since their behavior is very
|
||||||
|
different.
|
||||||
The basic approach is to compute a harmonic average of the squared quantizer,
|
The basic approach is to compute a harmonic average of the squared quantizer,
|
||||||
weighted by the expected squared magnitude of the DCT coefficients.
|
weighted by the expected squared magnitude of the DCT coefficients.
|
||||||
Under the (not quite true) assumption that DCT coefficients are
|
Under the (not quite true) assumption that DCT coefficients are
|
||||||
Laplacian-distributed, this preserves the product Q*lambda, where
|
Laplacian-distributed, this preserves the product Q*lambda, where
|
||||||
lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter (not to be
|
lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter (not to be
|
||||||
confused with the lambda used in R-D optimization throughout most of the
|
confused with the lambda used in R-D optimization throughout most of the
|
||||||
rest of the code).
|
rest of the code), when the distributions from multiple coefficients are
|
||||||
The value Q*lambda completely determines the entropy of the coefficients.*/
|
pooled.
|
||||||
|
The value Q*lambda completely determines the entropy of coefficients drawn
|
||||||
|
from a Laplacian distribution, and thus the expected bitrate.*/
|
||||||
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
||||||
|
ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _chroma_rd_scale[2][64][2],
|
||||||
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){
|
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){
|
||||||
int qi;
|
int qi;
|
||||||
int pli;
|
int pli;
|
||||||
int qti;
|
int qti;
|
||||||
int ci;
|
int ci;
|
||||||
for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){
|
for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){
|
||||||
ogg_int64_t q2;
|
ogg_int64_t q2;
|
||||||
|
ogg_uint32_t qp[3];
|
||||||
|
ogg_uint32_t cqp;
|
||||||
|
ogg_uint32_t d;
|
||||||
q2=0;
|
q2=0;
|
||||||
for(pli=0;pli<3;pli++){
|
for(pli=0;pli<3;pli++){
|
||||||
ogg_uint32_t qp;
|
qp[pli]=0;
|
||||||
qp=0;
|
|
||||||
for(ci=0;ci<64;ci++){
|
for(ci=0;ci<64;ci++){
|
||||||
unsigned rq;
|
unsigned rq;
|
||||||
unsigned qd;
|
unsigned qd;
|
||||||
qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]];
|
qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]];
|
||||||
rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
|
rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
|
||||||
qp+=rq*(ogg_uint32_t)rq;
|
qp[pli]+=rq*(ogg_uint32_t)rq;
|
||||||
}
|
}
|
||||||
q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp;
|
q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp[pli];
|
||||||
|
/*plq=1.0/sqrt(qp)*/
|
||||||
|
_log_plq[qi][pli][qti]=
|
||||||
|
(ogg_int16_t)(OC_Q10(32)-oc_blog32_q10(qp[pli])>>1);
|
||||||
}
|
}
|
||||||
|
d=OC_PCD[_pixel_fmt][1]+OC_PCD[_pixel_fmt][2];
|
||||||
|
cqp=(ogg_uint32_t)((OC_PCD[_pixel_fmt][1]*(ogg_int64_t)qp[1]+
|
||||||
|
OC_PCD[_pixel_fmt][2]*(ogg_int64_t)qp[2]+(d>>1))/d);
|
||||||
|
/*chroma_rd_scale=clamp(0.25,cqp/qp[0],4)*/
|
||||||
|
d=OC_MAXI(qp[0]+(1<<OC_RD_SCALE_BITS-1)>>OC_RD_SCALE_BITS,1);
|
||||||
|
d=OC_CLAMPI(1<<OC_RD_SCALE_BITS-2,(cqp+(d>>1))/d,4<<OC_RD_SCALE_BITS);
|
||||||
|
_chroma_rd_scale[qti][qi][0]=(ogg_int16_t)d;
|
||||||
|
/*chroma_rd_iscale=clamp(0.25,qp[0]/cqp,4)*/
|
||||||
|
d=OC_MAXI(OC_RD_ISCALE(cqp,1),1);
|
||||||
|
d=OC_CLAMPI(1<<OC_RD_ISCALE_BITS-2,(qp[0]+(d>>1))/d,4<<OC_RD_ISCALE_BITS);
|
||||||
|
_chroma_rd_scale[qti][qi][1]=(ogg_int16_t)d;
|
||||||
/*qavg=1.0/sqrt(q2).*/
|
/*qavg=1.0/sqrt(q2).*/
|
||||||
_log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
|
_log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
|
||||||
}
|
}
|
||||||
|
7
thirdparty/libtheora/enquant.h
vendored
7
thirdparty/libtheora/enquant.h
vendored
@ -14,14 +14,13 @@ struct oc_iquant{
|
|||||||
ogg_int16_t l;
|
ogg_int16_t l;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef oc_iquant oc_iquant_table[64];
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src);
|
||||||
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
|
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
|
||||||
void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
|
void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d);
|
||||||
oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
|
|
||||||
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
||||||
|
ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _pl_rd_scale[2][64][2],
|
||||||
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
|
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
9
thirdparty/libtheora/fdct.c
vendored
9
thirdparty/libtheora/fdct.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: fdct.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include "encint.h"
|
#include "encint.h"
|
||||||
@ -120,11 +120,6 @@ static void oc_fdct8(ogg_int16_t _y[8],const ogg_int16_t *_x){
|
|||||||
_y[7]=v;
|
_y[7]=v;
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
|
|
||||||
const ogg_int16_t _x[64]){
|
|
||||||
(*_enc->opt_vtable.fdct8x8)(_y,_x);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Performs a forward 8x8 Type-II DCT transform.
|
/*Performs a forward 8x8 Type-II DCT transform.
|
||||||
The output is scaled by a factor of 4 relative to the orthonormal version
|
The output is scaled by a factor of 4 relative to the orthonormal version
|
||||||
of the transform.
|
of the transform.
|
||||||
@ -152,7 +147,7 @@ void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
/*Round the result back to the external working precision (which is still
|
/*Round the result back to the external working precision (which is still
|
||||||
scaled by four relative to the orthogonal result).
|
scaled by four relative to the orthogonal result).
|
||||||
TODO: We should just update the external working precision.*/
|
TODO: We should just update the external working precision.*/
|
||||||
for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
|
for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
43
thirdparty/libtheora/fragment.c
vendored
43
thirdparty/libtheora/fragment.c
vendored
@ -11,17 +11,12 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
|
|
||||||
void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride){
|
|
||||||
(*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
|
void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
|
||||||
int i;
|
int i;
|
||||||
for(i=8;i-->0;){
|
for(i=8;i-->0;){
|
||||||
@ -31,9 +26,24 @@ void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
|
/*Copies the fragments specified by the lists of fragment indices from one
|
||||||
int _ystride,const ogg_int16_t _residue[64]){
|
frame to another.
|
||||||
_state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
|
_dst_frame: The reference frame to copy to.
|
||||||
|
_src_frame: The reference frame to copy from.
|
||||||
|
_ystride: The row stride of the reference frames.
|
||||||
|
_fragis: A pointer to a list of fragment indices.
|
||||||
|
_nfragis: The number of fragment indices to copy.
|
||||||
|
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||||
|
void oc_frag_copy_list_c(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||||
|
ptrdiff_t fragii;
|
||||||
|
for(fragii=0;fragii<_nfragis;fragii++){
|
||||||
|
ptrdiff_t frag_buf_off;
|
||||||
|
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||||
|
oc_frag_copy_c(_dst_frame+frag_buf_off,
|
||||||
|
_src_frame+frag_buf_off,_ystride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
|
void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
|
||||||
@ -46,11 +56,6 @@ void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
|
||||||
_state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
||||||
int i;
|
int i;
|
||||||
@ -62,12 +67,6 @@ void oc_frag_recon_inter_c(unsigned char *_dst,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
|
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride,
|
|
||||||
const ogg_int16_t _residue[64]){
|
|
||||||
_state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
|
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
|
||||||
int i;
|
int i;
|
||||||
@ -80,8 +79,4 @@ void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_restore_fpu(const oc_theora_state *_state){
|
|
||||||
_state->opt_vtable.restore_fpu();
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_restore_fpu_c(void){}
|
void oc_restore_fpu_c(void){}
|
||||||
|
712
thirdparty/libtheora/huffdec.c
vendored
712
thirdparty/libtheora/huffdec.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: huffdec.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -22,14 +22,60 @@
|
|||||||
#include "decint.h"
|
#include "decint.h"
|
||||||
|
|
||||||
|
|
||||||
/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
|
|
||||||
#define _ogg_offsetof(_type,_field)\
|
|
||||||
((size_t)((char *)&((_type *)0)->_field-(char *)0))
|
|
||||||
|
|
||||||
/*The number of internal tokens associated with each of the spec tokens.*/
|
/*Instead of storing every branching in the tree, subtrees can be collapsed
|
||||||
static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
|
into one node, with a table of size 1<<nbits pointing directly to its
|
||||||
1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
|
descedents nbits levels down.
|
||||||
};
|
This allows more than one bit to be read at a time, and avoids following all
|
||||||
|
the intermediate branches with next to no increased code complexity once
|
||||||
|
the collapsed tree has been built.
|
||||||
|
We do _not_ require that a subtree be complete to be collapsed, but instead
|
||||||
|
store duplicate pointers in the table, and record the actual depth of the
|
||||||
|
node below its parent.
|
||||||
|
This tells us the number of bits to advance the stream after reaching it.
|
||||||
|
|
||||||
|
This turns out to be equivalent to the method described in \cite{Hash95},
|
||||||
|
without the requirement that codewords be sorted by length.
|
||||||
|
If the codewords were sorted by length (so-called ``canonical-codes''), they
|
||||||
|
could be decoded much faster via either Lindell and Moffat's approach or
|
||||||
|
Hashemian's Condensed Huffman Code approach, the latter of which has an
|
||||||
|
extremely small memory footprint.
|
||||||
|
We can't use Choueka et al.'s finite state machine approach, which is
|
||||||
|
extremely fast, because we can't allow multiple symbols to be output at a
|
||||||
|
time; the codebook can and does change between symbols.
|
||||||
|
It also has very large memory requirements, which impairs cache coherency.
|
||||||
|
|
||||||
|
We store the tree packed in an array of 16-bit integers (words).
|
||||||
|
Each node consists of a single word, followed consecutively by two or more
|
||||||
|
indices of its children.
|
||||||
|
Let n be the value of this first word.
|
||||||
|
This is the number of bits that need to be read to traverse the node, and
|
||||||
|
must be positive.
|
||||||
|
1<<n entries follow in the array, each an index to a child node.
|
||||||
|
If the child is positive, then it is the index of another internal node in
|
||||||
|
the table.
|
||||||
|
If the child is negative or zero, then it is a leaf node.
|
||||||
|
These are stored directly in the child pointer to save space, since they only
|
||||||
|
require a single word.
|
||||||
|
If a leaf node would have been encountered before reading n bits, then it is
|
||||||
|
duplicated the necessary number of times in this table.
|
||||||
|
Leaf nodes pack both a token value and their actual depth in the tree.
|
||||||
|
The token in the leaf node is (-leaf&255).
|
||||||
|
The number of bits that need to be consumed to reach the leaf, starting from
|
||||||
|
the current node, is (-leaf>>8).
|
||||||
|
|
||||||
|
@ARTICLE{Hash95,
|
||||||
|
author="Reza Hashemian",
|
||||||
|
title="Memory Efficient and High-Speed Search {Huffman} Coding",
|
||||||
|
journal="{IEEE} Transactions on Communications",
|
||||||
|
volume=43,
|
||||||
|
number=10,
|
||||||
|
pages="2576--2581",
|
||||||
|
month=Oct,
|
||||||
|
year=1995
|
||||||
|
}*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*The map from external spec-defined tokens to internal tokens.
|
/*The map from external spec-defined tokens to internal tokens.
|
||||||
This is constructed so that any extra bits read with the original token value
|
This is constructed so that any extra bits read with the original token value
|
||||||
@ -99,391 +145,371 @@ static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={
|
|||||||
40
|
40
|
||||||
};
|
};
|
||||||
|
|
||||||
/*These three functions are really part of the bitpack.c module, but
|
/*The log base 2 of number of internal tokens associated with each of the spec
|
||||||
they are only used here.
|
tokens (i.e., how many of the extra bits are folded into the token value).
|
||||||
Declaring local static versions so they can be inlined saves considerable
|
Increasing the maximum value beyond 3 will enlarge the amount of stack
|
||||||
function call overhead.*/
|
required for tree construction.*/
|
||||||
|
static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={
|
||||||
static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
|
0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3
|
||||||
const unsigned char *ptr;
|
};
|
||||||
const unsigned char *stop;
|
|
||||||
oc_pb_window window;
|
|
||||||
int available;
|
|
||||||
window=_b->window;
|
|
||||||
available=_b->bits;
|
|
||||||
ptr=_b->ptr;
|
|
||||||
stop=_b->stop;
|
|
||||||
/*This version of _refill() doesn't bother setting eof because we won't
|
|
||||||
check for it after we've started decoding DCT tokens.*/
|
|
||||||
if(ptr>=stop)available=OC_LOTS_OF_BITS;
|
|
||||||
while(available<=OC_PB_WINDOW_SIZE-8){
|
|
||||||
available+=8;
|
|
||||||
window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
|
|
||||||
if(ptr>=stop)available=OC_LOTS_OF_BITS;
|
|
||||||
}
|
|
||||||
_b->ptr=ptr;
|
|
||||||
if(_bits>available)window|=*ptr>>(available&7);
|
|
||||||
_b->bits=available;
|
|
||||||
return window;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*Read in bits without advancing the bit pointer.
|
/*The size a lookup table is allowed to grow to relative to the number of
|
||||||
Here we assume 0<=_bits&&_bits<=32.*/
|
unique nodes it contains.
|
||||||
static long oc_pack_look(oc_pack_buf *_b,int _bits){
|
E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is
|
||||||
oc_pb_window window;
|
wasted (1/4 of the space must be used).
|
||||||
int available;
|
|
||||||
long result;
|
|
||||||
window=_b->window;
|
|
||||||
available=_b->bits;
|
|
||||||
if(_bits==0)return 0;
|
|
||||||
if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
|
|
||||||
result=window>>OC_PB_WINDOW_SIZE-_bits;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Advance the bit pointer.*/
|
|
||||||
static void oc_pack_adv(oc_pack_buf *_b,int _bits){
|
|
||||||
/*We ignore the special cases for _bits==0 and _bits==32 here, since they are
|
|
||||||
never used actually used.
|
|
||||||
OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
|
|
||||||
32 bits in a single go, and would require a 32 GB lookup table (assuming
|
|
||||||
8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
|
|
||||||
_b->window<<=_bits;
|
|
||||||
_b->bits-=_bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*The log_2 of the size of a lookup table is allowed to grow to relative to
|
|
||||||
the number of unique nodes it contains.
|
|
||||||
E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
|
|
||||||
wasted (each node will have an amortized cost of at most 20 bytes when using
|
|
||||||
4-byte pointers).
|
|
||||||
Larger numbers can decode tokens with fewer read operations, while smaller
|
Larger numbers can decode tokens with fewer read operations, while smaller
|
||||||
numbers may save more space (requiring as little as 8 bytes amortized per
|
numbers may save more space.
|
||||||
node, though there will be more nodes).
|
|
||||||
With a sample file:
|
With a sample file:
|
||||||
32233473 read calls are required when no tree collapsing is done (100.0%).
|
32233473 read calls are required when no tree collapsing is done (100.0%).
|
||||||
19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%).
|
19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%).
|
||||||
11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%).
|
11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%).
|
||||||
10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
|
10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%).
|
||||||
10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
|
10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%).
|
||||||
Since a value of 1 gets us the vast majority of the speed-up with only a
|
Since a value of 2 gets us the vast majority of the speed-up with only a
|
||||||
small amount of wasted memory, this is what we use.*/
|
small amount of wasted memory, this is what we use.
|
||||||
#define OC_HUFF_SLUSH (1)
|
This value must be less than 128, or you could create a tree with more than
|
||||||
|
32767 entries, which would overflow the 16-bit words used to index it.*/
|
||||||
|
#define OC_HUFF_SLUSH (2)
|
||||||
|
/*The root of the tree is on the fast path, and a larger value here is more
|
||||||
|
beneficial than elsewhere in the tree.
|
||||||
|
7 appears to give the best performance, trading off between increased use of
|
||||||
|
the single-read fast path and cache footprint for the tables, though
|
||||||
|
obviously this will depend on your cache size.
|
||||||
|
Using 7 here, the VP3 tables are about twice as large compared to using 2.*/
|
||||||
|
#define OC_ROOT_HUFF_SLUSH (7)
|
||||||
|
|
||||||
|
|
||||||
/*Determines the size in bytes of a Huffman tree node that represents a
|
|
||||||
|
/*Unpacks a Huffman codebook.
|
||||||
|
_opb: The buffer to unpack from.
|
||||||
|
_tokens: Stores a list of internal tokens, in the order they were found in
|
||||||
|
the codebook, and the lengths of their corresponding codewords.
|
||||||
|
This is enough to completely define the codebook, while minimizing
|
||||||
|
stack usage and avoiding temporary allocations (for platforms
|
||||||
|
where free() is a no-op).
|
||||||
|
Return: The number of internal tokens in the codebook, or a negative value
|
||||||
|
on error.*/
|
||||||
|
int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){
|
||||||
|
ogg_uint32_t code;
|
||||||
|
int len;
|
||||||
|
int ntokens;
|
||||||
|
int nleaves;
|
||||||
|
code=0;
|
||||||
|
len=ntokens=nleaves=0;
|
||||||
|
for(;;){
|
||||||
|
long bits;
|
||||||
|
bits=oc_pack_read1(_opb);
|
||||||
|
/*Only process nodes so long as there's more bits in the buffer.*/
|
||||||
|
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||||
|
/*Read an internal node:*/
|
||||||
|
if(!bits){
|
||||||
|
len++;
|
||||||
|
/*Don't allow codewords longer than 32 bits.*/
|
||||||
|
if(len>32)return TH_EBADHEADER;
|
||||||
|
}
|
||||||
|
/*Read a leaf node:*/
|
||||||
|
else{
|
||||||
|
ogg_uint32_t code_bit;
|
||||||
|
int neb;
|
||||||
|
int nentries;
|
||||||
|
int token;
|
||||||
|
/*Don't allow more than 32 spec-tokens per codebook.*/
|
||||||
|
if(++nleaves>32)return TH_EBADHEADER;
|
||||||
|
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
||||||
|
neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits];
|
||||||
|
token=OC_DCT_TOKEN_MAP[bits];
|
||||||
|
nentries=1<<neb;
|
||||||
|
while(nentries-->0){
|
||||||
|
_tokens[ntokens][0]=(unsigned char)token++;
|
||||||
|
_tokens[ntokens][1]=(unsigned char)(len+neb);
|
||||||
|
ntokens++;
|
||||||
|
}
|
||||||
|
code_bit=0x80000000U>>len-1;
|
||||||
|
while(len>0&&(code&code_bit)){
|
||||||
|
code^=code_bit;
|
||||||
|
code_bit<<=1;
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
if(len<=0)break;
|
||||||
|
code|=code_bit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ntokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Count how many tokens would be required to fill a subtree at depth _depth.
|
||||||
|
_tokens: A list of internal tokens, in the order they are found in the
|
||||||
|
codebook, and the lengths of their corresponding codewords.
|
||||||
|
_depth: The depth of the desired node in the corresponding tree structure.
|
||||||
|
Return: The number of tokens that belong to that subtree.*/
|
||||||
|
static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){
|
||||||
|
ogg_uint32_t code;
|
||||||
|
int ti;
|
||||||
|
code=0;
|
||||||
|
ti=0;
|
||||||
|
do{
|
||||||
|
if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth;
|
||||||
|
else{
|
||||||
|
/*Because of the expanded internal tokens, we can have codewords as long
|
||||||
|
as 35 bits.
|
||||||
|
A single recursion here is enough to advance past them.*/
|
||||||
|
code++;
|
||||||
|
ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while(code<0x80000000U);
|
||||||
|
return ti;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Compute the number of bits to use for a collapsed tree node at the given
|
||||||
|
depth.
|
||||||
|
_tokens: A list of internal tokens, in the order they are found in the
|
||||||
|
codebook, and the lengths of their corresponding codewords.
|
||||||
|
_ntokens: The number of tokens corresponding to this tree node.
|
||||||
|
_depth: The depth of this tree node.
|
||||||
|
Return: The number of bits to use for a collapsed tree node rooted here.
|
||||||
|
This is always at least one, even if this was a leaf node.*/
|
||||||
|
static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2],
|
||||||
|
int _ntokens,int _depth){
|
||||||
|
int got_leaves;
|
||||||
|
int loccupancy;
|
||||||
|
int occupancy;
|
||||||
|
int slush;
|
||||||
|
int nbits;
|
||||||
|
int best_nbits;
|
||||||
|
slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH;
|
||||||
|
/*It's legal to have a tree with just a single node, which requires no bits
|
||||||
|
to decode and always returns the same token.
|
||||||
|
However, no encoder actually does this (yet).
|
||||||
|
To avoid a special case in oc_huff_token_decode(), we force the number of
|
||||||
|
lookahead bits to be at least one.
|
||||||
|
This will produce a tree that looks ahead one bit and then advances the
|
||||||
|
stream zero bits.*/
|
||||||
|
nbits=1;
|
||||||
|
occupancy=2;
|
||||||
|
got_leaves=1;
|
||||||
|
do{
|
||||||
|
int ti;
|
||||||
|
if(got_leaves)best_nbits=nbits;
|
||||||
|
nbits++;
|
||||||
|
got_leaves=0;
|
||||||
|
loccupancy=occupancy;
|
||||||
|
for(occupancy=ti=0;ti<_ntokens;occupancy++){
|
||||||
|
if(_tokens[ti][1]<_depth+nbits)ti++;
|
||||||
|
else if(_tokens[ti][1]==_depth+nbits){
|
||||||
|
got_leaves=1;
|
||||||
|
ti++;
|
||||||
|
}
|
||||||
|
else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while(occupancy>loccupancy&&occupancy*slush>=1<<nbits);
|
||||||
|
return best_nbits;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Determines the size in words of a Huffman tree node that represents a
|
||||||
subtree of depth _nbits.
|
subtree of depth _nbits.
|
||||||
_nbits: The depth of the subtree.
|
_nbits: The depth of the subtree.
|
||||||
If this is 0, the node is a leaf node.
|
This must be greater than zero.
|
||||||
Otherwise 1<<_nbits pointers are allocated for children.
|
Return: The number of words required to store the node.*/
|
||||||
Return: The number of bytes required to store the node.*/
|
|
||||||
static size_t oc_huff_node_size(int _nbits){
|
static size_t oc_huff_node_size(int _nbits){
|
||||||
size_t size;
|
return 1+(1<<_nbits);
|
||||||
size=_ogg_offsetof(oc_huff_node,nodes);
|
|
||||||
if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
|
|
||||||
return size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
|
/*Produces a collapsed-tree representation of the given token list.
|
||||||
oc_huff_node *ret;
|
_tree: The storage for the collapsed Huffman tree.
|
||||||
ret=(oc_huff_node *)*_storage;
|
This may be NULL to compute the required storage size instead of
|
||||||
ret->nbits=(unsigned char)_nbits;
|
constructing the tree.
|
||||||
(*_storage)+=_size;
|
_tokens: A list of internal tokens, in the order they are found in the
|
||||||
return ret;
|
codebook, and the lengths of their corresponding codewords.
|
||||||
}
|
_ntokens: The number of tokens corresponding to this tree node.
|
||||||
|
Return: The number of words required to store the tree.*/
|
||||||
|
static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
|
||||||
/*Determines the size in bytes of a Huffman tree.
|
unsigned char _tokens[][2],int _ntokens){
|
||||||
_nbits: The depth of the subtree.
|
ogg_int16_t node[34];
|
||||||
If this is 0, the node is a leaf node.
|
unsigned char depth[34];
|
||||||
Otherwise storage for 1<<_nbits pointers are added for children.
|
unsigned char last[34];
|
||||||
Return: The number of bytes required to store the tree.*/
|
size_t ntree;
|
||||||
static size_t oc_huff_tree_size(const oc_huff_node *_node){
|
int ti;
|
||||||
size_t size;
|
int l;
|
||||||
size=oc_huff_node_size(_node->nbits);
|
depth[0]=0;
|
||||||
if(_node->nbits){
|
last[0]=(unsigned char)(_ntokens-1);
|
||||||
int nchildren;
|
ntree=0;
|
||||||
int i;
|
ti=0;
|
||||||
nchildren=1<<_node->nbits;
|
l=0;
|
||||||
for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
|
do{
|
||||||
size+=oc_huff_tree_size(_node->nodes[i]);
|
int nbits;
|
||||||
}
|
nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]);
|
||||||
}
|
node[l]=(ogg_int16_t)ntree;
|
||||||
return size;
|
ntree+=oc_huff_node_size(nbits);
|
||||||
}
|
if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits;
|
||||||
|
do{
|
||||||
|
while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){
|
||||||
/*Unpacks a sub-tree from the given buffer.
|
if(_tree!=NULL){
|
||||||
_opb: The buffer to unpack from.
|
ogg_int16_t leaf;
|
||||||
_binodes: The nodes to store the sub-tree in.
|
int nentries;
|
||||||
_nbinodes: The number of nodes available for the sub-tree.
|
nentries=1<<depth[l]+nbits-_tokens[ti][1];
|
||||||
Return: 0 on success, or a negative value on error.*/
|
leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]);
|
||||||
static int oc_huff_tree_unpack(oc_pack_buf *_opb,
|
while(nentries-->0)_tree[node[l]++]=leaf;
|
||||||
oc_huff_node *_binodes,int _nbinodes){
|
}
|
||||||
oc_huff_node *binode;
|
ti++;
|
||||||
long bits;
|
|
||||||
int nused;
|
|
||||||
if(_nbinodes<1)return TH_EBADHEADER;
|
|
||||||
binode=_binodes;
|
|
||||||
nused=0;
|
|
||||||
bits=oc_pack_read1(_opb);
|
|
||||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
|
||||||
/*Read an internal node:*/
|
|
||||||
if(!bits){
|
|
||||||
int ret;
|
|
||||||
nused++;
|
|
||||||
binode->nbits=1;
|
|
||||||
binode->depth=1;
|
|
||||||
binode->nodes[0]=_binodes+nused;
|
|
||||||
ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
|
|
||||||
if(ret>=0){
|
|
||||||
nused+=ret;
|
|
||||||
binode->nodes[1]=_binodes+nused;
|
|
||||||
ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
|
|
||||||
}
|
|
||||||
if(ret<0)return ret;
|
|
||||||
nused+=ret;
|
|
||||||
}
|
|
||||||
/*Read a leaf node:*/
|
|
||||||
else{
|
|
||||||
int ntokens;
|
|
||||||
int token;
|
|
||||||
int i;
|
|
||||||
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
|
||||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
|
||||||
/*Find out how many internal tokens we translate this external token into.*/
|
|
||||||
ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
|
|
||||||
if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
|
|
||||||
/*Fill in a complete binary tree pointing to the internal tokens.*/
|
|
||||||
for(i=1;i<ntokens;i<<=1){
|
|
||||||
int j;
|
|
||||||
binode=_binodes+nused;
|
|
||||||
nused+=i;
|
|
||||||
for(j=0;j<i;j++){
|
|
||||||
binode[j].nbits=1;
|
|
||||||
binode[j].depth=1;
|
|
||||||
binode[j].nodes[0]=_binodes+nused+2*j;
|
|
||||||
binode[j].nodes[1]=_binodes+nused+2*j+1;
|
|
||||||
}
|
}
|
||||||
|
if(ti<=last[l]){
|
||||||
|
/*We need to recurse*/
|
||||||
|
depth[l+1]=(unsigned char)(depth[l]+nbits);
|
||||||
|
if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree;
|
||||||
|
l++;
|
||||||
|
last[l]=
|
||||||
|
(unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/*Pop back up a level of recursion.*/
|
||||||
|
else if(l-->0)nbits=depth[l+1]-depth[l];
|
||||||
}
|
}
|
||||||
/*And now the leaf nodes with those tokens.*/
|
while(l>=0);
|
||||||
token=OC_DCT_TOKEN_MAP[bits];
|
|
||||||
for(i=0;i<ntokens;i++){
|
|
||||||
binode=_binodes+nused++;
|
|
||||||
binode->nbits=0;
|
|
||||||
binode->depth=1;
|
|
||||||
binode->token=token+i;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return nused;
|
while(l>=0);
|
||||||
}
|
return ntree;
|
||||||
|
|
||||||
/*Finds the depth of shortest branch of the given sub-tree.
|
|
||||||
The tree must be binary.
|
|
||||||
_binode: The root of the given sub-tree.
|
|
||||||
_binode->nbits must be 0 or 1.
|
|
||||||
Return: The smallest depth of a leaf node in this sub-tree.
|
|
||||||
0 indicates this sub-tree is a leaf node.*/
|
|
||||||
static int oc_huff_tree_mindepth(oc_huff_node *_binode){
|
|
||||||
int depth0;
|
|
||||||
int depth1;
|
|
||||||
if(_binode->nbits==0)return 0;
|
|
||||||
depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
|
|
||||||
depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
|
|
||||||
return OC_MINI(depth0,depth1)+1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Finds the number of internal nodes at a given depth, plus the number of
|
|
||||||
leaves at that depth or shallower.
|
|
||||||
The tree must be binary.
|
|
||||||
_binode: The root of the given sub-tree.
|
|
||||||
_binode->nbits must be 0 or 1.
|
|
||||||
Return: The number of entries that would be contained in a jump table of the
|
|
||||||
given depth.*/
|
|
||||||
static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
|
|
||||||
if(_binode->nbits==0||_depth<=0)return 1;
|
|
||||||
else{
|
|
||||||
return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
|
|
||||||
oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Makes a copy of the given Huffman tree.
|
|
||||||
_node: The Huffman tree to copy.
|
|
||||||
Return: The copy of the Huffman tree.*/
|
|
||||||
static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
|
|
||||||
char **_storage){
|
|
||||||
oc_huff_node *ret;
|
|
||||||
ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
|
|
||||||
ret->depth=_node->depth;
|
|
||||||
if(_node->nbits){
|
|
||||||
int nchildren;
|
|
||||||
int i;
|
|
||||||
int inext;
|
|
||||||
nchildren=1<<_node->nbits;
|
|
||||||
for(i=0;i<nchildren;){
|
|
||||||
ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
|
|
||||||
inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
|
|
||||||
while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else ret->token=_node->token;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
|
|
||||||
size_t size;
|
|
||||||
int mindepth;
|
|
||||||
int depth;
|
|
||||||
int loccupancy;
|
|
||||||
int occupancy;
|
|
||||||
if(_binode->nbits!=0&&_depth>0){
|
|
||||||
return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
|
|
||||||
oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
|
|
||||||
}
|
|
||||||
depth=mindepth=oc_huff_tree_mindepth(_binode);
|
|
||||||
occupancy=1<<mindepth;
|
|
||||||
do{
|
|
||||||
loccupancy=occupancy;
|
|
||||||
occupancy=oc_huff_tree_occupancy(_binode,++depth);
|
|
||||||
}
|
|
||||||
while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
|
|
||||||
depth--;
|
|
||||||
size=oc_huff_node_size(depth);
|
|
||||||
if(depth>0){
|
|
||||||
size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
|
|
||||||
size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
|
|
||||||
}
|
|
||||||
return size;
|
|
||||||
}
|
|
||||||
|
|
||||||
static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
|
|
||||||
char **_storage);
|
|
||||||
|
|
||||||
/*Fills the given nodes table with all the children in the sub-tree at the
|
|
||||||
given depth.
|
|
||||||
The nodes in the sub-tree with a depth less than that stored in the table
|
|
||||||
are freed.
|
|
||||||
The sub-tree must be binary and complete up until the given depth.
|
|
||||||
_nodes: The nodes table to fill.
|
|
||||||
_binode: The root of the sub-tree to fill it with.
|
|
||||||
_binode->nbits must be 0 or 1.
|
|
||||||
_level: The current level in the table.
|
|
||||||
0 indicates that the current node should be stored, regardless of
|
|
||||||
whether it is a leaf node or an internal node.
|
|
||||||
_depth: The depth of the nodes to fill the table with, relative to their
|
|
||||||
parent.*/
|
|
||||||
static void oc_huff_node_fill(oc_huff_node **_nodes,
|
|
||||||
oc_huff_node *_binode,int _level,int _depth,char **_storage){
|
|
||||||
if(_level<=0||_binode->nbits==0){
|
|
||||||
int i;
|
|
||||||
_binode->depth=(unsigned char)(_depth-_level);
|
|
||||||
_nodes[0]=oc_huff_tree_collapse(_binode,_storage);
|
|
||||||
for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
_level--;
|
|
||||||
oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
|
|
||||||
_nodes+=1<<_level;
|
|
||||||
oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Finds the largest complete sub-tree rooted at the current node and collapses
|
|
||||||
it into a single node.
|
|
||||||
This procedure is then applied recursively to all the children of that node.
|
|
||||||
_binode: The root of the sub-tree to collapse.
|
|
||||||
_binode->nbits must be 0 or 1.
|
|
||||||
Return: The new root of the collapsed sub-tree.*/
|
|
||||||
static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
|
|
||||||
char **_storage){
|
|
||||||
oc_huff_node *root;
|
|
||||||
size_t size;
|
|
||||||
int mindepth;
|
|
||||||
int depth;
|
|
||||||
int loccupancy;
|
|
||||||
int occupancy;
|
|
||||||
depth=mindepth=oc_huff_tree_mindepth(_binode);
|
|
||||||
occupancy=1<<mindepth;
|
|
||||||
do{
|
|
||||||
loccupancy=occupancy;
|
|
||||||
occupancy=oc_huff_tree_occupancy(_binode,++depth);
|
|
||||||
}
|
|
||||||
while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
|
|
||||||
depth--;
|
|
||||||
if(depth<=1)return oc_huff_tree_copy(_binode,_storage);
|
|
||||||
size=oc_huff_node_size(depth);
|
|
||||||
root=oc_huff_node_init(_storage,size,depth);
|
|
||||||
root->depth=_binode->depth;
|
|
||||||
oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
|
|
||||||
return root;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Unpacks a set of Huffman trees, and reduces them to a collapsed
|
/*Unpacks a set of Huffman trees, and reduces them to a collapsed
|
||||||
representation.
|
representation.
|
||||||
_opb: The buffer to unpack the trees from.
|
_opb: The buffer to unpack the trees from.
|
||||||
_nodes: The table to fill with the Huffman trees.
|
_nodes: The table to fill with the Huffman trees.
|
||||||
Return: 0 on success, or a negative value on error.*/
|
Return: 0 on success, or a negative value on error.
|
||||||
|
The caller is responsible for cleaning up any partially initialized
|
||||||
|
_nodes on failure.*/
|
||||||
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
||||||
oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
|
ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
|
||||||
int i;
|
int i;
|
||||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||||
oc_huff_node nodes[511];
|
unsigned char tokens[256][2];
|
||||||
char *storage;
|
int ntokens;
|
||||||
size_t size;
|
ogg_int16_t *tree;
|
||||||
int ret;
|
size_t size;
|
||||||
/*Unpack the full tree into a temporary buffer.*/
|
/*Unpack the full tree into a temporary buffer.*/
|
||||||
ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
|
ntokens=oc_huff_tree_unpack(_opb,tokens);
|
||||||
if(ret<0)return ret;
|
if(ntokens<0)return ntokens;
|
||||||
/*Figure out how big the collapsed tree will be.*/
|
/*Figure out how big the collapsed tree will be and allocate space for it.*/
|
||||||
size=oc_huff_tree_collapse_size(nodes,0);
|
size=oc_huff_tree_collapse(NULL,tokens,ntokens);
|
||||||
storage=(char *)_ogg_calloc(1,size);
|
/*This should never happen; if it does it means you set OC_HUFF_SLUSH or
|
||||||
if(storage==NULL)return TH_EFAULT;
|
OC_ROOT_HUFF_SLUSH too large.*/
|
||||||
/*And collapse it.*/
|
if(size>32767)return TH_EIMPL;
|
||||||
_nodes[i]=oc_huff_tree_collapse(nodes,&storage);
|
tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
|
||||||
|
if(tree==NULL)return TH_EFAULT;
|
||||||
|
/*Construct the collapsed the tree.*/
|
||||||
|
oc_huff_tree_collapse(tree,tokens,ntokens);
|
||||||
|
_nodes[i]=tree;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*Determines the size in words of a Huffman subtree.
|
||||||
|
_tree: The complete Huffman tree.
|
||||||
|
_node: The index of the root of the desired subtree.
|
||||||
|
Return: The number of words required to store the tree.*/
|
||||||
|
static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
|
||||||
|
size_t size;
|
||||||
|
int nchildren;
|
||||||
|
int n;
|
||||||
|
int i;
|
||||||
|
n=_tree[_node];
|
||||||
|
size=oc_huff_node_size(n);
|
||||||
|
nchildren=1<<n;
|
||||||
|
i=0;
|
||||||
|
do{
|
||||||
|
int child;
|
||||||
|
child=_tree[_node+i+1];
|
||||||
|
if(child<=0)i+=1<<n-(-child>>8);
|
||||||
|
else{
|
||||||
|
size+=oc_huff_tree_size(_tree,child);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while(i<nchildren);
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
/*Makes a copy of the given set of Huffman trees.
|
/*Makes a copy of the given set of Huffman trees.
|
||||||
_dst: The array to store the copy in.
|
_dst: The array to store the copy in.
|
||||||
_src: The array of trees to copy.*/
|
_src: The array of trees to copy.*/
|
||||||
int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
|
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
|
||||||
const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
|
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
|
||||||
|
int total;
|
||||||
int i;
|
int i;
|
||||||
|
total=0;
|
||||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||||
size_t size;
|
size_t size;
|
||||||
char *storage;
|
size=oc_huff_tree_size(_src[i],0);
|
||||||
size=oc_huff_tree_size(_src[i]);
|
total+=size;
|
||||||
storage=(char *)_ogg_calloc(1,size);
|
_dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i]));
|
||||||
if(storage==NULL){
|
if(_dst[i]==NULL){
|
||||||
while(i-->0)_ogg_free(_dst[i]);
|
while(i-->0)_ogg_free(_dst[i]);
|
||||||
return TH_EFAULT;
|
return TH_EFAULT;
|
||||||
}
|
}
|
||||||
_dst[i]=oc_huff_tree_copy(_src[i],&storage);
|
memcpy(_dst[i],_src[i],size*sizeof(*_dst[i]));
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Frees the memory used by a set of Huffman trees.
|
/*Frees the memory used by a set of Huffman trees.
|
||||||
_nodes: The array of trees to free.*/
|
_nodes: The array of trees to free.*/
|
||||||
void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
|
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
|
||||||
int i;
|
int i;
|
||||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
|
for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*Unpacks a single token using the given Huffman tree.
|
/*Unpacks a single token using the given Huffman tree.
|
||||||
_opb: The buffer to unpack the token from.
|
_opb: The buffer to unpack the token from.
|
||||||
_node: The tree to unpack the token with.
|
_node: The tree to unpack the token with.
|
||||||
Return: The token value.*/
|
Return: The token value.*/
|
||||||
int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
|
int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
|
||||||
long bits;
|
const unsigned char *ptr;
|
||||||
while(_node->nbits!=0){
|
const unsigned char *stop;
|
||||||
bits=oc_pack_look(_opb,_node->nbits);
|
oc_pb_window window;
|
||||||
_node=_node->nodes[bits];
|
int available;
|
||||||
oc_pack_adv(_opb,_node->depth);
|
long bits;
|
||||||
|
int node;
|
||||||
|
int n;
|
||||||
|
ptr=_opb->ptr;
|
||||||
|
window=_opb->window;
|
||||||
|
stop=_opb->stop;
|
||||||
|
available=_opb->bits;
|
||||||
|
node=0;
|
||||||
|
for(;;){
|
||||||
|
n=_tree[node];
|
||||||
|
if(n>available){
|
||||||
|
unsigned shift;
|
||||||
|
shift=OC_PB_WINDOW_SIZE-available;
|
||||||
|
do{
|
||||||
|
/*We don't bother setting eof because we won't check for it after we've
|
||||||
|
started decoding DCT tokens.*/
|
||||||
|
if(ptr>=stop){
|
||||||
|
shift=(unsigned)-OC_LOTS_OF_BITS;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
shift-=8;
|
||||||
|
window|=(oc_pb_window)*ptr++<<shift;
|
||||||
|
}
|
||||||
|
while(shift>=8);
|
||||||
|
/*Note: We never request more than 24 bits, so there's no need to fill in
|
||||||
|
the last partial byte here.*/
|
||||||
|
available=OC_PB_WINDOW_SIZE-shift;
|
||||||
|
}
|
||||||
|
bits=window>>OC_PB_WINDOW_SIZE-n;
|
||||||
|
node=_tree[node+1+bits];
|
||||||
|
if(node<=0)break;
|
||||||
|
window<<=n;
|
||||||
|
available-=n;
|
||||||
}
|
}
|
||||||
return _node->token;
|
node=-node;
|
||||||
|
n=node>>8;
|
||||||
|
window<<=n;
|
||||||
|
available-=n;
|
||||||
|
_opb->ptr=ptr;
|
||||||
|
_opb->window=window;
|
||||||
|
_opb->bits=available;
|
||||||
|
return node&255;
|
||||||
}
|
}
|
||||||
|
72
thirdparty/libtheora/huffdec.h
vendored
72
thirdparty/libtheora/huffdec.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -22,71 +22,11 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct oc_huff_node oc_huff_node;
|
|
||||||
|
|
||||||
/*A node in the Huffman tree.
|
|
||||||
Instead of storing every branching in the tree, subtrees can be collapsed
|
|
||||||
into one node, with a table of size 1<<nbits pointing directly to its
|
|
||||||
descedents nbits levels down.
|
|
||||||
This allows more than one bit to be read at a time, and avoids following all
|
|
||||||
the intermediate branches with next to no increased code complexity once
|
|
||||||
the collapsed tree has been built.
|
|
||||||
We do _not_ require that a subtree be complete to be collapsed, but instead
|
|
||||||
store duplicate pointers in the table, and record the actual depth of the
|
|
||||||
node below its parent.
|
|
||||||
This tells us the number of bits to advance the stream after reaching it.
|
|
||||||
|
|
||||||
This turns out to be equivalent to the method described in \cite{Hash95},
|
|
||||||
without the requirement that codewords be sorted by length.
|
|
||||||
If the codewords were sorted by length (so-called ``canonical-codes''), they
|
|
||||||
could be decoded much faster via either Lindell and Moffat's approach or
|
|
||||||
Hashemian's Condensed Huffman Code approach, the latter of which has an
|
|
||||||
extremely small memory footprint.
|
|
||||||
We can't use Choueka et al.'s finite state machine approach, which is
|
|
||||||
extremely fast, because we can't allow multiple symbols to be output at a
|
|
||||||
time; the codebook can and does change between symbols.
|
|
||||||
It also has very large memory requirements, which impairs cache coherency.
|
|
||||||
|
|
||||||
@ARTICLE{Hash95,
|
|
||||||
author="Reza Hashemian",
|
|
||||||
title="Memory Efficient and High-Speed Search {Huffman} Coding",
|
|
||||||
journal="{IEEE} Transactions on Communications",
|
|
||||||
volume=43,
|
|
||||||
number=10,
|
|
||||||
pages="2576--2581",
|
|
||||||
month=Oct,
|
|
||||||
year=1995
|
|
||||||
}*/
|
|
||||||
struct oc_huff_node{
|
|
||||||
/*The number of bits of the code needed to descend through this node.
|
|
||||||
0 indicates a leaf node.
|
|
||||||
Otherwise there are 1<<nbits nodes in the nodes table, which can be
|
|
||||||
indexed by reading nbits bits from the stream.*/
|
|
||||||
unsigned char nbits;
|
|
||||||
/*The value of a token stored in a leaf node.
|
|
||||||
The value in non-leaf nodes is undefined.*/
|
|
||||||
unsigned char token;
|
|
||||||
/*The depth of the current node, relative to its parent in the collapsed
|
|
||||||
tree.
|
|
||||||
This can be less than its parent's nbits value, in which case there are
|
|
||||||
1<<nbits-depth copies of this node in the table, and the bitstream should
|
|
||||||
only be advanced depth bits after reaching this node.*/
|
|
||||||
unsigned char depth;
|
|
||||||
/*The table of child nodes.
|
|
||||||
The ACTUAL size of this array is 1<<nbits, despite what the declaration
|
|
||||||
below claims.
|
|
||||||
The exception is that for leaf nodes the size is 0.*/
|
|
||||||
oc_huff_node *nodes[2];
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
||||||
oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
|
ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
|
||||||
int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
|
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
|
||||||
const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
|
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
|
||||||
void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
|
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
|
||||||
int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
|
int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
112
thirdparty/libtheora/huffenc.c
vendored
112
thirdparty/libtheora/huffenc.c
vendored
@ -859,9 +859,10 @@ int oc_huff_codes_pack(oggpack_buffer *_opb,
|
|||||||
/*First, find the maximum code length so we can align all the bit
|
/*First, find the maximum code length so we can align all the bit
|
||||||
patterns.*/
|
patterns.*/
|
||||||
maxlen=_codes[i][0].nbits;
|
maxlen=_codes[i][0].nbits;
|
||||||
for(j=1;j<TH_NDCT_TOKENS;j++){
|
for(j=1;j<TH_NDCT_TOKENS;j++)maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
|
||||||
maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
|
/*It's improbable that a code with more than 32 bits could pass the
|
||||||
}
|
validation below, but abort early in any case.*/
|
||||||
|
if(maxlen>32)return TH_EINVAL;
|
||||||
mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1;
|
mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1;
|
||||||
/*Copy over the codes into our temporary workspace.
|
/*Copy over the codes into our temporary workspace.
|
||||||
The bit patterns are aligned, and the original entry each code is from
|
The bit patterns are aligned, and the original entry each code is from
|
||||||
@ -877,34 +878,89 @@ int oc_huff_codes_pack(oggpack_buffer *_opb,
|
|||||||
/*For each leaf of the tree:*/
|
/*For each leaf of the tree:*/
|
||||||
bpos=maxlen;
|
bpos=maxlen;
|
||||||
for(j=0;j<TH_NDCT_TOKENS;j++){
|
for(j=0;j<TH_NDCT_TOKENS;j++){
|
||||||
int bit;
|
ogg_uint32_t bit;
|
||||||
/*If this code has any bits at all.*/
|
/*Fail if this code has no bits at all.
|
||||||
if(entries[j].shift<maxlen){
|
Technically a codebook with a single 0-bit entry is legal, but the
|
||||||
/*Descend into the tree, writing a bit for each branch.*/
|
encoder currently does not support codebooks which do not contain all
|
||||||
for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
|
the tokens.*/
|
||||||
/*Mark this as a leaf node, and write its value.*/
|
if(entries[j].shift>=maxlen)return TH_EINVAL;
|
||||||
oggpackB_write(_opb,1,1);
|
/*Descend into the tree, writing a bit for each branch.*/
|
||||||
oggpackB_write(_opb,entries[j].token,5);
|
for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
|
||||||
/*For each 1 branch we've descended, back up the tree until we reach a
|
/*Mark this as a leaf node, and write its value.*/
|
||||||
0 branch.*/
|
oggpackB_write(_opb,1,1);
|
||||||
bit=1<<bpos;
|
oggpackB_write(_opb,entries[j].token,5);
|
||||||
for(;entries[j].pattern&bit;bpos++)bit<<=1;
|
/*For each 1 branch we've descended, back up the tree until we reach a
|
||||||
/*Validate the code.*/
|
0 branch.*/
|
||||||
if(j+1<TH_NDCT_TOKENS){
|
bit=(ogg_uint32_t)1<<bpos;
|
||||||
mask=~(bit-1)<<1;
|
for(;entries[j].pattern&bit;bpos++)bit<<=1;
|
||||||
/*The next entry should have a 1 bit where we had a 0, and should
|
/*Validate the code.*/
|
||||||
match our code above that bit.
|
if(j+1<TH_NDCT_TOKENS){
|
||||||
This verifies both fullness and prefix-freeness simultaneously.*/
|
mask=~(bit-1)<<1;
|
||||||
if(!(entries[j+1].pattern&bit)||
|
/*The next entry should have a 1 bit where we had a 0, and should
|
||||||
(entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
|
match our code above that bit.
|
||||||
return TH_EINVAL;
|
This verifies both fullness and prefix-freeness simultaneously.*/
|
||||||
}
|
if(!(entries[j+1].pattern&bit)||
|
||||||
|
(entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
|
||||||
|
return TH_EINVAL;
|
||||||
}
|
}
|
||||||
/*If there are no more codes, we should have ascended back to the top
|
|
||||||
of the tree.*/
|
|
||||||
else if(bpos<maxlen)return TH_EINVAL;
|
|
||||||
}
|
}
|
||||||
|
/*If there are no more codes, we should have ascended back to the top
|
||||||
|
of the tree.*/
|
||||||
|
else if(bpos<maxlen)return TH_EINVAL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*This is used to copy the configuration of an existing setup header for use by
|
||||||
|
the encoder.
|
||||||
|
The decoder uses a completely different data structure for the Huffman
|
||||||
|
codebooks.*/
|
||||||
|
int oc_huff_codes_unpack(oc_pack_buf *_opb,
|
||||||
|
th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
|
||||||
|
int i;
|
||||||
|
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||||
|
ogg_uint32_t code;
|
||||||
|
int len;
|
||||||
|
int nleaves;
|
||||||
|
code=0;
|
||||||
|
len=nleaves=0;
|
||||||
|
memset(_codes[i],0,TH_NDCT_TOKENS*sizeof(*_codes[i]));
|
||||||
|
for(;;){
|
||||||
|
long bits;
|
||||||
|
bits=oc_pack_read1(_opb);
|
||||||
|
/*Only process nodes so long as there's more bits in the buffer.*/
|
||||||
|
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||||
|
/*Read an internal node:*/
|
||||||
|
if(!bits){
|
||||||
|
len++;
|
||||||
|
/*Don't allow codewords longer than 32 bits.*/
|
||||||
|
if(len>32)return TH_EBADHEADER;
|
||||||
|
}
|
||||||
|
/*Read a leaf node:*/
|
||||||
|
else{
|
||||||
|
ogg_uint32_t code_bit;
|
||||||
|
/*Don't allow more than 32 tokens per codebook.*/
|
||||||
|
if(++nleaves>32)return TH_EBADHEADER;
|
||||||
|
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
||||||
|
/*The current encoder does not support codebooks that do not contain
|
||||||
|
all of the tokens.*/
|
||||||
|
if(_codes[i][bits].nbits>0)return TH_EINVAL;
|
||||||
|
_codes[i][bits].pattern=code>>32-len;
|
||||||
|
_codes[i][bits].nbits=len;
|
||||||
|
code_bit=0x80000000U>>len-1;
|
||||||
|
while(len>0&&(code&code_bit)){
|
||||||
|
code^=code_bit;
|
||||||
|
code_bit<<=1;
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
if(len<=0)break;
|
||||||
|
code|=code_bit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*The current encoder does not support codebooks that do not contain all of
|
||||||
|
the tokens.*/
|
||||||
|
if(nleaves<32)return TH_EINVAL;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
3
thirdparty/libtheora/huffenc.h
vendored
3
thirdparty/libtheora/huffenc.h
vendored
@ -1,6 +1,7 @@
|
|||||||
#if !defined(_huffenc_H)
|
#if !defined(_huffenc_H)
|
||||||
# define _huffenc_H (1)
|
# define _huffenc_H (1)
|
||||||
# include "huffman.h"
|
# include "huffman.h"
|
||||||
|
# include "bitpack.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -15,5 +16,7 @@ extern const th_huff_code
|
|||||||
|
|
||||||
int oc_huff_codes_pack(oggpack_buffer *_opb,
|
int oc_huff_codes_pack(oggpack_buffer *_opb,
|
||||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
|
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
|
||||||
|
int oc_huff_codes_unpack(oc_pack_buf *_opb,
|
||||||
|
th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
4
thirdparty/libtheora/huffman.h
vendored
4
thirdparty/libtheora/huffman.h
vendored
@ -11,12 +11,12 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: huffman.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
#if !defined(_huffman_H)
|
#if !defined(_huffman_H)
|
||||||
# define _hufffman_H (1)
|
# define _huffman_H (1)
|
||||||
# include "theora/codec.h"
|
# include "theora/codec.h"
|
||||||
# include "ocintrin.h"
|
# include "ocintrin.h"
|
||||||
|
|
||||||
|
59
thirdparty/libtheora/idct.c
vendored
59
thirdparty/libtheora/idct.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -231,18 +231,18 @@ static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
|
|||||||
_y: The buffer to store the result in.
|
_y: The buffer to store the result in.
|
||||||
This may be the same as _x.
|
This may be the same as _x.
|
||||||
_x: The input coefficients.*/
|
_x: The input coefficients.*/
|
||||||
static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
const ogg_int16_t *in;
|
ogg_int16_t w[64];
|
||||||
ogg_int16_t *end;
|
int i;
|
||||||
ogg_int16_t *out;
|
|
||||||
ogg_int16_t w[64];
|
|
||||||
/*Transform rows of x into columns of w.*/
|
/*Transform rows of x into columns of w.*/
|
||||||
idct8_2(w,_x);
|
idct8_2(w,_x);
|
||||||
idct8_1(w+1,_x+8);
|
idct8_1(w+1,_x+8);
|
||||||
/*Transform rows of w into columns of y.*/
|
/*Transform rows of w into columns of y.*/
|
||||||
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
|
for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
|
||||||
/*Adjust for the scale factor.*/
|
/*Adjust for the scale factor.*/
|
||||||
for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
|
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||||
|
/*Clear input data for next block.*/
|
||||||
|
_x[0]=_x[1]=_x[8]=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||||
@ -260,20 +260,20 @@ static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
_y: The buffer to store the result in.
|
_y: The buffer to store the result in.
|
||||||
This may be the same as _x.
|
This may be the same as _x.
|
||||||
_x: The input coefficients.*/
|
_x: The input coefficients.*/
|
||||||
static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
const ogg_int16_t *in;
|
ogg_int16_t w[64];
|
||||||
ogg_int16_t *end;
|
int i;
|
||||||
ogg_int16_t *out;
|
|
||||||
ogg_int16_t w[64];
|
|
||||||
/*Transform rows of x into columns of w.*/
|
/*Transform rows of x into columns of w.*/
|
||||||
idct8_4(w,_x);
|
idct8_4(w,_x);
|
||||||
idct8_3(w+1,_x+8);
|
idct8_3(w+1,_x+8);
|
||||||
idct8_2(w+2,_x+16);
|
idct8_2(w+2,_x+16);
|
||||||
idct8_1(w+3,_x+24);
|
idct8_1(w+3,_x+24);
|
||||||
/*Transform rows of w into columns of y.*/
|
/*Transform rows of w into columns of y.*/
|
||||||
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
|
for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
|
||||||
/*Adjust for the scale factor.*/
|
/*Adjust for the scale factor.*/
|
||||||
for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
|
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||||
|
/*Clear input data for next block.*/
|
||||||
|
_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||||
@ -282,28 +282,23 @@ static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
_y: The buffer to store the result in.
|
_y: The buffer to store the result in.
|
||||||
This may be the same as _x.
|
This may be the same as _x.
|
||||||
_x: The input coefficients.*/
|
_x: The input coefficients.*/
|
||||||
static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
const ogg_int16_t *in;
|
ogg_int16_t w[64];
|
||||||
ogg_int16_t *end;
|
int i;
|
||||||
ogg_int16_t *out;
|
|
||||||
ogg_int16_t w[64];
|
|
||||||
/*Transform rows of x into columns of w.*/
|
/*Transform rows of x into columns of w.*/
|
||||||
for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
|
for(i=0;i<8;i++)idct8(w+i,_x+i*8);
|
||||||
/*Transform rows of w into columns of y.*/
|
/*Transform rows of w into columns of y.*/
|
||||||
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
|
for(i=0;i<8;i++)idct8(_y+i,w+i*8);
|
||||||
/*Adjust for the scale factor.*/
|
/*Adjust for the scale factor.*/
|
||||||
for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
|
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||||
}
|
/*Clear input data for next block.*/
|
||||||
|
for(i=0;i<64;i++)_x[i]=0;
|
||||||
void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
|
|
||||||
int _last_zzi){
|
|
||||||
(*_state->opt_vtable.idct8x8)(_y,_last_zzi);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||||
version of the transform.*/
|
version of the transform.*/
|
||||||
void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
|
void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||||
/*_last_zzi is subtly different from an actual count of the number of
|
/*_last_zzi is subtly different from an actual count of the number of
|
||||||
coefficients we decoded for this block.
|
coefficients we decoded for this block.
|
||||||
It contains the value of zzi BEFORE the final token in the block was
|
It contains the value of zzi BEFORE the final token in the block was
|
||||||
@ -329,7 +324,7 @@ void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
|
|||||||
gets.
|
gets.
|
||||||
Needless to say we inherited this approach from VP3.*/
|
Needless to say we inherited this approach from VP3.*/
|
||||||
/*Then perform the iDCT.*/
|
/*Then perform the iDCT.*/
|
||||||
if(_last_zzi<3)oc_idct8x8_3(_y,_y);
|
if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
|
||||||
else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
|
else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
|
||||||
else oc_idct8x8_slow(_y,_y);
|
else oc_idct8x8_slow(_y,_x);
|
||||||
}
|
}
|
||||||
|
10
thirdparty/libtheora/info.c
vendored
10
thirdparty/libtheora/info.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: info.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ void th_comment_init(th_comment *_tc){
|
|||||||
memset(_tc,0,sizeof(*_tc));
|
memset(_tc,0,sizeof(*_tc));
|
||||||
}
|
}
|
||||||
|
|
||||||
void th_comment_add(th_comment *_tc,char *_comment){
|
void th_comment_add(th_comment *_tc,const char *_comment){
|
||||||
char **user_comments;
|
char **user_comments;
|
||||||
int *comment_lengths;
|
int *comment_lengths;
|
||||||
int comment_len;
|
int comment_len;
|
||||||
@ -75,7 +75,7 @@ void th_comment_add(th_comment *_tc,char *_comment){
|
|||||||
_tc->user_comments[_tc->comments]=NULL;
|
_tc->user_comments[_tc->comments]=NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){
|
void th_comment_add_tag(th_comment *_tc,const char *_tag,const char *_val){
|
||||||
char *comment;
|
char *comment;
|
||||||
int tag_len;
|
int tag_len;
|
||||||
int val_len;
|
int val_len;
|
||||||
@ -91,7 +91,7 @@ void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){
|
|||||||
_ogg_free(comment);
|
_ogg_free(comment);
|
||||||
}
|
}
|
||||||
|
|
||||||
char *th_comment_query(th_comment *_tc,char *_tag,int _count){
|
char *th_comment_query(th_comment *_tc,const char *_tag,int _count){
|
||||||
long i;
|
long i;
|
||||||
int found;
|
int found;
|
||||||
int tag_len;
|
int tag_len;
|
||||||
@ -107,7 +107,7 @@ char *th_comment_query(th_comment *_tc,char *_tag,int _count){
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int th_comment_query_count(th_comment *_tc,char *_tag){
|
int th_comment_query_count(th_comment *_tc,const char *_tag){
|
||||||
long i;
|
long i;
|
||||||
int tag_len;
|
int tag_len;
|
||||||
int count;
|
int count;
|
||||||
|
92
thirdparty/libtheora/internal.c
vendored
92
thirdparty/libtheora/internal.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -97,79 +97,29 @@ int oc_ilog(unsigned _v){
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
void *oc_aligned_malloc(size_t _sz,size_t _align){
|
||||||
block when 4 different motion vectors are specified in the luma plane.
|
unsigned char *p;
|
||||||
This version is for use with chroma decimated in the X and Y directions
|
if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
|
||||||
(4:2:0).
|
p=(unsigned char *)_ogg_malloc(_sz+_align);
|
||||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
if(p!=NULL){
|
||||||
_lbmvs: The luma block-level motion vectors.*/
|
int offs;
|
||||||
static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
offs=((p-(unsigned char *)0)-1&_align-1);
|
||||||
int dx;
|
p[offs]=offs;
|
||||||
int dy;
|
p+=offs+1;
|
||||||
dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
|
}
|
||||||
dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
|
return p;
|
||||||
_cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
|
|
||||||
_cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
void oc_aligned_free(void *_ptr){
|
||||||
block when 4 different motion vectors are specified in the luma plane.
|
unsigned char *p;
|
||||||
This version is for use with chroma decimated in the Y direction.
|
p=(unsigned char *)_ptr;
|
||||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
if(p!=NULL){
|
||||||
_lbmvs: The luma block-level motion vectors.*/
|
int offs;
|
||||||
static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
offs=*--p;
|
||||||
int dx;
|
_ogg_free(p-offs);
|
||||||
int dy;
|
}
|
||||||
dx=_lbmvs[0][0]+_lbmvs[2][0];
|
|
||||||
dy=_lbmvs[0][1]+_lbmvs[2][1];
|
|
||||||
_cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
|
||||||
_cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
|
||||||
dx=_lbmvs[1][0]+_lbmvs[3][0];
|
|
||||||
dy=_lbmvs[1][1]+_lbmvs[3][1];
|
|
||||||
_cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
|
||||||
_cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
|
||||||
block when 4 different motion vectors are specified in the luma plane.
|
|
||||||
This version is for use with chroma decimated in the X direction (4:2:2).
|
|
||||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
|
||||||
_lbmvs: The luma block-level motion vectors.*/
|
|
||||||
static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
|
||||||
int dx;
|
|
||||||
int dy;
|
|
||||||
dx=_lbmvs[0][0]+_lbmvs[1][0];
|
|
||||||
dy=_lbmvs[0][1]+_lbmvs[1][1];
|
|
||||||
_cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
|
||||||
_cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
|
||||||
dx=_lbmvs[2][0]+_lbmvs[3][0];
|
|
||||||
dy=_lbmvs[2][1]+_lbmvs[3][1];
|
|
||||||
_cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
|
|
||||||
_cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*The function used to fill in the chroma plane motion vectors for a macro
|
|
||||||
block when 4 different motion vectors are specified in the luma plane.
|
|
||||||
This version is for use with no chroma decimation (4:4:4).
|
|
||||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
|
||||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
|
||||||
prediction.
|
|
||||||
_lbmvs: The luma block-level motion vectors.*/
|
|
||||||
static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
|
||||||
memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*A table of functions used to fill in the chroma plane motion vectors for a
|
|
||||||
macro block when 4 different motion vectors are specified in the luma
|
|
||||||
plane.*/
|
|
||||||
const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
|
|
||||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
|
|
||||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
|
|
||||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
|
|
||||||
(oc_set_chroma_mvs_func)oc_set_chroma_mvs11
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
|
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
|
||||||
size_t rowsz;
|
size_t rowsz;
|
||||||
@ -181,7 +131,6 @@ void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
|
|||||||
datsz=rowsz*_height;
|
datsz=rowsz*_height;
|
||||||
/*Alloc array and row pointers.*/
|
/*Alloc array and row pointers.*/
|
||||||
ret=(char *)_ogg_malloc(datsz+colsz);
|
ret=(char *)_ogg_malloc(datsz+colsz);
|
||||||
if(ret==NULL)return NULL;
|
|
||||||
/*Initialize the array.*/
|
/*Initialize the array.*/
|
||||||
if(ret!=NULL){
|
if(ret!=NULL){
|
||||||
size_t i;
|
size_t i;
|
||||||
@ -204,7 +153,6 @@ void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){
|
|||||||
datsz=rowsz*_height;
|
datsz=rowsz*_height;
|
||||||
/*Alloc array and row pointers.*/
|
/*Alloc array and row pointers.*/
|
||||||
ret=(char *)_ogg_calloc(datsz+colsz,1);
|
ret=(char *)_ogg_calloc(datsz+colsz,1);
|
||||||
if(ret==NULL)return NULL;
|
|
||||||
/*Initialize the array.*/
|
/*Initialize the array.*/
|
||||||
if(ret!=NULL){
|
if(ret!=NULL){
|
||||||
size_t i;
|
size_t i;
|
||||||
|
445
thirdparty/libtheora/internal.h
vendored
445
thirdparty/libtheora/internal.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#if !defined(_internal_H)
|
#if !defined(_internal_H)
|
||||||
@ -19,10 +19,20 @@
|
|||||||
# include <stdlib.h>
|
# include <stdlib.h>
|
||||||
# include <limits.h>
|
# include <limits.h>
|
||||||
# if defined(HAVE_CONFIG_H)
|
# if defined(HAVE_CONFIG_H)
|
||||||
# include <config.h>
|
# include "config.h"
|
||||||
# endif
|
# endif
|
||||||
# include "theora/codec.h"
|
# include "theora/codec.h"
|
||||||
# include "theora/theora.h"
|
# include "theora/theora.h"
|
||||||
|
# include "ocintrin.h"
|
||||||
|
|
||||||
|
# if !defined(__GNUC_PREREQ)
|
||||||
|
# if defined(__GNUC__)&&defined(__GNUC_MINOR__)
|
||||||
|
# define __GNUC_PREREQ(_maj,_min) \
|
||||||
|
((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
|
||||||
|
# else
|
||||||
|
# define __GNUC_PREREQ(_maj,_min) 0
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
# if defined(_MSC_VER)
|
# if defined(_MSC_VER)
|
||||||
/*Disable missing EMMS warnings.*/
|
/*Disable missing EMMS warnings.*/
|
||||||
@ -31,24 +41,25 @@
|
|||||||
# pragma warning(disable:4554)
|
# pragma warning(disable:4554)
|
||||||
# endif
|
# endif
|
||||||
/*You, too, gcc.*/
|
/*You, too, gcc.*/
|
||||||
# if defined(__GNUC_PREREQ)
|
# if __GNUC_PREREQ(4,2)
|
||||||
# if __GNUC_PREREQ(4,2)
|
# pragma GCC diagnostic ignored "-Wparentheses"
|
||||||
# pragma GCC diagnostic ignored "-Wparentheses"
|
|
||||||
# endif
|
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
# include "ocintrin.h"
|
/*Some assembly constructs require aligned operands.
|
||||||
# include "huffman.h"
|
The following macros are _only_ intended for structure member declarations.
|
||||||
# include "quant.h"
|
Although they will sometimes work on stack variables, gcc will often silently
|
||||||
|
ignore them.
|
||||||
/*Some assembly constructs require aligned operands.*/
|
A separate set of macros could be made for manual stack alignment, but we
|
||||||
# if defined(OC_X86_ASM)
|
don't actually require it anywhere.*/
|
||||||
|
# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
|
||||||
# if defined(__GNUC__)
|
# if defined(__GNUC__)
|
||||||
# define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
|
# define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
|
||||||
# define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
|
# define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
|
||||||
# elif defined(_MSC_VER)
|
# elif defined(_MSC_VER)
|
||||||
# define OC_ALIGN8(expr) __declspec (align(8)) expr
|
# define OC_ALIGN8(expr) __declspec (align(8)) expr
|
||||||
# define OC_ALIGN16(expr) __declspec (align(16)) expr
|
# define OC_ALIGN16(expr) __declspec (align(16)) expr
|
||||||
|
# else
|
||||||
|
# error "Alignment macros required for this platform."
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
# if !defined(OC_ALIGN8)
|
# if !defined(OC_ALIGN8)
|
||||||
@ -60,19 +71,8 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct oc_sb_flags oc_sb_flags;
|
|
||||||
typedef struct oc_border_info oc_border_info;
|
|
||||||
typedef struct oc_fragment oc_fragment;
|
|
||||||
typedef struct oc_fragment_plane oc_fragment_plane;
|
|
||||||
typedef struct oc_base_opt_vtable oc_base_opt_vtable;
|
|
||||||
typedef struct oc_base_opt_data oc_base_opt_data;
|
|
||||||
typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
|
|
||||||
typedef struct oc_theora_state oc_theora_state;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*This library's version.*/
|
/*This library's version.*/
|
||||||
# define OC_VENDOR_STRING "Xiph.Org libtheora 1.1 20090822 (Thusnelda)"
|
# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)"
|
||||||
|
|
||||||
/*Theora bitstream version.*/
|
/*Theora bitstream version.*/
|
||||||
# define TH_VERSION_MAJOR (3)
|
# define TH_VERSION_MAJOR (3)
|
||||||
@ -83,315 +83,6 @@ typedef struct oc_theora_state oc_theora_state;
|
|||||||
((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
|
((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
|
||||||
(_info)->version_subminor>=(_sub)))
|
(_info)->version_subminor>=(_sub)))
|
||||||
|
|
||||||
/*A keyframe.*/
|
|
||||||
#define OC_INTRA_FRAME (0)
|
|
||||||
/*A predicted frame.*/
|
|
||||||
#define OC_INTER_FRAME (1)
|
|
||||||
/*A frame of unknown type (frame type decision has not yet been made).*/
|
|
||||||
#define OC_UNKWN_FRAME (-1)
|
|
||||||
|
|
||||||
/*The amount of padding to add to the reconstructed frame buffers on all
|
|
||||||
sides.
|
|
||||||
This is used to allow unrestricted motion vectors without special casing.
|
|
||||||
This must be a multiple of 2.*/
|
|
||||||
#define OC_UMV_PADDING (16)
|
|
||||||
|
|
||||||
/*Frame classification indices.*/
|
|
||||||
/*The previous golden frame.*/
|
|
||||||
#define OC_FRAME_GOLD (0)
|
|
||||||
/*The previous frame.*/
|
|
||||||
#define OC_FRAME_PREV (1)
|
|
||||||
/*The current frame.*/
|
|
||||||
#define OC_FRAME_SELF (2)
|
|
||||||
|
|
||||||
/*The input or output buffer.*/
|
|
||||||
#define OC_FRAME_IO (3)
|
|
||||||
|
|
||||||
/*Macroblock modes.*/
|
|
||||||
/*Macro block is invalid: It is never coded.*/
|
|
||||||
#define OC_MODE_INVALID (-1)
|
|
||||||
/*Encoded difference from the same macro block in the previous frame.*/
|
|
||||||
#define OC_MODE_INTER_NOMV (0)
|
|
||||||
/*Encoded with no motion compensated prediction.*/
|
|
||||||
#define OC_MODE_INTRA (1)
|
|
||||||
/*Encoded difference from the previous frame offset by the given motion
|
|
||||||
vector.*/
|
|
||||||
#define OC_MODE_INTER_MV (2)
|
|
||||||
/*Encoded difference from the previous frame offset by the last coded motion
|
|
||||||
vector.*/
|
|
||||||
#define OC_MODE_INTER_MV_LAST (3)
|
|
||||||
/*Encoded difference from the previous frame offset by the second to last
|
|
||||||
coded motion vector.*/
|
|
||||||
#define OC_MODE_INTER_MV_LAST2 (4)
|
|
||||||
/*Encoded difference from the same macro block in the previous golden
|
|
||||||
frame.*/
|
|
||||||
#define OC_MODE_GOLDEN_NOMV (5)
|
|
||||||
/*Encoded difference from the previous golden frame offset by the given motion
|
|
||||||
vector.*/
|
|
||||||
#define OC_MODE_GOLDEN_MV (6)
|
|
||||||
/*Encoded difference from the previous frame offset by the individual motion
|
|
||||||
vectors given for each block.*/
|
|
||||||
#define OC_MODE_INTER_MV_FOUR (7)
|
|
||||||
/*The number of (coded) modes.*/
|
|
||||||
#define OC_NMODES (8)
|
|
||||||
|
|
||||||
/*Determines the reference frame used for a given MB mode.*/
|
|
||||||
#define OC_FRAME_FOR_MODE(_x) \
|
|
||||||
OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
|
|
||||||
OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
|
|
||||||
|
|
||||||
/*Constants for the packet state machine common between encoder and decoder.*/
|
|
||||||
|
|
||||||
/*Next packet to emit/read: Codec info header.*/
|
|
||||||
#define OC_PACKET_INFO_HDR (-3)
|
|
||||||
/*Next packet to emit/read: Comment header.*/
|
|
||||||
#define OC_PACKET_COMMENT_HDR (-2)
|
|
||||||
/*Next packet to emit/read: Codec setup header.*/
|
|
||||||
#define OC_PACKET_SETUP_HDR (-1)
|
|
||||||
/*No more packets to emit/read.*/
|
|
||||||
#define OC_PACKET_DONE (INT_MAX)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Super blocks are 32x32 segments of pixels in a single color plane indexed
|
|
||||||
in image order.
|
|
||||||
Internally, super blocks are broken up into four quadrants, each of which
|
|
||||||
contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
|
|
||||||
Quadrants, and the blocks within them, are indexed in a special order called
|
|
||||||
a "Hilbert curve" within the super block.
|
|
||||||
|
|
||||||
In order to differentiate between the Hilbert-curve indexing strategy and
|
|
||||||
the regular image order indexing strategy, blocks indexed in image order
|
|
||||||
are called "fragments".
|
|
||||||
Fragments are indexed in image order, left to right, then bottom to top,
|
|
||||||
from Y' plane to Cb plane to Cr plane.
|
|
||||||
|
|
||||||
The co-located fragments in all image planes corresponding to the location
|
|
||||||
of a single quadrant of a luma plane super block form a macro block.
|
|
||||||
Thus there is only a single set of macro blocks for all planes, each of which
|
|
||||||
contains between 6 and 12 fragments, depending on the pixel format.
|
|
||||||
Therefore macro block information is kept in a separate set of arrays from
|
|
||||||
super blocks to avoid unused space in the other planes.
|
|
||||||
The lists are indexed in super block order.
|
|
||||||
That is, the macro block corresponding to the macro block mbi in (luma plane)
|
|
||||||
super block sbi is at index (sbi<<2|mbi).
|
|
||||||
Thus the number of macro blocks in each dimension is always twice the number
|
|
||||||
of super blocks, even when only an odd number fall inside the coded frame.
|
|
||||||
These "extra" macro blocks are just an artifact of our internal data layout,
|
|
||||||
and not part of the coded stream; they are flagged with a negative MB mode.*/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*A single quadrant of the map from a super block to fragment numbers.*/
|
|
||||||
typedef ptrdiff_t oc_sb_map_quad[4];
|
|
||||||
/*A map from a super block to fragment numbers.*/
|
|
||||||
typedef oc_sb_map_quad oc_sb_map[4];
|
|
||||||
/*A single plane of the map from a macro block to fragment numbers.*/
|
|
||||||
typedef ptrdiff_t oc_mb_map_plane[4];
|
|
||||||
/*A map from a macro block to fragment numbers.*/
|
|
||||||
typedef oc_mb_map_plane oc_mb_map[3];
|
|
||||||
/*A motion vector.*/
|
|
||||||
typedef signed char oc_mv[2];
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Super block information.*/
|
|
||||||
struct oc_sb_flags{
|
|
||||||
unsigned char coded_fully:1;
|
|
||||||
unsigned char coded_partially:1;
|
|
||||||
unsigned char quad_valid:4;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Information about a fragment which intersects the border of the displayable
|
|
||||||
region.
|
|
||||||
This marks which pixels belong to the displayable region.*/
|
|
||||||
struct oc_border_info{
|
|
||||||
/*A bit mask marking which pixels are in the displayable region.
|
|
||||||
Pixel (x,y) corresponds to bit (y<<3|x).*/
|
|
||||||
ogg_int64_t mask;
|
|
||||||
/*The number of pixels in the displayable region.
|
|
||||||
This is always positive, and always less than 64.*/
|
|
||||||
int npixels;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Fragment information.*/
|
|
||||||
struct oc_fragment{
|
|
||||||
/*A flag indicating whether or not this fragment is coded.*/
|
|
||||||
unsigned coded:1;
|
|
||||||
/*A flag indicating that this entire fragment lies outside the displayable
|
|
||||||
region of the frame.
|
|
||||||
Note the contrast with an invalid macro block, which is outside the coded
|
|
||||||
frame, not just the displayable one.
|
|
||||||
There are no fragments outside the coded frame by construction.*/
|
|
||||||
unsigned invalid:1;
|
|
||||||
/*The index of the quality index used for this fragment's AC coefficients.*/
|
|
||||||
unsigned qii:6;
|
|
||||||
/*The mode of the macroblock this fragment belongs to.*/
|
|
||||||
unsigned mb_mode:3;
|
|
||||||
/*The index of the associated border information for fragments which lie
|
|
||||||
partially outside the displayable region.
|
|
||||||
For fragments completely inside or outside this region, this is -1.
|
|
||||||
Note that the C standard requires an explicit signed keyword for bitfield
|
|
||||||
types, since some compilers may treat them as unsigned without it.*/
|
|
||||||
signed int borderi:5;
|
|
||||||
/*The prediction-corrected DC component.
|
|
||||||
Note that the C standard requires an explicit signed keyword for bitfield
|
|
||||||
types, since some compilers may treat them as unsigned without it.*/
|
|
||||||
signed int dc:16;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*A description of each fragment plane.*/
|
|
||||||
struct oc_fragment_plane{
|
|
||||||
/*The number of fragments in the horizontal direction.*/
|
|
||||||
int nhfrags;
|
|
||||||
/*The number of fragments in the vertical direction.*/
|
|
||||||
int nvfrags;
|
|
||||||
/*The offset of the first fragment in the plane.*/
|
|
||||||
ptrdiff_t froffset;
|
|
||||||
/*The total number of fragments in the plane.*/
|
|
||||||
ptrdiff_t nfrags;
|
|
||||||
/*The number of super blocks in the horizontal direction.*/
|
|
||||||
unsigned nhsbs;
|
|
||||||
/*The number of super blocks in the vertical direction.*/
|
|
||||||
unsigned nvsbs;
|
|
||||||
/*The offset of the first super block in the plane.*/
|
|
||||||
unsigned sboffset;
|
|
||||||
/*The total number of super blocks in the plane.*/
|
|
||||||
unsigned nsbs;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*The shared (encoder and decoder) functions that have accelerated variants.*/
|
|
||||||
struct oc_base_opt_vtable{
|
|
||||||
void (*frag_copy)(unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride);
|
|
||||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
|
||||||
const ogg_int16_t _residue[64]);
|
|
||||||
void (*frag_recon_inter)(unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
|
|
||||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
|
|
||||||
void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
|
||||||
void (*state_frag_copy_list)(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli);
|
|
||||||
void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
|
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
|
||||||
void (*restore_fpu)(void);
|
|
||||||
};
|
|
||||||
|
|
||||||
/*The shared (encoder and decoder) tables that vary according to which variants
|
|
||||||
of the above functions are used.*/
|
|
||||||
struct oc_base_opt_data{
|
|
||||||
const unsigned char *dct_fzig_zag;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/*State information common to both the encoder and decoder.*/
|
|
||||||
struct oc_theora_state{
|
|
||||||
/*The stream information.*/
|
|
||||||
th_info info;
|
|
||||||
/*Table for shared accelerated functions.*/
|
|
||||||
oc_base_opt_vtable opt_vtable;
|
|
||||||
/*Table for shared data used by accelerated functions.*/
|
|
||||||
oc_base_opt_data opt_data;
|
|
||||||
/*CPU flags to detect the presence of extended instruction sets.*/
|
|
||||||
ogg_uint32_t cpu_flags;
|
|
||||||
/*The fragment plane descriptions.*/
|
|
||||||
oc_fragment_plane fplanes[3];
|
|
||||||
/*The list of fragments, indexed in image order.*/
|
|
||||||
oc_fragment *frags;
|
|
||||||
/*The the offset into the reference frame buffer to the upper-left pixel of
|
|
||||||
each fragment.*/
|
|
||||||
ptrdiff_t *frag_buf_offs;
|
|
||||||
/*The motion vector for each fragment.*/
|
|
||||||
oc_mv *frag_mvs;
|
|
||||||
/*The total number of fragments in a single frame.*/
|
|
||||||
ptrdiff_t nfrags;
|
|
||||||
/*The list of super block maps, indexed in image order.*/
|
|
||||||
oc_sb_map *sb_maps;
|
|
||||||
/*The list of super block flags, indexed in image order.*/
|
|
||||||
oc_sb_flags *sb_flags;
|
|
||||||
/*The total number of super blocks in a single frame.*/
|
|
||||||
unsigned nsbs;
|
|
||||||
/*The fragments from each color plane that belong to each macro block.
|
|
||||||
Fragments are stored in image order (left to right then top to bottom).
|
|
||||||
When chroma components are decimated, the extra fragments have an index of
|
|
||||||
-1.*/
|
|
||||||
oc_mb_map *mb_maps;
|
|
||||||
/*The list of macro block modes.
|
|
||||||
A negative number indicates the macro block lies entirely outside the
|
|
||||||
coded frame.*/
|
|
||||||
signed char *mb_modes;
|
|
||||||
/*The number of macro blocks in the X direction.*/
|
|
||||||
unsigned nhmbs;
|
|
||||||
/*The number of macro blocks in the Y direction.*/
|
|
||||||
unsigned nvmbs;
|
|
||||||
/*The total number of macro blocks.*/
|
|
||||||
size_t nmbs;
|
|
||||||
/*The list of coded fragments, in coded order.
|
|
||||||
Uncoded fragments are stored in reverse order from the end of the list.*/
|
|
||||||
ptrdiff_t *coded_fragis;
|
|
||||||
/*The number of coded fragments in each plane.*/
|
|
||||||
ptrdiff_t ncoded_fragis[3];
|
|
||||||
/*The total number of coded fragments.*/
|
|
||||||
ptrdiff_t ntotal_coded_fragis;
|
|
||||||
/*The index of the buffers being used for each OC_FRAME_* reference frame.*/
|
|
||||||
int ref_frame_idx[4];
|
|
||||||
/*The actual buffers used for the previously decoded frames.*/
|
|
||||||
th_ycbcr_buffer ref_frame_bufs[4];
|
|
||||||
/*The storage for the reference frame buffers.*/
|
|
||||||
unsigned char *ref_frame_data[4];
|
|
||||||
/*The strides for each plane in the reference frames.*/
|
|
||||||
int ref_ystride[3];
|
|
||||||
/*The number of unique border patterns.*/
|
|
||||||
int nborders;
|
|
||||||
/*The unique border patterns for all border fragments.
|
|
||||||
The borderi field of fragments which straddle the border indexes this
|
|
||||||
list.*/
|
|
||||||
oc_border_info borders[16];
|
|
||||||
/*The frame number of the last keyframe.*/
|
|
||||||
ogg_int64_t keyframe_num;
|
|
||||||
/*The frame number of the current frame.*/
|
|
||||||
ogg_int64_t curframe_num;
|
|
||||||
/*The granpos of the current frame.*/
|
|
||||||
ogg_int64_t granpos;
|
|
||||||
/*The type of the current frame.*/
|
|
||||||
unsigned char frame_type;
|
|
||||||
/*The bias to add to the frame count when computing granule positions.*/
|
|
||||||
unsigned char granpos_bias;
|
|
||||||
/*The number of quality indices used in the current frame.*/
|
|
||||||
unsigned char nqis;
|
|
||||||
/*The quality indices of the current frame.*/
|
|
||||||
unsigned char qis[3];
|
|
||||||
/*The dequantization tables, stored in zig-zag order, and indexed by
|
|
||||||
qi, pli, qti, and zzi.*/
|
|
||||||
ogg_uint16_t *dequant_tables[64][3][2];
|
|
||||||
OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]);
|
|
||||||
/*Loop filter strength parameters.*/
|
|
||||||
unsigned char loop_filter_limits[64];
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*The function type used to fill in the chroma plane motion vectors for a
|
|
||||||
macro block when 4 different motion vectors are specified in the luma
|
|
||||||
plane.
|
|
||||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
|
||||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
|
||||||
prediction.
|
|
||||||
_lbmvs: The luma block-level motion vectors.*/
|
|
||||||
typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*A map from the index in the zig zag scan to the coefficient number in a
|
/*A map from the index in the zig zag scan to the coefficient number in a
|
||||||
@ -409,14 +100,12 @@ extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12];
|
|||||||
/*The number of indices in the oc_mb_map array that can be valid for each of
|
/*The number of indices in the oc_mb_map array that can be valid for each of
|
||||||
the various chroma decimation types.*/
|
the various chroma decimation types.*/
|
||||||
extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
|
extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
|
||||||
/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
|
|
||||||
macro block when 4 different motion vectors are specified in the luma
|
|
||||||
plane.*/
|
|
||||||
extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int oc_ilog(unsigned _v);
|
int oc_ilog(unsigned _v);
|
||||||
|
void *oc_aligned_malloc(size_t _sz,size_t _align);
|
||||||
|
void oc_aligned_free(void *_ptr);
|
||||||
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
|
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
|
||||||
void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
|
void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
|
||||||
void oc_free_2d(void *_ptr);
|
void oc_free_2d(void *_ptr);
|
||||||
@ -424,86 +113,4 @@ void oc_free_2d(void *_ptr);
|
|||||||
void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
|
void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
|
||||||
const th_ycbcr_buffer _src);
|
const th_ycbcr_buffer _src);
|
||||||
|
|
||||||
int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
|
|
||||||
void oc_state_clear(oc_theora_state *_state);
|
|
||||||
void oc_state_vtable_init_c(oc_theora_state *_state);
|
|
||||||
void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
|
|
||||||
int _y0,int _yend);
|
|
||||||
void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
|
|
||||||
void oc_state_borders_fill(oc_theora_state *_state,int _refi);
|
|
||||||
void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
|
|
||||||
th_ycbcr_buffer _img);
|
|
||||||
int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
|
|
||||||
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
|
||||||
int _pli,int _dx,int _dy);
|
|
||||||
|
|
||||||
int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
|
|
||||||
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
|
|
||||||
#if defined(OC_DUMP_IMAGES)
|
|
||||||
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
|
|
||||||
const char *_suf);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*Shared accelerated functions.*/
|
|
||||||
void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride);
|
|
||||||
void oc_frag_recon_intra(const oc_theora_state *_state,
|
|
||||||
unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_frag_recon_inter2(const oc_theora_state *_state,
|
|
||||||
unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
|
|
||||||
int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi);
|
|
||||||
void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
|
||||||
void oc_state_frag_copy_list(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli);
|
|
||||||
void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,
|
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
|
||||||
void oc_restore_fpu(const oc_theora_state *_state);
|
|
||||||
|
|
||||||
/*Default pure-C implementations.*/
|
|
||||||
void oc_frag_copy_c(unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _src_ystride);
|
|
||||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
|
|
||||||
const ogg_int16_t _residue[64]);
|
|
||||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
|
||||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
|
||||||
void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
|
|
||||||
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
|
||||||
void oc_state_frag_copy_list_c(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli);
|
|
||||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
|
||||||
void oc_restore_fpu_c(void);
|
|
||||||
|
|
||||||
/*We need a way to call a few encoder functions without introducing a link-time
|
|
||||||
dependency into the decoder, while still allowing the old alpha API which
|
|
||||||
does not distinguish between encoder and decoder objects to be used.
|
|
||||||
We do this by placing a function table at the start of the encoder object
|
|
||||||
which can dispatch into the encoder library.
|
|
||||||
We do a similar thing for the decoder in case we ever decide to split off a
|
|
||||||
common base library.*/
|
|
||||||
typedef void (*oc_state_clear_func)(theora_state *_th);
|
|
||||||
typedef int (*oc_state_control_func)(theora_state *th,int _req,
|
|
||||||
void *_buf,size_t _buf_sz);
|
|
||||||
typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
|
|
||||||
ogg_int64_t _granulepos);
|
|
||||||
typedef double (*oc_state_granule_time_func)(theora_state *_th,
|
|
||||||
ogg_int64_t _granulepos);
|
|
||||||
|
|
||||||
|
|
||||||
struct oc_state_dispatch_vtable{
|
|
||||||
oc_state_clear_func clear;
|
|
||||||
oc_state_control_func control;
|
|
||||||
oc_state_granule_frame_func granule_frame;
|
|
||||||
oc_state_granule_time_func granule_time;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
68
thirdparty/libtheora/mathops.c
vendored
68
thirdparty/libtheora/mathops.c
vendored
@ -1,10 +1,8 @@
|
|||||||
|
#include "internal.h"
|
||||||
#include "mathops.h"
|
#include "mathops.h"
|
||||||
#include <limits.h>
|
|
||||||
|
|
||||||
/*The fastest fallback strategy for platforms with fast multiplication appears
|
/*The fastest fallback strategy for platforms with fast multiplication appears
|
||||||
to be based on de Bruijn sequences~\cite{LP98}.
|
to be based on de Bruijn sequences~\cite{LP98}.
|
||||||
Tests confirmed this to be true even on an ARM11, where it is actually faster
|
|
||||||
than using the native clz instruction.
|
|
||||||
Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
|
Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
|
||||||
multiplication or table lookups are too expensive.
|
multiplication or table lookups are too expensive.
|
||||||
|
|
||||||
@ -15,8 +13,7 @@
|
|||||||
year=1998,
|
year=1998,
|
||||||
note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
|
note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
|
||||||
}*/
|
}*/
|
||||||
#if !defined(OC_ILOG_NODEBRUIJN)&& \
|
#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32)
|
||||||
!defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL
|
|
||||||
static const unsigned char OC_DEBRUIJN_IDX32[32]={
|
static const unsigned char OC_DEBRUIJN_IDX32[32]={
|
||||||
0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
|
0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
|
||||||
31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
|
31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
|
||||||
@ -25,7 +22,7 @@ static const unsigned char OC_DEBRUIJN_IDX32[32]={
|
|||||||
|
|
||||||
int oc_ilog32(ogg_uint32_t _v){
|
int oc_ilog32(ogg_uint32_t _v){
|
||||||
#if defined(OC_CLZ32)
|
#if defined(OC_CLZ32)
|
||||||
return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v;
|
return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v;
|
||||||
#else
|
#else
|
||||||
/*On a Pentium M, this branchless version tested as the fastest version without
|
/*On a Pentium M, this branchless version tested as the fastest version without
|
||||||
multiplications on 1,000,000,000 random 32-bit integers, edging out a
|
multiplications on 1,000,000,000 random 32-bit integers, edging out a
|
||||||
@ -51,12 +48,12 @@ int oc_ilog32(ogg_uint32_t _v){
|
|||||||
/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
|
/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
|
||||||
# else
|
# else
|
||||||
int ret;
|
int ret;
|
||||||
ret=_v>0;
|
|
||||||
_v|=_v>>1;
|
_v|=_v>>1;
|
||||||
_v|=_v>>2;
|
_v|=_v>>2;
|
||||||
_v|=_v>>4;
|
_v|=_v>>4;
|
||||||
_v|=_v>>8;
|
_v|=_v>>8;
|
||||||
_v|=_v>>16;
|
_v|=_v>>16;
|
||||||
|
ret=_v&1;
|
||||||
_v=(_v>>1)+1;
|
_v=(_v>>1)+1;
|
||||||
ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
|
ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
|
||||||
return ret;
|
return ret;
|
||||||
@ -66,16 +63,21 @@ int oc_ilog32(ogg_uint32_t _v){
|
|||||||
|
|
||||||
int oc_ilog64(ogg_int64_t _v){
|
int oc_ilog64(ogg_int64_t _v){
|
||||||
#if defined(OC_CLZ64)
|
#if defined(OC_CLZ64)
|
||||||
return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v;
|
return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v;
|
||||||
#else
|
#else
|
||||||
# if defined(OC_ILOG_NODEBRUIJN)
|
/*If we don't have a fast 64-bit word implementation, split it into two 32-bit
|
||||||
|
halves.*/
|
||||||
|
# if defined(OC_ILOG_NODEBRUIJN)|| \
|
||||||
|
defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL
|
||||||
ogg_uint32_t v;
|
ogg_uint32_t v;
|
||||||
int ret;
|
int ret;
|
||||||
int m;
|
int m;
|
||||||
ret=_v>0;
|
|
||||||
m=(_v>0xFFFFFFFFU)<<5;
|
m=(_v>0xFFFFFFFFU)<<5;
|
||||||
v=(ogg_uint32_t)(_v>>m);
|
v=(ogg_uint32_t)(_v>>m);
|
||||||
ret|=m;
|
# if defined(OC_CLZ32)
|
||||||
|
ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v;
|
||||||
|
# elif defined(OC_ILOG_NODEBRUIJN)
|
||||||
|
ret=v>0|m;
|
||||||
m=(v>0xFFFFU)<<4;
|
m=(v>0xFFFFU)<<4;
|
||||||
v>>=m;
|
v>>=m;
|
||||||
ret|=m;
|
ret|=m;
|
||||||
@ -90,26 +92,19 @@ int oc_ilog64(ogg_int64_t _v){
|
|||||||
ret|=m;
|
ret|=m;
|
||||||
ret+=v>1;
|
ret+=v>1;
|
||||||
return ret;
|
return ret;
|
||||||
# else
|
# else
|
||||||
/*If we don't have a 64-bit word, split it into two 32-bit halves.*/
|
|
||||||
# if LONG_MAX<9223372036854775807LL
|
|
||||||
ogg_uint32_t v;
|
|
||||||
int ret;
|
|
||||||
int m;
|
|
||||||
ret=_v>0;
|
|
||||||
m=(_v>0xFFFFFFFFU)<<5;
|
|
||||||
v=(ogg_uint32_t)(_v>>m);
|
|
||||||
ret|=m;
|
|
||||||
v|=v>>1;
|
v|=v>>1;
|
||||||
v|=v>>2;
|
v|=v>>2;
|
||||||
v|=v>>4;
|
v|=v>>4;
|
||||||
v|=v>>8;
|
v|=v>>8;
|
||||||
v|=v>>16;
|
v|=v>>16;
|
||||||
|
ret=v&1|m;
|
||||||
v=(v>>1)+1;
|
v=(v>>1)+1;
|
||||||
ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
|
ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
|
||||||
|
# endif
|
||||||
return ret;
|
return ret;
|
||||||
/*Otherwise do it in one 64-bit operation.*/
|
/*Otherwise do it in one 64-bit multiply.*/
|
||||||
# else
|
# else
|
||||||
static const unsigned char OC_DEBRUIJN_IDX64[64]={
|
static const unsigned char OC_DEBRUIJN_IDX64[64]={
|
||||||
0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
|
0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
|
||||||
5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
|
5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
|
||||||
@ -117,17 +112,16 @@ int oc_ilog64(ogg_int64_t _v){
|
|||||||
62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
|
62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
|
||||||
};
|
};
|
||||||
int ret;
|
int ret;
|
||||||
ret=_v>0;
|
|
||||||
_v|=_v>>1;
|
_v|=_v>>1;
|
||||||
_v|=_v>>2;
|
_v|=_v>>2;
|
||||||
_v|=_v>>4;
|
_v|=_v>>4;
|
||||||
_v|=_v>>8;
|
_v|=_v>>8;
|
||||||
_v|=_v>>16;
|
_v|=_v>>16;
|
||||||
_v|=_v>>32;
|
_v|=_v>>32;
|
||||||
|
ret=(int)_v&1;
|
||||||
_v=(_v>>1)+1;
|
_v=(_v>>1)+1;
|
||||||
ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
|
ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
|
||||||
return ret;
|
return ret;
|
||||||
# endif
|
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -294,3 +288,27 @@ ogg_int64_t oc_blog64(ogg_int64_t _w){
|
|||||||
}
|
}
|
||||||
return OC_Q57(ipart)+z;
|
return OC_Q57(ipart)+z;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*Polynomial approximation of a binary exponential.
|
||||||
|
Q10 input, Q0 output.*/
|
||||||
|
ogg_uint32_t oc_bexp32_q10(int _z){
|
||||||
|
unsigned n;
|
||||||
|
int ipart;
|
||||||
|
ipart=_z>>10;
|
||||||
|
n=(_z&(1<<10)-1)<<4;
|
||||||
|
n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384;
|
||||||
|
return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Polynomial approximation of a binary logarithm.
|
||||||
|
Q0 input, Q10 output.*/
|
||||||
|
int oc_blog32_q10(ogg_uint32_t _w){
|
||||||
|
int n;
|
||||||
|
int ipart;
|
||||||
|
int fpart;
|
||||||
|
if(_w<=0)return -1;
|
||||||
|
ipart=OC_ILOGNZ_32(_w);
|
||||||
|
n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384;
|
||||||
|
fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793;
|
||||||
|
return (ipart<<10)+(fpart>>4);
|
||||||
|
}
|
||||||
|
44
thirdparty/libtheora/mathops.h
vendored
44
thirdparty/libtheora/mathops.h
vendored
@ -2,29 +2,27 @@
|
|||||||
# define _mathops_H (1)
|
# define _mathops_H (1)
|
||||||
# include <ogg/ogg.h>
|
# include <ogg/ogg.h>
|
||||||
|
|
||||||
# ifdef __GNUC_PREREQ
|
# if __GNUC_PREREQ(3,4)
|
||||||
# if __GNUC_PREREQ(3,4)
|
# include <limits.h>
|
||||||
# include <limits.h>
|
|
||||||
/*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
|
/*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
|
||||||
"upgrading" the type of an entire expression to an (unsigned) size_t.*/
|
"upgrading" the type of an entire expression to an (unsigned) size_t.*/
|
||||||
# if INT_MAX>=2147483647
|
# if INT_MAX>=2147483647
|
||||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||||
# define OC_CLZ32(_x) (__builtin_clz(_x))
|
# define OC_CLZ32(_x) (__builtin_clz(_x))
|
||||||
# elif LONG_MAX>=2147483647L
|
# elif LONG_MAX>=2147483647L
|
||||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||||
# define OC_CLZ32(_x) (__builtin_clzl(_x))
|
# define OC_CLZ32(_x) (__builtin_clzl(_x))
|
||||||
# endif
|
# endif
|
||||||
# if INT_MAX>=9223372036854775807LL
|
# if INT_MAX>=9223372036854775807LL
|
||||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||||
# define OC_CLZ64(_x) (__builtin_clz(_x))
|
# define OC_CLZ64(_x) (__builtin_clz(_x))
|
||||||
# elif LONG_MAX>=9223372036854775807LL
|
# elif LONG_MAX>=9223372036854775807LL
|
||||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||||
# define OC_CLZ64(_x) (__builtin_clzl(_x))
|
# define OC_CLZ64(_x) (__builtin_clzl(_x))
|
||||||
# elif LLONG_MAX>=9223372036854775807LL|| \
|
# elif LLONG_MAX>=9223372036854775807LL|| \
|
||||||
__LONG_LONG_MAX__>=9223372036854775807LL
|
__LONG_LONG_MAX__>=9223372036854775807LL
|
||||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
|
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
|
||||||
# define OC_CLZ64(_x) (__builtin_clzll(_x))
|
# define OC_CLZ64(_x) (__builtin_clzll(_x))
|
||||||
# endif
|
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
@ -134,8 +132,12 @@ int oc_ilog64(ogg_int64_t _v);
|
|||||||
# define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))
|
# define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))
|
||||||
|
|
||||||
#define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
|
#define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
|
||||||
|
#define OC_Q10(_v) ((_v)<<10)
|
||||||
|
|
||||||
ogg_int64_t oc_bexp64(ogg_int64_t _z);
|
ogg_int64_t oc_bexp64(ogg_int64_t _z);
|
||||||
ogg_int64_t oc_blog64(ogg_int64_t _w);
|
ogg_int64_t oc_blog64(ogg_int64_t _w);
|
||||||
|
|
||||||
|
ogg_uint32_t oc_bexp32_q10(int _z);
|
||||||
|
int oc_blog32_q10(ogg_uint32_t _w);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
215
thirdparty/libtheora/mcenc.c
vendored
215
thirdparty/libtheora/mcenc.c
vendored
@ -88,9 +88,11 @@ static const int OC_SQUARE_SITES[11][8]={
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
||||||
int _accum[2],int _mbi,int _frame){
|
oc_mv _accum,int _mbi,int _frame){
|
||||||
oc_mb_enc_info *embs;
|
oc_mb_enc_info *embs;
|
||||||
|
int accum_x;
|
||||||
|
int accum_y;
|
||||||
int a[3][2];
|
int a[3][2];
|
||||||
int ncandidates;
|
int ncandidates;
|
||||||
unsigned nmbi;
|
unsigned nmbi;
|
||||||
@ -102,20 +104,24 @@ static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
|||||||
/*Fill in the first part of set A: the vectors from adjacent blocks.*/
|
/*Fill in the first part of set A: the vectors from adjacent blocks.*/
|
||||||
for(i=0;i<embs[_mbi].ncneighbors;i++){
|
for(i=0;i<embs[_mbi].ncneighbors;i++){
|
||||||
nmbi=embs[_mbi].cneighbors[i];
|
nmbi=embs[_mbi].cneighbors[i];
|
||||||
_mcenc->candidates[ncandidates][0]=embs[nmbi].analysis_mv[0][_frame][0];
|
_mcenc->candidates[ncandidates][0]=
|
||||||
_mcenc->candidates[ncandidates][1]=embs[nmbi].analysis_mv[0][_frame][1];
|
OC_MV_X(embs[nmbi].analysis_mv[0][_frame]);
|
||||||
|
_mcenc->candidates[ncandidates][1]=
|
||||||
|
OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]);
|
||||||
ncandidates++;
|
ncandidates++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
accum_x=OC_MV_X(_accum);
|
||||||
|
accum_y=OC_MV_Y(_accum);
|
||||||
/*Add a few additional vectors to set A: the vectors used in the previous
|
/*Add a few additional vectors to set A: the vectors used in the previous
|
||||||
frames and the (0,0) vector.*/
|
frames and the (0,0) vector.*/
|
||||||
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,_accum[0],31);
|
_mcenc->candidates[ncandidates][0]=accum_x;
|
||||||
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,_accum[1],31);
|
_mcenc->candidates[ncandidates][1]=accum_y;
|
||||||
ncandidates++;
|
ncandidates++;
|
||||||
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
|
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
|
||||||
embs[_mbi].analysis_mv[1][_frame][0]+_accum[0],31);
|
OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31);
|
||||||
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
|
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
|
||||||
embs[_mbi].analysis_mv[1][_frame][1]+_accum[1],31);
|
OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31);
|
||||||
ncandidates++;
|
ncandidates++;
|
||||||
_mcenc->candidates[ncandidates][0]=0;
|
_mcenc->candidates[ncandidates][0]=0;
|
||||||
_mcenc->candidates[ncandidates][1]=0;
|
_mcenc->candidates[ncandidates][1]=0;
|
||||||
@ -131,30 +137,33 @@ static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
|||||||
OC_SORT2I(a[0][1],a[1][1]);
|
OC_SORT2I(a[0][1],a[1][1]);
|
||||||
_mcenc->candidates[0][0]=a[1][0];
|
_mcenc->candidates[0][0]=a[1][0];
|
||||||
_mcenc->candidates[0][1]=a[1][1];
|
_mcenc->candidates[0][1]=a[1][1];
|
||||||
/*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
|
|
||||||
_mcenc->setb0=ncandidates;
|
_mcenc->setb0=ncandidates;
|
||||||
/*The first time through the loop use the current macro block.*/
|
}
|
||||||
nmbi=_mbi;
|
|
||||||
for(i=0;;i++){
|
static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
||||||
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
|
oc_mv _accum,int _mbi,int _frame){
|
||||||
2*embs[_mbi].analysis_mv[1][_frame][0]
|
oc_mb_enc_info *embs;
|
||||||
-embs[_mbi].analysis_mv[2][_frame][0]+_accum[0],31);
|
int accum_x;
|
||||||
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
|
int accum_y;
|
||||||
2*embs[_mbi].analysis_mv[1][_frame][1]
|
int ncandidates;
|
||||||
-embs[_mbi].analysis_mv[2][_frame][1]+_accum[1],31);
|
embs=_enc->mb_info;
|
||||||
ncandidates++;
|
accum_x=OC_MV_X(_accum);
|
||||||
if(i>=embs[_mbi].npneighbors)break;
|
accum_y=OC_MV_Y(_accum);
|
||||||
nmbi=embs[_mbi].pneighbors[i];
|
/*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
|
||||||
}
|
ncandidates=_mcenc->setb0;
|
||||||
/*Truncate to full-pel positions.*/
|
/*Use only the current block. Using more did not appear to be helpful
|
||||||
for(i=0;i<ncandidates;i++){
|
with the current selection logic due to escaping the local search too
|
||||||
_mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]);
|
quickly.*/
|
||||||
_mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]);
|
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
|
||||||
}
|
2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame])
|
||||||
|
-OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31);
|
||||||
|
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
|
||||||
|
2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])
|
||||||
|
-OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31);
|
||||||
|
ncandidates++;
|
||||||
_mcenc->ncandidates=ncandidates;
|
_mcenc->ncandidates=ncandidates;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
|
static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
|
||||||
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
|
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
|
||||||
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
|
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
|
||||||
@ -170,20 +179,21 @@ static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
|
|||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
|
static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
|
||||||
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
|
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
|
||||||
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
|
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride,unsigned _best_err){
|
const unsigned char *_ref,int _ystride,unsigned _best_err){
|
||||||
unsigned err;
|
unsigned err;
|
||||||
|
int dc;
|
||||||
int bi;
|
int bi;
|
||||||
err=0;
|
err=0;
|
||||||
for(bi=0;bi<4;bi++){
|
for(bi=0;bi<4;bi++){
|
||||||
ptrdiff_t frag_offs;
|
ptrdiff_t frag_offs;
|
||||||
frag_offs=_frag_buf_offs[_fragis[bi]];
|
frag_offs=_frag_buf_offs[_fragis[bi]];
|
||||||
err+=oc_enc_frag_satd2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
|
err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
|
||||||
_ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
|
_ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
|
||||||
|
err+=abs(dc);
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
@ -219,9 +229,17 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
|
|||||||
err=0;
|
err=0;
|
||||||
for(bi=0;bi<4;bi++){
|
for(bi=0;bi<4;bi++){
|
||||||
ptrdiff_t frag_offs;
|
ptrdiff_t frag_offs;
|
||||||
|
int dc;
|
||||||
frag_offs=_frag_buf_offs[_fragis[bi]];
|
frag_offs=_frag_buf_offs[_fragis[bi]];
|
||||||
err+=oc_enc_frag_satd_thresh(_enc,
|
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
|
||||||
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride,UINT_MAX);
|
err+=oc_enc_frag_satd(_enc,&dc,
|
||||||
|
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
|
||||||
|
err+=abs(dc);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
err+=oc_enc_frag_sad(_enc,
|
||||||
|
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
@ -229,8 +247,11 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
|
|||||||
static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
|
static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
|
||||||
ptrdiff_t _frag_offs,int _dx,int _dy,
|
ptrdiff_t _frag_offs,int _dx,int _dy,
|
||||||
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
||||||
return oc_enc_frag_satd_thresh(_enc,
|
unsigned err;
|
||||||
_src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride,UINT_MAX);
|
int dc;
|
||||||
|
err=oc_enc_frag_satd(_enc,&dc,
|
||||||
|
_src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
|
||||||
|
return err+abs(dc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Perform a motion vector search for this macro block against a single
|
/*Perform a motion vector search for this macro block against a single
|
||||||
@ -239,11 +260,14 @@ static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
|
|||||||
the work can be shared.
|
the work can be shared.
|
||||||
The actual motion vector is stored in the appropriate place in the
|
The actual motion vector is stored in the appropriate place in the
|
||||||
oc_mb_enc_info structure.
|
oc_mb_enc_info structure.
|
||||||
_mcenc: The motion compensation context.
|
_accum: Drop frame/golden MV accumulators.
|
||||||
_accum: Drop frame/golden MV accumulators.
|
_mbi: The macro block index.
|
||||||
_mbi: The macro block index.
|
_frame: The frame to use for SATD calculations and refinement,
|
||||||
_frame: The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.*/
|
either OC_FRAME_PREV or OC_FRAME_GOLD.
|
||||||
void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
|
_frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV,
|
||||||
|
OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/
|
||||||
|
void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
|
||||||
|
int _frame_full){
|
||||||
/*Note: Traditionally this search is done using a rate-distortion objective
|
/*Note: Traditionally this search is done using a rate-distortion objective
|
||||||
function of the form D+lambda*R.
|
function of the form D+lambda*R.
|
||||||
However, xiphmont tested this and found it produced a small degredation,
|
However, xiphmont tested this and found it produced a small degredation,
|
||||||
@ -264,6 +288,7 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
|
|||||||
const ptrdiff_t *fragis;
|
const ptrdiff_t *fragis;
|
||||||
const unsigned char *src;
|
const unsigned char *src;
|
||||||
const unsigned char *ref;
|
const unsigned char *ref;
|
||||||
|
const unsigned char *satd_ref;
|
||||||
int ystride;
|
int ystride;
|
||||||
oc_mb_enc_info *embs;
|
oc_mb_enc_info *embs;
|
||||||
ogg_int32_t hit_cache[31];
|
ogg_int32_t hit_cache[31];
|
||||||
@ -278,17 +303,18 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
|
|||||||
int bi;
|
int bi;
|
||||||
embs=_enc->mb_info;
|
embs=_enc->mb_info;
|
||||||
/*Find some candidate motion vectors.*/
|
/*Find some candidate motion vectors.*/
|
||||||
oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame);
|
oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame);
|
||||||
/*Clear the cache of locations we've examined.*/
|
/*Clear the cache of locations we've examined.*/
|
||||||
memset(hit_cache,0,sizeof(hit_cache));
|
memset(hit_cache,0,sizeof(hit_cache));
|
||||||
/*Start with the median predictor.*/
|
/*Start with the median predictor.*/
|
||||||
candx=mcenc.candidates[0][0];
|
candx=OC_DIV2(mcenc.candidates[0][0]);
|
||||||
candy=mcenc.candidates[0][1];
|
candy=OC_DIV2(mcenc.candidates[0][1]);
|
||||||
hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
|
hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
|
||||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||||
fragis=_enc->state.mb_maps[_mbi][0];
|
fragis=_enc->state.mb_maps[_mbi][0];
|
||||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||||
ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
|
ref=_enc->state.ref_frame_data[_frame_full];
|
||||||
|
satd_ref=_enc->state.ref_frame_data[_frame];
|
||||||
ystride=_enc->state.ref_ystride[0];
|
ystride=_enc->state.ref_ystride[0];
|
||||||
/*TODO: customize error function for speed/(quality+size) tradeoff.*/
|
/*TODO: customize error function for speed/(quality+size) tradeoff.*/
|
||||||
best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
||||||
@ -317,8 +343,8 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
|
|||||||
t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
|
t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
|
||||||
/*Examine the candidates in set A.*/
|
/*Examine the candidates in set A.*/
|
||||||
for(ci=1;ci<mcenc.setb0;ci++){
|
for(ci=1;ci<mcenc.setb0;ci++){
|
||||||
candx=mcenc.candidates[ci][0];
|
candx=OC_DIV2(mcenc.candidates[ci][0]);
|
||||||
candy=mcenc.candidates[ci][1];
|
candy=OC_DIV2(mcenc.candidates[ci][1]);
|
||||||
/*If we've already examined this vector, then we would be using it if it
|
/*If we've already examined this vector, then we would be using it if it
|
||||||
was better than what we are using.*/
|
was better than what we are using.*/
|
||||||
hitbit=(ogg_int32_t)1<<candx+15;
|
hitbit=(ogg_int32_t)1<<candx+15;
|
||||||
@ -340,10 +366,11 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(best_err>t2){
|
if(best_err>t2){
|
||||||
|
oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame);
|
||||||
/*Examine the candidates in set B.*/
|
/*Examine the candidates in set B.*/
|
||||||
for(;ci<mcenc.ncandidates;ci++){
|
for(;ci<mcenc.ncandidates;ci++){
|
||||||
candx=mcenc.candidates[ci][0];
|
candx=OC_DIV2(mcenc.candidates[ci][0]);
|
||||||
candy=mcenc.candidates[ci][1];
|
candy=OC_DIV2(mcenc.candidates[ci][1]);
|
||||||
hitbit=(ogg_int32_t)1<<candx+15;
|
hitbit=(ogg_int32_t)1<<candx+15;
|
||||||
if(hit_cache[candy+15]&hitbit)continue;
|
if(hit_cache[candy+15]&hitbit)continue;
|
||||||
hit_cache[candy+15]|=hitbit;
|
hit_cache[candy+15]|=hitbit;
|
||||||
@ -475,58 +502,50 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
|
|||||||
candx=best_vec[0];
|
candx=best_vec[0];
|
||||||
candy=best_vec[1];
|
candy=best_vec[1];
|
||||||
embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
|
embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
|
||||||
frag_buf_offs,fragis,candx,candy,src,ref,ystride);
|
frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride);
|
||||||
embs[_mbi].analysis_mv[0][_frame][0]=(signed char)(candx<<1);
|
embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1);
|
||||||
embs[_mbi].analysis_mv[0][_frame][1]=(signed char)(candy<<1);
|
if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
|
||||||
if(_frame==OC_FRAME_PREV){
|
|
||||||
for(bi=0;bi<4;bi++){
|
for(bi=0;bi<4;bi++){
|
||||||
candx=best_block_vec[bi][0];
|
candx=best_block_vec[bi][0];
|
||||||
candy=best_block_vec[bi][1];
|
candy=best_block_vec[bi][1];
|
||||||
embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
|
embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
|
||||||
frag_buf_offs[fragis[bi]],candx,candy,src,ref,ystride);
|
frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride);
|
||||||
embs[_mbi].block_mv[bi][0]=(signed char)(candx<<1);
|
embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1);
|
||||||
embs[_mbi].block_mv[bi][1]=(signed char)(candy<<1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){
|
void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){
|
||||||
oc_mv2 *mvs;
|
oc_mv2 *mvs;
|
||||||
int accum_p[2];
|
oc_mv accum_p;
|
||||||
int accum_g[2];
|
oc_mv accum_g;
|
||||||
|
oc_mv mv2_p;
|
||||||
mvs=_enc->mb_info[_mbi].analysis_mv;
|
mvs=_enc->mb_info[_mbi].analysis_mv;
|
||||||
if(_enc->prevframe_dropped){
|
if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV];
|
||||||
accum_p[0]=mvs[0][OC_FRAME_PREV][0];
|
else accum_p=0;
|
||||||
accum_p[1]=mvs[0][OC_FRAME_PREV][1];
|
accum_g=mvs[2][OC_FRAME_GOLD];
|
||||||
}
|
|
||||||
else accum_p[1]=accum_p[0]=0;
|
|
||||||
accum_g[0]=mvs[2][OC_FRAME_GOLD][0];
|
|
||||||
accum_g[1]=mvs[2][OC_FRAME_GOLD][1];
|
|
||||||
mvs[0][OC_FRAME_PREV][0]-=mvs[2][OC_FRAME_PREV][0];
|
|
||||||
mvs[0][OC_FRAME_PREV][1]-=mvs[2][OC_FRAME_PREV][1];
|
|
||||||
/*Move the motion vector predictors back a frame.*/
|
/*Move the motion vector predictors back a frame.*/
|
||||||
memmove(mvs+1,mvs,2*sizeof(*mvs));
|
mv2_p=mvs[2][OC_FRAME_PREV];
|
||||||
|
mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD];
|
||||||
|
mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV];
|
||||||
|
mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD];
|
||||||
|
mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p);
|
||||||
/*Search the last frame.*/
|
/*Search the last frame.*/
|
||||||
oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV);
|
oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG);
|
||||||
mvs[2][OC_FRAME_PREV][0]=accum_p[0];
|
mvs[2][OC_FRAME_PREV]=accum_p;
|
||||||
mvs[2][OC_FRAME_PREV][1]=accum_p[1];
|
|
||||||
/*GOLDEN MVs are different from PREV MVs in that they're each absolute
|
/*GOLDEN MVs are different from PREV MVs in that they're each absolute
|
||||||
offsets from some frame in the past rather than relative offsets from the
|
offsets from some frame in the past rather than relative offsets from the
|
||||||
frame before.
|
frame before.
|
||||||
For predictor calculation to make sense, we need them to be in the same
|
For predictor calculation to make sense, we need them to be in the same
|
||||||
form as PREV MVs.*/
|
form as PREV MVs.*/
|
||||||
mvs[1][OC_FRAME_GOLD][0]-=mvs[2][OC_FRAME_GOLD][0];
|
mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
|
||||||
mvs[1][OC_FRAME_GOLD][1]-=mvs[2][OC_FRAME_GOLD][1];
|
mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g);
|
||||||
mvs[2][OC_FRAME_GOLD][0]-=accum_g[0];
|
|
||||||
mvs[2][OC_FRAME_GOLD][1]-=accum_g[1];
|
|
||||||
/*Search the golden frame.*/
|
/*Search the golden frame.*/
|
||||||
oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD);
|
oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG);
|
||||||
/*Put GOLDEN MVs back into absolute offset form.
|
/*Put GOLDEN MVs back into absolute offset form.
|
||||||
The newest MV is already an absolute offset.*/
|
The newest MV is already an absolute offset.*/
|
||||||
mvs[2][OC_FRAME_GOLD][0]+=accum_g[0];
|
mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g);
|
||||||
mvs[2][OC_FRAME_GOLD][1]+=accum_g[1];
|
mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
|
||||||
mvs[1][OC_FRAME_GOLD][0]+=mvs[2][OC_FRAME_GOLD][0];
|
|
||||||
mvs[1][OC_FRAME_GOLD][1]+=mvs[2][OC_FRAME_GOLD][1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@ -543,7 +562,7 @@ static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
|
|||||||
int sitei;
|
int sitei;
|
||||||
int err;
|
int err;
|
||||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||||
ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]];
|
ref=_enc->state.ref_frame_data[_framei];
|
||||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||||
fragis=_enc->state.mb_maps[_mbi][0];
|
fragis=_enc->state.mb_maps[_mbi][0];
|
||||||
ystride=_enc->state.ref_ystride[0];
|
ystride=_enc->state.ref_ystride[0];
|
||||||
@ -598,7 +617,7 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
|
|||||||
int sitei;
|
int sitei;
|
||||||
int err;
|
int err;
|
||||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||||
ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
|
ref=_enc->state.ref_frame_data[_frame];
|
||||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||||
fragis=_enc->state.mb_maps[_mbi][0];
|
fragis=_enc->state.mb_maps[_mbi][0];
|
||||||
ystride=_enc->state.ref_ystride[0];
|
ystride=_enc->state.ref_ystride[0];
|
||||||
@ -627,8 +646,14 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
|
|||||||
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
||||||
mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
|
mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
|
||||||
mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
|
mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
|
||||||
err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
|
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
|
||||||
mvoffset0,mvoffset1,src,ref,ystride,_best_err);
|
err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
|
||||||
|
mvoffset0,mvoffset1,src,ref,ystride,_best_err);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
|
||||||
|
mvoffset0,mvoffset1,src,ref,ystride,_best_err);
|
||||||
|
}
|
||||||
if(err<_best_err){
|
if(err<_best_err){
|
||||||
_best_err=err;
|
_best_err=err;
|
||||||
best_site=site;
|
best_site=site;
|
||||||
@ -643,12 +668,11 @@ void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){
|
|||||||
oc_mb_enc_info *embs;
|
oc_mb_enc_info *embs;
|
||||||
int vec[2];
|
int vec[2];
|
||||||
embs=_enc->mb_info;
|
embs=_enc->mb_info;
|
||||||
vec[0]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][0]);
|
vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame]));
|
||||||
vec[1]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][1]);
|
vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame]));
|
||||||
embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
|
embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
|
||||||
_mbi,vec,embs[_mbi].satd[_frame],_frame);
|
_mbi,vec,embs[_mbi].satd[_frame],_frame);
|
||||||
embs[_mbi].analysis_mv[0][_frame][0]=(signed char)vec[0];
|
embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]);
|
||||||
embs[_mbi].analysis_mv[0][_frame][1]=(signed char)vec[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@ -704,6 +728,7 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
|
|||||||
best_site=4;
|
best_site=4;
|
||||||
for(sitei=0;sitei<8;sitei++){
|
for(sitei=0;sitei<8;sitei++){
|
||||||
unsigned err;
|
unsigned err;
|
||||||
|
int dc;
|
||||||
int site;
|
int site;
|
||||||
int xmask;
|
int xmask;
|
||||||
int ymask;
|
int ymask;
|
||||||
@ -723,8 +748,9 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
|
|||||||
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
||||||
mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
|
mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
|
||||||
mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
|
mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
|
||||||
err=oc_enc_frag_satd2_thresh(_enc,_src,
|
err=oc_enc_frag_satd2(_enc,&dc,_src,
|
||||||
_ref+mvoffset0,_ref+mvoffset1,_ystride,_best_err);
|
_ref+mvoffset0,_ref+mvoffset1,_ystride);
|
||||||
|
err+=abs(dc);
|
||||||
if(err<_best_err){
|
if(err<_best_err){
|
||||||
_best_err=err;
|
_best_err=err;
|
||||||
best_site=site;
|
best_site=site;
|
||||||
@ -748,7 +774,7 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
|
|||||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||||
fragis=_enc->state.mb_maps[_mbi][0];
|
fragis=_enc->state.mb_maps[_mbi][0];
|
||||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||||
ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
|
ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
|
||||||
offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
|
offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
|
||||||
offset_y[3]=offset_y[5]=0;
|
offset_y[3]=offset_y[5]=0;
|
||||||
offset_y[6]=offset_y[7]=offset_y[8]=ystride;
|
offset_y[6]=offset_y[7]=offset_y[8]=ystride;
|
||||||
@ -757,11 +783,10 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
|
|||||||
ptrdiff_t frag_offs;
|
ptrdiff_t frag_offs;
|
||||||
int vec[2];
|
int vec[2];
|
||||||
frag_offs=frag_buf_offs[fragis[bi]];
|
frag_offs=frag_buf_offs[fragis[bi]];
|
||||||
vec[0]=OC_DIV2(embs[_mbi].block_mv[bi][0]);
|
vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi]));
|
||||||
vec[1]=OC_DIV2(embs[_mbi].block_mv[bi][1]);
|
vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi]));
|
||||||
embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
|
embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
|
||||||
src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
|
src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
|
||||||
embs[_mbi].ref_mv[bi][0]=(signed char)vec[0];
|
embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]);
|
||||||
embs[_mbi].ref_mv[bi][1]=(signed char)vec[1];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
4611
thirdparty/libtheora/modedec.h
vendored
4611
thirdparty/libtheora/modedec.h
vendored
File diff suppressed because it is too large
Load Diff
2
thirdparty/libtheora/ocintrin.h
vendored
2
thirdparty/libtheora/ocintrin.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: ocintrin.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
|
@ -1,38 +0,0 @@
|
|||||||
From 0ae66d565e6bead8604d312bc1a4e9dccf245c88 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Tim Terriberry <tterribe@xiph.org>
|
|
||||||
Date: Tue, 8 May 2012 02:51:57 +0000
|
|
||||||
Subject: [PATCH] Fix pp_sharp_mod calculation.
|
|
||||||
|
|
||||||
This was broken when the dequant_tables indexing changed in commit
|
|
||||||
r16102, but it only affected post-processing quality, so we never
|
|
||||||
noticed.
|
|
||||||
With gcc 4.8.0, this can now trigger a segfault during decoder
|
|
||||||
initialization.
|
|
||||||
|
|
||||||
svn path=/trunk/theora/; revision=18268
|
|
||||||
---
|
|
||||||
decode.c | 8 ++++----
|
|
||||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/decode.c b/decode.c
|
|
||||||
index b803505..9f2516a 100644
|
|
||||||
--- a/decode.c
|
|
||||||
+++ b/decode.c
|
|
||||||
@@ -400,10 +400,10 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
|
|
||||||
int qsum;
|
|
||||||
qsum=0;
|
|
||||||
for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
|
|
||||||
- qsum+=_dec->state.dequant_tables[qti][pli][qi][12]+
|
|
||||||
- _dec->state.dequant_tables[qti][pli][qi][17]+
|
|
||||||
- _dec->state.dequant_tables[qti][pli][qi][18]+
|
|
||||||
- _dec->state.dequant_tables[qti][pli][qi][24]<<(pli==0);
|
|
||||||
+ qsum+=_dec->state.dequant_tables[qi][pli][qti][12]+
|
|
||||||
+ _dec->state.dequant_tables[qi][pli][qti][17]+
|
|
||||||
+ _dec->state.dequant_tables[qi][pli][qti][18]+
|
|
||||||
+ _dec->state.dequant_tables[qi][pli][qti][24]<<(pli==0);
|
|
||||||
}
|
|
||||||
_dec->pp_sharp_mod[qi]=-(qsum>>11);
|
|
||||||
}
|
|
||||||
--
|
|
||||||
2.11.0
|
|
||||||
|
|
10
thirdparty/libtheora/quant.c
vendored
10
thirdparty/libtheora/quant.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: quant.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -21,6 +21,14 @@
|
|||||||
#include "quant.h"
|
#include "quant.h"
|
||||||
#include "decint.h"
|
#include "decint.h"
|
||||||
|
|
||||||
|
/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
|
||||||
|
These minimum quantizers ensure the result after quantization (and after
|
||||||
|
prediction for DC) will be no more than +/- 510.
|
||||||
|
The tokenization system can handle values up to +/- 580, so there is no need
|
||||||
|
to do any coefficient clamping.
|
||||||
|
I would rather have allowed smaller quantizers and had to clamp, but these
|
||||||
|
minimums were required when constructing the original VP3 matrices and have
|
||||||
|
been formalized in the spec.*/
|
||||||
static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
|
static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
|
||||||
static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
|
static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
|
||||||
|
|
||||||
|
2
thirdparty/libtheora/quant.h
vendored
2
thirdparty/libtheora/quant.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: quant.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
|
26
thirdparty/libtheora/rate.c
vendored
26
thirdparty/libtheora/rate.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: rate.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@ -190,7 +190,8 @@ void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _qti){
|
|||||||
This may need to be revised if the R-D cost estimation or qii flag
|
This may need to be revised if the R-D cost estimation or qii flag
|
||||||
optimization strategies change.*/
|
optimization strategies change.*/
|
||||||
nqis=1;
|
nqis=1;
|
||||||
if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible){
|
if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible&&
|
||||||
|
_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
|
||||||
qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0,
|
qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0,
|
||||||
lq+(OC_Q57(7)+5)/10);
|
lq+(OC_Q57(7)+5)/10);
|
||||||
if(qi1!=qi)_enc->state.qis[nqis++]=qi1;
|
if(qi1!=qi)_enc->state.qis[nqis++]=qi1;
|
||||||
@ -761,6 +762,7 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc,
|
|||||||
_enc->rc.cur_metrics.log_scale=oc_q57_to_q24(log_scale);
|
_enc->rc.cur_metrics.log_scale=oc_q57_to_q24(log_scale);
|
||||||
_enc->rc.cur_metrics.dup_count=_enc->dup_count;
|
_enc->rc.cur_metrics.dup_count=_enc->dup_count;
|
||||||
_enc->rc.cur_metrics.frame_type=_enc->state.frame_type;
|
_enc->rc.cur_metrics.frame_type=_enc->state.frame_type;
|
||||||
|
_enc->rc.cur_metrics.activity_avg=_enc->activity_avg;
|
||||||
_enc->rc.twopass_buffer_bytes=0;
|
_enc->rc.twopass_buffer_bytes=0;
|
||||||
}break;
|
}break;
|
||||||
case 2:{
|
case 2:{
|
||||||
@ -863,9 +865,9 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc,
|
|||||||
return dropped;
|
return dropped;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define OC_RC_2PASS_VERSION (1)
|
#define OC_RC_2PASS_VERSION (2)
|
||||||
#define OC_RC_2PASS_HDR_SZ (38)
|
#define OC_RC_2PASS_HDR_SZ (38)
|
||||||
#define OC_RC_2PASS_PACKET_SZ (8)
|
#define OC_RC_2PASS_PACKET_SZ (12)
|
||||||
|
|
||||||
static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){
|
static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){
|
||||||
while(_bytes-->0){
|
while(_bytes-->0){
|
||||||
@ -900,6 +902,7 @@ int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf){
|
|||||||
oc_rc_buffer_val(&_enc->rc,
|
oc_rc_buffer_val(&_enc->rc,
|
||||||
_enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4);
|
_enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4);
|
||||||
oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.log_scale,4);
|
oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.log_scale,4);
|
||||||
|
oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.activity_avg,4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if(_enc->packet_state==OC_PACKET_DONE&&
|
else if(_enc->packet_state==OC_PACKET_DONE&&
|
||||||
@ -1050,16 +1053,19 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
|
|||||||
if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
|
if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
|
||||||
ogg_uint32_t dup_count;
|
ogg_uint32_t dup_count;
|
||||||
ogg_int32_t log_scale;
|
ogg_int32_t log_scale;
|
||||||
|
unsigned activity;
|
||||||
int qti;
|
int qti;
|
||||||
int arg;
|
int arg;
|
||||||
/*Read the metrics for the next frame.*/
|
/*Read the metrics for the next frame.*/
|
||||||
dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
|
dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
|
||||||
log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
|
log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
|
||||||
|
activity=oc_rc_unbuffer_val(&_enc->rc,4);
|
||||||
_enc->rc.cur_metrics.log_scale=log_scale;
|
_enc->rc.cur_metrics.log_scale=log_scale;
|
||||||
qti=(dup_count&0x80000000)>>31;
|
qti=(dup_count&0x80000000)>>31;
|
||||||
_enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF;
|
_enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF;
|
||||||
_enc->rc.cur_metrics.frame_type=qti;
|
_enc->rc.cur_metrics.frame_type=qti;
|
||||||
_enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME;
|
_enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME;
|
||||||
|
_enc->activity_avg=_enc->rc.cur_metrics.activity_avg=activity;
|
||||||
/*"Helpfully" set the dup count back to what it was in pass 1.*/
|
/*"Helpfully" set the dup count back to what it was in pass 1.*/
|
||||||
arg=_enc->rc.cur_metrics.dup_count;
|
arg=_enc->rc.cur_metrics.dup_count;
|
||||||
th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
|
th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
|
||||||
@ -1070,8 +1076,8 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
|
|||||||
else{
|
else{
|
||||||
int frames_needed;
|
int frames_needed;
|
||||||
/*We're using a finite buffer:*/
|
/*We're using a finite buffer:*/
|
||||||
frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
|
frames_needed=OC_MINI(_enc->rc.buf_delay-OC_MINI(_enc->rc.buf_delay,
|
||||||
-(_enc->rc.scale_window_end-_enc->rc.scale_window0),
|
_enc->rc.scale_window_end-_enc->rc.scale_window0),
|
||||||
_enc->rc.frames_left[0]+_enc->rc.frames_left[1]
|
_enc->rc.frames_left[0]+_enc->rc.frames_left[1]
|
||||||
-_enc->rc.nframes[0]-_enc->rc.nframes[1]);
|
-_enc->rc.nframes[0]-_enc->rc.nframes[1]);
|
||||||
while(frames_needed>0){
|
while(frames_needed>0){
|
||||||
@ -1087,9 +1093,11 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
|
|||||||
ogg_uint32_t dup_count;
|
ogg_uint32_t dup_count;
|
||||||
ogg_int32_t log_scale;
|
ogg_int32_t log_scale;
|
||||||
int qti;
|
int qti;
|
||||||
|
unsigned activity;
|
||||||
/*Read the metrics for the next frame.*/
|
/*Read the metrics for the next frame.*/
|
||||||
dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
|
dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
|
||||||
log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
|
log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
|
||||||
|
activity=oc_rc_unbuffer_val(&_enc->rc,4);
|
||||||
/*Add the to the circular buffer.*/
|
/*Add the to the circular buffer.*/
|
||||||
fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++;
|
fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++;
|
||||||
if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics;
|
if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics;
|
||||||
@ -1098,6 +1106,7 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
|
|||||||
qti=(dup_count&0x80000000)>>31;
|
qti=(dup_count&0x80000000)>>31;
|
||||||
m->dup_count=dup_count&0x7FFFFFFF;
|
m->dup_count=dup_count&0x7FFFFFFF;
|
||||||
m->frame_type=qti;
|
m->frame_type=qti;
|
||||||
|
m->activity_avg=activity;
|
||||||
/*And accumulate the statistics over the window.*/
|
/*And accumulate the statistics over the window.*/
|
||||||
_enc->rc.nframes[qti]++;
|
_enc->rc.nframes[qti]++;
|
||||||
_enc->rc.nframes[2]+=m->dup_count;
|
_enc->rc.nframes[2]+=m->dup_count;
|
||||||
@ -1105,8 +1114,8 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
|
|||||||
_enc->rc.scale_window_end+=m->dup_count+1;
|
_enc->rc.scale_window_end+=m->dup_count+1;
|
||||||
/*Compute an upper bound on the number of remaining packets needed
|
/*Compute an upper bound on the number of remaining packets needed
|
||||||
for the current window.*/
|
for the current window.*/
|
||||||
frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
|
frames_needed=OC_MINI(_enc->rc.buf_delay-OC_MINI(_enc->rc.buf_delay,
|
||||||
-(_enc->rc.scale_window_end-_enc->rc.scale_window0),
|
_enc->rc.scale_window_end-_enc->rc.scale_window0),
|
||||||
_enc->rc.frames_left[0]+_enc->rc.frames_left[1]
|
_enc->rc.frames_left[0]+_enc->rc.frames_left[1]
|
||||||
-_enc->rc.nframes[0]-_enc->rc.nframes[1]);
|
-_enc->rc.nframes[0]-_enc->rc.nframes[1]);
|
||||||
/*Clear the buffer for the next frame.*/
|
/*Clear the buffer for the next frame.*/
|
||||||
@ -1124,6 +1133,7 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
|
|||||||
*(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head);
|
*(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head);
|
||||||
_enc->rc.twopass_force_kf=
|
_enc->rc.twopass_force_kf=
|
||||||
_enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME;
|
_enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME;
|
||||||
|
_enc->activity_avg=_enc->rc.cur_metrics.activity_avg;
|
||||||
/*"Helpfully" set the dup count back to what it was in pass 1.*/
|
/*"Helpfully" set the dup count back to what it was in pass 1.*/
|
||||||
arg=_enc->rc.cur_metrics.dup_count;
|
arg=_enc->rc.cur_metrics.dup_count;
|
||||||
th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
|
th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
|
||||||
|
298
thirdparty/libtheora/state.c
vendored
298
thirdparty/libtheora/state.c
vendored
@ -11,25 +11,93 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: state.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "internal.h"
|
#include "state.h"
|
||||||
#if defined(OC_X86_ASM)
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
# include "x86_vc/x86int.h"
|
|
||||||
#else
|
|
||||||
# include "x86/x86int.h"
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#if defined(OC_DUMP_IMAGES)
|
#if defined(OC_DUMP_IMAGES)
|
||||||
# include <stdio.h>
|
# include <stdio.h>
|
||||||
# include "png.h"
|
# include "png.h"
|
||||||
|
# include "zlib.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||||
|
block when 4 different motion vectors are specified in the luma plane.
|
||||||
|
This version is for use with chroma decimated in the X and Y directions
|
||||||
|
(4:2:0).
|
||||||
|
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||||
|
_lbmvs: The luma block-level motion vectors.*/
|
||||||
|
static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||||
|
int dx;
|
||||||
|
int dy;
|
||||||
|
dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1])
|
||||||
|
+OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
|
||||||
|
dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1])
|
||||||
|
+OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
|
||||||
|
_cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,2,2),OC_DIV_ROUND_POW2(dy,2,2));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||||
|
block when 4 different motion vectors are specified in the luma plane.
|
||||||
|
This version is for use with chroma decimated in the Y direction.
|
||||||
|
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||||
|
_lbmvs: The luma block-level motion vectors.*/
|
||||||
|
static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||||
|
int dx;
|
||||||
|
int dy;
|
||||||
|
dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[2]);
|
||||||
|
dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[2]);
|
||||||
|
_cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||||
|
dx=OC_MV_X(_lbmvs[1])+OC_MV_X(_lbmvs[3]);
|
||||||
|
dy=OC_MV_Y(_lbmvs[1])+OC_MV_Y(_lbmvs[3]);
|
||||||
|
_cbmvs[1]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||||
|
block when 4 different motion vectors are specified in the luma plane.
|
||||||
|
This version is for use with chroma decimated in the X direction (4:2:2).
|
||||||
|
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||||
|
_lbmvs: The luma block-level motion vectors.*/
|
||||||
|
static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||||
|
int dx;
|
||||||
|
int dy;
|
||||||
|
dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]);
|
||||||
|
dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]);
|
||||||
|
_cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||||
|
dx=OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]);
|
||||||
|
dy=OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]);
|
||||||
|
_cbmvs[2]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*The function used to fill in the chroma plane motion vectors for a macro
|
||||||
|
block when 4 different motion vectors are specified in the luma plane.
|
||||||
|
This version is for use with no chroma decimation (4:4:4).
|
||||||
|
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||||
|
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||||
|
prediction.
|
||||||
|
_lbmvs: The luma block-level motion vectors.*/
|
||||||
|
static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
|
||||||
|
_cbmvs[0]=_lbmvs[0];
|
||||||
|
_cbmvs[1]=_lbmvs[1];
|
||||||
|
_cbmvs[2]=_lbmvs[2];
|
||||||
|
_cbmvs[3]=_lbmvs[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*A table of functions used to fill in the chroma plane motion vectors for a
|
||||||
|
macro block when 4 different motion vectors are specified in the luma
|
||||||
|
plane.*/
|
||||||
|
const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
|
||||||
|
(oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
|
||||||
|
(oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
|
||||||
|
(oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
|
||||||
|
(oc_set_chroma_mvs_func)oc_set_chroma_mvs11
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*Returns the fragment index of the top-left block in a macro block.
|
/*Returns the fragment index of the top-left block in a macro block.
|
||||||
This can be used to test whether or not the whole macro block is valid.
|
This can be used to test whether or not the whole macro block is valid.
|
||||||
_sb_map: The super block map.
|
_sb_map: The super block map.
|
||||||
@ -92,7 +160,7 @@ static void oc_sb_create_plane_mapping(oc_sb_map _sb_maps[],
|
|||||||
if(jmax>4)jmax=4;
|
if(jmax>4)jmax=4;
|
||||||
else if(jmax<=0)break;
|
else if(jmax<=0)break;
|
||||||
/*By default, set all fragment indices to -1.*/
|
/*By default, set all fragment indices to -1.*/
|
||||||
memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
|
memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi]));
|
||||||
/*Fill in the fragment map for this super block.*/
|
/*Fill in the fragment map for this super block.*/
|
||||||
xfrag=yfrag+x;
|
xfrag=yfrag+x;
|
||||||
for(i=0;i<imax;i++){
|
for(i=0;i<imax;i++){
|
||||||
@ -186,10 +254,14 @@ static void oc_mb_fill_cmapping10(oc_mb_map_plane _mb_map[3],
|
|||||||
This version is for use with no chroma decimation (4:4:4).
|
This version is for use with no chroma decimation (4:4:4).
|
||||||
This uses the already filled-in luma plane values.
|
This uses the already filled-in luma plane values.
|
||||||
_mb_map: The macro block map to fill.
|
_mb_map: The macro block map to fill.
|
||||||
_fplanes: The descriptions of the fragment planes.*/
|
_fplanes: The descriptions of the fragment planes.
|
||||||
|
_xfrag0: The X location of the upper-left hand fragment in the luma plane.
|
||||||
|
_yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/
|
||||||
static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3],
|
static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3],
|
||||||
const oc_fragment_plane _fplanes[3]){
|
const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){
|
||||||
int k;
|
int k;
|
||||||
|
(void)_xfrag0;
|
||||||
|
(void)_yfrag0;
|
||||||
for(k=0;k<4;k++){
|
for(k=0;k<4;k++){
|
||||||
_mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
|
_mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
|
||||||
_mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
|
_mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
|
||||||
@ -211,7 +283,7 @@ static const oc_mb_fill_cmapping_func OC_MB_FILL_CMAPPING_TABLE[4]={
|
|||||||
oc_mb_fill_cmapping00,
|
oc_mb_fill_cmapping00,
|
||||||
oc_mb_fill_cmapping01,
|
oc_mb_fill_cmapping01,
|
||||||
oc_mb_fill_cmapping10,
|
oc_mb_fill_cmapping10,
|
||||||
(oc_mb_fill_cmapping_func)oc_mb_fill_cmapping11
|
oc_mb_fill_cmapping11
|
||||||
};
|
};
|
||||||
|
|
||||||
/*Fills in the mapping from macro blocks to their corresponding fragment
|
/*Fills in the mapping from macro blocks to their corresponding fragment
|
||||||
@ -469,7 +541,7 @@ static void oc_state_frarray_clear(oc_theora_state *_state){
|
|||||||
unrestricted motion vectors without special casing the boundary.
|
unrestricted motion vectors without special casing the boundary.
|
||||||
If chroma is decimated in either direction, the padding is reduced by a
|
If chroma is decimated in either direction, the padding is reduced by a
|
||||||
factor of 2 on the appropriate sides.
|
factor of 2 on the appropriate sides.
|
||||||
_nrefs: The number of reference buffers to init; must be 3 or 4.*/
|
_nrefs: The number of reference buffers to init; must be in the range 3...6.*/
|
||||||
static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
||||||
th_info *info;
|
th_info *info;
|
||||||
unsigned char *ref_frame_data;
|
unsigned char *ref_frame_data;
|
||||||
@ -481,6 +553,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
|||||||
int yheight;
|
int yheight;
|
||||||
int chstride;
|
int chstride;
|
||||||
int cheight;
|
int cheight;
|
||||||
|
ptrdiff_t align;
|
||||||
ptrdiff_t yoffset;
|
ptrdiff_t yoffset;
|
||||||
ptrdiff_t coffset;
|
ptrdiff_t coffset;
|
||||||
ptrdiff_t *frag_buf_offs;
|
ptrdiff_t *frag_buf_offs;
|
||||||
@ -489,33 +562,38 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
|||||||
int vdec;
|
int vdec;
|
||||||
int rfi;
|
int rfi;
|
||||||
int pli;
|
int pli;
|
||||||
if(_nrefs<3||_nrefs>4)return TH_EINVAL;
|
if(_nrefs<3||_nrefs>6)return TH_EINVAL;
|
||||||
info=&_state->info;
|
info=&_state->info;
|
||||||
/*Compute the image buffer parameters for each plane.*/
|
/*Compute the image buffer parameters for each plane.*/
|
||||||
hdec=!(info->pixel_fmt&1);
|
hdec=!(info->pixel_fmt&1);
|
||||||
vdec=!(info->pixel_fmt&2);
|
vdec=!(info->pixel_fmt&2);
|
||||||
yhstride=info->frame_width+2*OC_UMV_PADDING;
|
yhstride=info->frame_width+2*OC_UMV_PADDING;
|
||||||
yheight=info->frame_height+2*OC_UMV_PADDING;
|
yheight=info->frame_height+2*OC_UMV_PADDING;
|
||||||
chstride=yhstride>>hdec;
|
/*Require 16-byte aligned rows in the chroma planes.*/
|
||||||
|
chstride=(yhstride>>hdec)+15&~15;
|
||||||
cheight=yheight>>vdec;
|
cheight=yheight>>vdec;
|
||||||
yplane_sz=yhstride*(size_t)yheight;
|
yplane_sz=yhstride*(size_t)yheight;
|
||||||
cplane_sz=chstride*(size_t)cheight;
|
cplane_sz=chstride*(size_t)cheight;
|
||||||
yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
|
yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
|
||||||
coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
|
coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
|
||||||
ref_frame_sz=yplane_sz+2*cplane_sz;
|
/*Although we guarantee the rows of the chroma planes are a multiple of 16
|
||||||
|
bytes, the initial padding on the first row may only be 8 bytes.
|
||||||
|
Compute the offset needed to the actual image data to a multiple of 16.*/
|
||||||
|
align=-coffset&15;
|
||||||
|
ref_frame_sz=yplane_sz+2*cplane_sz+16;
|
||||||
ref_frame_data_sz=_nrefs*ref_frame_sz;
|
ref_frame_data_sz=_nrefs*ref_frame_sz;
|
||||||
/*Check for overflow.
|
/*Check for overflow.
|
||||||
The same caveats apply as for oc_state_frarray_init().*/
|
The same caveats apply as for oc_state_frarray_init().*/
|
||||||
if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
|
if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz||
|
||||||
ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
|
ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
|
||||||
return TH_EIMPL;
|
return TH_EIMPL;
|
||||||
}
|
}
|
||||||
ref_frame_data=_ogg_malloc(ref_frame_data_sz);
|
ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16);
|
||||||
frag_buf_offs=_state->frag_buf_offs=
|
frag_buf_offs=_state->frag_buf_offs=
|
||||||
_ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
|
_ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
|
||||||
if(ref_frame_data==NULL||frag_buf_offs==NULL){
|
if(ref_frame_data==NULL||frag_buf_offs==NULL){
|
||||||
_ogg_free(frag_buf_offs);
|
_ogg_free(frag_buf_offs);
|
||||||
_ogg_free(ref_frame_data);
|
oc_aligned_free(ref_frame_data);
|
||||||
return TH_EFAULT;
|
return TH_EFAULT;
|
||||||
}
|
}
|
||||||
/*Set up the width, height and stride for the image buffers.*/
|
/*Set up the width, height and stride for the image buffers.*/
|
||||||
@ -532,15 +610,15 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
|||||||
memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
|
memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
|
||||||
sizeof(_state->ref_frame_bufs[0]));
|
sizeof(_state->ref_frame_bufs[0]));
|
||||||
}
|
}
|
||||||
|
_state->ref_frame_handle=ref_frame_data;
|
||||||
/*Set up the data pointers for the image buffers.*/
|
/*Set up the data pointers for the image buffers.*/
|
||||||
for(rfi=0;rfi<_nrefs;rfi++){
|
for(rfi=0;rfi<_nrefs;rfi++){
|
||||||
_state->ref_frame_data[rfi]=ref_frame_data;
|
|
||||||
_state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
|
_state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
|
||||||
ref_frame_data+=yplane_sz;
|
ref_frame_data+=yplane_sz+align;
|
||||||
_state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
|
_state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
|
||||||
ref_frame_data+=cplane_sz;
|
ref_frame_data+=cplane_sz;
|
||||||
_state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
|
_state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
|
||||||
ref_frame_data+=cplane_sz;
|
ref_frame_data+=cplane_sz+(16-align);
|
||||||
/*Flip the buffer upside down.
|
/*Flip the buffer upside down.
|
||||||
This allows us to decode Theora's bottom-up frames in their natural
|
This allows us to decode Theora's bottom-up frames in their natural
|
||||||
order, yet return a top-down buffer with a positive stride to the user.*/
|
order, yet return a top-down buffer with a positive stride to the user.*/
|
||||||
@ -550,7 +628,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
|||||||
_state->ref_ystride[0]=-yhstride;
|
_state->ref_ystride[0]=-yhstride;
|
||||||
_state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
|
_state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
|
||||||
/*Initialize the fragment buffer offsets.*/
|
/*Initialize the fragment buffer offsets.*/
|
||||||
ref_frame_data=_state->ref_frame_data[0];
|
ref_frame_data=_state->ref_frame_bufs[0][0].data;
|
||||||
fragi=0;
|
fragi=0;
|
||||||
for(pli=0;pli<3;pli++){
|
for(pli=0;pli<3;pli++){
|
||||||
th_img_plane *iplane;
|
th_img_plane *iplane;
|
||||||
@ -576,41 +654,44 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
|
|||||||
vpix+=stride<<3;
|
vpix+=stride<<3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*Initialize the reference frame indices.*/
|
/*Initialize the reference frame pointers and indices.*/
|
||||||
_state->ref_frame_idx[OC_FRAME_GOLD]=
|
_state->ref_frame_idx[OC_FRAME_GOLD]=
|
||||||
_state->ref_frame_idx[OC_FRAME_PREV]=
|
_state->ref_frame_idx[OC_FRAME_PREV]=
|
||||||
_state->ref_frame_idx[OC_FRAME_SELF]=-1;
|
_state->ref_frame_idx[OC_FRAME_GOLD_ORIG]=
|
||||||
_state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1;
|
_state->ref_frame_idx[OC_FRAME_PREV_ORIG]=
|
||||||
|
_state->ref_frame_idx[OC_FRAME_SELF]=
|
||||||
|
_state->ref_frame_idx[OC_FRAME_IO]=-1;
|
||||||
|
_state->ref_frame_data[OC_FRAME_GOLD]=
|
||||||
|
_state->ref_frame_data[OC_FRAME_PREV]=
|
||||||
|
_state->ref_frame_data[OC_FRAME_GOLD_ORIG]=
|
||||||
|
_state->ref_frame_data[OC_FRAME_PREV_ORIG]=
|
||||||
|
_state->ref_frame_data[OC_FRAME_SELF]=
|
||||||
|
_state->ref_frame_data[OC_FRAME_IO]=NULL;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_state_ref_bufs_clear(oc_theora_state *_state){
|
static void oc_state_ref_bufs_clear(oc_theora_state *_state){
|
||||||
_ogg_free(_state->frag_buf_offs);
|
_ogg_free(_state->frag_buf_offs);
|
||||||
_ogg_free(_state->ref_frame_data[0]);
|
oc_aligned_free(_state->ref_frame_handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void oc_state_vtable_init_c(oc_theora_state *_state){
|
void oc_state_accel_init_c(oc_theora_state *_state){
|
||||||
|
_state->cpu_flags=0;
|
||||||
|
#if defined(OC_STATE_USE_VTABLE)
|
||||||
_state->opt_vtable.frag_copy=oc_frag_copy_c;
|
_state->opt_vtable.frag_copy=oc_frag_copy_c;
|
||||||
|
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_c;
|
||||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
|
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
|
||||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
|
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
|
||||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
|
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
|
||||||
_state->opt_vtable.idct8x8=oc_idct8x8_c;
|
_state->opt_vtable.idct8x8=oc_idct8x8_c;
|
||||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
|
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
|
||||||
_state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
|
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_c;
|
||||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||||
oc_state_loop_filter_frag_rows_c;
|
oc_state_loop_filter_frag_rows_c;
|
||||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_c;
|
_state->opt_vtable.restore_fpu=oc_restore_fpu_c;
|
||||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Initialize the accelerated function pointers.*/
|
|
||||||
void oc_state_vtable_init(oc_theora_state *_state){
|
|
||||||
#if defined(OC_X86_ASM)
|
|
||||||
oc_state_vtable_init_x86(_state);
|
|
||||||
#else
|
|
||||||
oc_state_vtable_init_c(_state);
|
|
||||||
#endif
|
#endif
|
||||||
|
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -626,7 +707,8 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
|
|||||||
how it is specified in the bitstream, because the Y axis is flipped in
|
how it is specified in the bitstream, because the Y axis is flipped in
|
||||||
the bitstream.
|
the bitstream.
|
||||||
The displayable frame must fit inside the encoded frame.
|
The displayable frame must fit inside the encoded frame.
|
||||||
The color space must be one known by the encoder.*/
|
The color space must be one known by the encoder.
|
||||||
|
The framerate ratio must not contain a zero value.*/
|
||||||
if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
|
if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
|
||||||
_info->frame_width<=0||_info->frame_width>=0x100000||
|
_info->frame_width<=0||_info->frame_width>=0x100000||
|
||||||
_info->frame_height<=0||_info->frame_height>=0x100000||
|
_info->frame_height<=0||_info->frame_height>=0x100000||
|
||||||
@ -639,7 +721,8 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
|
|||||||
but there are a number of compilers which will mis-optimize this.
|
but there are a number of compilers which will mis-optimize this.
|
||||||
It's better to live with the spurious warnings.*/
|
It's better to live with the spurious warnings.*/
|
||||||
_info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
|
_info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
|
||||||
_info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){
|
_info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS||
|
||||||
|
_info->fps_numerator<1||_info->fps_denominator<1){
|
||||||
return TH_EINVAL;
|
return TH_EINVAL;
|
||||||
}
|
}
|
||||||
memset(_state,0,sizeof(*_state));
|
memset(_state,0,sizeof(*_state));
|
||||||
@ -648,7 +731,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
|
|||||||
system.*/
|
system.*/
|
||||||
_state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
|
_state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
|
||||||
_state->frame_type=OC_UNKWN_FRAME;
|
_state->frame_type=OC_UNKWN_FRAME;
|
||||||
oc_state_vtable_init(_state);
|
oc_state_accel_init(_state);
|
||||||
ret=oc_state_frarray_init(_state);
|
ret=oc_state_frarray_init(_state);
|
||||||
if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs);
|
if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs);
|
||||||
if(ret<0){
|
if(ret<0){
|
||||||
@ -758,11 +841,10 @@ void oc_state_borders_fill(oc_theora_state *_state,int _refi){
|
|||||||
_offsets[1] is set if the motion vector has non-zero fractional
|
_offsets[1] is set if the motion vector has non-zero fractional
|
||||||
components.
|
components.
|
||||||
_pli: The color plane index.
|
_pli: The color plane index.
|
||||||
_dx: The X component of the motion vector.
|
_mv: The motion vector.
|
||||||
_dy: The Y component of the motion vector.
|
|
||||||
Return: The number of offsets returned: 1 or 2.*/
|
Return: The number of offsets returned: 1 or 2.*/
|
||||||
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||||
int _pli,int _dx,int _dy){
|
int _pli,oc_mv _mv){
|
||||||
/*Here is a brief description of how Theora handles motion vectors:
|
/*Here is a brief description of how Theora handles motion vectors:
|
||||||
Motion vector components are specified to half-pixel accuracy in
|
Motion vector components are specified to half-pixel accuracy in
|
||||||
undecimated directions of each plane, and quarter-pixel accuracy in
|
undecimated directions of each plane, and quarter-pixel accuracy in
|
||||||
@ -785,21 +867,25 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
|||||||
int xfrac;
|
int xfrac;
|
||||||
int yfrac;
|
int yfrac;
|
||||||
int offs;
|
int offs;
|
||||||
|
int dx;
|
||||||
|
int dy;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
/*These two variables decide whether we are in half- or quarter-pixel
|
/*These two variables decide whether we are in half- or quarter-pixel
|
||||||
precision in each component.*/
|
precision in each component.*/
|
||||||
xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
|
xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
|
||||||
yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2));
|
yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2));
|
||||||
|
dx=OC_MV_X(_mv);
|
||||||
|
dy=OC_MV_Y(_mv);
|
||||||
/*These two variables are either 0 if all the fractional bits are zero or -1
|
/*These two variables are either 0 if all the fractional bits are zero or -1
|
||||||
if any of them are non-zero.*/
|
if any of them are non-zero.*/
|
||||||
xfrac=OC_SIGNMASK(-(_dx&(xprec|1)));
|
xfrac=OC_SIGNMASK(-(dx&(xprec|1)));
|
||||||
yfrac=OC_SIGNMASK(-(_dy&(yprec|1)));
|
yfrac=OC_SIGNMASK(-(dy&(yprec|1)));
|
||||||
offs=(_dx>>xprec)+(_dy>>yprec)*ystride;
|
offs=(dx>>xprec)+(dy>>yprec)*ystride;
|
||||||
if(xfrac||yfrac){
|
if(xfrac||yfrac){
|
||||||
int xmask;
|
int xmask;
|
||||||
int ymask;
|
int ymask;
|
||||||
xmask=OC_SIGNMASK(_dx);
|
xmask=OC_SIGNMASK(dx);
|
||||||
ymask=OC_SIGNMASK(_dy);
|
ymask=OC_SIGNMASK(dy);
|
||||||
yfrac&=ystride;
|
yfrac&=ystride;
|
||||||
_offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
|
_offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
|
||||||
_offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
|
_offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
|
||||||
@ -848,13 +934,17 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
|||||||
int mx2;
|
int mx2;
|
||||||
int my2;
|
int my2;
|
||||||
int offs;
|
int offs;
|
||||||
|
int dx;
|
||||||
|
int dy;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
|
qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
|
||||||
my=OC_MVMAP[qpy][_dy+31];
|
dx=OC_MV_X(_mv);
|
||||||
my2=OC_MVMAP2[qpy][_dy+31];
|
dy=OC_MV_Y(_mv);
|
||||||
|
my=OC_MVMAP[qpy][dy+31];
|
||||||
|
my2=OC_MVMAP2[qpy][dy+31];
|
||||||
qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
|
qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
|
||||||
mx=OC_MVMAP[qpx][_dx+31];
|
mx=OC_MVMAP[qpx][dx+31];
|
||||||
mx2=OC_MVMAP2[qpx][_dx+31];
|
mx2=OC_MVMAP2[qpx][dx+31];
|
||||||
offs=my*ystride+mx;
|
offs=my*ystride+mx;
|
||||||
if(mx2||my2){
|
if(mx2||my2){
|
||||||
_offsets[1]=offs+my2*ystride+mx2;
|
_offsets[1]=offs+my2*ystride+mx2;
|
||||||
@ -866,18 +956,12 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
|
||||||
_state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
|
|
||||||
_last_zzi,_dc_quant);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||||
unsigned char *dst;
|
unsigned char *dst;
|
||||||
ptrdiff_t frag_buf_off;
|
ptrdiff_t frag_buf_off;
|
||||||
int ystride;
|
int ystride;
|
||||||
int mb_mode;
|
int refi;
|
||||||
/*Apply the inverse transform.*/
|
/*Apply the inverse transform.*/
|
||||||
/*Special case only having a DC component.*/
|
/*Special case only having a DC component.*/
|
||||||
if(_last_zzi<2){
|
if(_last_zzi<2){
|
||||||
@ -887,69 +971,35 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|||||||
no iDCT rounding.*/
|
no iDCT rounding.*/
|
||||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||||
/*LOOP VECTORIZES.*/
|
/*LOOP VECTORIZES.*/
|
||||||
for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
|
for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p;
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
/*First, dequantize the DC coefficient.*/
|
/*First, dequantize the DC coefficient.*/
|
||||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||||
oc_idct8x8(_state,_dct_coeffs,_last_zzi);
|
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||||
}
|
}
|
||||||
/*Fill in the target buffer.*/
|
/*Fill in the target buffer.*/
|
||||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||||
mb_mode=_state->frags[_fragi].mb_mode;
|
refi=_state->frags[_fragi].refi;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
|
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||||
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
|
if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
|
||||||
else{
|
else{
|
||||||
const unsigned char *ref;
|
const unsigned char *ref;
|
||||||
int mvoffsets[2];
|
int mvoffsets[2];
|
||||||
ref=
|
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||||
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
|
|
||||||
+frag_buf_off;
|
|
||||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||||
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
|
_state->frag_mvs[_fragi])>1){
|
||||||
oc_frag_recon_inter2(_state,
|
oc_frag_recon_inter2(_state,
|
||||||
dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
|
dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||||
}
|
}
|
||||||
else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Copies the fragments specified by the lists of fragment indices from one
|
static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){
|
||||||
frame to another.
|
|
||||||
_fragis: A pointer to a list of fragment indices.
|
|
||||||
_nfragis: The number of fragment indices to copy.
|
|
||||||
_dst_frame: The reference frame to copy to.
|
|
||||||
_src_frame: The reference frame to copy from.
|
|
||||||
_pli: The color plane the fragments lie in.*/
|
|
||||||
void oc_state_frag_copy_list(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli){
|
|
||||||
_state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame,
|
|
||||||
_src_frame,_pli);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_state_frag_copy_list_c(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli){
|
|
||||||
const ptrdiff_t *frag_buf_offs;
|
|
||||||
const unsigned char *src_frame_data;
|
|
||||||
unsigned char *dst_frame_data;
|
|
||||||
ptrdiff_t fragii;
|
|
||||||
int ystride;
|
|
||||||
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
|
|
||||||
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
|
|
||||||
ystride=_state->ref_ystride[_pli];
|
|
||||||
frag_buf_offs=_state->frag_buf_offs;
|
|
||||||
for(fragii=0;fragii<_nfragis;fragii++){
|
|
||||||
ptrdiff_t frag_buf_off;
|
|
||||||
frag_buf_off=frag_buf_offs[_fragis[fragii]];
|
|
||||||
oc_frag_copy(_state,dst_frame_data+frag_buf_off,
|
|
||||||
src_frame_data+frag_buf_off,ystride);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
|
|
||||||
int y;
|
int y;
|
||||||
_pix-=2;
|
_pix-=2;
|
||||||
for(y=0;y<8;y++){
|
for(y=0;y<8;y++){
|
||||||
@ -965,7 +1015,7 @@ static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
|
static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){
|
||||||
int x;
|
int x;
|
||||||
_pix-=_ystride*2;
|
_pix-=_ystride*2;
|
||||||
for(x=0;x<8;x++){
|
for(x=0;x<8;x++){
|
||||||
@ -982,20 +1032,16 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
|
|||||||
|
|
||||||
/*Initialize the bounding values array used by the loop filter.
|
/*Initialize the bounding values array used by the loop filter.
|
||||||
_bv: Storage for the array.
|
_bv: Storage for the array.
|
||||||
Return: 0 on success, or a non-zero value if no filtering need be applied.*/
|
_flimit: The filter limit as defined in Section 7.10 of the spec.*/
|
||||||
int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
|
void oc_loop_filter_init_c(signed char _bv[256],int _flimit){
|
||||||
int flimit;
|
|
||||||
int i;
|
int i;
|
||||||
flimit=_state->loop_filter_limits[_state->qis[0]];
|
|
||||||
if(flimit==0)return 1;
|
|
||||||
memset(_bv,0,sizeof(_bv[0])*256);
|
memset(_bv,0,sizeof(_bv[0])*256);
|
||||||
for(i=0;i<flimit;i++){
|
for(i=0;i<_flimit;i++){
|
||||||
if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
|
if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit);
|
||||||
_bv[127-i]=-i;
|
_bv[127-i]=(signed char)(-i);
|
||||||
_bv[127+i]=i;
|
_bv[127+i]=(signed char)(i);
|
||||||
if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
|
if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i);
|
||||||
}
|
}
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||||
@ -1006,14 +1052,8 @@ int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
|
|||||||
_pli: The color plane to filter.
|
_pli: The color plane to filter.
|
||||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||||
void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int _bv[256],
|
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
||||||
int _refi,int _pli,int _fragy0,int _fragy_end){
|
signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||||
_state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
|
|
||||||
_fragy0,_fragy_end);
|
|
||||||
}
|
|
||||||
|
|
||||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
|
|
||||||
int _refi,int _pli,int _fragy0,int _fragy_end){
|
|
||||||
const oc_fragment_plane *fplane;
|
const oc_fragment_plane *fplane;
|
||||||
const oc_fragment *frags;
|
const oc_fragment *frags;
|
||||||
const ptrdiff_t *frag_buf_offs;
|
const ptrdiff_t *frag_buf_offs;
|
||||||
@ -1030,7 +1070,7 @@ void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
|
|||||||
fragi_top=fplane->froffset;
|
fragi_top=fplane->froffset;
|
||||||
fragi_bot=fragi_top+fplane->nfrags;
|
fragi_bot=fragi_top+fplane->nfrags;
|
||||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||||
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
|
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
frags=_state->frags;
|
frags=_state->frags;
|
||||||
frag_buf_offs=_state->frag_buf_offs;
|
frag_buf_offs=_state->frag_buf_offs;
|
||||||
|
552
thirdparty/libtheora/state.h
vendored
Normal file
552
thirdparty/libtheora/state.h
vendored
Normal file
@ -0,0 +1,552 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
#if !defined(_state_H)
|
||||||
|
# define _state_H (1)
|
||||||
|
# include "internal.h"
|
||||||
|
# include "huffman.h"
|
||||||
|
# include "quant.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*A single quadrant of the map from a super block to fragment numbers.*/
|
||||||
|
typedef ptrdiff_t oc_sb_map_quad[4];
|
||||||
|
/*A map from a super block to fragment numbers.*/
|
||||||
|
typedef oc_sb_map_quad oc_sb_map[4];
|
||||||
|
/*A single plane of the map from a macro block to fragment numbers.*/
|
||||||
|
typedef ptrdiff_t oc_mb_map_plane[4];
|
||||||
|
/*A map from a macro block to fragment numbers.*/
|
||||||
|
typedef oc_mb_map_plane oc_mb_map[3];
|
||||||
|
/*A motion vector.*/
|
||||||
|
typedef ogg_int16_t oc_mv;
|
||||||
|
|
||||||
|
typedef struct oc_sb_flags oc_sb_flags;
|
||||||
|
typedef struct oc_border_info oc_border_info;
|
||||||
|
typedef struct oc_fragment oc_fragment;
|
||||||
|
typedef struct oc_fragment_plane oc_fragment_plane;
|
||||||
|
typedef struct oc_base_opt_vtable oc_base_opt_vtable;
|
||||||
|
typedef struct oc_base_opt_data oc_base_opt_data;
|
||||||
|
typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
|
||||||
|
typedef struct oc_theora_state oc_theora_state;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Shared accelerated functions.*/
|
||||||
|
# if defined(OC_X86_ASM)
|
||||||
|
# if defined(_MSC_VER)
|
||||||
|
# include "x86_vc/x86int.h"
|
||||||
|
# else
|
||||||
|
# include "x86/x86int.h"
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
# if defined(OC_ARM_ASM)
|
||||||
|
# include "arm/armint.h"
|
||||||
|
# endif
|
||||||
|
# if defined(OC_C64X_ASM)
|
||||||
|
# include "c64x/c64xint.h"
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# if !defined(oc_state_accel_init)
|
||||||
|
# define oc_state_accel_init oc_state_accel_init_c
|
||||||
|
# endif
|
||||||
|
# if defined(OC_STATE_USE_VTABLE)
|
||||||
|
# if !defined(oc_frag_copy)
|
||||||
|
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||||
|
((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_copy_list)
|
||||||
|
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||||
|
_fragis,_nfragis,_frag_buf_offs) \
|
||||||
|
((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
|
||||||
|
_fragis,_nfragis,_frag_buf_offs))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_recon_intra)
|
||||||
|
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
|
||||||
|
((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_recon_inter)
|
||||||
|
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||||
|
((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_recon_inter2)
|
||||||
|
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||||
|
((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \
|
||||||
|
_src1,_src2,_ystride,_residue))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_idct8x8)
|
||||||
|
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||||
|
((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_state_frag_recon)
|
||||||
|
# define oc_state_frag_recon(_state,_fragi, \
|
||||||
|
_pli,_dct_coeffs,_last_zzi,_dc_quant) \
|
||||||
|
((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
|
||||||
|
_pli,_dct_coeffs,_last_zzi,_dc_quant))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_loop_filter_init)
|
||||||
|
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||||
|
((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_state_loop_filter_frag_rows)
|
||||||
|
# define oc_state_loop_filter_frag_rows(_state, \
|
||||||
|
_bv,_refi,_pli,_fragy0,_fragy_end) \
|
||||||
|
((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \
|
||||||
|
_bv,_refi,_pli,_fragy0,_fragy_end))
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_restore_fpu)
|
||||||
|
# define oc_restore_fpu(_state) \
|
||||||
|
((*(_state)->opt_vtable.restore_fpu)())
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# if !defined(oc_frag_copy)
|
||||||
|
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||||
|
oc_frag_copy_c(_dst,_src,_ystride)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_copy_list)
|
||||||
|
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||||
|
_fragis,_nfragis,_frag_buf_offs) \
|
||||||
|
oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
|
||||||
|
_fragis,_nfragis,_frag_buf_offs)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_recon_intra)
|
||||||
|
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
|
||||||
|
oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_recon_inter)
|
||||||
|
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||||
|
oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_frag_recon_inter2)
|
||||||
|
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||||
|
oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_idct8x8)
|
||||||
|
# define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_state_frag_recon)
|
||||||
|
# define oc_state_frag_recon oc_state_frag_recon_c
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_loop_filter_init)
|
||||||
|
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||||
|
oc_loop_filter_init_c(_bv,_flimit)
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_state_loop_filter_frag_rows)
|
||||||
|
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
|
||||||
|
# endif
|
||||||
|
# if !defined(oc_restore_fpu)
|
||||||
|
# define oc_restore_fpu(_state) do{}while(0)
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*A keyframe.*/
|
||||||
|
# define OC_INTRA_FRAME (0)
|
||||||
|
/*A predicted frame.*/
|
||||||
|
# define OC_INTER_FRAME (1)
|
||||||
|
/*A frame of unknown type (frame type decision has not yet been made).*/
|
||||||
|
# define OC_UNKWN_FRAME (-1)
|
||||||
|
|
||||||
|
/*The amount of padding to add to the reconstructed frame buffers on all
|
||||||
|
sides.
|
||||||
|
This is used to allow unrestricted motion vectors without special casing.
|
||||||
|
This must be a multiple of 2.*/
|
||||||
|
# define OC_UMV_PADDING (16)
|
||||||
|
|
||||||
|
/*Frame classification indices.*/
|
||||||
|
/*The previous golden frame.*/
|
||||||
|
# define OC_FRAME_GOLD (0)
|
||||||
|
/*The previous frame.*/
|
||||||
|
# define OC_FRAME_PREV (1)
|
||||||
|
/*The current frame.*/
|
||||||
|
# define OC_FRAME_SELF (2)
|
||||||
|
/*Used to mark uncoded fragments (for DC prediction).*/
|
||||||
|
# define OC_FRAME_NONE (3)
|
||||||
|
|
||||||
|
/*The input or output buffer.*/
|
||||||
|
# define OC_FRAME_IO (3)
|
||||||
|
/*Uncompressed prev golden frame.*/
|
||||||
|
# define OC_FRAME_GOLD_ORIG (4)
|
||||||
|
/*Uncompressed previous frame. */
|
||||||
|
# define OC_FRAME_PREV_ORIG (5)
|
||||||
|
|
||||||
|
/*Macroblock modes.*/
|
||||||
|
/*Macro block is invalid: It is never coded.*/
|
||||||
|
# define OC_MODE_INVALID (-1)
|
||||||
|
/*Encoded difference from the same macro block in the previous frame.*/
|
||||||
|
# define OC_MODE_INTER_NOMV (0)
|
||||||
|
/*Encoded with no motion compensated prediction.*/
|
||||||
|
# define OC_MODE_INTRA (1)
|
||||||
|
/*Encoded difference from the previous frame offset by the given motion
|
||||||
|
vector.*/
|
||||||
|
# define OC_MODE_INTER_MV (2)
|
||||||
|
/*Encoded difference from the previous frame offset by the last coded motion
|
||||||
|
vector.*/
|
||||||
|
# define OC_MODE_INTER_MV_LAST (3)
|
||||||
|
/*Encoded difference from the previous frame offset by the second to last
|
||||||
|
coded motion vector.*/
|
||||||
|
# define OC_MODE_INTER_MV_LAST2 (4)
|
||||||
|
/*Encoded difference from the same macro block in the previous golden
|
||||||
|
frame.*/
|
||||||
|
# define OC_MODE_GOLDEN_NOMV (5)
|
||||||
|
/*Encoded difference from the previous golden frame offset by the given motion
|
||||||
|
vector.*/
|
||||||
|
# define OC_MODE_GOLDEN_MV (6)
|
||||||
|
/*Encoded difference from the previous frame offset by the individual motion
|
||||||
|
vectors given for each block.*/
|
||||||
|
# define OC_MODE_INTER_MV_FOUR (7)
|
||||||
|
/*The number of (coded) modes.*/
|
||||||
|
# define OC_NMODES (8)
|
||||||
|
|
||||||
|
/*Determines the reference frame used for a given MB mode.*/
|
||||||
|
# define OC_FRAME_FOR_MODE(_x) \
|
||||||
|
OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
|
||||||
|
OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
|
||||||
|
|
||||||
|
/*Constants for the packet state machine common between encoder and decoder.*/
|
||||||
|
|
||||||
|
/*Next packet to emit/read: Codec info header.*/
|
||||||
|
# define OC_PACKET_INFO_HDR (-3)
|
||||||
|
/*Next packet to emit/read: Comment header.*/
|
||||||
|
# define OC_PACKET_COMMENT_HDR (-2)
|
||||||
|
/*Next packet to emit/read: Codec setup header.*/
|
||||||
|
# define OC_PACKET_SETUP_HDR (-1)
|
||||||
|
/*No more packets to emit/read.*/
|
||||||
|
# define OC_PACKET_DONE (INT_MAX)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define OC_MV(_x,_y) ((oc_mv)((_x)&0xFF|(_y)<<8))
|
||||||
|
#define OC_MV_X(_mv) ((signed char)(_mv))
|
||||||
|
#define OC_MV_Y(_mv) ((_mv)>>8)
|
||||||
|
#define OC_MV_ADD(_mv1,_mv2) \
|
||||||
|
OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \
|
||||||
|
OC_MV_Y(_mv1)+OC_MV_Y(_mv2))
|
||||||
|
#define OC_MV_SUB(_mv1,_mv2) \
|
||||||
|
OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \
|
||||||
|
OC_MV_Y(_mv1)-OC_MV_Y(_mv2))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Super blocks are 32x32 segments of pixels in a single color plane indexed
|
||||||
|
in image order.
|
||||||
|
Internally, super blocks are broken up into four quadrants, each of which
|
||||||
|
contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
|
||||||
|
Quadrants, and the blocks within them, are indexed in a special order called
|
||||||
|
a "Hilbert curve" within the super block.
|
||||||
|
|
||||||
|
In order to differentiate between the Hilbert-curve indexing strategy and
|
||||||
|
the regular image order indexing strategy, blocks indexed in image order
|
||||||
|
are called "fragments".
|
||||||
|
Fragments are indexed in image order, left to right, then bottom to top,
|
||||||
|
from Y' plane to Cb plane to Cr plane.
|
||||||
|
|
||||||
|
The co-located fragments in all image planes corresponding to the location
|
||||||
|
of a single quadrant of a luma plane super block form a macro block.
|
||||||
|
Thus there is only a single set of macro blocks for all planes, each of which
|
||||||
|
contains between 6 and 12 fragments, depending on the pixel format.
|
||||||
|
Therefore macro block information is kept in a separate set of arrays from
|
||||||
|
super blocks to avoid unused space in the other planes.
|
||||||
|
The lists are indexed in super block order.
|
||||||
|
That is, the macro block corresponding to the macro block mbi in (luma plane)
|
||||||
|
super block sbi is at index (sbi<<2|mbi).
|
||||||
|
Thus the number of macro blocks in each dimension is always twice the number
|
||||||
|
of super blocks, even when only an odd number fall inside the coded frame.
|
||||||
|
These "extra" macro blocks are just an artifact of our internal data layout,
|
||||||
|
and not part of the coded stream; they are flagged with a negative MB mode.*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Super block information.*/
|
||||||
|
struct oc_sb_flags{
|
||||||
|
unsigned char coded_fully:1;
|
||||||
|
unsigned char coded_partially:1;
|
||||||
|
unsigned char quad_valid:4;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Information about a fragment which intersects the border of the displayable
|
||||||
|
region.
|
||||||
|
This marks which pixels belong to the displayable region.*/
|
||||||
|
struct oc_border_info{
|
||||||
|
/*A bit mask marking which pixels are in the displayable region.
|
||||||
|
Pixel (x,y) corresponds to bit (y<<3|x).*/
|
||||||
|
ogg_int64_t mask;
|
||||||
|
/*The number of pixels in the displayable region.
|
||||||
|
This is always positive, and always less than 64.*/
|
||||||
|
int npixels;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Fragment information.*/
|
||||||
|
struct oc_fragment{
|
||||||
|
/*A flag indicating whether or not this fragment is coded.*/
|
||||||
|
unsigned coded:1;
|
||||||
|
/*A flag indicating that this entire fragment lies outside the displayable
|
||||||
|
region of the frame.
|
||||||
|
Note the contrast with an invalid macro block, which is outside the coded
|
||||||
|
frame, not just the displayable one.
|
||||||
|
There are no fragments outside the coded frame by construction.*/
|
||||||
|
unsigned invalid:1;
|
||||||
|
/*The index of the quality index used for this fragment's AC coefficients.*/
|
||||||
|
unsigned qii:4;
|
||||||
|
/*The index of the reference frame this fragment is predicted from.*/
|
||||||
|
unsigned refi:2;
|
||||||
|
/*The mode of the macroblock this fragment belongs to.*/
|
||||||
|
unsigned mb_mode:3;
|
||||||
|
/*The index of the associated border information for fragments which lie
|
||||||
|
partially outside the displayable region.
|
||||||
|
For fragments completely inside or outside this region, this is -1.
|
||||||
|
Note that the C standard requires an explicit signed keyword for bitfield
|
||||||
|
types, since some compilers may treat them as unsigned without it.*/
|
||||||
|
signed int borderi:5;
|
||||||
|
/*The prediction-corrected DC component.
|
||||||
|
Note that the C standard requires an explicit signed keyword for bitfield
|
||||||
|
types, since some compilers may treat them as unsigned without it.*/
|
||||||
|
signed int dc:16;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*A description of each fragment plane.*/
|
||||||
|
struct oc_fragment_plane{
|
||||||
|
/*The number of fragments in the horizontal direction.*/
|
||||||
|
int nhfrags;
|
||||||
|
/*The number of fragments in the vertical direction.*/
|
||||||
|
int nvfrags;
|
||||||
|
/*The offset of the first fragment in the plane.*/
|
||||||
|
ptrdiff_t froffset;
|
||||||
|
/*The total number of fragments in the plane.*/
|
||||||
|
ptrdiff_t nfrags;
|
||||||
|
/*The number of super blocks in the horizontal direction.*/
|
||||||
|
unsigned nhsbs;
|
||||||
|
/*The number of super blocks in the vertical direction.*/
|
||||||
|
unsigned nvsbs;
|
||||||
|
/*The offset of the first super block in the plane.*/
|
||||||
|
unsigned sboffset;
|
||||||
|
/*The total number of super blocks in the plane.*/
|
||||||
|
unsigned nsbs;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
typedef void (*oc_state_loop_filter_frag_rows_func)(
|
||||||
|
const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
|
||||||
|
int _fragy0,int _fragy_end);
|
||||||
|
|
||||||
|
/*The shared (encoder and decoder) functions that have accelerated variants.*/
|
||||||
|
struct oc_base_opt_vtable{
|
||||||
|
void (*frag_copy)(unsigned char *_dst,
|
||||||
|
const unsigned char *_src,int _ystride);
|
||||||
|
void (*frag_copy_list)(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||||
|
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||||
|
const ogg_int16_t _residue[64]);
|
||||||
|
void (*frag_recon_inter)(unsigned char *_dst,
|
||||||
|
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||||
|
void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
|
||||||
|
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||||
|
void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||||
|
void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||||
|
void (*loop_filter_init)(signed char _bv[256],int _flimit);
|
||||||
|
oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
|
||||||
|
void (*restore_fpu)(void);
|
||||||
|
};
|
||||||
|
|
||||||
|
/*The shared (encoder and decoder) tables that vary according to which variants
|
||||||
|
of the above functions are used.*/
|
||||||
|
struct oc_base_opt_data{
|
||||||
|
const unsigned char *dct_fzig_zag;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*State information common to both the encoder and decoder.*/
|
||||||
|
struct oc_theora_state{
|
||||||
|
/*The stream information.*/
|
||||||
|
th_info info;
|
||||||
|
# if defined(OC_STATE_USE_VTABLE)
|
||||||
|
/*Table for shared accelerated functions.*/
|
||||||
|
oc_base_opt_vtable opt_vtable;
|
||||||
|
# endif
|
||||||
|
/*Table for shared data used by accelerated functions.*/
|
||||||
|
oc_base_opt_data opt_data;
|
||||||
|
/*CPU flags to detect the presence of extended instruction sets.*/
|
||||||
|
ogg_uint32_t cpu_flags;
|
||||||
|
/*The fragment plane descriptions.*/
|
||||||
|
oc_fragment_plane fplanes[3];
|
||||||
|
/*The list of fragments, indexed in image order.*/
|
||||||
|
oc_fragment *frags;
|
||||||
|
/*The the offset into the reference frame buffer to the upper-left pixel of
|
||||||
|
each fragment.*/
|
||||||
|
ptrdiff_t *frag_buf_offs;
|
||||||
|
/*The motion vector for each fragment.*/
|
||||||
|
oc_mv *frag_mvs;
|
||||||
|
/*The total number of fragments in a single frame.*/
|
||||||
|
ptrdiff_t nfrags;
|
||||||
|
/*The list of super block maps, indexed in image order.*/
|
||||||
|
oc_sb_map *sb_maps;
|
||||||
|
/*The list of super block flags, indexed in image order.*/
|
||||||
|
oc_sb_flags *sb_flags;
|
||||||
|
/*The total number of super blocks in a single frame.*/
|
||||||
|
unsigned nsbs;
|
||||||
|
/*The fragments from each color plane that belong to each macro block.
|
||||||
|
Fragments are stored in image order (left to right then top to bottom).
|
||||||
|
When chroma components are decimated, the extra fragments have an index of
|
||||||
|
-1.*/
|
||||||
|
oc_mb_map *mb_maps;
|
||||||
|
/*The list of macro block modes.
|
||||||
|
A negative number indicates the macro block lies entirely outside the
|
||||||
|
coded frame.*/
|
||||||
|
signed char *mb_modes;
|
||||||
|
/*The number of macro blocks in the X direction.*/
|
||||||
|
unsigned nhmbs;
|
||||||
|
/*The number of macro blocks in the Y direction.*/
|
||||||
|
unsigned nvmbs;
|
||||||
|
/*The total number of macro blocks.*/
|
||||||
|
size_t nmbs;
|
||||||
|
/*The list of coded fragments, in coded order.
|
||||||
|
Uncoded fragments are stored in reverse order from the end of the list.*/
|
||||||
|
ptrdiff_t *coded_fragis;
|
||||||
|
/*The number of coded fragments in each plane.*/
|
||||||
|
ptrdiff_t ncoded_fragis[3];
|
||||||
|
/*The total number of coded fragments.*/
|
||||||
|
ptrdiff_t ntotal_coded_fragis;
|
||||||
|
/*The actual buffers used for the reference frames.*/
|
||||||
|
th_ycbcr_buffer ref_frame_bufs[6];
|
||||||
|
/*The index of the buffers being used for each OC_FRAME_* reference frame.*/
|
||||||
|
int ref_frame_idx[6];
|
||||||
|
/*The storage for the reference frame buffers.
|
||||||
|
This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
|
||||||
|
for faster look-up.*/
|
||||||
|
unsigned char *ref_frame_data[6];
|
||||||
|
/*The handle used to allocate the reference frame buffers.*/
|
||||||
|
unsigned char *ref_frame_handle;
|
||||||
|
/*The strides for each plane in the reference frames.*/
|
||||||
|
int ref_ystride[3];
|
||||||
|
/*The number of unique border patterns.*/
|
||||||
|
int nborders;
|
||||||
|
/*The unique border patterns for all border fragments.
|
||||||
|
The borderi field of fragments which straddle the border indexes this
|
||||||
|
list.*/
|
||||||
|
oc_border_info borders[16];
|
||||||
|
/*The frame number of the last keyframe.*/
|
||||||
|
ogg_int64_t keyframe_num;
|
||||||
|
/*The frame number of the current frame.*/
|
||||||
|
ogg_int64_t curframe_num;
|
||||||
|
/*The granpos of the current frame.*/
|
||||||
|
ogg_int64_t granpos;
|
||||||
|
/*The type of the current frame.*/
|
||||||
|
signed char frame_type;
|
||||||
|
/*The bias to add to the frame count when computing granule positions.*/
|
||||||
|
unsigned char granpos_bias;
|
||||||
|
/*The number of quality indices used in the current frame.*/
|
||||||
|
unsigned char nqis;
|
||||||
|
/*The quality indices of the current frame.*/
|
||||||
|
unsigned char qis[3];
|
||||||
|
/*The dequantization tables, stored in zig-zag order, and indexed by
|
||||||
|
qi, pli, qti, and zzi.*/
|
||||||
|
ogg_uint16_t *dequant_tables[64][3][2];
|
||||||
|
OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]);
|
||||||
|
/*Loop filter strength parameters.*/
|
||||||
|
unsigned char loop_filter_limits[64];
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*The function type used to fill in the chroma plane motion vectors for a
|
||||||
|
macro block when 4 different motion vectors are specified in the luma
|
||||||
|
plane.
|
||||||
|
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||||
|
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||||
|
prediction.
|
||||||
|
_lbmvs: The luma block-level motion vectors.*/
|
||||||
|
typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
|
||||||
|
macro block when 4 different motion vectors are specified in the luma
|
||||||
|
plane.*/
|
||||||
|
extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
|
||||||
|
void oc_state_clear(oc_theora_state *_state);
|
||||||
|
void oc_state_accel_init_c(oc_theora_state *_state);
|
||||||
|
void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
|
||||||
|
int _y0,int _yend);
|
||||||
|
void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
|
||||||
|
void oc_state_borders_fill(oc_theora_state *_state,int _refi);
|
||||||
|
void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
|
||||||
|
th_ycbcr_buffer _img);
|
||||||
|
int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
|
||||||
|
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||||
|
int _pli,oc_mv _mv);
|
||||||
|
|
||||||
|
void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
|
||||||
|
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
|
||||||
|
# if defined(OC_DUMP_IMAGES)
|
||||||
|
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
|
||||||
|
const char *_suf);
|
||||||
|
# endif
|
||||||
|
|
||||||
|
/*Default pure-C implementations of shared accelerated functions.*/
|
||||||
|
void oc_frag_copy_c(unsigned char *_dst,
|
||||||
|
const unsigned char *_src,int _src_ystride);
|
||||||
|
void oc_frag_copy_list_c(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||||
|
void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
|
||||||
|
const ogg_int16_t _residue[64]);
|
||||||
|
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||||
|
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||||
|
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||||
|
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||||
|
void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||||
|
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||||
|
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
||||||
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||||
|
void oc_restore_fpu_c(void);
|
||||||
|
|
||||||
|
/*We need a way to call a few encoder functions without introducing a link-time
|
||||||
|
dependency into the decoder, while still allowing the old alpha API which
|
||||||
|
does not distinguish between encoder and decoder objects to be used.
|
||||||
|
We do this by placing a function table at the start of the encoder object
|
||||||
|
which can dispatch into the encoder library.
|
||||||
|
We do a similar thing for the decoder in case we ever decide to split off a
|
||||||
|
common base library.*/
|
||||||
|
typedef void (*oc_state_clear_func)(theora_state *_th);
|
||||||
|
typedef int (*oc_state_control_func)(theora_state *th,int _req,
|
||||||
|
void *_buf,size_t _buf_sz);
|
||||||
|
typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
|
||||||
|
ogg_int64_t _granulepos);
|
||||||
|
typedef double (*oc_state_granule_time_func)(theora_state *_th,
|
||||||
|
ogg_int64_t _granulepos);
|
||||||
|
|
||||||
|
|
||||||
|
struct oc_state_dispatch_vtable{
|
||||||
|
oc_state_clear_func clear;
|
||||||
|
oc_state_control_func control;
|
||||||
|
oc_state_granule_frame_func granule_frame;
|
||||||
|
oc_state_granule_time_func granule_time;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
77
thirdparty/libtheora/theora/codec.h
vendored
77
thirdparty/libtheora/theora/codec.h
vendored
@ -16,11 +16,12 @@
|
|||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
/**\mainpage
|
/**\mainpage
|
||||||
*
|
*
|
||||||
* \section intro Introduction
|
* \section intro Introduction
|
||||||
*
|
*
|
||||||
* This is the documentation for <tt>libtheora</tt> C API.
|
* This is the documentation for the <tt>libtheora</tt> C API.
|
||||||
* The current reference
|
*
|
||||||
|
* The \c libtheora package is the current reference
|
||||||
* implementation for <a href="http://www.theora.org/">Theora</a>, a free,
|
* implementation for <a href="http://www.theora.org/">Theora</a>, a free,
|
||||||
* patent-unencumbered video codec.
|
* patent-unencumbered video codec.
|
||||||
* Theora is derived from On2's VP3 codec with additional features and
|
* Theora is derived from On2's VP3 codec with additional features and
|
||||||
@ -30,29 +31,31 @@
|
|||||||
* <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
* <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||||
* specification</a>.
|
* specification</a>.
|
||||||
*
|
*
|
||||||
* \subsection Organization
|
* \section Organization
|
||||||
*
|
*
|
||||||
* The functions documented here are actually subdivided into three
|
* The functions documented here are divided between two
|
||||||
* separate libraries:
|
* separate libraries:
|
||||||
* - <tt>libtheoraenc</tt> contains the encoder interface,
|
* - \c libtheoraenc contains the encoder interface,
|
||||||
* described in \ref encfuncs.
|
* described in \ref encfuncs.
|
||||||
* - <tt>libtheoradec</tt> contains the decoder interface and
|
* - \c libtheoradec contains the decoder interface,
|
||||||
* routines shared with the encoder.
|
* described in \ref decfuncs, \n
|
||||||
* You must also link to this if you link to <tt>libtheoraenc</tt>.
|
* and additional \ref basefuncs.
|
||||||
* The routines in this library are described in \ref decfuncs and
|
|
||||||
* \ref basefuncs.
|
|
||||||
* - <tt>libtheora</tt> contains the \ref oldfuncs.
|
|
||||||
*
|
*
|
||||||
* New code should link to <tt>libtheoradec</tt> and, if using encoder
|
* New code should link to \c libtheoradec. If using encoder
|
||||||
* features, <tt>libtheoraenc</tt>. Together these two export both
|
* features, it must also link to \c libtheoraenc.
|
||||||
* the standard and the legacy API, so this is all that is needed by
|
|
||||||
* any code. The older <tt>libtheora</tt> library is provided just for
|
|
||||||
* compatibility with older build configurations.
|
|
||||||
*
|
*
|
||||||
* In general the recommended 1.x API symbols can be distinguished
|
* During initial development, prior to the 1.0 release,
|
||||||
* by their <tt>th_</tt> or <tt>TH_</tt> namespace prefix.
|
* \c libtheora exported a different \ref oldfuncs which
|
||||||
* The older, legacy API uses <tt>theora_</tt> or <tt>OC_</tt>
|
* combined both encode and decode functions.
|
||||||
* prefixes instead.
|
* In general, legacy API symbols can be indentified
|
||||||
|
* by their \c theora_ or \c OC_ namespace prefixes.
|
||||||
|
* The current API uses \c th_ or \c TH_ instead.
|
||||||
|
*
|
||||||
|
* While deprecated, \c libtheoraenc and \c libtheoradec
|
||||||
|
* together export the legacy api as well at the one documented above.
|
||||||
|
* Likewise, the legacy \c libtheora included with this package
|
||||||
|
* exports the new 1.x API. Older code and build scripts can therefore
|
||||||
|
* but updated independently to the current scheme.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**\file
|
/**\file
|
||||||
@ -168,7 +171,7 @@ typedef struct{
|
|||||||
typedef th_img_plane th_ycbcr_buffer[3];
|
typedef th_img_plane th_ycbcr_buffer[3];
|
||||||
|
|
||||||
/**Theora bitstream information.
|
/**Theora bitstream information.
|
||||||
* This contains the basic playback parameters for a stream, and corresponds to
|
* This contains the basic playback parameters for a stream, and corresponds to
|
||||||
* the initial 'info' header packet.
|
* the initial 'info' header packet.
|
||||||
* To initialize an encoder, the application fills in this structure and
|
* To initialize an encoder, the application fills in this structure and
|
||||||
* passes it to th_encode_alloc().
|
* passes it to th_encode_alloc().
|
||||||
@ -317,7 +320,7 @@ typedef struct{
|
|||||||
* In filling in this structure, th_decode_headerin() will null-terminate
|
* In filling in this structure, th_decode_headerin() will null-terminate
|
||||||
* the user_comment strings for safety.
|
* the user_comment strings for safety.
|
||||||
* However, the bitstream format itself treats them as 8-bit clean vectors,
|
* However, the bitstream format itself treats them as 8-bit clean vectors,
|
||||||
* possibly containing null characters, and so the length array should be
|
* possibly containing null characters, so the length array should be
|
||||||
* treated as their authoritative length.
|
* treated as their authoritative length.
|
||||||
*/
|
*/
|
||||||
typedef struct th_comment{
|
typedef struct th_comment{
|
||||||
@ -448,7 +451,13 @@ typedef struct{
|
|||||||
|
|
||||||
/**\defgroup basefuncs Functions Shared by Encode and Decode*/
|
/**\defgroup basefuncs Functions Shared by Encode and Decode*/
|
||||||
/*@{*/
|
/*@{*/
|
||||||
/**\name Basic shared functions*/
|
/**\name Basic shared functions
|
||||||
|
* These functions return information about the library itself,
|
||||||
|
* or provide high-level information about codec state
|
||||||
|
* and packet type.
|
||||||
|
*
|
||||||
|
* You must link to \c libtheoradec if you use any of the
|
||||||
|
* functions in this section.*/
|
||||||
/*@{*/
|
/*@{*/
|
||||||
/**Retrieves a human-readable string to identify the library vendor and
|
/**Retrieves a human-readable string to identify the library vendor and
|
||||||
* version.
|
* version.
|
||||||
@ -510,7 +519,12 @@ extern int th_packet_iskeyframe(ogg_packet *_op);
|
|||||||
/*@}*/
|
/*@}*/
|
||||||
|
|
||||||
|
|
||||||
/**\name Functions for manipulating header data*/
|
/**\name Functions for manipulating header data
|
||||||
|
* These functions manipulate the #th_info and #th_comment structures
|
||||||
|
* which describe video parameters and key-value metadata, respectively.
|
||||||
|
*
|
||||||
|
* You must link to \c libtheoradec if you use any of the
|
||||||
|
* functions in this section.*/
|
||||||
/*@{*/
|
/*@{*/
|
||||||
/**Initializes a th_info structure.
|
/**Initializes a th_info structure.
|
||||||
* This should be called on a freshly allocated #th_info structure before
|
* This should be called on a freshly allocated #th_info structure before
|
||||||
@ -537,7 +551,7 @@ extern void th_comment_init(th_comment *_tc);
|
|||||||
* \param _tc The #th_comment struct to add the comment to.
|
* \param _tc The #th_comment struct to add the comment to.
|
||||||
* \param _comment Must be a null-terminated UTF-8 string containing the
|
* \param _comment Must be a null-terminated UTF-8 string containing the
|
||||||
* comment in "TAG=the value" form.*/
|
* comment in "TAG=the value" form.*/
|
||||||
extern void th_comment_add(th_comment *_tc, char *_comment);
|
extern void th_comment_add(th_comment *_tc,const char *_comment);
|
||||||
/**Add a comment to an initialized #th_comment structure.
|
/**Add a comment to an initialized #th_comment structure.
|
||||||
* \note Neither th_comment_add() nor th_comment_add_tag() support
|
* \note Neither th_comment_add() nor th_comment_add_tag() support
|
||||||
* comments containing null values, although the bitstream format does
|
* comments containing null values, although the bitstream format does
|
||||||
@ -545,10 +559,11 @@ extern void th_comment_add(th_comment *_tc, char *_comment);
|
|||||||
* To add such comments you will need to manipulate the #th_comment
|
* To add such comments you will need to manipulate the #th_comment
|
||||||
* structure directly.
|
* structure directly.
|
||||||
* \param _tc The #th_comment struct to add the comment to.
|
* \param _tc The #th_comment struct to add the comment to.
|
||||||
* \param _tag A null-terminated string containing the tag associated with
|
* \param _tag A null-terminated string containing the tag associated with
|
||||||
* the comment.
|
* the comment.
|
||||||
* \param _val The corresponding value as a null-terminated string.*/
|
* \param _val The corresponding value as a null-terminated string.*/
|
||||||
extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val);
|
extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
|
||||||
|
const char *_val);
|
||||||
/**Look up a comment value by its tag.
|
/**Look up a comment value by its tag.
|
||||||
* \param _tc An initialized #th_comment structure.
|
* \param _tc An initialized #th_comment structure.
|
||||||
* \param _tag The tag to look up.
|
* \param _tag The tag to look up.
|
||||||
@ -564,15 +579,15 @@ extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val);
|
|||||||
* It should not be modified or freed by the application, and
|
* It should not be modified or freed by the application, and
|
||||||
* modifications to the structure may invalidate the pointer.
|
* modifications to the structure may invalidate the pointer.
|
||||||
* \retval NULL If no matching tag is found.*/
|
* \retval NULL If no matching tag is found.*/
|
||||||
extern char *th_comment_query(th_comment *_tc,char *_tag,int _count);
|
extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count);
|
||||||
/**Look up the number of instances of a tag.
|
/**Look up the number of instances of a tag.
|
||||||
* Call this first when querying for a specific tag and then iterate over the
|
* Call this first when querying for a specific tag and then iterate over the
|
||||||
* number of instances with separate calls to th_comment_query() to
|
* number of instances with separate calls to th_comment_query() to
|
||||||
* retrieve all the values for that tag in order.
|
* retrieve all the values for that tag in order.
|
||||||
* \param _tc An initialized #th_comment structure.
|
* \param _tc An initialized #th_comment structure.
|
||||||
* \param _tag The tag to look up.
|
* \param _tag The tag to look up.
|
||||||
* \return The number on instances of this particular tag.*/
|
* \return The number of instances of this particular tag.*/
|
||||||
extern int th_comment_query_count(th_comment *_tc,char *_tag);
|
extern int th_comment_query_count(th_comment *_tc,const char *_tag);
|
||||||
/**Clears a #th_comment structure.
|
/**Clears a #th_comment structure.
|
||||||
* This should be called on a #th_comment structure after it is no longer
|
* This should be called on a #th_comment structure after it is no longer
|
||||||
* needed.
|
* needed.
|
||||||
|
114
thirdparty/libtheora/theora/theora.h
vendored
114
thirdparty/libtheora/theora/theora.h
vendored
@ -34,41 +34,41 @@ extern "C"
|
|||||||
*
|
*
|
||||||
* \section intro Introduction
|
* \section intro Introduction
|
||||||
*
|
*
|
||||||
* This is the documentation for the libtheora legacy C API, declared in
|
* This is the documentation for the libtheora legacy C API, declared in
|
||||||
* the theora.h header, which describes the old interface used before
|
* the theora.h header, which describes the old interface used before
|
||||||
* the 1.0 release. This API was widely deployed for several years and
|
* the 1.0 release. This API was widely deployed for several years and
|
||||||
* remains supported, but for new code we recommend the cleaner API
|
* remains supported, but for new code we recommend the cleaner API
|
||||||
* declared in theoradec.h and theoraenc.h.
|
* declared in theoradec.h and theoraenc.h.
|
||||||
*
|
*
|
||||||
* libtheora is the reference implementation for
|
* libtheora is the reference implementation for
|
||||||
* <a href="http://www.theora.org/">Theora</a>, a free video codec.
|
* <a href="http://www.theora.org/">Theora</a>, a free video codec.
|
||||||
* Theora is derived from On2's VP3 codec with improved integration with
|
* Theora is derived from On2's VP3 codec with improved integration with
|
||||||
* Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
|
* Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
|
||||||
*
|
*
|
||||||
* \section overview Overview
|
* \section overview Overview
|
||||||
*
|
*
|
||||||
* This library will both decode and encode theora packets to/from raw YUV
|
* This library will both decode and encode theora packets to/from raw YUV
|
||||||
* frames. In either case, the packets will most likely either come from or
|
* frames. In either case, the packets will most likely either come from or
|
||||||
* need to be embedded in an Ogg stream. Use
|
* need to be embedded in an Ogg stream. Use
|
||||||
* <a href="http://xiph.org/ogg/">libogg</a> or
|
* <a href="http://xiph.org/ogg/">libogg</a> or
|
||||||
* <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
|
* <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
|
||||||
* to extract/package these packets.
|
* to extract/package these packets.
|
||||||
*
|
*
|
||||||
* \section decoding Decoding Process
|
* \section decoding Decoding Process
|
||||||
*
|
*
|
||||||
* Decoding can be separated into the following steps:
|
* Decoding can be separated into the following steps:
|
||||||
* -# initialise theora_info and theora_comment structures using
|
* -# initialise theora_info and theora_comment structures using
|
||||||
* theora_info_init() and theora_comment_init():
|
* theora_info_init() and theora_comment_init():
|
||||||
\verbatim
|
\verbatim
|
||||||
theora_info info;
|
theora_info info;
|
||||||
theora_comment comment;
|
theora_comment comment;
|
||||||
|
|
||||||
theora_info_init(&info);
|
theora_info_init(&info);
|
||||||
theora_comment_init(&comment);
|
theora_comment_init(&comment);
|
||||||
\endverbatim
|
\endverbatim
|
||||||
* -# retrieve header packets from Ogg stream (there should be 3) and decode
|
* -# retrieve header packets from Ogg stream (there should be 3) and decode
|
||||||
* into theora_info and theora_comment structures using
|
* into theora_info and theora_comment structures using
|
||||||
* theora_decode_header(). See \ref identification for more information on
|
* theora_decode_header(). See \ref identification for more information on
|
||||||
* identifying which packets are theora packets.
|
* identifying which packets are theora packets.
|
||||||
\verbatim
|
\verbatim
|
||||||
int i;
|
int i;
|
||||||
@ -79,14 +79,14 @@ extern "C"
|
|||||||
}
|
}
|
||||||
\endverbatim
|
\endverbatim
|
||||||
* -# initialise the decoder based on the information retrieved into the
|
* -# initialise the decoder based on the information retrieved into the
|
||||||
* theora_info struct by theora_decode_header(). You will need a
|
* theora_info struct by theora_decode_header(). You will need a
|
||||||
* theora_state struct.
|
* theora_state struct.
|
||||||
\verbatim
|
\verbatim
|
||||||
theora_state state;
|
theora_state state;
|
||||||
|
|
||||||
theora_decode_init(&state, &info);
|
theora_decode_init(&state, &info);
|
||||||
\endverbatim
|
\endverbatim
|
||||||
* -# pass in packets and retrieve decoded frames! See the yuv_buffer
|
* -# pass in packets and retrieve decoded frames! See the yuv_buffer
|
||||||
* documentation for information on how to retrieve raw YUV data.
|
* documentation for information on how to retrieve raw YUV data.
|
||||||
\verbatim
|
\verbatim
|
||||||
yuf_buffer buffer;
|
yuf_buffer buffer;
|
||||||
@ -96,20 +96,20 @@ extern "C"
|
|||||||
theora_decode_YUVout(&state, &buffer);
|
theora_decode_YUVout(&state, &buffer);
|
||||||
}
|
}
|
||||||
\endverbatim
|
\endverbatim
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* \subsection identification Identifying Theora Packets
|
* \subsection identification Identifying Theora Packets
|
||||||
*
|
*
|
||||||
* All streams inside an Ogg file have a unique serial_no attached to the
|
* All streams inside an Ogg file have a unique serial_no attached to the
|
||||||
* stream. Typically, you will want to
|
* stream. Typically, you will want to
|
||||||
* - retrieve the serial_no for each b_o_s (beginning of stream) page
|
* - retrieve the serial_no for each b_o_s (beginning of stream) page
|
||||||
* encountered within the Ogg file;
|
* encountered within the Ogg file;
|
||||||
* - test the first (only) packet on that page to determine if it is a theora
|
* - test the first (only) packet on that page to determine if it is a theora
|
||||||
* packet;
|
* packet;
|
||||||
* - once you have found a theora b_o_s page then use the retrieved serial_no
|
* - once you have found a theora b_o_s page then use the retrieved serial_no
|
||||||
* to identify future packets belonging to the same theora stream.
|
* to identify future packets belonging to the same theora stream.
|
||||||
*
|
*
|
||||||
* Note that you \e cannot use theora_packet_isheader() to determine if a
|
* Note that you \e cannot use theora_packet_isheader() to determine if a
|
||||||
* packet is a theora packet or not, as this function does not perform any
|
* packet is a theora packet or not, as this function does not perform any
|
||||||
* checking beyond whether a header bit is present. Instead, use the
|
* checking beyond whether a header bit is present. Instead, use the
|
||||||
* theora_decode_header() function and check the return value; or examine the
|
* theora_decode_header() function and check the return value; or examine the
|
||||||
@ -124,9 +124,9 @@ extern "C"
|
|||||||
* A YUV buffer for passing uncompressed frames to and from the codec.
|
* A YUV buffer for passing uncompressed frames to and from the codec.
|
||||||
* This holds a Y'CbCr frame in planar format. The CbCr planes can be
|
* This holds a Y'CbCr frame in planar format. The CbCr planes can be
|
||||||
* subsampled and have their own separate dimensions and row stride
|
* subsampled and have their own separate dimensions and row stride
|
||||||
* offsets. Note that the strides may be negative in some
|
* offsets. Note that the strides may be negative in some
|
||||||
* configurations. For theora the width and height of the largest plane
|
* configurations. For theora the width and height of the largest plane
|
||||||
* must be a multiple of 16. The actual meaningful picture size and
|
* must be a multiple of 16. The actual meaningful picture size and
|
||||||
* offset are stored in the theora_info structure; frames returned by
|
* offset are stored in the theora_info structure; frames returned by
|
||||||
* the decoder may need to be cropped for display.
|
* the decoder may need to be cropped for display.
|
||||||
*
|
*
|
||||||
@ -135,8 +135,8 @@ extern "C"
|
|||||||
* are ordered from left to right.
|
* are ordered from left to right.
|
||||||
*
|
*
|
||||||
* During decode, the yuv_buffer struct is allocated by the user, but all
|
* During decode, the yuv_buffer struct is allocated by the user, but all
|
||||||
* fields (including luma and chroma pointers) are filled by the library.
|
* fields (including luma and chroma pointers) are filled by the library.
|
||||||
* These pointers address library-internal memory and their contents should
|
* These pointers address library-internal memory and their contents should
|
||||||
* not be modified.
|
* not be modified.
|
||||||
*
|
*
|
||||||
* Conversely, during encode the user allocates the struct and fills out all
|
* Conversely, during encode the user allocates the struct and fills out all
|
||||||
@ -179,14 +179,14 @@ typedef enum {
|
|||||||
OC_PF_420, /**< Chroma subsampling by 2 in each direction (4:2:0) */
|
OC_PF_420, /**< Chroma subsampling by 2 in each direction (4:2:0) */
|
||||||
OC_PF_RSVD, /**< Reserved value */
|
OC_PF_RSVD, /**< Reserved value */
|
||||||
OC_PF_422, /**< Horizonatal chroma subsampling by 2 (4:2:2) */
|
OC_PF_422, /**< Horizonatal chroma subsampling by 2 (4:2:2) */
|
||||||
OC_PF_444, /**< No chroma subsampling at all (4:4:4) */
|
OC_PF_444 /**< No chroma subsampling at all (4:4:4) */
|
||||||
} theora_pixelformat;
|
} theora_pixelformat;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Theora bitstream info.
|
* Theora bitstream info.
|
||||||
* Contains the basic playback parameters for a stream,
|
* Contains the basic playback parameters for a stream,
|
||||||
* corresponding to the initial 'info' header packet.
|
* corresponding to the initial 'info' header packet.
|
||||||
*
|
*
|
||||||
* Encoded theora frames must be a multiple of 16 in width and height.
|
* Encoded theora frames must be a multiple of 16 in width and height.
|
||||||
* To handle other frame sizes, a crop rectangle is specified in
|
* To handle other frame sizes, a crop rectangle is specified in
|
||||||
* frame_height and frame_width, offset_x and * offset_y. The offset
|
* frame_height and frame_width, offset_x and * offset_y. The offset
|
||||||
@ -198,10 +198,10 @@ typedef enum {
|
|||||||
* fraction. Aspect ratio is also stored as a rational fraction, and
|
* fraction. Aspect ratio is also stored as a rational fraction, and
|
||||||
* refers to the aspect ratio of the frame pixels, not of the
|
* refers to the aspect ratio of the frame pixels, not of the
|
||||||
* overall frame itself.
|
* overall frame itself.
|
||||||
*
|
*
|
||||||
* See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
|
* See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
|
||||||
* examples/encoder_example.c</a> for usage examples of the
|
* examples/encoder_example.c</a> for usage examples of the
|
||||||
* other paramters and good default settings for the encoder parameters.
|
* other parameters and good default settings for the encoder parameters.
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ogg_uint32_t width; /**< encoded frame width */
|
ogg_uint32_t width; /**< encoded frame width */
|
||||||
@ -253,14 +253,14 @@ typedef struct{
|
|||||||
|
|
||||||
} theora_state;
|
} theora_state;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Comment header metadata.
|
* Comment header metadata.
|
||||||
*
|
*
|
||||||
* This structure holds the in-stream metadata corresponding to
|
* This structure holds the in-stream metadata corresponding to
|
||||||
* the 'comment' header packet.
|
* the 'comment' header packet.
|
||||||
*
|
*
|
||||||
* Meta data is stored as a series of (tag, value) pairs, in
|
* Meta data is stored as a series of (tag, value) pairs, in
|
||||||
* length-encoded string vectors. The first occurence of the
|
* length-encoded string vectors. The first occurence of the
|
||||||
* '=' character delimits the tag and value. A particular tag
|
* '=' character delimits the tag and value. A particular tag
|
||||||
* may occur more than once. The character set encoding for
|
* may occur more than once. The character set encoding for
|
||||||
* the strings is always UTF-8, but the tag names are limited
|
* the strings is always UTF-8, but the tag names are limited
|
||||||
@ -285,7 +285,7 @@ typedef struct theora_comment{
|
|||||||
/* \anchor decctlcodes_old
|
/* \anchor decctlcodes_old
|
||||||
* These are the available request codes for theora_control()
|
* These are the available request codes for theora_control()
|
||||||
* when called with a decoder instance.
|
* when called with a decoder instance.
|
||||||
* By convention decoder control codes are odd, to distinguish
|
* By convention decoder control codes are odd, to distinguish
|
||||||
* them from \ref encctlcodes_old "encoder control codes" which
|
* them from \ref encctlcodes_old "encoder control codes" which
|
||||||
* are even.
|
* are even.
|
||||||
*
|
*
|
||||||
@ -306,7 +306,7 @@ typedef struct theora_comment{
|
|||||||
#define TH_DECCTL_GET_PPLEVEL_MAX (1)
|
#define TH_DECCTL_GET_PPLEVEL_MAX (1)
|
||||||
|
|
||||||
/**Set the post-processing level.
|
/**Set the post-processing level.
|
||||||
* Sets the level of post-processing to use when decoding the
|
* Sets the level of post-processing to use when decoding the
|
||||||
* compressed stream. This must be a value between zero (off)
|
* compressed stream. This must be a value between zero (off)
|
||||||
* and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
|
* and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
|
||||||
*/
|
*/
|
||||||
@ -345,9 +345,9 @@ typedef struct theora_comment{
|
|||||||
* \param[in] buf #th_quant_info
|
* \param[in] buf #th_quant_info
|
||||||
* \retval OC_FAULT \a theora_state is <tt>NULL</tt>.
|
* \retval OC_FAULT \a theora_state is <tt>NULL</tt>.
|
||||||
* \retval OC_EINVAL Encoding has already begun, the quantization parameters
|
* \retval OC_EINVAL Encoding has already begun, the quantization parameters
|
||||||
* are not acceptable to this version of the encoder,
|
* are not acceptable to this version of the encoder,
|
||||||
* \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
|
* \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
|
||||||
* or \a buf is non-<tt>NULL</tt> and \a buf_sz is
|
* or \a buf is non-<tt>NULL</tt> and \a buf_sz is
|
||||||
* not <tt>sizeof(#th_quant_info)</tt>.
|
* not <tt>sizeof(#th_quant_info)</tt>.
|
||||||
* \retval OC_IMPL Not supported by this implementation.*/
|
* \retval OC_IMPL Not supported by this implementation.*/
|
||||||
#define TH_ENCCTL_SET_QUANT_PARAMS (2)
|
#define TH_ENCCTL_SET_QUANT_PARAMS (2)
|
||||||
@ -424,7 +424,7 @@ typedef struct theora_comment{
|
|||||||
#define OC_NEWPACKET -25 /**< Packet is an (ignorable) unhandled extension */
|
#define OC_NEWPACKET -25 /**< Packet is an (ignorable) unhandled extension */
|
||||||
#define OC_DUPFRAME 1 /**< Packet is a dropped frame */
|
#define OC_DUPFRAME 1 /**< Packet is a dropped frame */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieve a human-readable string to identify the encoder vendor and version.
|
* Retrieve a human-readable string to identify the encoder vendor and version.
|
||||||
* \returns A version string.
|
* \returns A version string.
|
||||||
*/
|
*/
|
||||||
@ -462,7 +462,7 @@ extern int theora_encode_init(theora_state *th, theora_info *ti);
|
|||||||
extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
|
extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Request the next packet of encoded video.
|
* Request the next packet of encoded video.
|
||||||
* The encoded data is placed in a user-provided ogg_packet structure.
|
* The encoded data is placed in a user-provided ogg_packet structure.
|
||||||
* \param t A theora_state handle previously initialized for encoding.
|
* \param t A theora_state handle previously initialized for encoding.
|
||||||
* \param last_p whether this is the last packet the encoder should produce.
|
* \param last_p whether this is the last packet the encoder should produce.
|
||||||
@ -496,7 +496,11 @@ extern int theora_encode_header(theora_state *t, ogg_packet *op);
|
|||||||
* \param op An ogg_packet structure to fill. libtheora will set all
|
* \param op An ogg_packet structure to fill. libtheora will set all
|
||||||
* elements of this structure, including a pointer to the encoded
|
* elements of this structure, including a pointer to the encoded
|
||||||
* comment data. The memory for the comment data is owned by
|
* comment data. The memory for the comment data is owned by
|
||||||
* libtheora.
|
* the application, and must be freed by it using _ogg_free().
|
||||||
|
* On some systems (such as Windows when using dynamic linking), this
|
||||||
|
* may mean the free is executed in a different module from the
|
||||||
|
* malloc, which will crash; there is no way to free this memory on
|
||||||
|
* such systems.
|
||||||
* \retval 0 Success
|
* \retval 0 Success
|
||||||
*/
|
*/
|
||||||
extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
|
extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
|
||||||
@ -581,8 +585,8 @@ extern int theora_decode_packetin(theora_state *th,ogg_packet *op);
|
|||||||
* \param th A theora_state handle previously initialized for decoding.
|
* \param th A theora_state handle previously initialized for decoding.
|
||||||
* \param yuv A yuv_buffer in which libtheora should place the decoded data.
|
* \param yuv A yuv_buffer in which libtheora should place the decoded data.
|
||||||
* Note that the buffer struct itself is allocated by the user, but
|
* Note that the buffer struct itself is allocated by the user, but
|
||||||
* that the luma and chroma pointers will be filled in by the
|
* that the luma and chroma pointers will be filled in by the
|
||||||
* library. Also note that these luma and chroma regions should be
|
* library. Also note that these luma and chroma regions should be
|
||||||
* considered read-only by the user.
|
* considered read-only by the user.
|
||||||
* \retval 0 Success
|
* \retval 0 Success
|
||||||
*/
|
*/
|
||||||
@ -617,22 +621,22 @@ extern int theora_packet_iskeyframe(ogg_packet *op);
|
|||||||
/**
|
/**
|
||||||
* Report the granulepos shift radix
|
* Report the granulepos shift radix
|
||||||
*
|
*
|
||||||
* When embedded in Ogg, Theora uses a two-part granulepos,
|
* When embedded in Ogg, Theora uses a two-part granulepos,
|
||||||
* splitting the 64-bit field into two pieces. The more-significant
|
* splitting the 64-bit field into two pieces. The more-significant
|
||||||
* section represents the frame count at the last keyframe,
|
* section represents the frame count at the last keyframe,
|
||||||
* and the less-significant section represents the count of
|
* and the less-significant section represents the count of
|
||||||
* frames since the last keyframe. In this way the overall
|
* frames since the last keyframe. In this way the overall
|
||||||
* field is still non-decreasing with time, but usefully encodes
|
* field is still non-decreasing with time, but usefully encodes
|
||||||
* a pointer to the last keyframe, which is necessary for
|
* a pointer to the last keyframe, which is necessary for
|
||||||
* correctly restarting decode after a seek.
|
* correctly restarting decode after a seek.
|
||||||
*
|
*
|
||||||
* This function reports the number of bits used to represent
|
* This function reports the number of bits used to represent
|
||||||
* the distance to the last keyframe, and thus how the granulepos
|
* the distance to the last keyframe, and thus how the granulepos
|
||||||
* field must be shifted or masked to obtain the two parts.
|
* field must be shifted or masked to obtain the two parts.
|
||||||
*
|
*
|
||||||
* Since libtheora returns compressed data in an ogg_packet
|
* Since libtheora returns compressed data in an ogg_packet
|
||||||
* structure, this may be generally useful even if the Theora
|
* structure, this may be generally useful even if the Theora
|
||||||
* packets are not being used in an Ogg container.
|
* packets are not being used in an Ogg container.
|
||||||
*
|
*
|
||||||
* \param ti A previously initialized theora_info struct
|
* \param ti A previously initialized theora_info struct
|
||||||
* \returns The bit shift dividing the two granulepos fields
|
* \returns The bit shift dividing the two granulepos fields
|
||||||
@ -644,7 +648,7 @@ int theora_granule_shift(theora_info *ti);
|
|||||||
/**
|
/**
|
||||||
* Convert a granulepos to an absolute frame index, starting at 0.
|
* Convert a granulepos to an absolute frame index, starting at 0.
|
||||||
* The granulepos is interpreted in the context of a given theora_state handle.
|
* The granulepos is interpreted in the context of a given theora_state handle.
|
||||||
*
|
*
|
||||||
* Note that while the granulepos encodes the frame count (i.e. starting
|
* Note that while the granulepos encodes the frame count (i.e. starting
|
||||||
* from 1) this call returns the frame index, starting from zero. Thus
|
* from 1) this call returns the frame index, starting from zero. Thus
|
||||||
* One can calculate the presentation time by multiplying the index by
|
* One can calculate the presentation time by multiplying the index by
|
||||||
@ -670,9 +674,7 @@ extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos)
|
|||||||
* This is the "end time" for the frame, or the latest time it should
|
* This is the "end time" for the frame, or the latest time it should
|
||||||
* be displayed.
|
* be displayed.
|
||||||
* It is not the presentation time.
|
* It is not the presentation time.
|
||||||
* \retval -1. The given granulepos is undefined (i.e. negative), or
|
* \retval -1. The given granulepos is undefined (i.e. negative).
|
||||||
* \retval -1. The function has been disabled because floating
|
|
||||||
* point support is not available.
|
|
||||||
*/
|
*/
|
||||||
extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
|
extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
|
||||||
|
|
||||||
@ -699,7 +701,7 @@ extern void theora_clear(theora_state *t);
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize an allocated theora_comment structure
|
* Initialize an allocated theora_comment structure
|
||||||
* \param tc An allocated theora_comment structure
|
* \param tc An allocated theora_comment structure
|
||||||
**/
|
**/
|
||||||
extern void theora_comment_init(theora_comment *tc);
|
extern void theora_comment_init(theora_comment *tc);
|
||||||
|
|
||||||
@ -720,7 +722,7 @@ extern void theora_comment_add(theora_comment *tc, char *comment);
|
|||||||
/**
|
/**
|
||||||
* Add a comment to an initialized theora_comment structure.
|
* Add a comment to an initialized theora_comment structure.
|
||||||
* \param tc A previously initialized theora comment structure
|
* \param tc A previously initialized theora comment structure
|
||||||
* \param tag A null-terminated string containing the tag
|
* \param tag A null-terminated string containing the tag
|
||||||
* associated with the comment.
|
* associated with the comment.
|
||||||
* \param value The corresponding value as a null-terminated string
|
* \param value The corresponding value as a null-terminated string
|
||||||
*
|
*
|
||||||
@ -752,9 +754,9 @@ extern char *theora_comment_query(theora_comment *tc, char *tag, int count);
|
|||||||
* \param tc An initialized theora_comment structure
|
* \param tc An initialized theora_comment structure
|
||||||
* \param tag The tag to look up
|
* \param tag The tag to look up
|
||||||
* \returns The number on instances of a particular tag.
|
* \returns The number on instances of a particular tag.
|
||||||
*
|
*
|
||||||
* Call this first when querying for a specific tag and then interate
|
* Call this first when querying for a specific tag and then interate
|
||||||
* over the number of instances with separate calls to
|
* over the number of instances with separate calls to
|
||||||
* theora_comment_query() to retrieve all instances in order.
|
* theora_comment_query() to retrieve all instances in order.
|
||||||
**/
|
**/
|
||||||
extern int theora_comment_query_count(theora_comment *tc, char *tag);
|
extern int theora_comment_query_count(theora_comment *tc, char *tag);
|
||||||
@ -769,7 +771,7 @@ extern void theora_comment_clear(theora_comment *tc);
|
|||||||
* This is used to provide advanced control the encoding process.
|
* This is used to provide advanced control the encoding process.
|
||||||
* \param th A #theora_state handle.
|
* \param th A #theora_state handle.
|
||||||
* \param req The control code to process.
|
* \param req The control code to process.
|
||||||
* See \ref encctlcodes_old "the list of available
|
* See \ref encctlcodes_old "the list of available
|
||||||
* control codes" for details.
|
* control codes" for details.
|
||||||
* \param buf The parameters for this control code.
|
* \param buf The parameters for this control code.
|
||||||
* \param buf_sz The size of the parameter buffer.*/
|
* \param buf_sz The size of the parameter buffer.*/
|
||||||
|
22
thirdparty/libtheora/theora/theoradec.h
vendored
22
thirdparty/libtheora/theora/theoradec.h
vendored
@ -92,13 +92,17 @@ extern "C" {
|
|||||||
* <tt>sizeof(th_stripe_callback)</tt>.*/
|
* <tt>sizeof(th_stripe_callback)</tt>.*/
|
||||||
#define TH_DECCTL_SET_STRIPE_CB (7)
|
#define TH_DECCTL_SET_STRIPE_CB (7)
|
||||||
|
|
||||||
/**Enables telemetry and sets the macroblock display mode */
|
/**Sets the macroblock display mode. Set to 0 to disable displaying
|
||||||
|
* macroblocks.*/
|
||||||
#define TH_DECCTL_SET_TELEMETRY_MBMODE (9)
|
#define TH_DECCTL_SET_TELEMETRY_MBMODE (9)
|
||||||
/**Enables telemetry and sets the motion vector display mode */
|
/**Sets the motion vector display mode. Set to 0 to disable displaying motion
|
||||||
|
* vectors.*/
|
||||||
#define TH_DECCTL_SET_TELEMETRY_MV (11)
|
#define TH_DECCTL_SET_TELEMETRY_MV (11)
|
||||||
/**Enables telemetry and sets the adaptive quantization display mode */
|
/**Sets the adaptive quantization display mode. Set to 0 to disable displaying
|
||||||
|
* adaptive quantization. */
|
||||||
#define TH_DECCTL_SET_TELEMETRY_QI (13)
|
#define TH_DECCTL_SET_TELEMETRY_QI (13)
|
||||||
/**Enables telemetry and sets the bitstream breakdown visualization mode */
|
/**Sets the bitstream breakdown visualization mode. Set to 0 to disable
|
||||||
|
* displaying bitstream breakdown.*/
|
||||||
#define TH_DECCTL_SET_TELEMETRY_BITS (15)
|
#define TH_DECCTL_SET_TELEMETRY_BITS (15)
|
||||||
/*@}*/
|
/*@}*/
|
||||||
|
|
||||||
@ -171,7 +175,7 @@ typedef struct th_setup_info th_setup_info;
|
|||||||
/**\defgroup decfuncs Functions for Decoding*/
|
/**\defgroup decfuncs Functions for Decoding*/
|
||||||
/*@{*/
|
/*@{*/
|
||||||
/**\name Functions for decoding
|
/**\name Functions for decoding
|
||||||
* You must link to <tt>libtheoradec</tt> if you use any of the
|
* You must link to <tt>libtheoradec</tt> if you use any of the
|
||||||
* functions in this section.
|
* functions in this section.
|
||||||
*
|
*
|
||||||
* The functions are listed in the order they are used in a typical decode.
|
* The functions are listed in the order they are used in a typical decode.
|
||||||
@ -267,7 +271,10 @@ extern void th_setup_free(th_setup_info *_setup);
|
|||||||
* See \ref decctlcodes "the list of available control codes"
|
* See \ref decctlcodes "the list of available control codes"
|
||||||
* for details.
|
* for details.
|
||||||
* \param _buf The parameters for this control code.
|
* \param _buf The parameters for this control code.
|
||||||
* \param _buf_sz The size of the parameter buffer.*/
|
* \param _buf_sz The size of the parameter buffer.
|
||||||
|
* \return Possible return values depend on the control code used.
|
||||||
|
* See \ref decctlcodes "the list of control codes" for
|
||||||
|
* specific values. Generally 0 indicates success.*/
|
||||||
extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
|
extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
|
||||||
size_t _buf_sz);
|
size_t _buf_sz);
|
||||||
/**Submits a packet containing encoded video data to the decoder.
|
/**Submits a packet containing encoded video data to the decoder.
|
||||||
@ -283,7 +290,8 @@ extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
|
|||||||
* \retval 0 Success.
|
* \retval 0 Success.
|
||||||
* A new decoded frame can be retrieved by calling
|
* A new decoded frame can be retrieved by calling
|
||||||
* th_decode_ycbcr_out().
|
* th_decode_ycbcr_out().
|
||||||
* \retval TH_DUPFRAME The packet represented a dropped (0-byte) frame.
|
* \retval TH_DUPFRAME The packet represented a dropped frame (either a
|
||||||
|
* 0-byte frame or an INTER frame with no coded blocks).
|
||||||
* The player can skip the call to th_decode_ycbcr_out(),
|
* The player can skip the call to th_decode_ycbcr_out(),
|
||||||
* as the contents of the decoded frame buffer have not
|
* as the contents of the decoded frame buffer have not
|
||||||
* changed.
|
* changed.
|
||||||
|
112
thirdparty/libtheora/theora/theoraenc.h
vendored
112
thirdparty/libtheora/theora/theoraenc.h
vendored
@ -43,7 +43,7 @@ extern "C" {
|
|||||||
* <tt>NULL</tt> may be specified to revert to the default tables.
|
* <tt>NULL</tt> may be specified to revert to the default tables.
|
||||||
*
|
*
|
||||||
* \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
|
* \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
|
||||||
* \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL Encoding has already begun or one or more of the given
|
* \retval TH_EINVAL Encoding has already begun or one or more of the given
|
||||||
* tables is not full or prefix-free, \a _buf is
|
* tables is not full or prefix-free, \a _buf is
|
||||||
* <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
|
* <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
|
||||||
@ -57,8 +57,8 @@ extern "C" {
|
|||||||
* <tt>NULL</tt> may be specified to revert to the default parameters.
|
* <tt>NULL</tt> may be specified to revert to the default parameters.
|
||||||
*
|
*
|
||||||
* \param[in] _buf #th_quant_info
|
* \param[in] _buf #th_quant_info
|
||||||
* \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL Encoding has already begun, \a _buf is
|
* \retval TH_EINVAL Encoding has already begun, \a _buf is
|
||||||
* <tt>NULL</tt> and \a _buf_sz is not zero,
|
* <tt>NULL</tt> and \a _buf_sz is not zero,
|
||||||
* or \a _buf is non-<tt>NULL</tt> and
|
* or \a _buf is non-<tt>NULL</tt> and
|
||||||
* \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
|
* \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
|
||||||
@ -73,7 +73,7 @@ extern "C" {
|
|||||||
* \param[in] _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
|
* \param[in] _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
|
||||||
* frames.
|
* frames.
|
||||||
* \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
|
* \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
|
||||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||||
#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
|
#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
|
||||||
@ -101,7 +101,7 @@ extern "C" {
|
|||||||
* 4:2:0, the picture region is smaller than the full frame,
|
* 4:2:0, the picture region is smaller than the full frame,
|
||||||
* or if encoding has begun, preventing the quantization
|
* or if encoding has begun, preventing the quantization
|
||||||
* tables and codebooks from being set.
|
* tables and codebooks from being set.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||||
#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
|
#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
|
||||||
@ -114,7 +114,7 @@ extern "C" {
|
|||||||
* the current encoding mode (VBR vs. constant quality, etc.).
|
* the current encoding mode (VBR vs. constant quality, etc.).
|
||||||
*
|
*
|
||||||
* \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
|
* \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||||
* encoding mode.*/
|
* encoding mode.*/
|
||||||
@ -124,7 +124,7 @@ extern "C" {
|
|||||||
*
|
*
|
||||||
* \param[in] _buf <tt>int</tt>: The new encoding speed level.
|
* \param[in] _buf <tt>int</tt>: The new encoding speed level.
|
||||||
* 0 is slowest, larger values use less CPU.
|
* 0 is slowest, larger values use less CPU.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||||
* encoding speed level is out of bounds.
|
* encoding speed level is out of bounds.
|
||||||
* The maximum encoding speed level may be
|
* The maximum encoding speed level may be
|
||||||
@ -142,7 +142,7 @@ extern "C" {
|
|||||||
*
|
*
|
||||||
* \param[out] _buf <tt>int</tt>: The current encoding speed level.
|
* \param[out] _buf <tt>int</tt>: The current encoding speed level.
|
||||||
* 0 is slowest, larger values use less CPU.
|
* 0 is slowest, larger values use less CPU.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||||
* encoding mode.*/
|
* encoding mode.*/
|
||||||
@ -162,7 +162,7 @@ extern "C" {
|
|||||||
*
|
*
|
||||||
* \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
|
* \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
|
||||||
* If this is negative or zero, no duplicates will be produced.
|
* If this is negative or zero, no duplicates will be produced.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||||
* number of duplicates is greater than or equal to the
|
* number of duplicates is greater than or equal to the
|
||||||
* maximum keyframe interval.
|
* maximum keyframe interval.
|
||||||
@ -187,7 +187,7 @@ extern "C" {
|
|||||||
* use.
|
* use.
|
||||||
* - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
|
* - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
|
||||||
* later.
|
* later.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
|
||||||
* is not enabled.
|
* is not enabled.
|
||||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||||
@ -211,7 +211,7 @@ extern "C" {
|
|||||||
* \param[in] _buf <tt>int</tt>: Requested size of the reservoir measured in
|
* \param[in] _buf <tt>int</tt>: Requested size of the reservoir measured in
|
||||||
* frames.
|
* frames.
|
||||||
* \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
|
* \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
|
||||||
* is not enabled. The buffer has an implementation
|
* is not enabled. The buffer has an implementation
|
||||||
* defined minimum and maximum size and the value in _buf
|
* defined minimum and maximum size and the value in _buf
|
||||||
@ -243,7 +243,7 @@ extern "C" {
|
|||||||
* application.
|
* application.
|
||||||
* \retval >=0 The number of bytes of metric data available in the
|
* \retval >=0 The number of bytes of metric data available in the
|
||||||
* returned buffer.
|
* returned buffer.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
|
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
|
||||||
* bitrate has been set, or the first call was made after
|
* bitrate has been set, or the first call was made after
|
||||||
* the first frame was submitted for encoding.
|
* the first frame was submitted for encoding.
|
||||||
@ -283,7 +283,7 @@ extern "C" {
|
|||||||
* of bytes consumed.
|
* of bytes consumed.
|
||||||
* \retval >0 The number of bytes of metric data required/consumed.
|
* \retval >0 The number of bytes of metric data required/consumed.
|
||||||
* \retval 0 No more data is required before the next frame.
|
* \retval 0 No more data is required before the next frame.
|
||||||
* \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL No target bitrate has been set, or the first call was
|
* \retval TH_EINVAL No target bitrate has been set, or the first call was
|
||||||
* made after the first frame was submitted for
|
* made after the first frame was submitted for
|
||||||
* encoding.
|
* encoding.
|
||||||
@ -306,7 +306,7 @@ extern "C" {
|
|||||||
* \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
|
* \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
|
||||||
* inclusive.
|
* inclusive.
|
||||||
* \retval 0 Success.
|
* \retval 0 Success.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL A target bitrate has already been specified, or the
|
* \retval TH_EINVAL A target bitrate has already been specified, or the
|
||||||
* quality index was not in the range 0...63.
|
* quality index was not in the range 0...63.
|
||||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||||
@ -328,10 +328,54 @@ extern "C" {
|
|||||||
*
|
*
|
||||||
* \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
|
* \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
|
||||||
* \retval 0 Success.
|
* \retval 0 Success.
|
||||||
* \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL The target bitrate was not positive.
|
* \retval TH_EINVAL The target bitrate was not positive.
|
||||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
* A future version of this library may allow passing 0
|
||||||
|
* to disabled rate-controlled mode and return to a
|
||||||
|
* quality-based mode, in which case this function will
|
||||||
|
* not return an error for that value.
|
||||||
|
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||||
#define TH_ENCCTL_SET_BITRATE (30)
|
#define TH_ENCCTL_SET_BITRATE (30)
|
||||||
|
/**Sets the configuration to be compatible with that from the given setup
|
||||||
|
* header.
|
||||||
|
* This sets the Huffman codebooks and quantization parameters to match those
|
||||||
|
* found in the given setup header.
|
||||||
|
* This guarantees that packets encoded by this encoder will be decodable using
|
||||||
|
* a decoder configured with the passed-in setup header.
|
||||||
|
* It does <em>not</em> guarantee that th_encode_flushheader() will produce a
|
||||||
|
* bit-identical setup header, only that they will be compatible.
|
||||||
|
* If you need a bit-identical setup header, then use the one you passed into
|
||||||
|
* this command, and not the one returned by th_encode_flushheader().
|
||||||
|
*
|
||||||
|
* This also does <em>not</em> enable or disable VP3 compatibility; that is not
|
||||||
|
* signaled in the setup header (or anywhere else in the encoded stream), and
|
||||||
|
* is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
|
||||||
|
* If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
|
||||||
|
* and quantization parameters to match the given setup header, you should
|
||||||
|
* enable VP3 compatibility before invoking this command, otherwise the
|
||||||
|
* codebooks and quantization parameters will be reset to the VP3 defaults.
|
||||||
|
*
|
||||||
|
* The current encoder does not support Huffman codebooks which do not contain
|
||||||
|
* codewords for all 32 tokens.
|
||||||
|
* Such codebooks are legal, according to the specification, but cannot be
|
||||||
|
* configured with this function.
|
||||||
|
*
|
||||||
|
* \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
|
||||||
|
* the configuration from.
|
||||||
|
* This should be the original,
|
||||||
|
* undecoded setup header packet,
|
||||||
|
* and <em>not</em> a #th_setup_info
|
||||||
|
* structure filled in by
|
||||||
|
* th_decode_headerin().
|
||||||
|
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||||
|
* \retval TH_EINVAL Encoding has already begun, so the codebooks and
|
||||||
|
* quantization parameters cannot be changed, or the
|
||||||
|
* data in the setup header was not supported by this
|
||||||
|
* encoder.
|
||||||
|
* \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
|
||||||
|
* \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
|
||||||
|
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||||
|
#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
|
||||||
|
|
||||||
/*@}*/
|
/*@}*/
|
||||||
|
|
||||||
@ -342,7 +386,8 @@ extern "C" {
|
|||||||
/*@{*/
|
/*@{*/
|
||||||
/**Drop frames to keep within bitrate buffer constraints.
|
/**Drop frames to keep within bitrate buffer constraints.
|
||||||
* This can have a severe impact on quality, but is the only way to ensure that
|
* This can have a severe impact on quality, but is the only way to ensure that
|
||||||
* bitrate targets are met at low rates during sudden bursts of activity.*/
|
* bitrate targets are met at low rates during sudden bursts of activity.
|
||||||
|
* It is enabled by default.*/
|
||||||
#define TH_RATECTL_DROP_FRAMES (0x1)
|
#define TH_RATECTL_DROP_FRAMES (0x1)
|
||||||
/**Ignore bitrate buffer overflows.
|
/**Ignore bitrate buffer overflows.
|
||||||
* If the encoder uses so few bits that the reservoir of available bits
|
* If the encoder uses so few bits that the reservoir of available bits
|
||||||
@ -350,14 +395,14 @@ extern "C" {
|
|||||||
* The encoder will not try to use these extra bits in future frames.
|
* The encoder will not try to use these extra bits in future frames.
|
||||||
* At high rates this may cause the result to be undersized, but allows a
|
* At high rates this may cause the result to be undersized, but allows a
|
||||||
* client to play the stream using a finite buffer; it should normally be
|
* client to play the stream using a finite buffer; it should normally be
|
||||||
* enabled.*/
|
* enabled, which is the default.*/
|
||||||
#define TH_RATECTL_CAP_OVERFLOW (0x2)
|
#define TH_RATECTL_CAP_OVERFLOW (0x2)
|
||||||
/**Ignore bitrate buffer underflows.
|
/**Ignore bitrate buffer underflows.
|
||||||
* If the encoder uses so many bits that the reservoir of available bits
|
* If the encoder uses so many bits that the reservoir of available bits
|
||||||
* underflows, ignore the deficit.
|
* underflows, ignore the deficit.
|
||||||
* The encoder will not try to make up these extra bits in future frames.
|
* The encoder will not try to make up these extra bits in future frames.
|
||||||
* At low rates this may cause the result to be oversized; it should normally
|
* At low rates this may cause the result to be oversized; it should normally
|
||||||
* be disabled.*/
|
* be disabled, which is the default.*/
|
||||||
#define TH_RATECTL_CAP_UNDERFLOW (0x4)
|
#define TH_RATECTL_CAP_UNDERFLOW (0x4)
|
||||||
/*@}*/
|
/*@}*/
|
||||||
|
|
||||||
@ -401,8 +446,8 @@ typedef struct th_enc_ctx th_enc_ctx;
|
|||||||
* packets.
|
* packets.
|
||||||
* - For each uncompressed frame:
|
* - For each uncompressed frame:
|
||||||
* - Submit the uncompressed frame via th_encode_ycbcr_in()
|
* - Submit the uncompressed frame via th_encode_ycbcr_in()
|
||||||
* - Repeatedly call th_encode_packetout() to retrieve any video data packets
|
* - Repeatedly call th_encode_packetout() to retrieve any video
|
||||||
* that are ready.
|
* data packets that are ready.
|
||||||
* - Call th_encode_free() to release all encoder memory.*/
|
* - Call th_encode_free() to release all encoder memory.*/
|
||||||
/*@{*/
|
/*@{*/
|
||||||
/**Allocates an encoder instance.
|
/**Allocates an encoder instance.
|
||||||
@ -417,7 +462,10 @@ extern th_enc_ctx *th_encode_alloc(const th_info *_info);
|
|||||||
* See \ref encctlcodes "the list of available control codes"
|
* See \ref encctlcodes "the list of available control codes"
|
||||||
* for details.
|
* for details.
|
||||||
* \param _buf The parameters for this control code.
|
* \param _buf The parameters for this control code.
|
||||||
* \param _buf_sz The size of the parameter buffer.*/
|
* \param _buf_sz The size of the parameter buffer.
|
||||||
|
* \return Possible return values depend on the control code used.
|
||||||
|
* See \ref encctlcodes "the list of control codes" for
|
||||||
|
* specific values. Generally 0 indicates success.*/
|
||||||
extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
|
extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
|
||||||
/**Outputs the next header packet.
|
/**Outputs the next header packet.
|
||||||
* This should be called repeatedly after encoder initialization until it
|
* This should be called repeatedly after encoder initialization until it
|
||||||
@ -441,11 +489,25 @@ extern int th_encode_flushheader(th_enc_ctx *_enc,
|
|||||||
/**Submits an uncompressed frame to the encoder.
|
/**Submits an uncompressed frame to the encoder.
|
||||||
* \param _enc A #th_enc_ctx handle.
|
* \param _enc A #th_enc_ctx handle.
|
||||||
* \param _ycbcr A buffer of Y'CbCr data to encode.
|
* \param _ycbcr A buffer of Y'CbCr data to encode.
|
||||||
|
* If the width and height of the buffer matches the frame size
|
||||||
|
* the encoder was initialized with, the encoder will only
|
||||||
|
* reference the portion inside the picture region.
|
||||||
|
* Any data outside this region will be ignored, and need not map
|
||||||
|
* to a valid address.
|
||||||
|
* Alternatively, you can pass a buffer equal to the size of the
|
||||||
|
* picture region, if this is less than the full frame size.
|
||||||
|
* When using subsampled chroma planes, odd picture sizes or odd
|
||||||
|
* picture offsets may require an unexpected chroma plane size,
|
||||||
|
* and their use is generally discouraged, as they will not be
|
||||||
|
* well-supported by players and other media frameworks.
|
||||||
|
* See Section 4.4 of
|
||||||
|
* <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||||
|
* specification</a> for details if you wish to use them anyway.
|
||||||
* \retval 0 Success.
|
* \retval 0 Success.
|
||||||
* \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
|
* \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
|
||||||
* \retval TH_EINVAL The buffer size does not match the frame size the encoder
|
* \retval TH_EINVAL The buffer size matches neither the frame size nor the
|
||||||
* was initialized with, or encoding has already
|
* picture size the encoder was initialized with, or
|
||||||
* completed.*/
|
* encoding has already completed.*/
|
||||||
extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
|
extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
|
||||||
/**Retrieves encoded video data packets.
|
/**Retrieves encoded video data packets.
|
||||||
* This should be called repeatedly after each frame is submitted to flush any
|
* This should be called repeatedly after each frame is submitted to flush any
|
||||||
|
1006
thirdparty/libtheora/tokenize.c
vendored
1006
thirdparty/libtheora/tokenize.c
vendored
File diff suppressed because it is too large
Load Diff
290
thirdparty/libtheora/x86/mmxencfrag.c
vendored
290
thirdparty/libtheora/x86/mmxencfrag.c
vendored
@ -65,7 +65,7 @@ unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
|||||||
"paddw %%mm6,%%mm0\n\t"
|
"paddw %%mm6,%%mm0\n\t"
|
||||||
"paddw %%mm2,%%mm0\n\t"
|
"paddw %%mm2,%%mm0\n\t"
|
||||||
"movd %%mm0,%[ret]\n\t"
|
"movd %%mm0,%[ret]\n\t"
|
||||||
:[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
|
:[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
|
||||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||||
);
|
);
|
||||||
return (unsigned)ret;
|
return (unsigned)ret;
|
||||||
@ -87,7 +87,9 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
|||||||
The latter is exactly 1 too large when the low bit of two corresponding \
|
The latter is exactly 1 too large when the low bit of two corresponding \
|
||||||
bytes is only set in one of them. \
|
bytes is only set in one of them. \
|
||||||
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
|
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
|
||||||
correct the output of pavgb.*/ \
|
correct the output of pavgb. \
|
||||||
|
TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
|
||||||
|
schedules better; currently, however, this function is unused.*/ \
|
||||||
"movq %%mm0,%%mm6\n\t" \
|
"movq %%mm0,%%mm6\n\t" \
|
||||||
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
|
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
|
||||||
"pxor %%mm1,%%mm0\n\t" \
|
"pxor %%mm1,%%mm0\n\t" \
|
||||||
@ -153,7 +155,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
OC_SAD2_LOOP
|
OC_SAD2_LOOP
|
||||||
OC_SAD2_LOOP
|
OC_SAD2_LOOP
|
||||||
OC_SAD2_TAIL
|
OC_SAD2_TAIL
|
||||||
:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
|
:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
|
||||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||||
);
|
);
|
||||||
return (unsigned)ret;
|
return (unsigned)ret;
|
||||||
@ -163,54 +165,54 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
16-bit difference in %%mm0...%%mm7.*/
|
16-bit difference in %%mm0...%%mm7.*/
|
||||||
#define OC_LOAD_SUB_8x4(_off) \
|
#define OC_LOAD_SUB_8x4(_off) \
|
||||||
"#OC_LOAD_SUB_8x4\n\t" \
|
"#OC_LOAD_SUB_8x4\n\t" \
|
||||||
"movd "_off"(%[src]),%%mm0\n\t" \
|
"movd "#_off"(%[src]),%%mm0\n\t" \
|
||||||
"movd "_off"(%[ref]),%%mm4\n\t" \
|
"movd "#_off"(%[ref]),%%mm4\n\t" \
|
||||||
"movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
|
"movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
|
||||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
"movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
|
"movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
|
||||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
"movd "_off"(%[src]),%%mm2\n\t" \
|
"movd "#_off"(%[src]),%%mm2\n\t" \
|
||||||
"movd "_off"(%[ref]),%%mm7\n\t" \
|
"movd "#_off"(%[ref]),%%mm7\n\t" \
|
||||||
"movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
|
"movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
|
||||||
"movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
|
"movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
|
||||||
"punpcklbw %%mm4,%%mm0\n\t" \
|
"punpcklbw %%mm4,%%mm0\n\t" \
|
||||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
"punpcklbw %%mm4,%%mm4\n\t" \
|
"punpcklbw %%mm4,%%mm4\n\t" \
|
||||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
"psubw %%mm4,%%mm0\n\t" \
|
"psubw %%mm4,%%mm0\n\t" \
|
||||||
"movd "_off"(%[src]),%%mm4\n\t" \
|
"movd "#_off"(%[src]),%%mm4\n\t" \
|
||||||
"movq %%mm0,"_off"*2(%[buf])\n\t" \
|
"movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
|
||||||
"movd "_off"(%[ref]),%%mm0\n\t" \
|
"movd "#_off"(%[ref]),%%mm0\n\t" \
|
||||||
"punpcklbw %%mm5,%%mm1\n\t" \
|
"punpcklbw %%mm5,%%mm1\n\t" \
|
||||||
"punpcklbw %%mm5,%%mm5\n\t" \
|
"punpcklbw %%mm5,%%mm5\n\t" \
|
||||||
"psubw %%mm5,%%mm1\n\t" \
|
"psubw %%mm5,%%mm1\n\t" \
|
||||||
"movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
|
"movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm2\n\t" \
|
"punpcklbw %%mm7,%%mm2\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||||
"psubw %%mm7,%%mm2\n\t" \
|
"psubw %%mm7,%%mm2\n\t" \
|
||||||
"movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
|
"movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
|
||||||
"punpcklbw %%mm6,%%mm3\n\t" \
|
"punpcklbw %%mm6,%%mm3\n\t" \
|
||||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
"punpcklbw %%mm6,%%mm6\n\t" \
|
"punpcklbw %%mm6,%%mm6\n\t" \
|
||||||
"psubw %%mm6,%%mm3\n\t" \
|
"psubw %%mm6,%%mm3\n\t" \
|
||||||
"movd "_off"(%[src]),%%mm6\n\t" \
|
"movd "#_off"(%[src]),%%mm6\n\t" \
|
||||||
"punpcklbw %%mm0,%%mm4\n\t" \
|
"punpcklbw %%mm0,%%mm4\n\t" \
|
||||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
"psubw %%mm0,%%mm4\n\t" \
|
"psubw %%mm0,%%mm4\n\t" \
|
||||||
"movd "_off"(%[ref]),%%mm0\n\t" \
|
"movd "#_off"(%[ref]),%%mm0\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm5\n\t" \
|
"punpcklbw %%mm7,%%mm5\n\t" \
|
||||||
"neg %[src_ystride]\n\t" \
|
"neg %[src_ystride]\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||||
"psubw %%mm7,%%mm5\n\t" \
|
"psubw %%mm7,%%mm5\n\t" \
|
||||||
"movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
|
"movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
|
||||||
"punpcklbw %%mm0,%%mm6\n\t" \
|
"punpcklbw %%mm0,%%mm6\n\t" \
|
||||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||||
"neg %[ref_ystride]\n\t" \
|
"neg %[ref_ystride]\n\t" \
|
||||||
"psubw %%mm0,%%mm6\n\t" \
|
"psubw %%mm0,%%mm6\n\t" \
|
||||||
"movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
|
"movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
|
||||||
"lea (%[src],%[src_ystride],8),%[src]\n\t" \
|
"lea (%[src],%[src_ystride],8),%[src]\n\t" \
|
||||||
"punpcklbw %%mm0,%%mm7\n\t" \
|
"punpcklbw %%mm0,%%mm7\n\t" \
|
||||||
"neg %[src_ystride]\n\t" \
|
"neg %[src_ystride]\n\t" \
|
||||||
@ -218,24 +220,24 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
|
"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
|
||||||
"psubw %%mm0,%%mm7\n\t" \
|
"psubw %%mm0,%%mm7\n\t" \
|
||||||
"neg %[ref_ystride]\n\t" \
|
"neg %[ref_ystride]\n\t" \
|
||||||
"movq "_off"*2(%[buf]),%%mm0\n\t" \
|
"movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
|
||||||
|
|
||||||
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
|
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
|
||||||
#define OC_LOAD_8x4(_off) \
|
#define OC_LOAD_8x4(_off) \
|
||||||
"#OC_LOAD_8x4\n\t" \
|
"#OC_LOAD_8x4\n\t" \
|
||||||
"movd "_off"(%[src]),%%mm0\n\t" \
|
"movd "#_off"(%[src]),%%mm0\n\t" \
|
||||||
"movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
|
"movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
|
||||||
"movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
|
"movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
|
||||||
"pxor %%mm7,%%mm7\n\t" \
|
"pxor %%mm7,%%mm7\n\t" \
|
||||||
"movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
|
"movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||||
"movd "_off"(%[src4]),%%mm4\n\t" \
|
"movd "#_off"(%[src4]),%%mm4\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||||
"movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
|
"movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm2\n\t" \
|
"punpcklbw %%mm7,%%mm2\n\t" \
|
||||||
"movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
|
"movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||||
"movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
|
"movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
|
||||||
"punpcklbw %%mm4,%%mm4\n\t" \
|
"punpcklbw %%mm4,%%mm4\n\t" \
|
||||||
"punpcklbw %%mm5,%%mm5\n\t" \
|
"punpcklbw %%mm5,%%mm5\n\t" \
|
||||||
"psrlw $8,%%mm4\n\t" \
|
"psrlw $8,%%mm4\n\t" \
|
||||||
@ -248,7 +250,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
||||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||||
outputs 4-7.
|
outputs 4-7.
|
||||||
Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
|
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
|
||||||
perform this stage in place with no temporary registers).*/
|
perform this stage in place with no temporary registers).*/
|
||||||
#define OC_HADAMARD_AB_8x4 \
|
#define OC_HADAMARD_AB_8x4 \
|
||||||
"#OC_HADAMARD_AB_8x4\n\t" \
|
"#OC_HADAMARD_AB_8x4\n\t" \
|
||||||
@ -281,7 +283,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
"psubw %%mm5,%%mm7\n\t" \
|
"psubw %%mm5,%%mm7\n\t" \
|
||||||
|
|
||||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||||
Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||||
place with no temporary registers).*/
|
place with no temporary registers).*/
|
||||||
#define OC_HADAMARD_C_8x4 \
|
#define OC_HADAMARD_C_8x4 \
|
||||||
"#OC_HADAMARD_C_8x4\n\t" \
|
"#OC_HADAMARD_C_8x4\n\t" \
|
||||||
@ -324,8 +326,8 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||||
This implementation is only 26 (+4 for spilling registers).*/ \
|
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||||
"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
|
"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
|
||||||
"movq %%mm7,"_r7"(%[buf])\n\t" \
|
"movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
|
||||||
"movq %%mm6,"_r6"(%[buf])\n\t" \
|
"movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
|
||||||
/*mm7={0x7FFF}x4 \
|
/*mm7={0x7FFF}x4 \
|
||||||
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
|
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
|
||||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||||
@ -343,14 +345,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
"pmaxsw %%mm5,%%mm4\n\t" \
|
"pmaxsw %%mm5,%%mm4\n\t" \
|
||||||
"paddw %%mm3,%%mm6\n\t" \
|
"paddw %%mm3,%%mm6\n\t" \
|
||||||
"paddw %%mm5,%%mm1\n\t" \
|
"paddw %%mm5,%%mm1\n\t" \
|
||||||
"movq "_r7"(%[buf]),%%mm3\n\t" \
|
"movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
|
||||||
|
|
||||||
/*Performs the second part of the final stage of the Hadamard transform and
|
/*Performs the second part of the final stage of the Hadamard transform and
|
||||||
summing of absolute values.*/
|
summing of absolute values.*/
|
||||||
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
||||||
"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
|
"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
|
||||||
"paddsw %%mm7,%%mm6\n\t" \
|
"paddsw %%mm7,%%mm6\n\t" \
|
||||||
"movq "_r6"(%[buf]),%%mm5\n\t" \
|
"movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
|
||||||
"paddsw %%mm7,%%mm1\n\t" \
|
"paddsw %%mm7,%%mm1\n\t" \
|
||||||
"psubw %%mm6,%%mm2\n\t" \
|
"psubw %%mm6,%%mm2\n\t" \
|
||||||
"psubw %%mm1,%%mm4\n\t" \
|
"psubw %%mm1,%%mm4\n\t" \
|
||||||
@ -391,7 +393,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
#define OC_TRANSPOSE_4x4x2(_off) \
|
#define OC_TRANSPOSE_4x4x2(_off) \
|
||||||
"#OC_TRANSPOSE_4x4x2\n\t" \
|
"#OC_TRANSPOSE_4x4x2\n\t" \
|
||||||
/*First 4x4 transpose:*/ \
|
/*First 4x4 transpose:*/ \
|
||||||
"movq %%mm5,0x10+"_off"(%[buf])\n\t" \
|
"movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
|
||||||
/*mm0 = e3 e2 e1 e0 \
|
/*mm0 = e3 e2 e1 e0 \
|
||||||
mm1 = f3 f2 f1 f0 \
|
mm1 = f3 f2 f1 f0 \
|
||||||
mm2 = g3 g2 g1 g0 \
|
mm2 = g3 g2 g1 g0 \
|
||||||
@ -411,13 +413,13 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||||
"movq %%mm3,%%mm2\n\t" \
|
"movq %%mm3,%%mm2\n\t" \
|
||||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||||
"movq %%mm0,0x40+"_off"(%[buf])\n\t" \
|
"movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
|
||||||
"punpckldq %%mm5,%%mm2\n\t" \
|
"punpckldq %%mm5,%%mm2\n\t" \
|
||||||
/*mm0 = h0 g0 f0 e0 \
|
/*mm0 = h0 g0 f0 e0 \
|
||||||
mm1 = h1 g1 f1 e1 \
|
mm1 = h1 g1 f1 e1 \
|
||||||
mm2 = h2 g2 f2 e2 \
|
mm2 = h2 g2 f2 e2 \
|
||||||
mm3 = h3 g3 f3 e3*/ \
|
mm3 = h3 g3 f3 e3*/ \
|
||||||
"movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
|
"movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
|
||||||
/*Second 4x4 transpose:*/ \
|
/*Second 4x4 transpose:*/ \
|
||||||
/*mm4 = a3 a2 a1 a0 \
|
/*mm4 = a3 a2 a1 a0 \
|
||||||
mm5 = b3 b2 b1 b0 \
|
mm5 = b3 b2 b1 b0 \
|
||||||
@ -425,11 +427,11 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
mm7 = d3 d2 d1 d0*/ \
|
mm7 = d3 d2 d1 d0*/ \
|
||||||
"movq %%mm6,%%mm0\n\t" \
|
"movq %%mm6,%%mm0\n\t" \
|
||||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||||
"movq %%mm1,0x50+"_off"(%[buf])\n\t" \
|
"movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
|
||||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||||
"movq %%mm4,%%mm7\n\t" \
|
"movq %%mm4,%%mm7\n\t" \
|
||||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||||
"movq %%mm2,0x60+"_off"(%[buf])\n\t" \
|
"movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
|
||||||
"punpckhwd %%mm5,%%mm7\n\t" \
|
"punpckhwd %%mm5,%%mm7\n\t" \
|
||||||
/*mm4 = b1 a1 b0 a0 \
|
/*mm4 = b1 a1 b0 a0 \
|
||||||
mm7 = b3 a3 b2 a2 \
|
mm7 = b3 a3 b2 a2 \
|
||||||
@ -437,7 +439,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
mm0 = d3 c3 d2 c2*/ \
|
mm0 = d3 c3 d2 c2*/ \
|
||||||
"movq %%mm4,%%mm5\n\t" \
|
"movq %%mm4,%%mm5\n\t" \
|
||||||
"punpckldq %%mm6,%%mm4\n\t" \
|
"punpckldq %%mm6,%%mm4\n\t" \
|
||||||
"movq %%mm3,0x70+"_off"(%[buf])\n\t" \
|
"movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
|
||||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||||
"movq %%mm7,%%mm6\n\t" \
|
"movq %%mm7,%%mm6\n\t" \
|
||||||
"punpckhdq %%mm0,%%mm7\n\t" \
|
"punpckhdq %%mm0,%%mm7\n\t" \
|
||||||
@ -447,100 +449,102 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
mm6 = d2 c2 b2 a2 \
|
mm6 = d2 c2 b2 a2 \
|
||||||
mm7 = d3 c3 b3 a3*/ \
|
mm7 = d3 c3 b3 a3*/ \
|
||||||
|
|
||||||
static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
|
static unsigned oc_int_frag_satd_mmxext(int *_dc,
|
||||||
int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
|
const unsigned char *_src,int _src_ystride,
|
||||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
const unsigned char *_ref,int _ref_ystride){
|
||||||
ogg_int16_t *bufp;
|
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||||
unsigned ret;
|
unsigned ret;
|
||||||
unsigned ret2;
|
unsigned ret2;
|
||||||
bufp=buf;
|
int dc;
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
OC_LOAD_SUB_8x4("0x00")
|
OC_LOAD_SUB_8x4(0x00)
|
||||||
OC_HADAMARD_8x4
|
OC_HADAMARD_8x4
|
||||||
OC_TRANSPOSE_4x4x2("0x00")
|
OC_TRANSPOSE_4x4x2(0x00)
|
||||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||||
mm0...mm3 have been swapped out already.*/
|
mm0...mm3 have been swapped out already.*/
|
||||||
"movq %%mm4,0x00(%[buf])\n\t"
|
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||||
"movq %%mm5,0x10(%[buf])\n\t"
|
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||||
"movq %%mm6,0x20(%[buf])\n\t"
|
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||||
"movq %%mm7,0x30(%[buf])\n\t"
|
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||||
OC_LOAD_SUB_8x4("0x04")
|
OC_LOAD_SUB_8x4(0x04)
|
||||||
OC_HADAMARD_8x4
|
OC_HADAMARD_8x4
|
||||||
OC_TRANSPOSE_4x4x2("0x08")
|
OC_TRANSPOSE_4x4x2(0x08)
|
||||||
/*Here the first 4x4 block of output from the last transpose is the second
|
/*Here the first 4x4 block of output from the last transpose is the second
|
||||||
4x4 block of input for the next transform.
|
4x4 block of input for the next transform.
|
||||||
We have cleverly arranged that it already be in the appropriate place, so
|
We have cleverly arranged that it already be in the appropriate place, so
|
||||||
we only have to do half the loads.*/
|
we only have to do half the loads.*/
|
||||||
"movq 0x10(%[buf]),%%mm1\n\t"
|
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
|
||||||
"movq 0x20(%[buf]),%%mm2\n\t"
|
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
|
||||||
"movq 0x30(%[buf]),%%mm3\n\t"
|
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
|
||||||
"movq 0x00(%[buf]),%%mm0\n\t"
|
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
|
||||||
OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38")
|
/*We split out the stages here so we can save the DC coefficient in the
|
||||||
|
middle.*/
|
||||||
|
OC_HADAMARD_AB_8x4
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||||
|
"movd %%mm1,%[dc]\n\t"
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||||
Now we finally have to promote things to dwords.
|
Now we finally have to promote things to dwords.
|
||||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||||
latency of pmaddwd by starting the next series of loads now.*/
|
latency of pmaddwd by starting the next series of loads now.*/
|
||||||
"mov %[thresh],%[ret2]\n\t"
|
|
||||||
"pmaddwd %%mm7,%%mm0\n\t"
|
"pmaddwd %%mm7,%%mm0\n\t"
|
||||||
"movq 0x50(%[buf]),%%mm1\n\t"
|
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
|
||||||
"movq 0x58(%[buf]),%%mm5\n\t"
|
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
|
||||||
"movq %%mm0,%%mm4\n\t"
|
|
||||||
"movq 0x60(%[buf]),%%mm2\n\t"
|
|
||||||
"punpckhdq %%mm0,%%mm0\n\t"
|
|
||||||
"movq 0x68(%[buf]),%%mm6\n\t"
|
|
||||||
"paddd %%mm0,%%mm4\n\t"
|
|
||||||
"movq 0x70(%[buf]),%%mm3\n\t"
|
|
||||||
"movd %%mm4,%[ret]\n\t"
|
|
||||||
"movq 0x78(%[buf]),%%mm7\n\t"
|
|
||||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
|
||||||
added to them, and a factor of two removed; correct the final sum here.*/
|
|
||||||
"lea -32(%[ret],%[ret]),%[ret]\n\t"
|
|
||||||
"movq 0x40(%[buf]),%%mm0\n\t"
|
|
||||||
"cmp %[ret2],%[ret]\n\t"
|
|
||||||
"movq 0x48(%[buf]),%%mm4\n\t"
|
|
||||||
"jae 1f\n\t"
|
|
||||||
OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
|
|
||||||
"pmaddwd %%mm7,%%mm0\n\t"
|
|
||||||
/*There isn't much to stick in here to hide the latency this time, but the
|
|
||||||
alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
|
|
||||||
latency is even worse.*/
|
|
||||||
"sub $32,%[ret]\n\t"
|
|
||||||
"movq %%mm0,%%mm4\n\t"
|
"movq %%mm0,%%mm4\n\t"
|
||||||
|
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
|
||||||
"punpckhdq %%mm0,%%mm0\n\t"
|
"punpckhdq %%mm0,%%mm0\n\t"
|
||||||
|
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
|
||||||
"paddd %%mm0,%%mm4\n\t"
|
"paddd %%mm0,%%mm4\n\t"
|
||||||
|
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
|
||||||
"movd %%mm4,%[ret2]\n\t"
|
"movd %%mm4,%[ret2]\n\t"
|
||||||
"lea (%[ret],%[ret2],2),%[ret]\n\t"
|
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
|
||||||
".p2align 4,,15\n\t"
|
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
|
||||||
"1:\n\t"
|
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
|
||||||
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||||
|
"pmaddwd %%mm7,%%mm0\n\t"
|
||||||
|
/*Subtract abs(dc) from 2*ret2.*/
|
||||||
|
"movsx %w[dc],%[dc]\n\t"
|
||||||
|
"cdq\n\t"
|
||||||
|
"lea (%[ret],%[ret2],2),%[ret2]\n\t"
|
||||||
|
"movq %%mm0,%%mm4\n\t"
|
||||||
|
"punpckhdq %%mm0,%%mm0\n\t"
|
||||||
|
"xor %[dc],%[ret]\n\t"
|
||||||
|
"paddd %%mm0,%%mm4\n\t"
|
||||||
|
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
||||||
|
added to them, a factor of two removed, and the DC value included;
|
||||||
|
correct the final sum here.*/
|
||||||
|
"sub %[ret],%[ret2]\n\t"
|
||||||
|
"movd %%mm4,%[ret]\n\t"
|
||||||
|
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
||||||
|
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
|
||||||
and %[ret2] with some of the inputs, since for once we don't write to
|
and %[ret2] with some of the inputs, since for once we don't write to
|
||||||
them until after we're done using everything but %[buf] (which is also
|
them until after we're done using everything but %[buf].*/
|
||||||
listed as an output to ensure gcc _doesn't_ alias them against it).*/
|
|
||||||
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
||||||
constraints, otherewise if gcc can prove they're equal it will allocate
|
constraints, otherewise if gcc can prove they're equal it will allocate
|
||||||
them to the same register (which is bad); _src and _ref face a similar
|
them to the same register (which is bad); _src and _ref face a similar
|
||||||
problem, though those are never actually the same.*/
|
problem, though those are never actually the same.*/
|
||||||
:[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
|
:[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
|
||||||
|
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||||
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
||||||
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride),
|
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||||
[thresh]"m"(_thresh)
|
|
||||||
/*We have to use neg, so we actually clobber the condition codes for once
|
/*We have to use neg, so we actually clobber the condition codes for once
|
||||||
(not to mention cmp, sub, and add).*/
|
(not to mention cmp, sub, and add).*/
|
||||||
:"cc"
|
:"cc"
|
||||||
);
|
);
|
||||||
|
*_dc=dc;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
const unsigned char *_ref,int _ystride){
|
||||||
return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
|
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
||||||
we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
|
we can share code with oc_enc_frag_satd2_mmxext().*/
|
||||||
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
/*Load the first 3 rows.*/
|
/*Load the first 3 rows.*/
|
||||||
@ -649,55 +653,53 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
|||||||
"psubb %%mm4,%%mm2\n\t"
|
"psubb %%mm4,%%mm2\n\t"
|
||||||
/*%%mm2 (row 7) is done, write it out.*/
|
/*%%mm2 (row 7) is done, write it out.*/
|
||||||
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
|
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
|
||||||
:[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
|
:[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
|
||||||
:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
|
:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
|
||||||
[src_ystride]"r"((ptrdiff_t)_src_ystride)
|
[src_ystride]"r"((ptrdiff_t)_src_ystride)
|
||||||
:"memory"
|
:"memory"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||||
unsigned _thresh){
|
|
||||||
OC_ALIGN8(unsigned char ref[64]);
|
OC_ALIGN8(unsigned char ref[64]);
|
||||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||||
return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
|
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
|
||||||
int _ystride){
|
const unsigned char *_src,int _ystride){
|
||||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||||
ogg_int16_t *bufp;
|
unsigned ret;
|
||||||
unsigned ret;
|
unsigned ret2;
|
||||||
unsigned ret2;
|
int dc;
|
||||||
bufp=buf;
|
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
OC_LOAD_8x4("0x00")
|
OC_LOAD_8x4(0x00)
|
||||||
OC_HADAMARD_8x4
|
OC_HADAMARD_8x4
|
||||||
OC_TRANSPOSE_4x4x2("0x00")
|
OC_TRANSPOSE_4x4x2(0x00)
|
||||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||||
mm0...mm3 have been swapped out already.*/
|
mm0...mm3 have been swapped out already.*/
|
||||||
"movq %%mm4,0x00(%[buf])\n\t"
|
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||||
"movq %%mm5,0x10(%[buf])\n\t"
|
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||||
"movq %%mm6,0x20(%[buf])\n\t"
|
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||||
"movq %%mm7,0x30(%[buf])\n\t"
|
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||||
OC_LOAD_8x4("0x04")
|
OC_LOAD_8x4(0x04)
|
||||||
OC_HADAMARD_8x4
|
OC_HADAMARD_8x4
|
||||||
OC_TRANSPOSE_4x4x2("0x08")
|
OC_TRANSPOSE_4x4x2(0x08)
|
||||||
/*Here the first 4x4 block of output from the last transpose is the second
|
/*Here the first 4x4 block of output from the last transpose is the second
|
||||||
4x4 block of input for the next transform.
|
4x4 block of input for the next transform.
|
||||||
We have cleverly arranged that it already be in the appropriate place, so
|
We have cleverly arranged that it already be in the appropriate place, so
|
||||||
we only have to do half the loads.*/
|
we only have to do half the loads.*/
|
||||||
"movq 0x10(%[buf]),%%mm1\n\t"
|
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
|
||||||
"movq 0x20(%[buf]),%%mm2\n\t"
|
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
|
||||||
"movq 0x30(%[buf]),%%mm3\n\t"
|
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
|
||||||
"movq 0x00(%[buf]),%%mm0\n\t"
|
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
|
||||||
/*We split out the stages here so we can save the DC coefficient in the
|
/*We split out the stages here so we can save the DC coefficient in the
|
||||||
middle.*/
|
middle.*/
|
||||||
OC_HADAMARD_AB_8x4
|
OC_HADAMARD_AB_8x4
|
||||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
|
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||||
"movd %%mm1,%[ret]\n\t"
|
"movd %%mm1,%[dc]\n\t"
|
||||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
|
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||||
@ -705,41 +707,43 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
|
|||||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||||
latency of pmaddwd by starting the next series of loads now.*/
|
latency of pmaddwd by starting the next series of loads now.*/
|
||||||
"pmaddwd %%mm7,%%mm0\n\t"
|
"pmaddwd %%mm7,%%mm0\n\t"
|
||||||
"movq 0x50(%[buf]),%%mm1\n\t"
|
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
|
||||||
"movq 0x58(%[buf]),%%mm5\n\t"
|
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
|
||||||
"movq 0x60(%[buf]),%%mm2\n\t"
|
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
|
||||||
"movq %%mm0,%%mm4\n\t"
|
"movq %%mm0,%%mm4\n\t"
|
||||||
"movq 0x68(%[buf]),%%mm6\n\t"
|
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
|
||||||
"punpckhdq %%mm0,%%mm0\n\t"
|
"punpckhdq %%mm0,%%mm0\n\t"
|
||||||
"movq 0x70(%[buf]),%%mm3\n\t"
|
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
|
||||||
"paddd %%mm0,%%mm4\n\t"
|
"paddd %%mm0,%%mm4\n\t"
|
||||||
"movq 0x78(%[buf]),%%mm7\n\t"
|
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
|
||||||
"movd %%mm4,%[ret2]\n\t"
|
"movd %%mm4,%[ret]\n\t"
|
||||||
"movq 0x40(%[buf]),%%mm0\n\t"
|
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
|
||||||
"movq 0x48(%[buf]),%%mm4\n\t"
|
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
|
||||||
OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
|
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||||
"pmaddwd %%mm7,%%mm0\n\t"
|
"pmaddwd %%mm7,%%mm0\n\t"
|
||||||
/*We assume that the DC coefficient is always positive (which is true,
|
/*We assume that the DC coefficient is always positive (which is true,
|
||||||
because the input to the INTRA transform was not a difference).*/
|
because the input to the INTRA transform was not a difference).*/
|
||||||
"movzx %w[ret],%[ret]\n\t"
|
"movzx %w[dc],%[dc]\n\t"
|
||||||
"add %[ret2],%[ret2]\n\t"
|
"add %[ret],%[ret]\n\t"
|
||||||
"sub %[ret],%[ret2]\n\t"
|
"sub %[dc],%[ret]\n\t"
|
||||||
"movq %%mm0,%%mm4\n\t"
|
"movq %%mm0,%%mm4\n\t"
|
||||||
"punpckhdq %%mm0,%%mm0\n\t"
|
"punpckhdq %%mm0,%%mm0\n\t"
|
||||||
"paddd %%mm0,%%mm4\n\t"
|
"paddd %%mm0,%%mm4\n\t"
|
||||||
"movd %%mm4,%[ret]\n\t"
|
"movd %%mm4,%[ret2]\n\t"
|
||||||
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
"lea -64(%[ret],%[ret2],2),%[ret]\n\t"
|
||||||
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
|
||||||
and %[ret2] with some of the inputs, since for once we don't write to
|
and %[ret2] with some of the inputs, since for once we don't write to
|
||||||
them until after we're done using everything but %[buf] (which is also
|
them until after we're done using everything but %[buf] (which is also
|
||||||
listed as an output to ensure gcc _doesn't_ alias them against it).*/
|
listed as an output to ensure gcc _doesn't_ alias them against it).*/
|
||||||
:[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
|
:[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
|
||||||
|
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||||
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||||
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||||
/*We have to use sub, so we actually clobber the condition codes for once
|
/*We have to use sub, so we actually clobber the condition codes for once
|
||||||
(not to mention add).*/
|
(not to mention add).*/
|
||||||
:"cc"
|
:"cc"
|
||||||
);
|
);
|
||||||
|
*_dc=dc;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
129
thirdparty/libtheora/x86/mmxfdct.c
vendored
129
thirdparty/libtheora/x86/mmxfdct.c
vendored
@ -12,6 +12,7 @@
|
|||||||
/*MMX fDCT implementation for x86_32*/
|
/*MMX fDCT implementation for x86_32*/
|
||||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||||
#include "x86enc.h"
|
#include "x86enc.h"
|
||||||
|
#include "x86zigzag.h"
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
@ -462,8 +463,9 @@
|
|||||||
mm7 = d3 c3 b3 a3*/ \
|
mm7 = d3 c3 b3 a3*/ \
|
||||||
|
|
||||||
/*MMX implementation of the fDCT.*/
|
/*MMX implementation of the fDCT.*/
|
||||||
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||||
ptrdiff_t a;
|
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||||
|
ptrdiff_t a;
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||||
we could overflow.*/
|
we could overflow.*/
|
||||||
@ -586,77 +588,88 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
"movq 0x30(%[y]),%%mm3\n\t"
|
"movq 0x30(%[y]),%%mm3\n\t"
|
||||||
OC_FDCT_STAGE1_8x4
|
OC_FDCT_STAGE1_8x4
|
||||||
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
|
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
|
||||||
OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
|
/*mm2={-2}x4*/
|
||||||
/*mm0={-2}x4*/
|
"pcmpeqw %%mm2,%%mm2\n\t"
|
||||||
"pcmpeqw %%mm0,%%mm0\n\t"
|
"paddw %%mm2,%%mm2\n\t"
|
||||||
"paddw %%mm0,%%mm0\n\t"
|
/*Round and store the results (no transpose).*/
|
||||||
/*Round the results.*/
|
"movq 0x10(%[y]),%%mm7\n\t"
|
||||||
"psubw %%mm0,%%mm1\n\t"
|
"psubw %%mm2,%%mm4\n\t"
|
||||||
"psubw %%mm0,%%mm2\n\t"
|
"psubw %%mm2,%%mm6\n\t"
|
||||||
"psraw $2,%%mm1\n\t"
|
|
||||||
"psubw %%mm0,%%mm3\n\t"
|
|
||||||
"movq %%mm1,0x18(%[y])\n\t"
|
|
||||||
"psraw $2,%%mm2\n\t"
|
|
||||||
"psubw %%mm0,%%mm4\n\t"
|
|
||||||
"movq 0x08(%[y]),%%mm1\n\t"
|
|
||||||
"psraw $2,%%mm3\n\t"
|
|
||||||
"psubw %%mm0,%%mm5\n\t"
|
|
||||||
"psraw $2,%%mm4\n\t"
|
"psraw $2,%%mm4\n\t"
|
||||||
"psubw %%mm0,%%mm6\n\t"
|
"psubw %%mm2,%%mm0\n\t"
|
||||||
"psraw $2,%%mm5\n\t"
|
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||||
"psubw %%mm0,%%mm7\n\t"
|
"movq 0x30(%[y]),%%mm4\n\t"
|
||||||
"psraw $2,%%mm6\n\t"
|
"psraw $2,%%mm6\n\t"
|
||||||
"psubw %%mm0,%%mm1\n\t"
|
"psubw %%mm2,%%mm5\n\t"
|
||||||
"psraw $2,%%mm7\n\t"
|
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||||
"movq 0x40(%[y]),%%mm0\n\t"
|
"psraw $2,%%mm0\n\t"
|
||||||
|
"psubw %%mm2,%%mm3\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm5\n\t"
|
||||||
|
"psubw %%mm2,%%mm1\n\t"
|
||||||
|
"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm3\n\t"
|
||||||
|
"psubw %%mm2,%%mm7\n\t"
|
||||||
|
"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
|
||||||
"psraw $2,%%mm1\n\t"
|
"psraw $2,%%mm1\n\t"
|
||||||
"movq %%mm7,0x30(%[y])\n\t"
|
"psubw %%mm2,%%mm4\n\t"
|
||||||
|
"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm7\n\t"
|
||||||
|
"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm4\n\t"
|
||||||
|
"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||||
|
/*Load the next block.*/
|
||||||
|
"movq 0x40(%[y]),%%mm0\n\t"
|
||||||
"movq 0x78(%[y]),%%mm7\n\t"
|
"movq 0x78(%[y]),%%mm7\n\t"
|
||||||
"movq %%mm1,0x08(%[y])\n\t"
|
|
||||||
"movq 0x50(%[y]),%%mm1\n\t"
|
"movq 0x50(%[y]),%%mm1\n\t"
|
||||||
"movq %%mm6,0x20(%[y])\n\t"
|
|
||||||
"movq 0x68(%[y]),%%mm6\n\t"
|
"movq 0x68(%[y]),%%mm6\n\t"
|
||||||
"movq %%mm2,0x28(%[y])\n\t"
|
|
||||||
"movq 0x60(%[y]),%%mm2\n\t"
|
"movq 0x60(%[y]),%%mm2\n\t"
|
||||||
"movq %%mm5,0x10(%[y])\n\t"
|
|
||||||
"movq 0x58(%[y]),%%mm5\n\t"
|
"movq 0x58(%[y]),%%mm5\n\t"
|
||||||
"movq %%mm3,0x38(%[y])\n\t"
|
|
||||||
"movq 0x70(%[y]),%%mm3\n\t"
|
"movq 0x70(%[y]),%%mm3\n\t"
|
||||||
"movq %%mm4,0x00(%[y])\n\t"
|
|
||||||
"movq 0x48(%[y]),%%mm4\n\t"
|
"movq 0x48(%[y]),%%mm4\n\t"
|
||||||
OC_FDCT_STAGE1_8x4
|
OC_FDCT_STAGE1_8x4
|
||||||
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
|
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
|
||||||
OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
|
/*mm2={-2}x4*/
|
||||||
/*mm0={-2}x4*/
|
"pcmpeqw %%mm2,%%mm2\n\t"
|
||||||
"pcmpeqw %%mm0,%%mm0\n\t"
|
"paddw %%mm2,%%mm2\n\t"
|
||||||
"paddw %%mm0,%%mm0\n\t"
|
/*Round and store the results (no transpose).*/
|
||||||
/*Round the results.*/
|
"movq 0x50(%[y]),%%mm7\n\t"
|
||||||
"psubw %%mm0,%%mm1\n\t"
|
"psubw %%mm2,%%mm4\n\t"
|
||||||
"psubw %%mm0,%%mm2\n\t"
|
"psubw %%mm2,%%mm6\n\t"
|
||||||
"psraw $2,%%mm1\n\t"
|
|
||||||
"psubw %%mm0,%%mm3\n\t"
|
|
||||||
"movq %%mm1,0x58(%[y])\n\t"
|
|
||||||
"psraw $2,%%mm2\n\t"
|
|
||||||
"psubw %%mm0,%%mm4\n\t"
|
|
||||||
"movq 0x48(%[y]),%%mm1\n\t"
|
|
||||||
"psraw $2,%%mm3\n\t"
|
|
||||||
"psubw %%mm0,%%mm5\n\t"
|
|
||||||
"movq %%mm2,0x68(%[y])\n\t"
|
|
||||||
"psraw $2,%%mm4\n\t"
|
"psraw $2,%%mm4\n\t"
|
||||||
"psubw %%mm0,%%mm6\n\t"
|
"psubw %%mm2,%%mm0\n\t"
|
||||||
"movq %%mm3,0x78(%[y])\n\t"
|
"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
|
||||||
"psraw $2,%%mm5\n\t"
|
"movq 0x70(%[y]),%%mm4\n\t"
|
||||||
"psubw %%mm0,%%mm7\n\t"
|
|
||||||
"movq %%mm4,0x40(%[y])\n\t"
|
|
||||||
"psraw $2,%%mm6\n\t"
|
"psraw $2,%%mm6\n\t"
|
||||||
"psubw %%mm0,%%mm1\n\t"
|
"psubw %%mm2,%%mm5\n\t"
|
||||||
"movq %%mm5,0x50(%[y])\n\t"
|
"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
|
||||||
"psraw $2,%%mm7\n\t"
|
"psraw $2,%%mm0\n\t"
|
||||||
"movq %%mm6,0x60(%[y])\n\t"
|
"psubw %%mm2,%%mm3\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm5\n\t"
|
||||||
|
"psubw %%mm2,%%mm1\n\t"
|
||||||
|
"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm3\n\t"
|
||||||
|
"psubw %%mm2,%%mm7\n\t"
|
||||||
|
"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
|
||||||
"psraw $2,%%mm1\n\t"
|
"psraw $2,%%mm1\n\t"
|
||||||
"movq %%mm7,0x70(%[y])\n\t"
|
"psubw %%mm2,%%mm4\n\t"
|
||||||
"movq %%mm1,0x48(%[y])\n\t"
|
"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
|
||||||
:[a]"=&r"(a)
|
"psraw $2,%%mm7\n\t"
|
||||||
|
"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
|
||||||
|
"psraw $2,%%mm4\n\t"
|
||||||
|
"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
|
||||||
|
/*Final transpose and zig-zag.*/
|
||||||
|
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||||
|
"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
|
||||||
|
|
||||||
|
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||||
|
"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
|
||||||
|
|
||||||
|
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||||
|
#undef OC_ZZ_LOAD_ROW_LO
|
||||||
|
#undef OC_ZZ_LOAD_ROW_HI
|
||||||
|
:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||||
:[y]"r"(_y),[x]"r"(_x)
|
:[y]"r"(_y),[x]"r"(_x)
|
||||||
:"memory"
|
:"memory"
|
||||||
);
|
);
|
||||||
|
81
thirdparty/libtheora/x86/mmxfrag.c
vendored
81
thirdparty/libtheora/x86/mmxfrag.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -22,10 +22,64 @@
|
|||||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "x86int.h"
|
#include "x86int.h"
|
||||||
#include "mmxfrag.h"
|
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
|
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||||
|
between rows.*/
|
||||||
|
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||||
|
do{ \
|
||||||
|
const unsigned char *src; \
|
||||||
|
unsigned char *dst; \
|
||||||
|
ptrdiff_t ystride3; \
|
||||||
|
src=(_src); \
|
||||||
|
dst=(_dst); \
|
||||||
|
__asm__ __volatile__( \
|
||||||
|
/*src+0*ystride*/ \
|
||||||
|
"movq (%[src]),%%mm0\n\t" \
|
||||||
|
/*src+1*ystride*/ \
|
||||||
|
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||||
|
/*ystride3=ystride*3*/ \
|
||||||
|
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||||
|
/*src+2*ystride*/ \
|
||||||
|
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||||
|
/*src+3*ystride*/ \
|
||||||
|
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||||
|
/*dst+0*ystride*/ \
|
||||||
|
"movq %%mm0,(%[dst])\n\t" \
|
||||||
|
/*dst+1*ystride*/ \
|
||||||
|
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||||
|
/*Pointer to next 4.*/ \
|
||||||
|
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||||
|
/*dst+2*ystride*/ \
|
||||||
|
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||||
|
/*dst+3*ystride*/ \
|
||||||
|
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||||
|
/*Pointer to next 4.*/ \
|
||||||
|
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
|
||||||
|
/*src+0*ystride*/ \
|
||||||
|
"movq (%[src]),%%mm0\n\t" \
|
||||||
|
/*src+1*ystride*/ \
|
||||||
|
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||||
|
/*src+2*ystride*/ \
|
||||||
|
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||||
|
/*src+3*ystride*/ \
|
||||||
|
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||||
|
/*dst+0*ystride*/ \
|
||||||
|
"movq %%mm0,(%[dst])\n\t" \
|
||||||
|
/*dst+1*ystride*/ \
|
||||||
|
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||||
|
/*dst+2*ystride*/ \
|
||||||
|
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||||
|
/*dst+3*ystride*/ \
|
||||||
|
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||||
|
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
|
||||||
|
:[ystride]"r"((ptrdiff_t)(_ystride)) \
|
||||||
|
:"memory" \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
while(0)
|
||||||
|
|
||||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||||
between rows.*/
|
between rows.*/
|
||||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||||
@ -33,6 +87,27 @@ void oc_frag_copy_mmx(unsigned char *_dst,
|
|||||||
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*Copies the fragments specified by the lists of fragment indices from one
|
||||||
|
frame to another.
|
||||||
|
_dst_frame: The reference frame to copy to.
|
||||||
|
_src_frame: The reference frame to copy from.
|
||||||
|
_ystride: The row stride of the reference frames.
|
||||||
|
_fragis: A pointer to a list of fragment indices.
|
||||||
|
_nfragis: The number of fragment indices to copy.
|
||||||
|
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||||
|
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||||
|
ptrdiff_t fragii;
|
||||||
|
for(fragii=0;fragii<_nfragis;fragii++){
|
||||||
|
ptrdiff_t frag_buf_off;
|
||||||
|
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||||
|
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||||
|
_src_frame+frag_buf_off,_ystride);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||||
const ogg_int16_t *_residue){
|
const ogg_int16_t *_residue){
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
@ -280,7 +355,7 @@ void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
|||||||
/*Advance dest ptr.*/
|
/*Advance dest ptr.*/
|
||||||
"lea (%[dst],%[ystride],2),%[dst]\n\t"
|
"lea (%[dst],%[ystride],2),%[dst]\n\t"
|
||||||
:[dst]"+r"(_dst),[residue]"+r"(_residue),
|
:[dst]"+r"(_dst),[residue]"+r"(_residue),
|
||||||
[src1]"+%r"(_src1),[src2]"+r"(_src2)
|
[src1]"+r"(_src1),[src2]"+r"(_src2)
|
||||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||||
:"memory"
|
:"memory"
|
||||||
);
|
);
|
||||||
|
64
thirdparty/libtheora/x86/mmxfrag.h
vendored
64
thirdparty/libtheora/x86/mmxfrag.h
vendored
@ -1,64 +0,0 @@
|
|||||||
#if !defined(_x86_mmxfrag_H)
|
|
||||||
# define _x86_mmxfrag_H (1)
|
|
||||||
# include <stddef.h>
|
|
||||||
# include "x86int.h"
|
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
|
||||||
|
|
||||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
|
||||||
between rows.*/
|
|
||||||
#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
|
||||||
do{ \
|
|
||||||
const unsigned char *src; \
|
|
||||||
unsigned char *dst; \
|
|
||||||
ptrdiff_t ystride3; \
|
|
||||||
src=(_src); \
|
|
||||||
dst=(_dst); \
|
|
||||||
__asm__ __volatile__( \
|
|
||||||
/*src+0*ystride*/ \
|
|
||||||
"movq (%[src]),%%mm0\n\t" \
|
|
||||||
/*src+1*ystride*/ \
|
|
||||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
|
||||||
/*ystride3=ystride*3*/ \
|
|
||||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
|
||||||
/*src+2*ystride*/ \
|
|
||||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
|
||||||
/*src+3*ystride*/ \
|
|
||||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
|
||||||
/*dst+0*ystride*/ \
|
|
||||||
"movq %%mm0,(%[dst])\n\t" \
|
|
||||||
/*dst+1*ystride*/ \
|
|
||||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
|
||||||
/*Pointer to next 4.*/ \
|
|
||||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
|
||||||
/*dst+2*ystride*/ \
|
|
||||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
|
||||||
/*dst+3*ystride*/ \
|
|
||||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
|
||||||
/*Pointer to next 4.*/ \
|
|
||||||
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
|
|
||||||
/*src+0*ystride*/ \
|
|
||||||
"movq (%[src]),%%mm0\n\t" \
|
|
||||||
/*src+1*ystride*/ \
|
|
||||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
|
||||||
/*src+2*ystride*/ \
|
|
||||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
|
||||||
/*src+3*ystride*/ \
|
|
||||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
|
||||||
/*dst+0*ystride*/ \
|
|
||||||
"movq %%mm0,(%[dst])\n\t" \
|
|
||||||
/*dst+1*ystride*/ \
|
|
||||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
|
||||||
/*dst+2*ystride*/ \
|
|
||||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
|
||||||
/*dst+3*ystride*/ \
|
|
||||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
|
||||||
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
|
|
||||||
:[ystride]"r"((ptrdiff_t)(_ystride)) \
|
|
||||||
:"memory" \
|
|
||||||
); \
|
|
||||||
} \
|
|
||||||
while(0)
|
|
||||||
|
|
||||||
# endif
|
|
||||||
#endif
|
|
292
thirdparty/libtheora/x86/mmxidct.c
vendored
292
thirdparty/libtheora/x86/mmxidct.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -30,89 +30,66 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*A table of constants used by the MMX routines.*/
|
|
||||||
static const ogg_uint16_t __attribute__((aligned(8),used))
|
|
||||||
OC_IDCT_CONSTS[(7+1)*4]={
|
|
||||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
|
||||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
|
||||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
|
||||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
|
||||||
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
|
|
||||||
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
|
|
||||||
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
|
|
||||||
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
|
|
||||||
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
|
|
||||||
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
|
|
||||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
|
||||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
|
||||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
|
||||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
|
||||||
8, 8, 8, 8
|
|
||||||
};
|
|
||||||
|
|
||||||
/*Converts the expression in the argument to a string.*/
|
|
||||||
#define OC_M2STR(_s) #_s
|
|
||||||
|
|
||||||
/*38 cycles*/
|
/*38 cycles*/
|
||||||
#define OC_IDCT_BEGIN \
|
#define OC_IDCT_BEGIN(_y,_x) \
|
||||||
"#OC_IDCT_BEGIN\n\t" \
|
"#OC_IDCT_BEGIN\n\t" \
|
||||||
"movq "OC_I(3)",%%mm2\n\t" \
|
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||||
"movq "OC_C(3)",%%mm6\n\t" \
|
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||||
"movq %%mm2,%%mm4\n\t" \
|
"movq %%mm2,%%mm4\n\t" \
|
||||||
"movq "OC_J(5)",%%mm7\n\t" \
|
"movq "OC_J(5,_x)",%%mm7\n\t" \
|
||||||
"pmulhw %%mm6,%%mm4\n\t" \
|
"pmulhw %%mm6,%%mm4\n\t" \
|
||||||
"movq "OC_C(5)",%%mm1\n\t" \
|
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||||
"pmulhw %%mm7,%%mm6\n\t" \
|
"pmulhw %%mm7,%%mm6\n\t" \
|
||||||
"movq %%mm1,%%mm5\n\t" \
|
"movq %%mm1,%%mm5\n\t" \
|
||||||
"pmulhw %%mm2,%%mm1\n\t" \
|
"pmulhw %%mm2,%%mm1\n\t" \
|
||||||
"movq "OC_I(1)",%%mm3\n\t" \
|
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||||
"pmulhw %%mm7,%%mm5\n\t" \
|
"pmulhw %%mm7,%%mm5\n\t" \
|
||||||
"movq "OC_C(1)",%%mm0\n\t" \
|
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||||
"paddw %%mm2,%%mm4\n\t" \
|
"paddw %%mm2,%%mm4\n\t" \
|
||||||
"paddw %%mm7,%%mm6\n\t" \
|
"paddw %%mm7,%%mm6\n\t" \
|
||||||
"paddw %%mm1,%%mm2\n\t" \
|
"paddw %%mm1,%%mm2\n\t" \
|
||||||
"movq "OC_J(7)",%%mm1\n\t" \
|
"movq "OC_J(7,_x)",%%mm1\n\t" \
|
||||||
"paddw %%mm5,%%mm7\n\t" \
|
"paddw %%mm5,%%mm7\n\t" \
|
||||||
"movq %%mm0,%%mm5\n\t" \
|
"movq %%mm0,%%mm5\n\t" \
|
||||||
"pmulhw %%mm3,%%mm0\n\t" \
|
"pmulhw %%mm3,%%mm0\n\t" \
|
||||||
"paddw %%mm7,%%mm4\n\t" \
|
"paddw %%mm7,%%mm4\n\t" \
|
||||||
"pmulhw %%mm1,%%mm5\n\t" \
|
"pmulhw %%mm1,%%mm5\n\t" \
|
||||||
"movq "OC_C(7)",%%mm7\n\t" \
|
"movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
|
||||||
"psubw %%mm2,%%mm6\n\t" \
|
"psubw %%mm2,%%mm6\n\t" \
|
||||||
"paddw %%mm3,%%mm0\n\t" \
|
"paddw %%mm3,%%mm0\n\t" \
|
||||||
"pmulhw %%mm7,%%mm3\n\t" \
|
"pmulhw %%mm7,%%mm3\n\t" \
|
||||||
"movq "OC_I(2)",%%mm2\n\t" \
|
"movq "OC_I(2,_x)",%%mm2\n\t" \
|
||||||
"pmulhw %%mm1,%%mm7\n\t" \
|
"pmulhw %%mm1,%%mm7\n\t" \
|
||||||
"paddw %%mm1,%%mm5\n\t" \
|
"paddw %%mm1,%%mm5\n\t" \
|
||||||
"movq %%mm2,%%mm1\n\t" \
|
"movq %%mm2,%%mm1\n\t" \
|
||||||
"pmulhw "OC_C(2)",%%mm2\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
|
||||||
"psubw %%mm5,%%mm3\n\t" \
|
"psubw %%mm5,%%mm3\n\t" \
|
||||||
"movq "OC_J(6)",%%mm5\n\t" \
|
"movq "OC_J(6,_x)",%%mm5\n\t" \
|
||||||
"paddw %%mm7,%%mm0\n\t" \
|
"paddw %%mm7,%%mm0\n\t" \
|
||||||
"movq %%mm5,%%mm7\n\t" \
|
"movq %%mm5,%%mm7\n\t" \
|
||||||
"psubw %%mm4,%%mm0\n\t" \
|
"psubw %%mm4,%%mm0\n\t" \
|
||||||
"pmulhw "OC_C(2)",%%mm5\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||||
"paddw %%mm1,%%mm2\n\t" \
|
"paddw %%mm1,%%mm2\n\t" \
|
||||||
"pmulhw "OC_C(6)",%%mm1\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||||
"paddw %%mm4,%%mm4\n\t" \
|
"paddw %%mm4,%%mm4\n\t" \
|
||||||
"paddw %%mm0,%%mm4\n\t" \
|
"paddw %%mm0,%%mm4\n\t" \
|
||||||
"psubw %%mm6,%%mm3\n\t" \
|
"psubw %%mm6,%%mm3\n\t" \
|
||||||
"paddw %%mm7,%%mm5\n\t" \
|
"paddw %%mm7,%%mm5\n\t" \
|
||||||
"paddw %%mm6,%%mm6\n\t" \
|
"paddw %%mm6,%%mm6\n\t" \
|
||||||
"pmulhw "OC_C(6)",%%mm7\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||||
"paddw %%mm3,%%mm6\n\t" \
|
"paddw %%mm3,%%mm6\n\t" \
|
||||||
"movq %%mm4,"OC_I(1)"\n\t" \
|
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||||
"psubw %%mm5,%%mm1\n\t" \
|
"psubw %%mm5,%%mm1\n\t" \
|
||||||
"movq "OC_C(4)",%%mm4\n\t" \
|
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||||
"movq %%mm3,%%mm5\n\t" \
|
"movq %%mm3,%%mm5\n\t" \
|
||||||
"pmulhw %%mm4,%%mm3\n\t" \
|
"pmulhw %%mm4,%%mm3\n\t" \
|
||||||
"paddw %%mm2,%%mm7\n\t" \
|
"paddw %%mm2,%%mm7\n\t" \
|
||||||
"movq %%mm6,"OC_I(2)"\n\t" \
|
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||||
"movq %%mm0,%%mm2\n\t" \
|
"movq %%mm0,%%mm2\n\t" \
|
||||||
"movq "OC_I(0)",%%mm6\n\t" \
|
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||||
"pmulhw %%mm4,%%mm0\n\t" \
|
"pmulhw %%mm4,%%mm0\n\t" \
|
||||||
"paddw %%mm3,%%mm5\n\t" \
|
"paddw %%mm3,%%mm5\n\t" \
|
||||||
"movq "OC_J(4)",%%mm3\n\t" \
|
"movq "OC_J(4,_x)",%%mm3\n\t" \
|
||||||
"psubw %%mm1,%%mm5\n\t" \
|
"psubw %%mm1,%%mm5\n\t" \
|
||||||
"paddw %%mm0,%%mm2\n\t" \
|
"paddw %%mm0,%%mm2\n\t" \
|
||||||
"psubw %%mm3,%%mm6\n\t" \
|
"psubw %%mm3,%%mm6\n\t" \
|
||||||
@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
"paddw %%mm0,%%mm6\n\t" \
|
"paddw %%mm0,%%mm6\n\t" \
|
||||||
"psubw %%mm2,%%mm6\n\t" \
|
"psubw %%mm2,%%mm6\n\t" \
|
||||||
"paddw %%mm2,%%mm2\n\t" \
|
"paddw %%mm2,%%mm2\n\t" \
|
||||||
"movq "OC_I(1)",%%mm0\n\t" \
|
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||||
"paddw %%mm6,%%mm2\n\t" \
|
"paddw %%mm6,%%mm2\n\t" \
|
||||||
"paddw %%mm3,%%mm4\n\t" \
|
"paddw %%mm3,%%mm4\n\t" \
|
||||||
"psubw %%mm1,%%mm2\n\t" \
|
"psubw %%mm1,%%mm2\n\t" \
|
||||||
"#end OC_IDCT_BEGIN\n\t" \
|
"#end OC_IDCT_BEGIN\n\t" \
|
||||||
|
|
||||||
/*38+8=46 cycles.*/
|
/*38+8=46 cycles.*/
|
||||||
#define OC_ROW_IDCT \
|
#define OC_ROW_IDCT(_y,_x) \
|
||||||
"#OC_ROW_IDCT\n" \
|
"#OC_ROW_IDCT\n" \
|
||||||
OC_IDCT_BEGIN \
|
OC_IDCT_BEGIN(_y,_x) \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
"movq "OC_I(2)",%%mm3\n\t" \
|
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||||
/*r4=E'=E-G*/ \
|
/*r4=E'=E-G*/ \
|
||||||
"psubw %%mm7,%%mm4\n\t" \
|
"psubw %%mm7,%%mm4\n\t" \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
"psubw %%mm0,%%mm7\n\t" \
|
"psubw %%mm0,%%mm7\n\t" \
|
||||||
"paddw %%mm0,%%mm0\n\t" \
|
"paddw %%mm0,%%mm0\n\t" \
|
||||||
/*Save R1.*/ \
|
/*Save R1.*/ \
|
||||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||||
/*r0=R0=G.+C.*/ \
|
/*r0=R0=G.+C.*/ \
|
||||||
"paddw %%mm7,%%mm0\n\t" \
|
"paddw %%mm7,%%mm0\n\t" \
|
||||||
"#end OC_ROW_IDCT\n\t" \
|
"#end OC_ROW_IDCT\n\t" \
|
||||||
@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
|
|
||||||
Since r1 is free at entry, we calculate the Js first.*/
|
Since r1 is free at entry, we calculate the Js first.*/
|
||||||
/*19 cycles.*/
|
/*19 cycles.*/
|
||||||
#define OC_TRANSPOSE \
|
#define OC_TRANSPOSE(_y) \
|
||||||
"#OC_TRANSPOSE\n\t" \
|
"#OC_TRANSPOSE\n\t" \
|
||||||
"movq %%mm4,%%mm1\n\t" \
|
"movq %%mm4,%%mm1\n\t" \
|
||||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||||
"punpckhwd %%mm5,%%mm1\n\t" \
|
"punpckhwd %%mm5,%%mm1\n\t" \
|
||||||
"movq %%mm6,%%mm0\n\t" \
|
"movq %%mm6,%%mm0\n\t" \
|
||||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||||
@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
"punpckldq %%mm6,%%mm4\n\t" \
|
"punpckldq %%mm6,%%mm4\n\t" \
|
||||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||||
"movq %%mm1,%%mm6\n\t" \
|
"movq %%mm1,%%mm6\n\t" \
|
||||||
"movq %%mm4,"OC_J(4)"\n\t" \
|
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||||
"movq %%mm5,"OC_J(5)"\n\t" \
|
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||||
"punpckhdq %%mm0,%%mm6\n\t" \
|
"punpckhdq %%mm0,%%mm6\n\t" \
|
||||||
"movq "OC_I(0)",%%mm4\n\t" \
|
"movq "OC_I(0,_y)",%%mm4\n\t" \
|
||||||
"punpckldq %%mm0,%%mm1\n\t" \
|
"punpckldq %%mm0,%%mm1\n\t" \
|
||||||
"movq "OC_I(1)",%%mm5\n\t" \
|
"movq "OC_I(1,_y)",%%mm5\n\t" \
|
||||||
"movq %%mm4,%%mm0\n\t" \
|
"movq %%mm4,%%mm0\n\t" \
|
||||||
"movq %%mm6,"OC_J(7)"\n\t" \
|
"movq %%mm6,"OC_J(7,_y)"\n\t" \
|
||||||
"punpcklwd %%mm5,%%mm0\n\t" \
|
"punpcklwd %%mm5,%%mm0\n\t" \
|
||||||
"movq %%mm1,"OC_J(6)"\n\t" \
|
"movq %%mm1,"OC_J(6,_y)"\n\t" \
|
||||||
"punpckhwd %%mm5,%%mm4\n\t" \
|
"punpckhwd %%mm5,%%mm4\n\t" \
|
||||||
"movq %%mm2,%%mm5\n\t" \
|
"movq %%mm2,%%mm5\n\t" \
|
||||||
"punpcklwd %%mm3,%%mm2\n\t" \
|
"punpcklwd %%mm3,%%mm2\n\t" \
|
||||||
@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
"punpckldq %%mm2,%%mm0\n\t" \
|
"punpckldq %%mm2,%%mm0\n\t" \
|
||||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||||
"movq %%mm4,%%mm2\n\t" \
|
"movq %%mm4,%%mm2\n\t" \
|
||||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||||
"punpckhwd %%mm3,%%mm5\n\t" \
|
"punpckhwd %%mm3,%%mm5\n\t" \
|
||||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||||
"punpckhdq %%mm5,%%mm4\n\t" \
|
"punpckhdq %%mm5,%%mm4\n\t" \
|
||||||
"punpckldq %%mm5,%%mm2\n\t" \
|
"punpckldq %%mm5,%%mm2\n\t" \
|
||||||
"movq %%mm4,"OC_I(3)"\n\t" \
|
"movq %%mm4,"OC_I(3,_y)"\n\t" \
|
||||||
"movq %%mm2,"OC_I(2)"\n\t" \
|
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||||
"#end OC_TRANSPOSE\n\t" \
|
"#end OC_TRANSPOSE\n\t" \
|
||||||
|
|
||||||
/*38+19=57 cycles.*/
|
/*38+19=57 cycles.*/
|
||||||
#define OC_COLUMN_IDCT \
|
#define OC_COLUMN_IDCT(_y) \
|
||||||
"#OC_COLUMN_IDCT\n" \
|
"#OC_COLUMN_IDCT\n" \
|
||||||
OC_IDCT_BEGIN \
|
OC_IDCT_BEGIN(_y,_y) \
|
||||||
"paddw "OC_8",%%mm2\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
"paddw %%mm1,%%mm1\n\t" \
|
"paddw %%mm1,%%mm1\n\t" \
|
||||||
/*r1=R1=A''+H'*/ \
|
/*r1=R1=A''+H'*/ \
|
||||||
@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
/*r1=NR1*/ \
|
/*r1=NR1*/ \
|
||||||
"psraw $4,%%mm1\n\t" \
|
"psraw $4,%%mm1\n\t" \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
"movq "OC_I(2)",%%mm3\n\t" \
|
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||||
/*r7=G+G*/ \
|
/*r7=G+G*/ \
|
||||||
"paddw %%mm7,%%mm7\n\t" \
|
"paddw %%mm7,%%mm7\n\t" \
|
||||||
/*Store NR2 at I(2).*/ \
|
/*Store NR2 at I(2).*/ \
|
||||||
"movq %%mm2,"OC_I(2)"\n\t" \
|
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||||
/*r7=G'=E+G*/ \
|
/*r7=G'=E+G*/ \
|
||||||
"paddw %%mm4,%%mm7\n\t" \
|
"paddw %%mm4,%%mm7\n\t" \
|
||||||
/*Store NR1 at I(1).*/ \
|
/*Store NR1 at I(1).*/ \
|
||||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||||
/*r4=R4=E'-D'*/ \
|
/*r4=R4=E'-D'*/ \
|
||||||
"psubw %%mm3,%%mm4\n\t" \
|
"psubw %%mm3,%%mm4\n\t" \
|
||||||
"paddw "OC_8",%%mm4\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||||
/*r3=D'+D'*/ \
|
/*r3=D'+D'*/ \
|
||||||
"paddw %%mm3,%%mm3\n\t" \
|
"paddw %%mm3,%%mm3\n\t" \
|
||||||
/*r3=R3=E'+D'*/ \
|
/*r3=R3=E'+D'*/ \
|
||||||
@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
"psubw %%mm5,%%mm6\n\t" \
|
"psubw %%mm5,%%mm6\n\t" \
|
||||||
/*r3=NR3*/ \
|
/*r3=NR3*/ \
|
||||||
"psraw $4,%%mm3\n\t" \
|
"psraw $4,%%mm3\n\t" \
|
||||||
"paddw "OC_8",%%mm6\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||||
/*r5=B''+B''*/ \
|
/*r5=B''+B''*/ \
|
||||||
"paddw %%mm5,%%mm5\n\t" \
|
"paddw %%mm5,%%mm5\n\t" \
|
||||||
/*r5=R5=F'+B''*/ \
|
/*r5=R5=F'+B''*/ \
|
||||||
@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
/*r6=NR6*/ \
|
/*r6=NR6*/ \
|
||||||
"psraw $4,%%mm6\n\t" \
|
"psraw $4,%%mm6\n\t" \
|
||||||
/*Store NR4 at J(4).*/ \
|
/*Store NR4 at J(4).*/ \
|
||||||
"movq %%mm4,"OC_J(4)"\n\t" \
|
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||||
/*r5=NR5*/ \
|
/*r5=NR5*/ \
|
||||||
"psraw $4,%%mm5\n\t" \
|
"psraw $4,%%mm5\n\t" \
|
||||||
/*Store NR3 at I(3).*/ \
|
/*Store NR3 at I(3).*/ \
|
||||||
"movq %%mm3,"OC_I(3)"\n\t" \
|
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||||
/*r7=R7=G'-C'*/ \
|
/*r7=R7=G'-C'*/ \
|
||||||
"psubw %%mm0,%%mm7\n\t" \
|
"psubw %%mm0,%%mm7\n\t" \
|
||||||
"paddw "OC_8",%%mm7\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||||
/*r0=C'+C'*/ \
|
/*r0=C'+C'*/ \
|
||||||
"paddw %%mm0,%%mm0\n\t" \
|
"paddw %%mm0,%%mm0\n\t" \
|
||||||
/*r0=R0=G'+C'*/ \
|
/*r0=R0=G'+C'*/ \
|
||||||
@ -295,113 +272,121 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
|
|||||||
/*r7=NR7*/ \
|
/*r7=NR7*/ \
|
||||||
"psraw $4,%%mm7\n\t" \
|
"psraw $4,%%mm7\n\t" \
|
||||||
/*Store NR6 at J(6).*/ \
|
/*Store NR6 at J(6).*/ \
|
||||||
"movq %%mm6,"OC_J(6)"\n\t" \
|
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||||
/*r0=NR0*/ \
|
/*r0=NR0*/ \
|
||||||
"psraw $4,%%mm0\n\t" \
|
"psraw $4,%%mm0\n\t" \
|
||||||
/*Store NR5 at J(5).*/ \
|
/*Store NR5 at J(5).*/ \
|
||||||
"movq %%mm5,"OC_J(5)"\n\t" \
|
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||||
/*Store NR7 at J(7).*/ \
|
/*Store NR7 at J(7).*/ \
|
||||||
"movq %%mm7,"OC_J(7)"\n\t" \
|
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||||
/*Store NR0 at I(0).*/ \
|
/*Store NR0 at I(0).*/ \
|
||||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||||
"#end OC_COLUMN_IDCT\n\t" \
|
"#end OC_COLUMN_IDCT\n\t" \
|
||||||
|
|
||||||
#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
|
static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
|
int i;
|
||||||
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
|
|
||||||
|
|
||||||
static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|
||||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||||
Every 4x4 block is transposed.*/
|
Every 4x4 block is transposed.*/
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||||
#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
|
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||||
OC_ROW_IDCT
|
OC_ROW_IDCT(y,x)
|
||||||
OC_TRANSPOSE
|
OC_TRANSPOSE(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) OC_M2STR((_k*16)+64)"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
|
||||||
#define OC_J(_k) OC_M2STR(((_k-4)*16)+72)"(%[y])"
|
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
|
||||||
OC_ROW_IDCT
|
OC_ROW_IDCT(y,x)
|
||||||
OC_TRANSPOSE
|
OC_TRANSPOSE(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT
|
OC_COLUMN_IDCT(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT
|
OC_COLUMN_IDCT(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
:
|
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||||
:[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
|
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||||
|
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
|
||||||
);
|
);
|
||||||
|
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
|
||||||
|
for(i=0;i<4;i++){
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
|
||||||
|
:[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*25 cycles.*/
|
/*25 cycles.*/
|
||||||
#define OC_IDCT_BEGIN_10 \
|
#define OC_IDCT_BEGIN_10(_y,_x) \
|
||||||
"#OC_IDCT_BEGIN_10\n\t" \
|
"#OC_IDCT_BEGIN_10\n\t" \
|
||||||
"movq "OC_I(3)",%%mm2\n\t" \
|
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||||
"nop\n\t" \
|
"nop\n\t" \
|
||||||
"movq "OC_C(3)",%%mm6\n\t" \
|
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||||
"movq %%mm2,%%mm4\n\t" \
|
"movq %%mm2,%%mm4\n\t" \
|
||||||
"movq "OC_C(5)",%%mm1\n\t" \
|
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||||
"pmulhw %%mm6,%%mm4\n\t" \
|
"pmulhw %%mm6,%%mm4\n\t" \
|
||||||
"movq "OC_I(1)",%%mm3\n\t" \
|
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||||
"pmulhw %%mm2,%%mm1\n\t" \
|
"pmulhw %%mm2,%%mm1\n\t" \
|
||||||
"movq "OC_C(1)",%%mm0\n\t" \
|
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||||
"paddw %%mm2,%%mm4\n\t" \
|
"paddw %%mm2,%%mm4\n\t" \
|
||||||
"pxor %%mm6,%%mm6\n\t" \
|
"pxor %%mm6,%%mm6\n\t" \
|
||||||
"paddw %%mm1,%%mm2\n\t" \
|
"paddw %%mm1,%%mm2\n\t" \
|
||||||
"movq "OC_I(2)",%%mm5\n\t" \
|
"movq "OC_I(2,_x)",%%mm5\n\t" \
|
||||||
"pmulhw %%mm3,%%mm0\n\t" \
|
"pmulhw %%mm3,%%mm0\n\t" \
|
||||||
"movq %%mm5,%%mm1\n\t" \
|
"movq %%mm5,%%mm1\n\t" \
|
||||||
"paddw %%mm3,%%mm0\n\t" \
|
"paddw %%mm3,%%mm0\n\t" \
|
||||||
"pmulhw "OC_C(7)",%%mm3\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||||
"psubw %%mm2,%%mm6\n\t" \
|
"psubw %%mm2,%%mm6\n\t" \
|
||||||
"pmulhw "OC_C(2)",%%mm5\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||||
"psubw %%mm4,%%mm0\n\t" \
|
"psubw %%mm4,%%mm0\n\t" \
|
||||||
"movq "OC_I(2)",%%mm7\n\t" \
|
"movq "OC_I(2,_x)",%%mm7\n\t" \
|
||||||
"paddw %%mm4,%%mm4\n\t" \
|
"paddw %%mm4,%%mm4\n\t" \
|
||||||
"paddw %%mm5,%%mm7\n\t" \
|
"paddw %%mm5,%%mm7\n\t" \
|
||||||
"paddw %%mm0,%%mm4\n\t" \
|
"paddw %%mm0,%%mm4\n\t" \
|
||||||
"pmulhw "OC_C(6)",%%mm1\n\t" \
|
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||||
"psubw %%mm6,%%mm3\n\t" \
|
"psubw %%mm6,%%mm3\n\t" \
|
||||||
"movq %%mm4,"OC_I(1)"\n\t" \
|
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||||
"paddw %%mm6,%%mm6\n\t" \
|
"paddw %%mm6,%%mm6\n\t" \
|
||||||
"movq "OC_C(4)",%%mm4\n\t" \
|
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||||
"paddw %%mm3,%%mm6\n\t" \
|
"paddw %%mm3,%%mm6\n\t" \
|
||||||
"movq %%mm3,%%mm5\n\t" \
|
"movq %%mm3,%%mm5\n\t" \
|
||||||
"pmulhw %%mm4,%%mm3\n\t" \
|
"pmulhw %%mm4,%%mm3\n\t" \
|
||||||
"movq %%mm6,"OC_I(2)"\n\t" \
|
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||||
"movq %%mm0,%%mm2\n\t" \
|
"movq %%mm0,%%mm2\n\t" \
|
||||||
"movq "OC_I(0)",%%mm6\n\t" \
|
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||||
"pmulhw %%mm4,%%mm0\n\t" \
|
"pmulhw %%mm4,%%mm0\n\t" \
|
||||||
"paddw %%mm3,%%mm5\n\t" \
|
"paddw %%mm3,%%mm5\n\t" \
|
||||||
"paddw %%mm0,%%mm2\n\t" \
|
"paddw %%mm0,%%mm2\n\t" \
|
||||||
"psubw %%mm1,%%mm5\n\t" \
|
"psubw %%mm1,%%mm5\n\t" \
|
||||||
"pmulhw %%mm4,%%mm6\n\t" \
|
"pmulhw %%mm4,%%mm6\n\t" \
|
||||||
"paddw "OC_I(0)",%%mm6\n\t" \
|
"paddw "OC_I(0,_x)",%%mm6\n\t" \
|
||||||
"paddw %%mm1,%%mm1\n\t" \
|
"paddw %%mm1,%%mm1\n\t" \
|
||||||
"movq %%mm6,%%mm4\n\t" \
|
"movq %%mm6,%%mm4\n\t" \
|
||||||
"paddw %%mm5,%%mm1\n\t" \
|
"paddw %%mm5,%%mm1\n\t" \
|
||||||
"psubw %%mm2,%%mm6\n\t" \
|
"psubw %%mm2,%%mm6\n\t" \
|
||||||
"paddw %%mm2,%%mm2\n\t" \
|
"paddw %%mm2,%%mm2\n\t" \
|
||||||
"movq "OC_I(1)",%%mm0\n\t" \
|
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||||
"paddw %%mm6,%%mm2\n\t" \
|
"paddw %%mm6,%%mm2\n\t" \
|
||||||
"psubw %%mm1,%%mm2\n\t" \
|
"psubw %%mm1,%%mm2\n\t" \
|
||||||
"nop\n\t" \
|
"nop\n\t" \
|
||||||
"#end OC_IDCT_BEGIN_10\n\t" \
|
"#end OC_IDCT_BEGIN_10\n\t" \
|
||||||
|
|
||||||
/*25+8=33 cycles.*/
|
/*25+8=33 cycles.*/
|
||||||
#define OC_ROW_IDCT_10 \
|
#define OC_ROW_IDCT_10(_y,_x) \
|
||||||
"#OC_ROW_IDCT_10\n\t" \
|
"#OC_ROW_IDCT_10\n\t" \
|
||||||
OC_IDCT_BEGIN_10 \
|
OC_IDCT_BEGIN_10(_y,_x) \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
"movq "OC_I(2)",%%mm3\n\t" \
|
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||||
/*r4=E'=E-G*/ \
|
/*r4=E'=E-G*/ \
|
||||||
"psubw %%mm7,%%mm4\n\t" \
|
"psubw %%mm7,%%mm4\n\t" \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
@ -426,16 +411,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
"psubw %%mm0,%%mm7\n\t" \
|
"psubw %%mm0,%%mm7\n\t" \
|
||||||
"paddw %%mm0,%%mm0\n\t" \
|
"paddw %%mm0,%%mm0\n\t" \
|
||||||
/*Save R1.*/ \
|
/*Save R1.*/ \
|
||||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||||
/*r0=R0=G'+C'*/ \
|
/*r0=R0=G'+C'*/ \
|
||||||
"paddw %%mm7,%%mm0\n\t" \
|
"paddw %%mm7,%%mm0\n\t" \
|
||||||
"#end OC_ROW_IDCT_10\n\t" \
|
"#end OC_ROW_IDCT_10\n\t" \
|
||||||
|
|
||||||
/*25+19=44 cycles'*/
|
/*25+19=44 cycles'*/
|
||||||
#define OC_COLUMN_IDCT_10 \
|
#define OC_COLUMN_IDCT_10(_y) \
|
||||||
"#OC_COLUMN_IDCT_10\n\t" \
|
"#OC_COLUMN_IDCT_10\n\t" \
|
||||||
OC_IDCT_BEGIN_10 \
|
OC_IDCT_BEGIN_10(_y,_y) \
|
||||||
"paddw "OC_8",%%mm2\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
"paddw %%mm1,%%mm1\n\t" \
|
"paddw %%mm1,%%mm1\n\t" \
|
||||||
/*r1=R1=A''+H'*/ \
|
/*r1=R1=A''+H'*/ \
|
||||||
@ -447,18 +432,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
/*r1=NR1*/ \
|
/*r1=NR1*/ \
|
||||||
"psraw $4,%%mm1\n\t" \
|
"psraw $4,%%mm1\n\t" \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
"movq "OC_I(2)",%%mm3\n\t" \
|
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||||
/*r7=G+G*/ \
|
/*r7=G+G*/ \
|
||||||
"paddw %%mm7,%%mm7\n\t" \
|
"paddw %%mm7,%%mm7\n\t" \
|
||||||
/*Store NR2 at I(2).*/ \
|
/*Store NR2 at I(2).*/ \
|
||||||
"movq %%mm2,"OC_I(2)"\n\t" \
|
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||||
/*r7=G'=E+G*/ \
|
/*r7=G'=E+G*/ \
|
||||||
"paddw %%mm4,%%mm7\n\t" \
|
"paddw %%mm4,%%mm7\n\t" \
|
||||||
/*Store NR1 at I(1).*/ \
|
/*Store NR1 at I(1).*/ \
|
||||||
"movq %%mm1,"OC_I(1)"\n\t" \
|
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||||
/*r4=R4=E'-D'*/ \
|
/*r4=R4=E'-D'*/ \
|
||||||
"psubw %%mm3,%%mm4\n\t" \
|
"psubw %%mm3,%%mm4\n\t" \
|
||||||
"paddw "OC_8",%%mm4\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||||
/*r3=D'+D'*/ \
|
/*r3=D'+D'*/ \
|
||||||
"paddw %%mm3,%%mm3\n\t" \
|
"paddw %%mm3,%%mm3\n\t" \
|
||||||
/*r3=R3=E'+D'*/ \
|
/*r3=R3=E'+D'*/ \
|
||||||
@ -469,7 +454,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
"psubw %%mm5,%%mm6\n\t" \
|
"psubw %%mm5,%%mm6\n\t" \
|
||||||
/*r3=NR3*/ \
|
/*r3=NR3*/ \
|
||||||
"psraw $4,%%mm3\n\t" \
|
"psraw $4,%%mm3\n\t" \
|
||||||
"paddw "OC_8",%%mm6\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||||
/*r5=B''+B''*/ \
|
/*r5=B''+B''*/ \
|
||||||
"paddw %%mm5,%%mm5\n\t" \
|
"paddw %%mm5,%%mm5\n\t" \
|
||||||
/*r5=R5=F'+B''*/ \
|
/*r5=R5=F'+B''*/ \
|
||||||
@ -477,14 +462,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
/*r6=NR6*/ \
|
/*r6=NR6*/ \
|
||||||
"psraw $4,%%mm6\n\t" \
|
"psraw $4,%%mm6\n\t" \
|
||||||
/*Store NR4 at J(4).*/ \
|
/*Store NR4 at J(4).*/ \
|
||||||
"movq %%mm4,"OC_J(4)"\n\t" \
|
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||||
/*r5=NR5*/ \
|
/*r5=NR5*/ \
|
||||||
"psraw $4,%%mm5\n\t" \
|
"psraw $4,%%mm5\n\t" \
|
||||||
/*Store NR3 at I(3).*/ \
|
/*Store NR3 at I(3).*/ \
|
||||||
"movq %%mm3,"OC_I(3)"\n\t" \
|
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||||
/*r7=R7=G'-C'*/ \
|
/*r7=R7=G'-C'*/ \
|
||||||
"psubw %%mm0,%%mm7\n\t" \
|
"psubw %%mm0,%%mm7\n\t" \
|
||||||
"paddw "OC_8",%%mm7\n\t" \
|
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||||
/*r0=C'+C'*/ \
|
/*r0=C'+C'*/ \
|
||||||
"paddw %%mm0,%%mm0\n\t" \
|
"paddw %%mm0,%%mm0\n\t" \
|
||||||
/*r0=R0=G'+C'*/ \
|
/*r0=R0=G'+C'*/ \
|
||||||
@ -492,46 +477,55 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
/*r7=NR7*/ \
|
/*r7=NR7*/ \
|
||||||
"psraw $4,%%mm7\n\t" \
|
"psraw $4,%%mm7\n\t" \
|
||||||
/*Store NR6 at J(6).*/ \
|
/*Store NR6 at J(6).*/ \
|
||||||
"movq %%mm6,"OC_J(6)"\n\t" \
|
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||||
/*r0=NR0*/ \
|
/*r0=NR0*/ \
|
||||||
"psraw $4,%%mm0\n\t" \
|
"psraw $4,%%mm0\n\t" \
|
||||||
/*Store NR5 at J(5).*/ \
|
/*Store NR5 at J(5).*/ \
|
||||||
"movq %%mm5,"OC_J(5)"\n\t" \
|
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||||
/*Store NR7 at J(7).*/ \
|
/*Store NR7 at J(7).*/ \
|
||||||
"movq %%mm7,"OC_J(7)"\n\t" \
|
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||||
/*Store NR0 at I(0).*/ \
|
/*Store NR0 at I(0).*/ \
|
||||||
"movq %%mm0,"OC_I(0)"\n\t" \
|
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||||
"#end OC_COLUMN_IDCT_10\n\t" \
|
"#end OC_COLUMN_IDCT_10\n\t" \
|
||||||
|
|
||||||
static void oc_idct8x8_10(ogg_int16_t _y[64]){
|
static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
__asm__ __volatile__(
|
__asm__ __volatile__(
|
||||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||||
#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
|
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||||
/*Done with dequant, descramble, and partial transpose.
|
/*Done with dequant, descramble, and partial transpose.
|
||||||
Now do the iDCT itself.*/
|
Now do the iDCT itself.*/
|
||||||
OC_ROW_IDCT_10
|
OC_ROW_IDCT_10(y,x)
|
||||||
OC_TRANSPOSE
|
OC_TRANSPOSE(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT_10
|
OC_COLUMN_IDCT_10(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
|
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT_10
|
OC_COLUMN_IDCT_10(y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
:
|
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||||
:[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
|
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||||
|
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
|
||||||
|
);
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"pxor %%mm0,%%mm0\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||||
|
:[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||||
version of the transform.*/
|
version of the transform.*/
|
||||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||||
/*_last_zzi is subtly different from an actual count of the number of
|
/*_last_zzi is subtly different from an actual count of the number of
|
||||||
coefficients we decoded for this block.
|
coefficients we decoded for this block.
|
||||||
It contains the value of zzi BEFORE the final token in the block was
|
It contains the value of zzi BEFORE the final token in the block was
|
||||||
@ -557,8 +551,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
|||||||
gets.
|
gets.
|
||||||
Needless to say we inherited this approach from VP3.*/
|
Needless to say we inherited this approach from VP3.*/
|
||||||
/*Then perform the iDCT.*/
|
/*Then perform the iDCT.*/
|
||||||
if(_last_zzi<10)oc_idct8x8_10(_y);
|
if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
|
||||||
else oc_idct8x8_slow(_y);
|
else oc_idct8x8_slow_mmx(_y,_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
271
thirdparty/libtheora/x86/mmxloop.h
vendored
271
thirdparty/libtheora/x86/mmxloop.h
vendored
@ -9,88 +9,191 @@
|
|||||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
|
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
|
||||||
#define OC_LOOP_FILTER8_MMX \
|
#define OC_LOOP_FILTER8_MMX \
|
||||||
"#OC_LOOP_FILTER8_MMX\n\t" \
|
"#OC_LOOP_FILTER8_MMX\n\t" \
|
||||||
/*mm7=0*/ \
|
/*mm7=0*/ \
|
||||||
"pxor %%mm7,%%mm7\n\t" \
|
"pxor %%mm7,%%mm7\n\t" \
|
||||||
/*mm6:mm0={a0,...,a7}*/ \
|
/*mm6:mm0={a0,...,a7}*/ \
|
||||||
"movq %%mm0,%%mm6\n\t" \
|
"movq %%mm0,%%mm6\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||||
"punpckhbw %%mm7,%%mm6\n\t" \
|
"punpckhbw %%mm7,%%mm6\n\t" \
|
||||||
/*mm3:mm5={d0,...,d7}*/ \
|
/*mm3:mm5={d0,...,d7}*/ \
|
||||||
"movq %%mm3,%%mm5\n\t" \
|
"movq %%mm3,%%mm5\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||||
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
||||||
"psubw %%mm3,%%mm0\n\t" \
|
"psubw %%mm3,%%mm0\n\t" \
|
||||||
"psubw %%mm5,%%mm6\n\t" \
|
"psubw %%mm5,%%mm6\n\t" \
|
||||||
/*mm3:mm1={b0,...,b7}*/ \
|
/*mm3:mm1={b0,...,b7}*/ \
|
||||||
"movq %%mm1,%%mm3\n\t" \
|
"movq %%mm1,%%mm3\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||||
"movq %%mm2,%%mm4\n\t" \
|
"movq %%mm2,%%mm4\n\t" \
|
||||||
"punpckhbw %%mm7,%%mm3\n\t" \
|
"punpckhbw %%mm7,%%mm3\n\t" \
|
||||||
/*mm5:mm4={c0,...,c7}*/ \
|
/*mm5:mm4={c0,...,c7}*/ \
|
||||||
"movq %%mm2,%%mm5\n\t" \
|
"movq %%mm2,%%mm5\n\t" \
|
||||||
"punpcklbw %%mm7,%%mm4\n\t" \
|
"punpcklbw %%mm7,%%mm4\n\t" \
|
||||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||||
/*mm7={3}x4 \
|
/*mm7={3}x4 \
|
||||||
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
||||||
"pcmpeqw %%mm7,%%mm7\n\t" \
|
"pcmpeqw %%mm7,%%mm7\n\t" \
|
||||||
"psubw %%mm1,%%mm4\n\t" \
|
"psubw %%mm1,%%mm4\n\t" \
|
||||||
"psrlw $14,%%mm7\n\t" \
|
"psrlw $14,%%mm7\n\t" \
|
||||||
"psubw %%mm3,%%mm5\n\t" \
|
"psubw %%mm3,%%mm5\n\t" \
|
||||||
/*Scale by 3.*/ \
|
/*Scale by 3.*/ \
|
||||||
"pmullw %%mm7,%%mm4\n\t" \
|
"pmullw %%mm7,%%mm4\n\t" \
|
||||||
"pmullw %%mm7,%%mm5\n\t" \
|
"pmullw %%mm7,%%mm5\n\t" \
|
||||||
/*mm7={4}x4 \
|
/*mm7={4}x4 \
|
||||||
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
||||||
"psrlw $1,%%mm7\n\t" \
|
"psrlw $1,%%mm7\n\t" \
|
||||||
"paddw %%mm0,%%mm4\n\t" \
|
"paddw %%mm0,%%mm4\n\t" \
|
||||||
"psllw $2,%%mm7\n\t" \
|
"psllw $2,%%mm7\n\t" \
|
||||||
"movq (%[ll]),%%mm0\n\t" \
|
"movq (%[ll]),%%mm0\n\t" \
|
||||||
"paddw %%mm6,%%mm5\n\t" \
|
"paddw %%mm6,%%mm5\n\t" \
|
||||||
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
||||||
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
||||||
"psubw %%mm7,%%mm4\n\t" \
|
"psubw %%mm7,%%mm4\n\t" \
|
||||||
"psubw %%mm7,%%mm5\n\t" \
|
"psubw %%mm7,%%mm5\n\t" \
|
||||||
"psraw $3,%%mm4\n\t" \
|
"psraw $3,%%mm4\n\t" \
|
||||||
"psraw $3,%%mm5\n\t" \
|
"psraw $3,%%mm5\n\t" \
|
||||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||||
"packsswb %%mm5,%%mm4\n\t" \
|
"packsswb %%mm5,%%mm4\n\t" \
|
||||||
"pxor %%mm6,%%mm6\n\t" \
|
"pxor %%mm6,%%mm6\n\t" \
|
||||||
"pxor %%mm7,%%mm4\n\t" \
|
"pxor %%mm7,%%mm4\n\t" \
|
||||||
"packuswb %%mm3,%%mm1\n\t" \
|
"packuswb %%mm3,%%mm1\n\t" \
|
||||||
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
||||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||||
we have to split things by sign (the other option is to work in 16 bits, \
|
we have to split things by sign (the other option is to work in 16 bits, \
|
||||||
but working in 8 bits gives much better parallelism). \
|
but working in 8 bits gives much better parallelism). \
|
||||||
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
||||||
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
||||||
Finally, we split mm4 into positive and negative pieces using the mask in \
|
Finally, we split mm4 into positive and negative pieces using the mask in \
|
||||||
mm6, and add and subtract them as appropriate.*/ \
|
mm6, and add and subtract them as appropriate.*/ \
|
||||||
/*mm4=abs(-R_i)*/ \
|
/*mm4=abs(-R_i)*/ \
|
||||||
/*mm7=255-2*L*/ \
|
/*mm7=255-2*L*/ \
|
||||||
"pcmpgtb %%mm4,%%mm6\n\t" \
|
"pcmpgtb %%mm4,%%mm6\n\t" \
|
||||||
"psubb %%mm0,%%mm7\n\t" \
|
"psubb %%mm0,%%mm7\n\t" \
|
||||||
"pxor %%mm6,%%mm4\n\t" \
|
"pxor %%mm6,%%mm4\n\t" \
|
||||||
"psubb %%mm0,%%mm7\n\t" \
|
"psubb %%mm0,%%mm7\n\t" \
|
||||||
"psubb %%mm6,%%mm4\n\t" \
|
"psubb %%mm6,%%mm4\n\t" \
|
||||||
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
||||||
"paddusb %%mm4,%%mm7\n\t" \
|
"paddusb %%mm4,%%mm7\n\t" \
|
||||||
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
||||||
"paddusb %%mm7,%%mm4\n\t" \
|
"paddusb %%mm7,%%mm4\n\t" \
|
||||||
"psubusb %%mm7,%%mm4\n\t" \
|
"psubusb %%mm7,%%mm4\n\t" \
|
||||||
/*Now split mm4 by the original sign of -R_i.*/ \
|
/*Now split mm4 by the original sign of -R_i.*/ \
|
||||||
"movq %%mm4,%%mm5\n\t" \
|
"movq %%mm4,%%mm5\n\t" \
|
||||||
"pand %%mm6,%%mm4\n\t" \
|
"pand %%mm6,%%mm4\n\t" \
|
||||||
"pandn %%mm5,%%mm6\n\t" \
|
"pandn %%mm5,%%mm6\n\t" \
|
||||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||||
"paddusb %%mm4,%%mm1\n\t" \
|
"paddusb %%mm4,%%mm1\n\t" \
|
||||||
"psubusb %%mm4,%%mm2\n\t" \
|
"psubusb %%mm4,%%mm2\n\t" \
|
||||||
"psubusb %%mm6,%%mm1\n\t" \
|
"psubusb %%mm6,%%mm1\n\t" \
|
||||||
"paddusb %%mm6,%%mm2\n\t" \
|
"paddusb %%mm6,%%mm2\n\t" \
|
||||||
|
|
||||||
#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
|
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||||
|
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||||
|
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
|
||||||
|
All other MMX registers are clobbered.*/
|
||||||
|
#define OC_LOOP_FILTER8_MMXEXT \
|
||||||
|
"#OC_LOOP_FILTER8_MMXEXT\n\t" \
|
||||||
|
/*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
|
||||||
|
-R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
|
||||||
|
/*This first part is based on the transformation \
|
||||||
|
f = -(3*(c-b)+a-d+4>>3) \
|
||||||
|
= -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
|
||||||
|
= -(3*(c+~b)+(a+~d)-1016>>3) \
|
||||||
|
= 127-(3*(c+~b)+(a+~d)>>3) \
|
||||||
|
= 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
|
||||||
|
Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
|
||||||
|
fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
|
||||||
|
Using this, the last expression above can be computed in 8 bits of working \
|
||||||
|
precision via: \
|
||||||
|
u = ~pavgb(~b,c); \
|
||||||
|
v = pavgb(b,~c); \
|
||||||
|
This mask is 0 or 0xFF, and controls whether t is biased up or down: \
|
||||||
|
m = u-v; \
|
||||||
|
t = m^pavgb(m^~a,m^d); \
|
||||||
|
f = 128+pavgb(pavgb(t,u),v); \
|
||||||
|
This required some careful analysis to ensure that carries are propagated \
|
||||||
|
correctly in all cases, but has been checked exhaustively.*/ \
|
||||||
|
/*input (a, b, c, d, ., ., ., .)*/ \
|
||||||
|
/*ff=0xFF; \
|
||||||
|
u=b; \
|
||||||
|
v=c; \
|
||||||
|
ll=255-2*L;*/ \
|
||||||
|
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||||
|
"movq %%mm1,%%mm4\n\t" \
|
||||||
|
"movq %%mm2,%%mm5\n\t" \
|
||||||
|
"movq (%[ll]),%%mm6\n\t" \
|
||||||
|
/*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
|
||||||
|
/*u^=ff; \
|
||||||
|
v^=ff;*/ \
|
||||||
|
"pxor %%mm7,%%mm4\n\t" \
|
||||||
|
"pxor %%mm7,%%mm5\n\t" \
|
||||||
|
/*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
|
||||||
|
/*u=pavgb(u,c); \
|
||||||
|
v=pavgb(v,b);*/ \
|
||||||
|
"pavgb %%mm2,%%mm4\n\t" \
|
||||||
|
"pavgb %%mm1,%%mm5\n\t" \
|
||||||
|
/*u^=ff; \
|
||||||
|
a^=ff;*/ \
|
||||||
|
"pxor %%mm7,%%mm4\n\t" \
|
||||||
|
"pxor %%mm7,%%mm0\n\t" \
|
||||||
|
/*m=u-v;*/ \
|
||||||
|
"psubb %%mm5,%%mm4\n\t" \
|
||||||
|
/*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
|
||||||
|
/*a^=m; \
|
||||||
|
d^=m;*/ \
|
||||||
|
"pxor %%mm4,%%mm0\n\t" \
|
||||||
|
"pxor %%mm4,%%mm3\n\t" \
|
||||||
|
/*t=pavgb(a,d);*/ \
|
||||||
|
"pavgb %%mm3,%%mm0\n\t" \
|
||||||
|
"psllw $7,%%mm7\n\t" \
|
||||||
|
/*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
|
||||||
|
/*t^=m; \
|
||||||
|
u=m+v;*/ \
|
||||||
|
"pxor %%mm4,%%mm0\n\t" \
|
||||||
|
"paddb %%mm5,%%mm4\n\t" \
|
||||||
|
/*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
|
||||||
|
/*f=pavgb(f,u); \
|
||||||
|
of=128;*/ \
|
||||||
|
"pavgb %%mm4,%%mm0\n\t" \
|
||||||
|
"packsswb %%mm7,%%mm7\n\t" \
|
||||||
|
/*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
|
||||||
|
/*f=pavgb(f,v);*/ \
|
||||||
|
"pavgb %%mm5,%%mm0\n\t" \
|
||||||
|
"movq %%mm7,%%mm3\n\t" \
|
||||||
|
"movq %%mm6,%%mm4\n\t" \
|
||||||
|
/*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
|
||||||
|
/*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
|
||||||
|
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||||
|
we have to split things by sign (the other option is to work in 16 bits, \
|
||||||
|
but staying in 8 bits gives much better parallelism).*/ \
|
||||||
|
/*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
|
||||||
|
This is the same number of instructions as computing a mask and splitting \
|
||||||
|
after the lflim computation, but has shorter dependency chains.*/ \
|
||||||
|
/*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
|
||||||
|
mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
|
||||||
|
"psubusb %%mm0,%%mm3\n\t" \
|
||||||
|
"psubusb %%mm7,%%mm0\n\t" \
|
||||||
|
/*mm6=255-max(2*L-abs(R_i<0),0) \
|
||||||
|
mm4=255-max(2*L-abs(R_i>0),0)*/ \
|
||||||
|
"paddusb %%mm3,%%mm4\n\t" \
|
||||||
|
"paddusb %%mm0,%%mm6\n\t" \
|
||||||
|
/*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
|
||||||
|
mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
|
||||||
|
"paddusb %%mm4,%%mm3\n\t" \
|
||||||
|
"paddusb %%mm6,%%mm0\n\t" \
|
||||||
|
"psubusb %%mm4,%%mm3\n\t" \
|
||||||
|
"psubusb %%mm6,%%mm0\n\t" \
|
||||||
|
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||||
|
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||||
|
"paddusb %%mm3,%%mm1\n\t" \
|
||||||
|
"psubusb %%mm3,%%mm2\n\t" \
|
||||||
|
"psubusb %%mm0,%%mm1\n\t" \
|
||||||
|
"paddusb %%mm0,%%mm2\n\t" \
|
||||||
|
|
||||||
|
#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
|
||||||
do{ \
|
do{ \
|
||||||
ptrdiff_t ystride3__; \
|
ptrdiff_t ystride3__; \
|
||||||
__asm__ __volatile__( \
|
__asm__ __volatile__( \
|
||||||
@ -104,7 +207,7 @@
|
|||||||
"movq (%[pix],%[ystride]),%%mm1\n\t" \
|
"movq (%[pix],%[ystride]),%%mm1\n\t" \
|
||||||
/*mm2={c0,...,c7}*/ \
|
/*mm2={c0,...,c7}*/ \
|
||||||
"movq (%[pix],%[ystride],2),%%mm2\n\t" \
|
"movq (%[pix],%[ystride],2),%%mm2\n\t" \
|
||||||
OC_LOOP_FILTER8_MMX \
|
_filter \
|
||||||
/*Write it back out.*/ \
|
/*Write it back out.*/ \
|
||||||
"movq %%mm1,(%[pix],%[ystride])\n\t" \
|
"movq %%mm1,(%[pix],%[ystride])\n\t" \
|
||||||
"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
|
"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
|
||||||
@ -116,7 +219,7 @@
|
|||||||
} \
|
} \
|
||||||
while(0)
|
while(0)
|
||||||
|
|
||||||
#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
|
#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
|
||||||
do{ \
|
do{ \
|
||||||
unsigned char *pix__; \
|
unsigned char *pix__; \
|
||||||
ptrdiff_t ystride3__; \
|
ptrdiff_t ystride3__; \
|
||||||
@ -174,7 +277,7 @@
|
|||||||
"punpckldq %%mm5,%%mm2\n\t" \
|
"punpckldq %%mm5,%%mm2\n\t" \
|
||||||
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
|
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
|
||||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||||
OC_LOOP_FILTER8_MMX \
|
_filter \
|
||||||
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
|
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
|
||||||
"movq %%mm1,%%mm0\n\t" \
|
"movq %%mm1,%%mm0\n\t" \
|
||||||
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
|
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
|
||||||
|
162
thirdparty/libtheora/x86/mmxstate.c
vendored
162
thirdparty/libtheora/x86/mmxstate.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -19,23 +19,23 @@
|
|||||||
Originally written by Rudolf Marek.*/
|
Originally written by Rudolf Marek.*/
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "x86int.h"
|
#include "x86int.h"
|
||||||
#include "mmxfrag.h"
|
|
||||||
#include "mmxloop.h"
|
#include "mmxloop.h"
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||||
unsigned char *dst;
|
unsigned char *dst;
|
||||||
ptrdiff_t frag_buf_off;
|
ptrdiff_t frag_buf_off;
|
||||||
int ystride;
|
int ystride;
|
||||||
int mb_mode;
|
int refi;
|
||||||
/*Apply the inverse transform.*/
|
/*Apply the inverse transform.*/
|
||||||
/*Special case only having a DC component.*/
|
/*Special case only having a DC component.*/
|
||||||
if(_last_zzi<2){
|
if(_last_zzi<2){
|
||||||
/*Note that this value must be unsigned, to keep the __asm__ block from
|
/*Note that this value must be unsigned, to keep the __asm__ block from
|
||||||
sign-extending it when it puts it in a register.*/
|
sign-extending it when it puts it in a register.*/
|
||||||
ogg_uint16_t p;
|
ogg_uint16_t p;
|
||||||
|
int i;
|
||||||
/*We round this dequant product (and not any of the others) because there's
|
/*We round this dequant product (and not any of the others) because there's
|
||||||
no iDCT rounding.*/
|
no iDCT rounding.*/
|
||||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||||
@ -47,81 +47,48 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|||||||
"punpcklwd %%mm0,%%mm0\n\t"
|
"punpcklwd %%mm0,%%mm0\n\t"
|
||||||
/*mm0=AAAA AAAA AAAA AAAA*/
|
/*mm0=AAAA AAAA AAAA AAAA*/
|
||||||
"punpckldq %%mm0,%%mm0\n\t"
|
"punpckldq %%mm0,%%mm0\n\t"
|
||||||
"movq %%mm0,(%[y])\n\t"
|
|
||||||
"movq %%mm0,8(%[y])\n\t"
|
|
||||||
"movq %%mm0,16(%[y])\n\t"
|
|
||||||
"movq %%mm0,24(%[y])\n\t"
|
|
||||||
"movq %%mm0,32(%[y])\n\t"
|
|
||||||
"movq %%mm0,40(%[y])\n\t"
|
|
||||||
"movq %%mm0,48(%[y])\n\t"
|
|
||||||
"movq %%mm0,56(%[y])\n\t"
|
|
||||||
"movq %%mm0,64(%[y])\n\t"
|
|
||||||
"movq %%mm0,72(%[y])\n\t"
|
|
||||||
"movq %%mm0,80(%[y])\n\t"
|
|
||||||
"movq %%mm0,88(%[y])\n\t"
|
|
||||||
"movq %%mm0,96(%[y])\n\t"
|
|
||||||
"movq %%mm0,104(%[y])\n\t"
|
|
||||||
"movq %%mm0,112(%[y])\n\t"
|
|
||||||
"movq %%mm0,120(%[y])\n\t"
|
|
||||||
:
|
:
|
||||||
:[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
|
:[p]"r"((unsigned)p)
|
||||||
:"memory"
|
|
||||||
);
|
);
|
||||||
|
for(i=0;i<4;i++){
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
|
||||||
|
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
/*Dequantize the DC coefficient.*/
|
/*Dequantize the DC coefficient.*/
|
||||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||||
oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
|
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||||
}
|
}
|
||||||
/*Fill in the target buffer.*/
|
/*Fill in the target buffer.*/
|
||||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||||
mb_mode=_state->frags[_fragi].mb_mode;
|
refi=_state->frags[_fragi].refi;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
|
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||||
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
|
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||||
else{
|
else{
|
||||||
const unsigned char *ref;
|
const unsigned char *ref;
|
||||||
int mvoffsets[2];
|
int mvoffsets[2];
|
||||||
ref=
|
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||||
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
|
|
||||||
+frag_buf_off;
|
|
||||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||||
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
|
_state->frag_mvs[_fragi])>1){
|
||||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||||
_dct_coeffs);
|
_dct_coeffs+64);
|
||||||
}
|
}
|
||||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
|
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*We copy these entire function to inline the actual MMX routines so that we
|
/*We copy these entire function to inline the actual MMX routines so that we
|
||||||
use only a single indirect call.*/
|
use only a single indirect call.*/
|
||||||
|
|
||||||
/*Copies the fragments specified by the lists of fragment indices from one
|
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||||
frame to another.
|
memset(_bv,_flimit,8);
|
||||||
_fragis: A pointer to a list of fragment indices.
|
|
||||||
_nfragis: The number of fragment indices to copy.
|
|
||||||
_dst_frame: The reference frame to copy to.
|
|
||||||
_src_frame: The reference frame to copy from.
|
|
||||||
_pli: The color plane the fragments lie in.*/
|
|
||||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli){
|
|
||||||
const ptrdiff_t *frag_buf_offs;
|
|
||||||
const unsigned char *src_frame_data;
|
|
||||||
unsigned char *dst_frame_data;
|
|
||||||
ptrdiff_t fragii;
|
|
||||||
int ystride;
|
|
||||||
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
|
|
||||||
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
|
|
||||||
ystride=_state->ref_ystride[_pli];
|
|
||||||
frag_buf_offs=_state->frag_buf_offs;
|
|
||||||
for(fragii=0;fragii<_nfragis;fragii++){
|
|
||||||
ptrdiff_t frag_buf_off;
|
|
||||||
frag_buf_off=frag_buf_offs[_fragis[fragii]];
|
|
||||||
OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
|
|
||||||
src_frame_data+frag_buf_off,ystride);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||||
@ -133,7 +100,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
|||||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||||
OC_ALIGN8(unsigned char ll[8]);
|
OC_ALIGN8(unsigned char ll[8]);
|
||||||
const oc_fragment_plane *fplane;
|
const oc_fragment_plane *fplane;
|
||||||
const oc_fragment *frags;
|
const oc_fragment *frags;
|
||||||
@ -170,13 +137,84 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
|||||||
if(frags[fragi].coded){
|
if(frags[fragi].coded){
|
||||||
unsigned char *ref;
|
unsigned char *ref;
|
||||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||||
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
|
if(fragi>fragi0){
|
||||||
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
|
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||||
|
}
|
||||||
|
if(fragi0>fragi_top){
|
||||||
|
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||||
|
}
|
||||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||||
OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
|
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
|
||||||
}
|
}
|
||||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||||
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
|
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fragi++;
|
||||||
|
}
|
||||||
|
fragi0+=nhfrags;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
|
||||||
|
memset(_bv,~(_flimit<<1),8);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||||
|
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||||
|
fragments, so this row also needs to be available.
|
||||||
|
_bv: The bounding values array.
|
||||||
|
_refi: The index of the frame buffer to filter.
|
||||||
|
_pli: The color plane to filter.
|
||||||
|
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||||
|
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||||
|
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||||
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||||
|
const oc_fragment_plane *fplane;
|
||||||
|
const oc_fragment *frags;
|
||||||
|
const ptrdiff_t *frag_buf_offs;
|
||||||
|
unsigned char *ref_frame_data;
|
||||||
|
ptrdiff_t fragi_top;
|
||||||
|
ptrdiff_t fragi_bot;
|
||||||
|
ptrdiff_t fragi0;
|
||||||
|
ptrdiff_t fragi0_end;
|
||||||
|
int ystride;
|
||||||
|
int nhfrags;
|
||||||
|
fplane=_state->fplanes+_pli;
|
||||||
|
nhfrags=fplane->nhfrags;
|
||||||
|
fragi_top=fplane->froffset;
|
||||||
|
fragi_bot=fragi_top+fplane->nfrags;
|
||||||
|
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||||
|
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||||
|
ystride=_state->ref_ystride[_pli];
|
||||||
|
frags=_state->frags;
|
||||||
|
frag_buf_offs=_state->frag_buf_offs;
|
||||||
|
ref_frame_data=_state->ref_frame_data[_refi];
|
||||||
|
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||||
|
The main idea is: if a block boundary has at least one coded fragment on
|
||||||
|
it, the filter is applied to it.
|
||||||
|
However, the order that the filters are applied in matters, and VP3 chose
|
||||||
|
the somewhat strange ordering used below.*/
|
||||||
|
while(fragi0<fragi0_end){
|
||||||
|
ptrdiff_t fragi;
|
||||||
|
ptrdiff_t fragi_end;
|
||||||
|
fragi=fragi0;
|
||||||
|
fragi_end=fragi+nhfrags;
|
||||||
|
while(fragi<fragi_end){
|
||||||
|
if(frags[fragi].coded){
|
||||||
|
unsigned char *ref;
|
||||||
|
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||||
|
if(fragi>fragi0){
|
||||||
|
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||||
|
}
|
||||||
|
if(fragi0>fragi_top){
|
||||||
|
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||||
|
}
|
||||||
|
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||||
|
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
|
||||||
|
}
|
||||||
|
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||||
|
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fragi++;
|
fragi++;
|
||||||
|
501
thirdparty/libtheora/x86/sse2encfrag.c
vendored
Normal file
501
thirdparty/libtheora/x86/sse2encfrag.c
vendored
Normal file
@ -0,0 +1,501 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
#include <stddef.h>
|
||||||
|
#include "x86enc.h"
|
||||||
|
#include "sse2trans.h"
|
||||||
|
|
||||||
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
|
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
|
||||||
|
16-bit differences.
|
||||||
|
On output, these are stored in _m0, xmm1, xmm2, and xmm3.
|
||||||
|
xmm4 and xmm5 are clobbered.*/
|
||||||
|
#define OC_LOAD_SUB_4x8(_m0) \
|
||||||
|
"#OC_LOAD_SUB_4x8\n\t" \
|
||||||
|
/*Load the first three rows.*/ \
|
||||||
|
"movq (%[src]),"_m0"\n\t" \
|
||||||
|
"movq (%[ref]),%%xmm4\n\t" \
|
||||||
|
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||||
|
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
|
||||||
|
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
|
||||||
|
"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
|
||||||
|
/*Unpack and subtract.*/ \
|
||||||
|
"punpcklbw %%xmm4,"_m0"\n\t" \
|
||||||
|
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"punpcklbw %%xmm3,%%xmm1\n\t" \
|
||||||
|
"punpcklbw %%xmm3,%%xmm3\n\t" \
|
||||||
|
"psubw %%xmm4,"_m0"\n\t" \
|
||||||
|
"psubw %%xmm3,%%xmm1\n\t" \
|
||||||
|
/*Load the last row.*/ \
|
||||||
|
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
|
||||||
|
"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
|
||||||
|
/*Unpack, subtract, and advance the pointers.*/ \
|
||||||
|
"punpcklbw %%xmm5,%%xmm2\n\t" \
|
||||||
|
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||||
|
"psubw %%xmm5,%%xmm2\n\t" \
|
||||||
|
"punpcklbw %%xmm4,%%xmm3\n\t" \
|
||||||
|
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"lea (%[ref],%[ystride],4),%[ref]\n\t" \
|
||||||
|
"psubw %%xmm4,%%xmm3\n\t" \
|
||||||
|
|
||||||
|
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
|
||||||
|
On output, xmm0 contains the sum of two of the rows, and the other two are
|
||||||
|
added to xmm7.*/
|
||||||
|
#define OC_SSD_4x8(_m0) \
|
||||||
|
"pmaddwd "_m0","_m0"\n\t" \
|
||||||
|
"pmaddwd %%xmm1,%%xmm1\n\t" \
|
||||||
|
"pmaddwd %%xmm2,%%xmm2\n\t" \
|
||||||
|
"pmaddwd %%xmm3,%%xmm3\n\t" \
|
||||||
|
"paddd %%xmm1,"_m0"\n\t" \
|
||||||
|
"paddd %%xmm3,%%xmm2\n\t" \
|
||||||
|
"paddd %%xmm2,%%xmm7\n\t" \
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride){
|
||||||
|
unsigned ret;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
OC_LOAD_SUB_4x8("%%xmm7")
|
||||||
|
OC_SSD_4x8("%%xmm7")
|
||||||
|
OC_LOAD_SUB_4x8("%%xmm0")
|
||||||
|
OC_SSD_4x8("%%xmm0")
|
||||||
|
"paddd %%xmm0,%%xmm7\n\t"
|
||||||
|
"movdqa %%xmm7,%%xmm6\n\t"
|
||||||
|
"punpckhqdq %%xmm7,%%xmm7\n\t"
|
||||||
|
"paddd %%xmm6,%%xmm7\n\t"
|
||||||
|
"pshufd $1,%%xmm7,%%xmm6\n\t"
|
||||||
|
"paddd %%xmm6,%%xmm7\n\t"
|
||||||
|
"movd %%xmm7,%[ret]\n\t"
|
||||||
|
:[ret]"=a"(ret)
|
||||||
|
:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
|
||||||
|
[ystride3]"r"((ptrdiff_t)_ystride*3)
|
||||||
|
);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
|
||||||
|
0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
|
||||||
|
};
|
||||||
|
|
||||||
|
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
|
||||||
|
horizontal sums as well as their 16-bit differences subject to a mask.
|
||||||
|
%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
|
||||||
|
#define OC_LOAD_SUB_MASK_2x8 \
|
||||||
|
"#OC_LOAD_SUB_MASK_2x8\n\t" \
|
||||||
|
/*Start the loads and expand the next 8 bits of the mask.*/ \
|
||||||
|
"shl $8,%[m]\n\t" \
|
||||||
|
"movq (%[src]),%%xmm0\n\t" \
|
||||||
|
"mov %h[m],%b[m]\n\t" \
|
||||||
|
"movq (%[ref]),%%xmm2\n\t" \
|
||||||
|
"movd %[m],%%xmm4\n\t" \
|
||||||
|
"shr $8,%[m]\n\t" \
|
||||||
|
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
|
||||||
|
"mov %h[m],%b[m]\n\t" \
|
||||||
|
"pand %%xmm6,%%xmm4\n\t" \
|
||||||
|
"pcmpeqb %%xmm6,%%xmm4\n\t" \
|
||||||
|
/*Perform the masking.*/ \
|
||||||
|
"pand %%xmm4,%%xmm0\n\t" \
|
||||||
|
"pand %%xmm4,%%xmm2\n\t" \
|
||||||
|
/*Finish the loads while unpacking the first set of rows, and expand the next
|
||||||
|
8 bits of the mask.*/ \
|
||||||
|
"movd %[m],%%xmm4\n\t" \
|
||||||
|
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||||
|
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
|
||||||
|
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
|
||||||
|
"pand %%xmm6,%%xmm4\n\t" \
|
||||||
|
"punpcklbw %%xmm2,%%xmm0\n\t" \
|
||||||
|
"pcmpeqb %%xmm6,%%xmm4\n\t" \
|
||||||
|
"punpcklbw %%xmm2,%%xmm2\n\t" \
|
||||||
|
/*Mask and unpack the second set of rows.*/ \
|
||||||
|
"pand %%xmm4,%%xmm1\n\t" \
|
||||||
|
"pand %%xmm4,%%xmm3\n\t" \
|
||||||
|
"punpcklbw %%xmm3,%%xmm1\n\t" \
|
||||||
|
"punpcklbw %%xmm3,%%xmm3\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm0\n\t" \
|
||||||
|
"psubw %%xmm3,%%xmm1\n\t" \
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
|
||||||
|
ptrdiff_t ystride;
|
||||||
|
unsigned ret;
|
||||||
|
int i;
|
||||||
|
ystride=_ystride;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"pxor %%xmm7,%%xmm7\n\t"
|
||||||
|
"movq %[c],%%xmm6\n\t"
|
||||||
|
:
|
||||||
|
:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
|
||||||
|
);
|
||||||
|
for(i=0;i<4;i++){
|
||||||
|
unsigned m;
|
||||||
|
m=_mask&0xFFFF;
|
||||||
|
_mask>>=16;
|
||||||
|
if(m){
|
||||||
|
__asm__ __volatile__(
|
||||||
|
OC_LOAD_SUB_MASK_2x8
|
||||||
|
"pmaddwd %%xmm0,%%xmm0\n\t"
|
||||||
|
"pmaddwd %%xmm1,%%xmm1\n\t"
|
||||||
|
"paddd %%xmm0,%%xmm7\n\t"
|
||||||
|
"paddd %%xmm1,%%xmm7\n\t"
|
||||||
|
:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
_src+=2*ystride;
|
||||||
|
_ref+=2*ystride;
|
||||||
|
}
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"movdqa %%xmm7,%%xmm6\n\t"
|
||||||
|
"punpckhqdq %%xmm7,%%xmm7\n\t"
|
||||||
|
"paddd %%xmm6,%%xmm7\n\t"
|
||||||
|
"pshufd $1,%%xmm7,%%xmm6\n\t"
|
||||||
|
"paddd %%xmm6,%%xmm7\n\t"
|
||||||
|
"movd %%xmm7,%[ret]\n\t"
|
||||||
|
:[ret]"=a"(ret)
|
||||||
|
);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
|
||||||
|
16-bit difference in %%xmm0...%%xmm7.*/
|
||||||
|
#define OC_LOAD_SUB_8x8 \
|
||||||
|
"#OC_LOAD_SUB_8x8\n\t" \
|
||||||
|
"movq (%[src]),%%xmm0\n\t" \
|
||||||
|
"movq (%[ref]),%%xmm4\n\t" \
|
||||||
|
"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
|
||||||
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
|
"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
|
||||||
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
|
"movq (%[src]),%%xmm2\n\t" \
|
||||||
|
"movq (%[ref]),%%xmm7\n\t" \
|
||||||
|
"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
|
||||||
|
"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
|
||||||
|
"punpcklbw %%xmm4,%%xmm0\n\t" \
|
||||||
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
|
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
|
"psubw %%xmm4,%%xmm0\n\t" \
|
||||||
|
"movq (%[src]),%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
"movq (%[ref]),%%xmm0\n\t" \
|
||||||
|
"punpcklbw %%xmm5,%%xmm1\n\t" \
|
||||||
|
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"psubw %%xmm5,%%xmm1\n\t" \
|
||||||
|
"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
|
||||||
|
"punpcklbw %%xmm6,%%xmm3\n\t" \
|
||||||
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
|
"punpcklbw %%xmm6,%%xmm6\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm3\n\t" \
|
||||||
|
"movq (%[src]),%%xmm6\n\t" \
|
||||||
|
"punpcklbw %%xmm0,%%xmm4\n\t" \
|
||||||
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
|
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||||
|
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm4\n\t" \
|
||||||
|
"movq (%[ref]),%%xmm0\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm5\n\t" \
|
||||||
|
"neg %[src_ystride]\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm7,%%xmm5\n\t" \
|
||||||
|
"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
|
||||||
|
"punpcklbw %%xmm0,%%xmm6\n\t" \
|
||||||
|
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||||
|
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||||
|
"neg %[ref_ystride]\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm6\n\t" \
|
||||||
|
"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
|
||||||
|
"punpcklbw %%xmm0,%%xmm7\n\t" \
|
||||||
|
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm7\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||||
|
|
||||||
|
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
|
||||||
|
#define OC_LOAD_8x8 \
|
||||||
|
"#OC_LOAD_8x8\n\t" \
|
||||||
|
"movq (%[src]),%%xmm0\n\t" \
|
||||||
|
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||||
|
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
|
||||||
|
"pxor %%xmm7,%%xmm7\n\t" \
|
||||||
|
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm0\n\t" \
|
||||||
|
"movq (%[src4]),%%xmm4\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm1\n\t" \
|
||||||
|
"movq (%[src4],%[ystride]),%%xmm5\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm3\n\t" \
|
||||||
|
"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
|
||||||
|
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"psrlw $8,%%xmm4\n\t" \
|
||||||
|
"psrlw $8,%%xmm5\n\t" \
|
||||||
|
"punpcklbw %%xmm6,%%xmm6\n\t" \
|
||||||
|
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"psrlw $8,%%xmm6\n\t" \
|
||||||
|
"psrlw $8,%%xmm7\n\t" \
|
||||||
|
|
||||||
|
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
|
||||||
|
Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
|
||||||
|
perform this stage in place with no temporary registers).*/
|
||||||
|
#define OC_HADAMARD_AB_8x8 \
|
||||||
|
"#OC_HADAMARD_AB_8x8\n\t" \
|
||||||
|
/*Stage A:*/ \
|
||||||
|
"paddw %%xmm5,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm6\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm5\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"psubw %%xmm3,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm4\n\t" \
|
||||||
|
/*Stage B:*/ \
|
||||||
|
"paddw %%xmm2,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm2\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm3\n\t" \
|
||||||
|
"psubw %%xmm4,%%xmm6\n\t" \
|
||||||
|
"psubw %%xmm5,%%xmm7\n\t" \
|
||||||
|
|
||||||
|
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||||
|
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||||
|
place with no temporary registers).*/
|
||||||
|
#define OC_HADAMARD_C_8x8 \
|
||||||
|
"#OC_HADAMARD_C_8x8\n\t" \
|
||||||
|
/*Stage C:*/ \
|
||||||
|
"paddw %%xmm1,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm1,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm1\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm3\n\t" \
|
||||||
|
"psubw %%xmm4,%%xmm5\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm7\n\t" \
|
||||||
|
|
||||||
|
/*Performs an 8-point 1-D Hadamard transform in place.
|
||||||
|
Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
|
||||||
|
in place with no temporary registers).*/
|
||||||
|
#define OC_HADAMARD_8x8 \
|
||||||
|
OC_HADAMARD_AB_8x8 \
|
||||||
|
OC_HADAMARD_C_8x8 \
|
||||||
|
|
||||||
|
/*Performs the first part of the final stage of the Hadamard transform and
|
||||||
|
summing of absolute values.
|
||||||
|
At the end of this part, %%xmm1 will contain the DC coefficient of the
|
||||||
|
transform.*/
|
||||||
|
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
|
||||||
|
/*We use the fact that \
|
||||||
|
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
|
||||||
|
to merge the final butterfly with the abs and the first stage of \
|
||||||
|
accumulation. \
|
||||||
|
Thus we can avoid using pabsw, which is not available until SSSE3. \
|
||||||
|
Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
|
||||||
|
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
|
||||||
|
registers). \
|
||||||
|
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||||
|
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||||
|
"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
|
||||||
|
"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||||
|
"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
/*xmm7={0x7FFF}x4 \
|
||||||
|
xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
|
||||||
|
"pcmpeqb %%xmm7,%%xmm7\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm6\n\t" \
|
||||||
|
"psrlw $1,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm6\n\t" \
|
||||||
|
"pmaxsw %%xmm5,%%xmm4\n\t" \
|
||||||
|
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm4\n\t" \
|
||||||
|
/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
|
||||||
|
xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
|
||||||
|
"movdqa %%xmm2,%%xmm6\n\t" \
|
||||||
|
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||||
|
"pmaxsw %%xmm3,%%xmm2\n\t" \
|
||||||
|
"pmaxsw %%xmm1,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm6\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm1\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
|
||||||
|
|
||||||
|
/*Performs the second part of the final stage of the Hadamard transform and
|
||||||
|
summing of absolute values.*/
|
||||||
|
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
|
||||||
|
"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
|
||||||
|
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||||
|
"paddsw %%xmm7,%%xmm1\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm2\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm0\n\t" \
|
||||||
|
/*xmm7={1}x4 (needed for the horizontal add that follows) \
|
||||||
|
xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||||
|
"pmaxsw %%xmm5,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm0\n\t" \
|
||||||
|
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm0\n\t" \
|
||||||
|
"psrlw $14,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm0\n\t" \
|
||||||
|
|
||||||
|
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
|
||||||
|
absolute value of each component, and accumulates everything into xmm0.*/
|
||||||
|
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
|
||||||
|
|
||||||
|
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
|
||||||
|
component, and accumulates everything into xmm0.
|
||||||
|
Note that xmm0 will have an extra 4 added to each column, and that after
|
||||||
|
removing this value, the remainder will be half the conventional value.*/
|
||||||
|
#define OC_HADAMARD_ABS_ACCUM_8x8 \
|
||||||
|
OC_HADAMARD_AB_8x8 \
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_8x8
|
||||||
|
|
||||||
|
static unsigned oc_int_frag_satd_sse2(int *_dc,
|
||||||
|
const unsigned char *_src,int _src_ystride,
|
||||||
|
const unsigned char *_ref,int _ref_ystride){
|
||||||
|
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||||
|
unsigned ret;
|
||||||
|
unsigned ret2;
|
||||||
|
int dc;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
OC_LOAD_SUB_8x8
|
||||||
|
OC_HADAMARD_8x8
|
||||||
|
OC_TRANSPOSE_8x8
|
||||||
|
/*We split out the stages here so we can save the DC coefficient in the
|
||||||
|
middle.*/
|
||||||
|
OC_HADAMARD_AB_8x8
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_A_8x8
|
||||||
|
"movd %%xmm1,%[dc]\n\t"
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_B_8x8
|
||||||
|
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||||
|
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||||
|
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||||
|
Now we finally have to promote things to dwords.
|
||||||
|
We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
|
||||||
|
latency of pmaddwd by starting to compute abs(dc) here.*/
|
||||||
|
"pmaddwd %%xmm7,%%xmm0\n\t"
|
||||||
|
"movsx %w[dc],%[dc]\n\t"
|
||||||
|
"cdq\n\t"
|
||||||
|
"movdqa %%xmm0,%%xmm1\n\t"
|
||||||
|
"punpckhqdq %%xmm0,%%xmm0\n\t"
|
||||||
|
"paddd %%xmm1,%%xmm0\n\t"
|
||||||
|
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
|
||||||
|
"paddd %%xmm1,%%xmm0\n\t"
|
||||||
|
"movd %%xmm0,%[ret]\n\t"
|
||||||
|
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
|
||||||
|
added to them, a factor of two removed, and the DC value included;
|
||||||
|
correct the final sum here.*/
|
||||||
|
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
||||||
|
"xor %[dc],%[ret2]\n\t"
|
||||||
|
"sub %[ret2],%[ret]\n\t"
|
||||||
|
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
||||||
|
and %[dc] with some of the inputs, since for once we don't write to
|
||||||
|
them until after we're done using everything but %[buf].*/
|
||||||
|
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
||||||
|
constraints, otherewise if gcc can prove they're equal it will allocate
|
||||||
|
them to the same register (which is bad); _src and _ref face a similar
|
||||||
|
problem.
|
||||||
|
All four are destructively modified, but if we list them as output
|
||||||
|
constraints, gcc can't alias them with other outputs.*/
|
||||||
|
:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
|
||||||
|
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
|
||||||
|
:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
||||||
|
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||||
|
/*We have to use neg, so we actually clobber the condition codes for once
|
||||||
|
(not to mention sub, and add).*/
|
||||||
|
:"cc"
|
||||||
|
);
|
||||||
|
*_dc=dc;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride){
|
||||||
|
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||||
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||||
|
OC_ALIGN8(unsigned char ref[64]);
|
||||||
|
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||||
|
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||||
|
const unsigned char *_src,int _ystride){
|
||||||
|
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||||
|
unsigned ret;
|
||||||
|
int dc;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
OC_LOAD_8x8
|
||||||
|
OC_HADAMARD_8x8
|
||||||
|
OC_TRANSPOSE_8x8
|
||||||
|
/*We split out the stages here so we can save the DC coefficient in the
|
||||||
|
middle.*/
|
||||||
|
OC_HADAMARD_AB_8x8
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_A_8x8
|
||||||
|
"movd %%xmm1,%[dc]\n\t"
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_B_8x8
|
||||||
|
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||||
|
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||||
|
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||||
|
Now we finally have to promote things to dwords.*/
|
||||||
|
"pmaddwd %%xmm7,%%xmm0\n\t"
|
||||||
|
/*We assume that the DC coefficient is always positive (which is true,
|
||||||
|
because the input to the INTRA transform was not a difference).*/
|
||||||
|
"movzx %w[dc],%[dc]\n\t"
|
||||||
|
"movdqa %%xmm0,%%xmm1\n\t"
|
||||||
|
"punpckhqdq %%xmm0,%%xmm0\n\t"
|
||||||
|
"paddd %%xmm1,%%xmm0\n\t"
|
||||||
|
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
|
||||||
|
"paddd %%xmm1,%%xmm0\n\t"
|
||||||
|
"movd %%xmm0,%[ret]\n\t"
|
||||||
|
"lea -64(%[ret],%[ret]),%[ret]\n\t"
|
||||||
|
"sub %[dc],%[ret]\n\t"
|
||||||
|
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
||||||
|
and %[dc] with some of the inputs, since for once we don't write to
|
||||||
|
them until after we're done using everything but %[buf].*/
|
||||||
|
:[ret]"=a"(ret),[dc]"=r"(dc),
|
||||||
|
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
|
||||||
|
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||||
|
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||||
|
/*We have to use sub, so we actually clobber the condition codes for once.*/
|
||||||
|
:"cc"
|
||||||
|
);
|
||||||
|
*_dc=dc;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
111
thirdparty/libtheora/x86/sse2fdct.c
vendored
111
thirdparty/libtheora/x86/sse2fdct.c
vendored
@ -13,12 +13,14 @@
|
|||||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "x86enc.h"
|
#include "x86enc.h"
|
||||||
|
#include "x86zigzag.h"
|
||||||
|
#include "sse2trans.h"
|
||||||
|
|
||||||
#if defined(OC_X86_64_ASM)
|
#if defined(OC_X86_64_ASM)
|
||||||
|
|
||||||
# define OC_FDCT8x8 \
|
# define OC_FDCT_8x8 \
|
||||||
/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
|
/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
|
||||||
"#OC_FDCT8x8\n\t" \
|
"#OC_FDCT_8x8\n\t" \
|
||||||
/*Stage 1:*/ \
|
/*Stage 1:*/ \
|
||||||
"movdqa %%xmm0,%%xmm11\n\t" \
|
"movdqa %%xmm0,%%xmm11\n\t" \
|
||||||
"movdqa %%xmm1,%%xmm10\n\t" \
|
"movdqa %%xmm1,%%xmm10\n\t" \
|
||||||
@ -349,81 +351,6 @@
|
|||||||
"psubw %%xmm14,%%xmm10\n\t" \
|
"psubw %%xmm14,%%xmm10\n\t" \
|
||||||
"paddw %%xmm10,%%xmm7\n\t " \
|
"paddw %%xmm10,%%xmm7\n\t " \
|
||||||
|
|
||||||
# define OC_TRANSPOSE8x8 \
|
|
||||||
"#OC_TRANSPOSE8x8\n\t" \
|
|
||||||
"movdqa %%xmm4,%%xmm8\n\t" \
|
|
||||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
|
||||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
|
||||||
/*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
|
||||||
"punpckhwd %%xmm5,%%xmm8\n\t" \
|
|
||||||
/*xmm5 is free.*/ \
|
|
||||||
"movdqa %%xmm0,%%xmm5\n\t" \
|
|
||||||
/*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
|
||||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
|
||||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
|
||||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
|
||||||
/*xmm1 is free.*/ \
|
|
||||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
|
||||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
|
||||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
|
||||||
/*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
|
||||||
"punpckhwd %%xmm7,%%xmm1\n\t" \
|
|
||||||
/*xmm7 is free.*/ \
|
|
||||||
"movdqa %%xmm2,%%xmm7\n\t" \
|
|
||||||
/*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
|
||||||
"punpcklwd %%xmm3,%%xmm7\n\t" \
|
|
||||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
|
||||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
|
||||||
/*xmm3 is free.*/ \
|
|
||||||
"movdqa %%xmm0,%%xmm3\n\t" \
|
|
||||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
|
||||||
"punpckldq %%xmm7,%%xmm0\n\t" \
|
|
||||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
|
||||||
"punpckhdq %%xmm7,%%xmm3\n\t" \
|
|
||||||
/*xmm7 is free.*/ \
|
|
||||||
"movdqa %%xmm5,%%xmm7\n\t" \
|
|
||||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
|
||||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
|
||||||
/*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
|
||||||
"punpckhdq %%xmm2,%%xmm7\n\t" \
|
|
||||||
/*xmm2 is free.*/ \
|
|
||||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
|
||||||
/*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
|
||||||
"punpckldq %%xmm6,%%xmm2\n\t" \
|
|
||||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
|
||||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
|
||||||
/*xmm6 is free.*/ \
|
|
||||||
"movdqa %%xmm8,%%xmm6\n\t" \
|
|
||||||
/*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
|
||||||
"punpckldq %%xmm1,%%xmm6\n\t" \
|
|
||||||
/*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
|
||||||
"punpckhdq %%xmm1,%%xmm8\n\t" \
|
|
||||||
/*xmm1 is free.*/ \
|
|
||||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
|
||||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
|
||||||
"punpcklqdq %%xmm2,%%xmm0\n\t" \
|
|
||||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
|
||||||
"punpckhqdq %%xmm2,%%xmm1\n\t" \
|
|
||||||
/*xmm2 is free.*/ \
|
|
||||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
|
||||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
|
||||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
|
||||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
|
||||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
|
||||||
/*xmm4 is free.*/ \
|
|
||||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
|
||||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
|
||||||
"punpcklqdq %%xmm6,%%xmm4\n\t" \
|
|
||||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
|
||||||
"punpckhqdq %%xmm6,%%xmm5\n\t" \
|
|
||||||
/*xmm6 is free.*/ \
|
|
||||||
"movdqa %%xmm7,%%xmm6\n\t" \
|
|
||||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
|
||||||
"punpcklqdq %%xmm8,%%xmm6\n\t" \
|
|
||||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
|
||||||
"punpckhqdq %%xmm8,%%xmm7\n\t" \
|
|
||||||
/*xmm8 is free.*/ \
|
|
||||||
|
|
||||||
/*SSE2 implementation of the fDCT for x86-64 only.
|
/*SSE2 implementation of the fDCT for x86-64 only.
|
||||||
Because of the 8 extra XMM registers on x86-64, this version can operate
|
Because of the 8 extra XMM registers on x86-64, this version can operate
|
||||||
without any temporary stack access at all.*/
|
without any temporary stack access at all.*/
|
||||||
@ -482,12 +409,10 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
|
/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
|
||||||
"psubw %%xmm9,%%xmm1\n\t"
|
"psubw %%xmm9,%%xmm1\n\t"
|
||||||
/*Transform columns.*/
|
/*Transform columns.*/
|
||||||
OC_FDCT8x8
|
OC_FDCT_8x8
|
||||||
/*Transform rows.*/
|
/*Transform rows.*/
|
||||||
OC_TRANSPOSE8x8
|
OC_TRANSPOSE_8x8
|
||||||
OC_FDCT8x8
|
OC_FDCT_8x8
|
||||||
/*TODO: zig-zag ordering?*/
|
|
||||||
OC_TRANSPOSE8x8
|
|
||||||
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
|
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
|
||||||
"paddw %%xmm14,%%xmm14\n\t"
|
"paddw %%xmm14,%%xmm14\n\t"
|
||||||
"psubw %%xmm14,%%xmm0\n\t"
|
"psubw %%xmm14,%%xmm0\n\t"
|
||||||
@ -506,15 +431,19 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
"psubw %%xmm14,%%xmm7\n\t"
|
"psubw %%xmm14,%%xmm7\n\t"
|
||||||
"psraw $2,%%xmm6\n\t"
|
"psraw $2,%%xmm6\n\t"
|
||||||
"psraw $2,%%xmm7\n\t"
|
"psraw $2,%%xmm7\n\t"
|
||||||
/*Store the result.*/
|
/*Transpose, zig-zag, and store the result.*/
|
||||||
"movdqa %%xmm0,0x00(%[y])\n\t"
|
/*We could probably do better using SSSE3's palignr, but re-using MMXEXT
|
||||||
"movdqa %%xmm1,0x10(%[y])\n\t"
|
version will do for now.*/
|
||||||
"movdqa %%xmm2,0x20(%[y])\n\t"
|
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||||
"movdqa %%xmm3,0x30(%[y])\n\t"
|
"movdq2q %%xmm"#_row","_reg"\n\t" \
|
||||||
"movdqa %%xmm4,0x40(%[y])\n\t"
|
|
||||||
"movdqa %%xmm5,0x50(%[y])\n\t"
|
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||||
"movdqa %%xmm6,0x60(%[y])\n\t"
|
"punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
|
||||||
"movdqa %%xmm7,0x70(%[y])\n\t"
|
"movdq2q %%xmm"#_row","_reg"\n\t" \
|
||||||
|
|
||||||
|
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||||
|
#undef OC_ZZ_LOAD_ROW_LO
|
||||||
|
#undef OC_ZZ_LOAD_ROW_HI
|
||||||
:[a]"=&r"(a)
|
:[a]"=&r"(a)
|
||||||
:[y]"r"(_y),[x]"r"(_x)
|
:[y]"r"(_y),[x]"r"(_x)
|
||||||
:"memory"
|
:"memory"
|
||||||
|
456
thirdparty/libtheora/x86/sse2idct.c
vendored
Normal file
456
thirdparty/libtheora/x86/sse2idct.c
vendored
Normal file
@ -0,0 +1,456 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
/*SSE2 acceleration of Theora's iDCT.*/
|
||||||
|
#include "x86int.h"
|
||||||
|
#include "sse2trans.h"
|
||||||
|
#include "../dct.h"
|
||||||
|
|
||||||
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
|
/*A table of constants used by the MMX routines.*/
|
||||||
|
const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
|
||||||
|
8, 8, 8, 8, 8, 8, 8, 8,
|
||||||
|
OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
|
||||||
|
OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
|
||||||
|
OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
|
||||||
|
OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
|
||||||
|
OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
|
||||||
|
OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
|
||||||
|
OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*Performs the first three stages of the iDCT.
|
||||||
|
xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
|
||||||
|
(accessed in that order).
|
||||||
|
The remaining rows must be in _x at their corresponding locations.
|
||||||
|
On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||||
|
contain rows 4 through 7.*/
|
||||||
|
#define OC_IDCT_8x8_ABC(_x) \
|
||||||
|
"#OC_IDCT_8x8_ABC\n\t" \
|
||||||
|
/*Stage 1:*/ \
|
||||||
|
/*2-3 rotation by 6pi/16. \
|
||||||
|
xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm1,%%xmm0\n\t" \
|
||||||
|
"pmulhw %%xmm2,%%xmm1\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||||
|
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm0\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm1,%%xmm2\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm7\n\t" \
|
||||||
|
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm2\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||||
|
/*5-6 rotation by 3pi/16. \
|
||||||
|
xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
|
||||||
|
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||||
|
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||||
|
"pmulhw %%xmm3,%%xmm4\n\t" \
|
||||||
|
"pmulhw %%xmm5,%%xmm1\n\t" \
|
||||||
|
"pmulhw %%xmm3,%%xmm6\n\t" \
|
||||||
|
"pmulhw %%xmm5,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm3\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm1\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm2\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||||
|
"psubw %%xmm4,%%xmm1\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
|
||||||
|
/*4-7 rotation by 7pi/16. \
|
||||||
|
xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm0\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||||
|
"pmulhw %%xmm5,%%xmm3\n\t" \
|
||||||
|
"pmulhw %%xmm5,%%xmm7\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm4\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm4,%%xmm3\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm0\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
|
||||||
|
/*0-1 butterfly. \
|
||||||
|
xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
|
||||||
|
"paddw %%xmm7,%%xmm6\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm4\n\t" \
|
||||||
|
/*Stage 2:*/ \
|
||||||
|
/*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
|
||||||
|
7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm1,%%xmm3\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm6\n\t" \
|
||||||
|
"movdqa %%xmm5,%%xmm1\n\t" \
|
||||||
|
"pmulhw %%xmm7,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm5\n\t" \
|
||||||
|
"movdqa %%xmm0,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm0\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm7\n\t" \
|
||||||
|
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||||
|
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm1\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||||
|
/*Stage 3: \
|
||||||
|
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||||
|
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||||
|
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||||
|
"paddw %%xmm2,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm2\n\t" \
|
||||||
|
"psubw %%xmm7,%%xmm4\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm5\n\t" \
|
||||||
|
|
||||||
|
/*Performs the last stage of the iDCT.
|
||||||
|
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||||
|
contain rows 4 through 7.
|
||||||
|
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||||
|
#define OC_IDCT_8x8_D \
|
||||||
|
"#OC_IDCT_8x8_D\n\t" \
|
||||||
|
/*Stage 4: \
|
||||||
|
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||||
|
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||||
|
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||||
|
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||||
|
"psubw %%xmm0,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm6\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm5\n\t" \
|
||||||
|
"psubw %%xmm3,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm0,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm1,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm3\n\t" \
|
||||||
|
|
||||||
|
/*Performs the last stage of the iDCT.
|
||||||
|
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||||
|
contain rows 4 through 7.
|
||||||
|
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||||
|
#define OC_IDCT_8x8_D_STORE \
|
||||||
|
"#OC_IDCT_8x8_D_STORE\n\t" \
|
||||||
|
/*Stage 4: \
|
||||||
|
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||||
|
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||||
|
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||||
|
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||||
|
"psubw %%xmm3,%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
|
||||||
|
"psubw %%xmm0,%%xmm7\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm6\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm5\n\t" \
|
||||||
|
"paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm0,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm1,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm1\n\t" \
|
||||||
|
"psraw $4,%%xmm0\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm2\n\t" \
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm3\n\t" \
|
||||||
|
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm2\n\t" \
|
||||||
|
"movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm3\n\t" \
|
||||||
|
"movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm5\n\t" \
|
||||||
|
"movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm6\n\t" \
|
||||||
|
"movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
|
||||||
|
"psraw $4,%%xmm7\n\t" \
|
||||||
|
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
|
||||||
|
|
||||||
|
static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
|
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||||
|
int i;
|
||||||
|
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||||
|
__asm__ __volatile__(
|
||||||
|
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
|
||||||
|
"movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
|
||||||
|
"movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
|
||||||
|
"movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
|
||||||
|
"movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
|
||||||
|
OC_IDCT_8x8_ABC(x)
|
||||||
|
OC_IDCT_8x8_D
|
||||||
|
OC_TRANSPOSE_8x8
|
||||||
|
/*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
|
||||||
|
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
|
||||||
|
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
|
||||||
|
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||||
|
OC_IDCT_8x8_ABC(y)
|
||||||
|
OC_IDCT_8x8_D_STORE
|
||||||
|
:[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
|
||||||
|
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||||
|
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
|
||||||
|
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
|
||||||
|
);
|
||||||
|
__asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
|
||||||
|
/*Clear input data for next block (decoder only).*/
|
||||||
|
for(i=0;i<2;i++){
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||||
|
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
|
||||||
|
need to work with four columns at a time.
|
||||||
|
Doing this in MMX is faster on processors with a 64-bit data path.*/
|
||||||
|
#define OC_IDCT_8x8_10_MMX \
|
||||||
|
"#OC_IDCT_8x8_10_MMX\n\t" \
|
||||||
|
/*Stage 1:*/ \
|
||||||
|
/*2-3 rotation by 6pi/16. \
|
||||||
|
mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
|
||||||
|
"movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
|
||||||
|
"pmulhw %%mm2,%%mm6\n\t" \
|
||||||
|
"pmulhw %%mm2,%%mm7\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
|
||||||
|
"paddw %%mm6,%%mm2\n\t" \
|
||||||
|
"movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
|
||||||
|
"movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
/*5-6 rotation by 3pi/16. \
|
||||||
|
mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
|
||||||
|
"pmulhw %%mm3,%%mm5\n\t" \
|
||||||
|
"pmulhw %%mm3,%%mm2\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
|
||||||
|
"paddw %%mm3,%%mm5\n\t" \
|
||||||
|
"paddw %%mm3,%%mm2\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||||
|
/*4-7 rotation by 7pi/16. \
|
||||||
|
mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
|
||||||
|
"pmulhw %%mm1,%%mm3\n\t" \
|
||||||
|
"pmulhw %%mm1,%%mm7\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||||
|
"movq %%mm3,%%mm6\n\t" \
|
||||||
|
"paddw %%mm1,%%mm7\n\t" \
|
||||||
|
/*0-1 butterfly. \
|
||||||
|
mm4=C4, mm0=X0, X4=0.*/ \
|
||||||
|
/*Stage 2:*/ \
|
||||||
|
/*4-5 butterfly: mm3=t[4], mm5=t[5] \
|
||||||
|
7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
|
||||||
|
"psubw %%mm5,%%mm3\n\t" \
|
||||||
|
"paddw %%mm5,%%mm6\n\t" \
|
||||||
|
"movq %%mm4,%%mm1\n\t" \
|
||||||
|
"pmulhw %%mm0,%%mm4\n\t" \
|
||||||
|
"paddw %%mm0,%%mm4\n\t" \
|
||||||
|
"movq %%mm7,%%mm0\n\t" \
|
||||||
|
"movq %%mm4,%%mm5\n\t" \
|
||||||
|
"paddw %%mm2,%%mm0\n\t" \
|
||||||
|
"psubw %%mm2,%%mm7\n\t" \
|
||||||
|
"movq %%mm1,%%mm2\n\t" \
|
||||||
|
"pmulhw %%mm6,%%mm1\n\t" \
|
||||||
|
"pmulhw %%mm7,%%mm2\n\t" \
|
||||||
|
"paddw %%mm6,%%mm1\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
|
||||||
|
"paddw %%mm7,%%mm2\n\t" \
|
||||||
|
"movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
|
||||||
|
/*Stage 3: \
|
||||||
|
6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
|
||||||
|
0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
|
||||||
|
1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
|
||||||
|
"paddw %%mm2,%%mm1\n\t" \
|
||||||
|
"paddw %%mm5,%%mm6\n\t" \
|
||||||
|
"paddw %%mm4,%%mm7\n\t" \
|
||||||
|
"paddw %%mm2,%%mm2\n\t" \
|
||||||
|
"paddw %%mm4,%%mm4\n\t" \
|
||||||
|
"paddw %%mm5,%%mm5\n\t" \
|
||||||
|
"psubw %%mm1,%%mm2\n\t" \
|
||||||
|
"psubw %%mm7,%%mm4\n\t" \
|
||||||
|
"psubw %%mm6,%%mm5\n\t" \
|
||||||
|
/*Stage 4: \
|
||||||
|
0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
|
||||||
|
1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
|
||||||
|
2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
|
||||||
|
3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
|
||||||
|
"psubw %%mm0,%%mm7\n\t" \
|
||||||
|
"psubw %%mm1,%%mm6\n\t" \
|
||||||
|
"psubw %%mm2,%%mm5\n\t" \
|
||||||
|
"psubw %%mm3,%%mm4\n\t" \
|
||||||
|
"paddw %%mm0,%%mm0\n\t" \
|
||||||
|
"paddw %%mm1,%%mm1\n\t" \
|
||||||
|
"paddw %%mm2,%%mm2\n\t" \
|
||||||
|
"paddw %%mm3,%%mm3\n\t" \
|
||||||
|
"paddw %%mm7,%%mm0\n\t" \
|
||||||
|
"paddw %%mm6,%%mm1\n\t" \
|
||||||
|
"paddw %%mm5,%%mm2\n\t" \
|
||||||
|
"paddw %%mm4,%%mm3\n\t" \
|
||||||
|
|
||||||
|
#define OC_IDCT_8x8_10_ABC \
|
||||||
|
"#OC_IDCT_8x8_10_ABC\n\t" \
|
||||||
|
/*Stage 1:*/ \
|
||||||
|
/*2-3 rotation by 6pi/16. \
|
||||||
|
xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
|
||||||
|
"pmulhw %%xmm2,%%xmm6\n\t" \
|
||||||
|
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm2\n\t" \
|
||||||
|
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
|
||||||
|
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
/*5-6 rotation by 3pi/16. \
|
||||||
|
xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
|
||||||
|
"pmulhw %%xmm3,%%xmm5\n\t" \
|
||||||
|
"pmulhw %%xmm3,%%xmm2\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm3,%%xmm2\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||||
|
/*4-7 rotation by 7pi/16. \
|
||||||
|
xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
|
||||||
|
"pmulhw %%xmm1,%%xmm3\n\t" \
|
||||||
|
"pmulhw %%xmm1,%%xmm7\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm1,%%xmm7\n\t" \
|
||||||
|
/*0-1 butterfly. \
|
||||||
|
xmm4=C4, xmm0=X0, X4=0.*/ \
|
||||||
|
/*Stage 2:*/ \
|
||||||
|
/*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
|
||||||
|
7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
|
||||||
|
"psubw %%xmm5,%%xmm3\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm6\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm1\n\t" \
|
||||||
|
"pmulhw %%xmm0,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm0,%%xmm4\n\t" \
|
||||||
|
"movdqa %%xmm7,%%xmm0\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm0\n\t" \
|
||||||
|
"psubw %%xmm2,%%xmm7\n\t" \
|
||||||
|
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||||
|
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||||
|
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm6,%%xmm1\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm7,%%xmm2\n\t" \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||||
|
/*Stage 3: \
|
||||||
|
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||||
|
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||||
|
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||||
|
"paddw %%xmm2,%%xmm1\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm6\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm7\n\t" \
|
||||||
|
"paddw %%xmm2,%%xmm2\n\t" \
|
||||||
|
"paddw %%xmm4,%%xmm4\n\t" \
|
||||||
|
"paddw %%xmm5,%%xmm5\n\t" \
|
||||||
|
"psubw %%xmm1,%%xmm2\n\t" \
|
||||||
|
"psubw %%xmm7,%%xmm4\n\t" \
|
||||||
|
"psubw %%xmm6,%%xmm5\n\t" \
|
||||||
|
|
||||||
|
static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
|
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||||
|
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
|
||||||
|
"movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
|
||||||
|
"movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
|
||||||
|
"movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
|
||||||
|
OC_IDCT_8x8_10_MMX
|
||||||
|
OC_TRANSPOSE_8x4_MMX2SSE
|
||||||
|
OC_IDCT_8x8_10_ABC
|
||||||
|
OC_IDCT_8x8_D_STORE
|
||||||
|
:[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
|
||||||
|
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||||
|
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||||
|
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
|
||||||
|
);
|
||||||
|
/*Clear input data for next block (decoder only).*/
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"pxor %%mm0,%%mm0\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||||
|
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||||
|
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||||
|
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||||
|
version of the transform.*/
|
||||||
|
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||||
|
/*_last_zzi is subtly different from an actual count of the number of
|
||||||
|
coefficients we decoded for this block.
|
||||||
|
It contains the value of zzi BEFORE the final token in the block was
|
||||||
|
decoded.
|
||||||
|
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||||
|
previous block counts), and so this is the same as the coefficient count.
|
||||||
|
However, in the case that the last token was NOT an EOB token, but filled
|
||||||
|
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||||
|
Provided the last token was not a pure zero run, the minimum value it can
|
||||||
|
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||||
|
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||||
|
will be 1 while the number of coefficients decoded is 64.
|
||||||
|
Thus, we will trigger the following special case, where the real
|
||||||
|
coefficient count would not.
|
||||||
|
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||||
|
but we still process the DC coefficient, which might have a non-zero value
|
||||||
|
due to DC prediction.
|
||||||
|
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||||
|
use a smaller transform when the block ends with a long zero run instead
|
||||||
|
of a normal EOB token.
|
||||||
|
It could be smarter... multiple separate zero runs at the end of a block
|
||||||
|
will fool it, but an encoder that generates these really deserves what it
|
||||||
|
gets.
|
||||||
|
Needless to say we inherited this approach from VP3.*/
|
||||||
|
/*Then perform the iDCT.*/
|
||||||
|
if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
|
||||||
|
else oc_idct8x8_slow_sse2(_y,_x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
242
thirdparty/libtheora/x86/sse2trans.h
vendored
Normal file
242
thirdparty/libtheora/x86/sse2trans.h
vendored
Normal file
@ -0,0 +1,242 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#if !defined(_x86_sse2trans_H)
|
||||||
|
# define _x86_sse2trans_H (1)
|
||||||
|
# include "x86int.h"
|
||||||
|
|
||||||
|
# if defined(OC_X86_64_ASM)
|
||||||
|
/*On x86-64 we can transpose in-place without spilling registers.
|
||||||
|
By clever choices of the order to apply the butterflies and the order of
|
||||||
|
their outputs, we can take the rows in order and output the columns in order
|
||||||
|
without any extra operations and using just one temporary register.*/
|
||||||
|
# define OC_TRANSPOSE_8x8 \
|
||||||
|
"#OC_TRANSPOSE_8x8\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm8\n\t" \
|
||||||
|
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||||
|
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||||
|
/*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||||
|
"punpckhwd %%xmm5,%%xmm8\n\t" \
|
||||||
|
/*xmm5 is free.*/ \
|
||||||
|
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||||
|
/*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||||
|
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||||
|
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||||
|
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||||
|
/*xmm1 is free.*/ \
|
||||||
|
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||||
|
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||||
|
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||||
|
/*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||||
|
"punpckhwd %%xmm7,%%xmm1\n\t" \
|
||||||
|
/*xmm7 is free.*/ \
|
||||||
|
"movdqa %%xmm2,%%xmm7\n\t" \
|
||||||
|
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||||
|
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||||
|
/*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||||
|
"punpcklwd %%xmm3,%%xmm7\n\t" \
|
||||||
|
/*xmm3 is free.*/ \
|
||||||
|
"movdqa %%xmm0,%%xmm3\n\t" \
|
||||||
|
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||||
|
"punpckldq %%xmm7,%%xmm0\n\t" \
|
||||||
|
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||||
|
"punpckhdq %%xmm7,%%xmm3\n\t" \
|
||||||
|
/*xmm7 is free.*/ \
|
||||||
|
"movdqa %%xmm5,%%xmm7\n\t" \
|
||||||
|
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||||
|
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||||
|
/*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||||
|
"punpckhdq %%xmm2,%%xmm7\n\t" \
|
||||||
|
/*xmm2 is free.*/ \
|
||||||
|
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||||
|
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||||
|
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||||
|
/*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||||
|
"punpckldq %%xmm6,%%xmm2\n\t" \
|
||||||
|
/*xmm6 is free.*/ \
|
||||||
|
"movdqa %%xmm8,%%xmm6\n\t" \
|
||||||
|
/*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||||
|
"punpckldq %%xmm1,%%xmm6\n\t" \
|
||||||
|
/*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||||
|
"punpckhdq %%xmm1,%%xmm8\n\t" \
|
||||||
|
/*xmm1 is free.*/ \
|
||||||
|
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||||
|
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||||
|
"punpcklqdq %%xmm2,%%xmm0\n\t" \
|
||||||
|
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||||
|
"punpckhqdq %%xmm2,%%xmm1\n\t" \
|
||||||
|
/*xmm2 is free.*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||||
|
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||||
|
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||||
|
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||||
|
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||||
|
/*xmm4 is free.*/ \
|
||||||
|
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||||
|
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||||
|
"punpckhqdq %%xmm6,%%xmm5\n\t" \
|
||||||
|
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||||
|
"punpcklqdq %%xmm6,%%xmm4\n\t" \
|
||||||
|
/*xmm6 is free.*/ \
|
||||||
|
"movdqa %%xmm7,%%xmm6\n\t" \
|
||||||
|
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||||
|
"punpckhqdq %%xmm8,%%xmm7\n\t" \
|
||||||
|
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||||
|
"punpcklqdq %%xmm8,%%xmm6\n\t" \
|
||||||
|
/*xmm8 is free.*/ \
|
||||||
|
|
||||||
|
# else
|
||||||
|
/*Otherwise, we need to spill some values to %[buf] temporarily.
|
||||||
|
Again, the butterflies are carefully arranged to get the columns to come out
|
||||||
|
in order, minimizing register spills and maximizing the delay between a load
|
||||||
|
and when the value loaded is actually used.*/
|
||||||
|
# define OC_TRANSPOSE_8x8 \
|
||||||
|
"#OC_TRANSPOSE_8x8\n\t" \
|
||||||
|
/*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||||
|
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
/*xmm0 is free.*/ \
|
||||||
|
"movdqa %%xmm2,%%xmm0\n\t" \
|
||||||
|
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||||
|
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||||
|
/*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||||
|
"punpcklwd %%xmm3,%%xmm0\n\t" \
|
||||||
|
/*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
|
||||||
|
/*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||||
|
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||||
|
/*xmm2 is free.*/ \
|
||||||
|
"movdqa %%xmm6,%%xmm2\n\t" \
|
||||||
|
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||||
|
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||||
|
/*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||||
|
"punpckhwd %%xmm7,%%xmm2\n\t" \
|
||||||
|
/*xmm7 is free.*/ \
|
||||||
|
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||||
|
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||||
|
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||||
|
/*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||||
|
"punpckhwd %%xmm5,%%xmm7\n\t" \
|
||||||
|
/*xmm5 is free.*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm5\n\t" \
|
||||||
|
/*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||||
|
"punpcklwd %%xmm1,%%xmm3\n\t" \
|
||||||
|
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||||
|
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||||
|
/*xmm1 is free.*/ \
|
||||||
|
"movdqa %%xmm7,%%xmm1\n\t" \
|
||||||
|
/*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||||
|
"punpckldq %%xmm2,%%xmm7\n\t" \
|
||||||
|
/*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||||
|
"punpckhdq %%xmm2,%%xmm1\n\t" \
|
||||||
|
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
|
||||||
|
/*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||||
|
"movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||||
|
/*xmm1 is free.*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm1\n\t" \
|
||||||
|
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||||
|
"punpckhdq %%xmm0,%%xmm3\n\t" \
|
||||||
|
/*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||||
|
"punpckldq %%xmm0,%%xmm1\n\t" \
|
||||||
|
/*xmm0 is free.*/ \
|
||||||
|
"movdqa %%xmm4,%%xmm0\n\t" \
|
||||||
|
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||||
|
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||||
|
/*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||||
|
"punpckldq %%xmm6,%%xmm0\n\t" \
|
||||||
|
/*xmm6 is free.*/ \
|
||||||
|
"movdqa %%xmm5,%%xmm6\n\t" \
|
||||||
|
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||||
|
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||||
|
/*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||||
|
"punpckhdq %%xmm2,%%xmm6\n\t" \
|
||||||
|
/*xmm2 is free.*/ \
|
||||||
|
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||||
|
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||||
|
"punpckhqdq %%xmm0,%%xmm1\n\t" \
|
||||||
|
/*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||||
|
"punpcklqdq %%xmm0,%%xmm2\n\t" \
|
||||||
|
/*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||||
|
/*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||||
|
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||||
|
/*xmm2 is free.*/ \
|
||||||
|
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||||
|
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||||
|
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||||
|
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||||
|
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||||
|
/*xmm4 is free.*/ \
|
||||||
|
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||||
|
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||||
|
"punpckhqdq %%xmm7,%%xmm5\n\t" \
|
||||||
|
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||||
|
"punpcklqdq %%xmm7,%%xmm4\n\t" \
|
||||||
|
/*xmm7 is free.*/ \
|
||||||
|
"movdqa %%xmm6,%%xmm7\n\t" \
|
||||||
|
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||||
|
"punpcklqdq %%xmm0,%%xmm6\n\t" \
|
||||||
|
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||||
|
"punpckhqdq %%xmm0,%%xmm7\n\t" \
|
||||||
|
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||||
|
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
|
||||||
|
|
||||||
|
# endif
|
||||||
|
|
||||||
|
/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
|
||||||
|
four SSE registers.
|
||||||
|
No need to be clever here; we have plenty of room.*/
|
||||||
|
# define OC_TRANSPOSE_8x4_MMX2SSE \
|
||||||
|
"#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
|
||||||
|
"movq2dq %%mm0,%%xmm0\n\t" \
|
||||||
|
"movq2dq %%mm1,%%xmm1\n\t" \
|
||||||
|
/*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||||
|
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||||
|
"movq2dq %%mm2,%%xmm3\n\t" \
|
||||||
|
"movq2dq %%mm3,%%xmm2\n\t" \
|
||||||
|
/*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||||
|
"punpcklwd %%xmm2,%%xmm3\n\t" \
|
||||||
|
"movq2dq %%mm4,%%xmm4\n\t" \
|
||||||
|
"movq2dq %%mm5,%%xmm5\n\t" \
|
||||||
|
/*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||||
|
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||||
|
"movq2dq %%mm6,%%xmm7\n\t" \
|
||||||
|
"movq2dq %%mm7,%%xmm6\n\t" \
|
||||||
|
/*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||||
|
"punpcklwd %%xmm6,%%xmm7\n\t" \
|
||||||
|
"movdqa %%xmm0,%%xmm2\n\t" \
|
||||||
|
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||||
|
"punpckldq %%xmm3,%%xmm0\n\t" \
|
||||||
|
/*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||||
|
"punpckhdq %%xmm3,%%xmm2\n\t" \
|
||||||
|
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||||
|
/*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||||
|
"punpckldq %%xmm7,%%xmm4\n\t" \
|
||||||
|
/*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||||
|
"punpckhdq %%xmm7,%%xmm5\n\t" \
|
||||||
|
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||||
|
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||||
|
"punpcklqdq %%xmm4,%%xmm0\n\t" \
|
||||||
|
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||||
|
"punpckhqdq %%xmm4,%%xmm1\n\t" \
|
||||||
|
"movdqa %%xmm2,%%xmm3\n\t" \
|
||||||
|
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||||
|
"punpcklqdq %%xmm5,%%xmm2\n\t" \
|
||||||
|
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||||
|
"punpckhqdq %%xmm5,%%xmm3\n\t" \
|
||||||
|
|
||||||
|
#endif
|
182
thirdparty/libtheora/x86/x86cpu.c
vendored
Normal file
182
thirdparty/libtheora/x86/x86cpu.c
vendored
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
CPU capability detection for x86 processors.
|
||||||
|
Originally written by Rudolf Marek.
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id$
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#include "x86cpu.h"
|
||||||
|
|
||||||
|
#if !defined(OC_X86_ASM)
|
||||||
|
ogg_uint32_t oc_cpu_flags_get(void){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# if defined(__amd64__)||defined(__x86_64__)
|
||||||
|
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
|
||||||
|
compiling with -fPIC.*/
|
||||||
|
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||||
|
__asm__ __volatile__( \
|
||||||
|
"cpuid\n\t" \
|
||||||
|
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||||
|
:"a"(_op) \
|
||||||
|
:"cc" \
|
||||||
|
)
|
||||||
|
# else
|
||||||
|
/*On x86-32, not so much.*/
|
||||||
|
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||||
|
__asm__ __volatile__( \
|
||||||
|
"xchgl %%ebx,%[ebx]\n\t" \
|
||||||
|
"cpuid\n\t" \
|
||||||
|
"xchgl %%ebx,%[ebx]\n\t" \
|
||||||
|
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||||
|
:"a"(_op) \
|
||||||
|
:"cc" \
|
||||||
|
)
|
||||||
|
# endif
|
||||||
|
|
||||||
|
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||||
|
ogg_uint32_t flags;
|
||||||
|
/*If there isn't even MMX, give up.*/
|
||||||
|
if(!(_edx&0x00800000))return 0;
|
||||||
|
flags=OC_CPU_X86_MMX;
|
||||||
|
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
|
||||||
|
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
|
||||||
|
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
|
||||||
|
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
|
||||||
|
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
|
||||||
|
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||||
|
ogg_uint32_t flags;
|
||||||
|
/*If there isn't even MMX, give up.*/
|
||||||
|
if(!(_edx&0x00800000))return 0;
|
||||||
|
flags=OC_CPU_X86_MMX;
|
||||||
|
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
|
||||||
|
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
|
||||||
|
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
|
||||||
|
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
|
||||||
|
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
ogg_uint32_t oc_cpu_flags_get(void){
|
||||||
|
ogg_uint32_t flags;
|
||||||
|
ogg_uint32_t eax;
|
||||||
|
ogg_uint32_t ebx;
|
||||||
|
ogg_uint32_t ecx;
|
||||||
|
ogg_uint32_t edx;
|
||||||
|
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||||
|
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"pushfl\n\t"
|
||||||
|
"pushfl\n\t"
|
||||||
|
"popl %[a]\n\t"
|
||||||
|
"movl %[a],%[b]\n\t"
|
||||||
|
"xorl $0x200000,%[a]\n\t"
|
||||||
|
"pushl %[a]\n\t"
|
||||||
|
"popfl\n\t"
|
||||||
|
"pushfl\n\t"
|
||||||
|
"popl %[a]\n\t"
|
||||||
|
"popfl\n\t"
|
||||||
|
:[a]"=r"(eax),[b]"=r"(ebx)
|
||||||
|
:
|
||||||
|
:"cc"
|
||||||
|
);
|
||||||
|
/*No cpuid.*/
|
||||||
|
if(eax==ebx)return 0;
|
||||||
|
# endif
|
||||||
|
cpuid(0,eax,ebx,ecx,edx);
|
||||||
|
/* l e t n I e n i u n e G*/
|
||||||
|
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||||
|
/* 6 8 x M T e n i u n e G*/
|
||||||
|
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||||
|
int family;
|
||||||
|
int model;
|
||||||
|
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||||
|
cpuid(1,eax,ebx,ecx,edx);
|
||||||
|
flags=oc_parse_intel_flags(edx,ecx);
|
||||||
|
family=(eax>>8)&0xF;
|
||||||
|
model=(eax>>4)&0xF;
|
||||||
|
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||||
|
unit, so don't use it.*/
|
||||||
|
if(family==6&&(model==9||model==13||model==14)){
|
||||||
|
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* D M A c i t n e h t u A*/
|
||||||
|
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
||||||
|
/* C S N y b e d o e G*/
|
||||||
|
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
|
||||||
|
/*AMD, Geode:*/
|
||||||
|
cpuid(0x80000000,eax,ebx,ecx,edx);
|
||||||
|
if(eax<0x80000001)flags=0;
|
||||||
|
else{
|
||||||
|
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||||
|
flags=oc_parse_amd_flags(edx,ecx);
|
||||||
|
}
|
||||||
|
/*Also check for SSE.*/
|
||||||
|
cpuid(1,eax,ebx,ecx,edx);
|
||||||
|
flags|=oc_parse_intel_flags(edx,ecx);
|
||||||
|
}
|
||||||
|
/*Technically some VIA chips can be configured in the BIOS to return any
|
||||||
|
string here the user wants.
|
||||||
|
There is a special detection method that can be used to identify such
|
||||||
|
processors, but in my opinion, if the user really wants to change it, they
|
||||||
|
deserve what they get.*/
|
||||||
|
/* s l u a H r u a t n e C*/
|
||||||
|
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
|
||||||
|
/*VIA:*/
|
||||||
|
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
|
||||||
|
chips (thanks to the engineers from Centaur Technology who provided it).
|
||||||
|
These chips support Intel-like cpuid info.
|
||||||
|
The C3-2 (Nehemiah) cores appear to, as well.*/
|
||||||
|
cpuid(1,eax,ebx,ecx,edx);
|
||||||
|
flags=oc_parse_intel_flags(edx,ecx);
|
||||||
|
if(eax>=0x80000001){
|
||||||
|
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
|
||||||
|
We need to check this even if the Intel test succeeds to pick up 3DNow!
|
||||||
|
support on these processors.
|
||||||
|
Unlike actual AMD processors, we cannot _rely_ on this info, since
|
||||||
|
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
|
||||||
|
this function, yet return edx=0, despite the Intel test indicating
|
||||||
|
MMX support.
|
||||||
|
Therefore the features detected here are strictly added to those
|
||||||
|
detected by the Intel test.*/
|
||||||
|
/*TODO: How about earlier chips?*/
|
||||||
|
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||||
|
/*Note: As of the C7, this function returns Intel-style extended feature
|
||||||
|
flags, not AMD-style.
|
||||||
|
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
|
||||||
|
do not conflict with any of the AMD flags we inspect.
|
||||||
|
For the remaining bits, Intel tells us, "Do not count on their value",
|
||||||
|
but VIA assures us that they will all be zero (at least on the C7 and
|
||||||
|
Isaiah chips).
|
||||||
|
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
|
||||||
|
(0xC0C00000) for something else, we will have to add code to detect
|
||||||
|
the model to decide when it is appropriate to inspect them.*/
|
||||||
|
flags|=oc_parse_amd_flags(edx,ecx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
/*Implement me.*/
|
||||||
|
flags=0;
|
||||||
|
}
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
#endif
|
@ -10,13 +10,13 @@
|
|||||||
* *
|
* *
|
||||||
********************************************************************
|
********************************************************************
|
||||||
function:
|
function:
|
||||||
last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
#if !defined(_x86_cpu_H)
|
#if !defined(_x86_x86cpu_H)
|
||||||
# define _x86_cpu_H (1)
|
# define _x86_x86cpu_H (1)
|
||||||
#include "internal.h"
|
#include "../internal.h"
|
||||||
|
|
||||||
#define OC_CPU_X86_MMX (1<<0)
|
#define OC_CPU_X86_MMX (1<<0)
|
||||||
#define OC_CPU_X86_3DNOW (1<<1)
|
#define OC_CPU_X86_3DNOW (1<<1)
|
||||||
@ -31,4 +31,6 @@
|
|||||||
#define OC_CPU_X86_SSE4A (1<<10)
|
#define OC_CPU_X86_SSE4A (1<<10)
|
||||||
#define OC_CPU_X86_SSE5 (1<<11)
|
#define OC_CPU_X86_SSE5 (1<<11)
|
||||||
|
|
||||||
|
ogg_uint32_t oc_cpu_flags_get(void);
|
||||||
|
|
||||||
#endif
|
#endif
|
34
thirdparty/libtheora/x86/x86enc.c
vendored
34
thirdparty/libtheora/x86/x86enc.c
vendored
@ -18,32 +18,46 @@
|
|||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
#include "../cpu.c"
|
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
|
||||||
|
|
||||||
void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
|
|
||||||
ogg_uint32_t cpu_flags;
|
ogg_uint32_t cpu_flags;
|
||||||
cpu_flags=oc_cpu_flags_get();
|
cpu_flags=_enc->state.cpu_flags;
|
||||||
oc_enc_vtable_init_c(_enc);
|
oc_enc_accel_init_c(_enc);
|
||||||
|
# if defined(OC_ENC_USE_VTABLE)
|
||||||
if(cpu_flags&OC_CPU_X86_MMX){
|
if(cpu_flags&OC_CPU_X86_MMX){
|
||||||
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
||||||
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
||||||
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||||
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
|
|
||||||
}
|
}
|
||||||
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
||||||
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
||||||
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
||||||
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
||||||
_enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
|
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
|
||||||
_enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
|
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
|
||||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
||||||
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
||||||
|
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
|
||||||
}
|
}
|
||||||
if(cpu_flags&OC_CPU_X86_SSE2){
|
if(cpu_flags&OC_CPU_X86_SSE2){
|
||||||
# if defined(OC_X86_64_ASM)
|
# if defined(OC_X86_64_ASM)
|
||||||
/*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
|
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
|
||||||
|
# endif
|
||||||
|
_enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
|
||||||
|
_enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
|
||||||
|
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
|
||||||
|
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
|
||||||
|
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
|
||||||
|
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
|
||||||
|
_enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
|
||||||
|
_enc->opt_vtable.quantize=oc_enc_quantize_sse2;
|
||||||
|
# else
|
||||||
|
(void) cpu_flags;
|
||||||
# endif
|
# endif
|
||||||
|
_enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
|
||||||
|
_enc->opt_data.enquant_table_alignment=16;
|
||||||
|
# if defined(OC_ENC_USE_VTABLE)
|
||||||
}
|
}
|
||||||
|
# endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
93
thirdparty/libtheora/x86/x86enc.h
vendored
93
thirdparty/libtheora/x86/x86enc.h
vendored
@ -17,11 +17,62 @@
|
|||||||
|
|
||||||
#if !defined(_x86_x86enc_H)
|
#if !defined(_x86_x86enc_H)
|
||||||
# define _x86_x86enc_H (1)
|
# define _x86_x86enc_H (1)
|
||||||
# include "../encint.h"
|
|
||||||
# include "x86int.h"
|
# include "x86int.h"
|
||||||
|
|
||||||
void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
|
# if defined(OC_X86_ASM)
|
||||||
|
# define oc_enc_accel_init oc_enc_accel_init_x86
|
||||||
|
# if defined(OC_X86_64_ASM)
|
||||||
|
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||||
|
If the best routine we have available only needs SSE2 (which at the moment
|
||||||
|
covers all of them), then we can avoid runtime detection and the indirect
|
||||||
|
call.*/
|
||||||
|
# define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
|
||||||
|
oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
|
||||||
|
# define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
|
||||||
|
oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
|
||||||
|
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
|
||||||
|
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||||
|
oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
|
||||||
|
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||||
|
oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
|
||||||
|
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
|
||||||
|
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||||
|
oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
|
||||||
|
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||||
|
oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
|
||||||
|
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||||
|
oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
|
||||||
|
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||||
|
oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
|
||||||
|
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||||
|
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
|
||||||
|
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||||
|
oc_enc_enquant_table_init_x86(_enquant,_dequant)
|
||||||
|
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||||
|
oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
|
||||||
|
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||||
|
oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
|
||||||
|
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||||
|
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||||
|
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||||
|
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||||
|
# define oc_enc_fdct8x8(_enc,_y,_x) \
|
||||||
|
oc_enc_fdct8x8_x86_64sse2(_y,_x)
|
||||||
|
# else
|
||||||
|
# define OC_ENC_USE_VTABLE (1)
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# include "../encint.h"
|
||||||
|
|
||||||
|
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
|
||||||
|
|
||||||
|
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
||||||
|
const unsigned char *_x,const unsigned char *_y,int _stride);
|
||||||
|
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
||||||
|
const unsigned char *_x,int _stride);
|
||||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride);
|
const unsigned char *_ref,int _ystride);
|
||||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||||
@ -29,19 +80,35 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
|||||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||||
unsigned _thresh);
|
unsigned _thresh);
|
||||||
unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
const unsigned char *_ref,int _ystride);
|
||||||
unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref,int _ystride);
|
||||||
unsigned _thresh);
|
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||||
unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||||
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_x,const unsigned char *_y,int _stride);
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
|
||||||
const unsigned char *_x,int _stride);
|
const unsigned char *_src,int _ystride);
|
||||||
|
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||||
|
const unsigned char *_src,int _ystride);
|
||||||
|
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride);
|
||||||
|
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||||
|
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||||
|
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||||
|
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
|
||||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||||
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
void oc_enc_enquant_table_init_x86(void *_enquant,
|
||||||
|
const ogg_uint16_t _dequant[64]);
|
||||||
|
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
|
||||||
|
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||||
|
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||||
|
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||||
|
|
||||||
|
# if defined(OC_X86_64_ASM)
|
||||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||||
|
# endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
149
thirdparty/libtheora/x86/x86enquant.c
vendored
Normal file
149
thirdparty/libtheora/x86/x86enquant.c
vendored
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#include "x86enc.h"
|
||||||
|
|
||||||
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*The default enquant table is not quite suitable for SIMD purposes.
|
||||||
|
First, the m and l parameters need to be separated so that an entire row full
|
||||||
|
of m's or l's can be loaded at a time.
|
||||||
|
Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
|
||||||
|
emulate one with a multiply.
|
||||||
|
Therefore we translate the shift count into a scale factor.*/
|
||||||
|
void oc_enc_enquant_table_init_x86(void *_enquant,
|
||||||
|
const ogg_uint16_t _dequant[64]){
|
||||||
|
ogg_int16_t *m;
|
||||||
|
ogg_int16_t *l;
|
||||||
|
int zzi;
|
||||||
|
m=(ogg_int16_t *)_enquant;
|
||||||
|
l=m+64;
|
||||||
|
for(zzi=0;zzi<64;zzi++){
|
||||||
|
oc_iquant q;
|
||||||
|
oc_iquant_init(&q,_dequant[zzi]);
|
||||||
|
m[zzi]=q.m;
|
||||||
|
/*q.l must be at least 2 for this to work; fortunately, once all the scale
|
||||||
|
factors are baked in, the minimum quantizer is much larger than that.*/
|
||||||
|
l[zzi]=1<<16-q.l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
|
||||||
|
int pli;
|
||||||
|
int qii;
|
||||||
|
int qti;
|
||||||
|
for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
|
||||||
|
((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
|
||||||
|
((ogg_int16_t *)_enquant[pli][0][qti])[0];
|
||||||
|
((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
|
||||||
|
((ogg_int16_t *)_enquant[pli][0][qti])[64];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||||
|
const ogg_uint16_t _dequant[64],const void *_enquant){
|
||||||
|
ptrdiff_t r;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"xor %[r],%[r]\n\t"
|
||||||
|
/*Loop through two rows at a time.*/
|
||||||
|
".p2align 4\n\t"
|
||||||
|
"0:\n\t"
|
||||||
|
/*Load the first two rows of the data and the quant matrices.*/
|
||||||
|
"movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
|
||||||
|
"movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
|
||||||
|
"movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
|
||||||
|
"movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
|
||||||
|
"movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
|
||||||
|
"movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
|
||||||
|
/*Double the input and propagate its sign to the rounding factor.
|
||||||
|
Using SSSE3's psignw would help here, but we need the mask later anyway.*/
|
||||||
|
"movdqa %%xmm0,%%xmm6\n\t"
|
||||||
|
"psraw $15,%%xmm0\n\t"
|
||||||
|
"movdqa %%xmm1,%%xmm7\n\t"
|
||||||
|
"paddw %%xmm6,%%xmm6\n\t"
|
||||||
|
"psraw $15,%%xmm1\n\t"
|
||||||
|
"paddw %%xmm7,%%xmm7\n\t"
|
||||||
|
"paddw %%xmm0,%%xmm2\n\t"
|
||||||
|
"paddw %%xmm1,%%xmm3\n\t"
|
||||||
|
"pxor %%xmm0,%%xmm2\n\t"
|
||||||
|
"pxor %%xmm1,%%xmm3\n\t"
|
||||||
|
/*Add the rounding factor and perform the first multiply.*/
|
||||||
|
"paddw %%xmm2,%%xmm6\n\t"
|
||||||
|
"paddw %%xmm3,%%xmm7\n\t"
|
||||||
|
"pmulhw %%xmm6,%%xmm4\n\t"
|
||||||
|
"pmulhw %%xmm7,%%xmm5\n\t"
|
||||||
|
"movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
|
||||||
|
"movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
|
||||||
|
"paddw %%xmm4,%%xmm6\n\t"
|
||||||
|
"paddw %%xmm5,%%xmm7\n\t"
|
||||||
|
/*Emulate an element-wise right-shift via a second multiply.*/
|
||||||
|
"pmulhw %%xmm2,%%xmm6\n\t"
|
||||||
|
"pmulhw %%xmm3,%%xmm7\n\t"
|
||||||
|
"add $32,%[r]\n\t"
|
||||||
|
"cmp $96,%[r]\n\t"
|
||||||
|
/*Correct for the sign.*/
|
||||||
|
"psubw %%xmm0,%%xmm6\n\t"
|
||||||
|
"psubw %%xmm1,%%xmm7\n\t"
|
||||||
|
/*Save the result.*/
|
||||||
|
"movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
|
||||||
|
"movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
|
||||||
|
"jle 0b\n\t"
|
||||||
|
/*Now find the location of the last non-zero value.*/
|
||||||
|
"movdqa 0x50(%[qdct]),%%xmm5\n\t"
|
||||||
|
"movdqa 0x40(%[qdct]),%%xmm4\n\t"
|
||||||
|
"packsswb %%xmm7,%%xmm6\n\t"
|
||||||
|
"packsswb %%xmm5,%%xmm4\n\t"
|
||||||
|
"pxor %%xmm0,%%xmm0\n\t"
|
||||||
|
"mov $-1,%k[dq]\n\t"
|
||||||
|
"pcmpeqb %%xmm0,%%xmm6\n\t"
|
||||||
|
"pcmpeqb %%xmm0,%%xmm4\n\t"
|
||||||
|
"pmovmskb %%xmm6,%k[q]\n\t"
|
||||||
|
"pmovmskb %%xmm4,%k[r]\n\t"
|
||||||
|
"shl $16,%k[q]\n\t"
|
||||||
|
"or %k[r],%k[q]\n\t"
|
||||||
|
"mov $32,%[r]\n\t"
|
||||||
|
/*We have to use xor here instead of not in order to set the flags.*/
|
||||||
|
"xor %k[dq],%k[q]\n\t"
|
||||||
|
"jnz 1f\n\t"
|
||||||
|
"movdqa 0x30(%[qdct]),%%xmm7\n\t"
|
||||||
|
"movdqa 0x20(%[qdct]),%%xmm6\n\t"
|
||||||
|
"movdqa 0x10(%[qdct]),%%xmm5\n\t"
|
||||||
|
"movdqa 0x00(%[qdct]),%%xmm4\n\t"
|
||||||
|
"packsswb %%xmm7,%%xmm6\n\t"
|
||||||
|
"packsswb %%xmm5,%%xmm4\n\t"
|
||||||
|
"pcmpeqb %%xmm0,%%xmm6\n\t"
|
||||||
|
"pcmpeqb %%xmm0,%%xmm4\n\t"
|
||||||
|
"pmovmskb %%xmm6,%k[q]\n\t"
|
||||||
|
"pmovmskb %%xmm4,%k[r]\n\t"
|
||||||
|
"shl $16,%k[q]\n\t"
|
||||||
|
"or %k[r],%k[q]\n\t"
|
||||||
|
"xor %[r],%[r]\n\t"
|
||||||
|
"not %k[q]\n\t"
|
||||||
|
"or $1,%k[q]\n\t"
|
||||||
|
"1:\n\t"
|
||||||
|
"bsr %k[q],%k[q]\n\t"
|
||||||
|
"add %k[q],%k[r]\n\t"
|
||||||
|
:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
|
||||||
|
:[dct]"r"(_dct),[qdct]"r"(_qdct)
|
||||||
|
:"cc","memory"
|
||||||
|
);
|
||||||
|
return (int)r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
96
thirdparty/libtheora/x86/x86int.h
vendored
96
thirdparty/libtheora/x86/x86int.h
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -19,24 +19,104 @@
|
|||||||
# define _x86_x86int_H (1)
|
# define _x86_x86int_H (1)
|
||||||
# include "../internal.h"
|
# include "../internal.h"
|
||||||
|
|
||||||
void oc_state_vtable_init_x86(oc_theora_state *_state);
|
# if defined(OC_X86_ASM)
|
||||||
|
# define oc_state_accel_init oc_state_accel_init_x86
|
||||||
|
# if defined(OC_X86_64_ASM)
|
||||||
|
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||||
|
If the best routine we have available only needs SSE2 (which at the moment
|
||||||
|
covers all of them), then we can avoid runtime detection and the indirect
|
||||||
|
call.*/
|
||||||
|
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||||
|
oc_frag_copy_mmx(_dst,_src,_ystride)
|
||||||
|
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||||
|
_fragis,_nfragis,_frag_buf_offs) \
|
||||||
|
oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
|
||||||
|
_fragis,_nfragis,_frag_buf_offs)
|
||||||
|
# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
|
||||||
|
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||||
|
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||||
|
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||||
|
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||||
|
oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
|
||||||
|
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||||
|
oc_idct8x8_sse2(_y,_x,_last_zzi)
|
||||||
|
# define oc_state_frag_recon oc_state_frag_recon_mmx
|
||||||
|
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||||
|
oc_loop_filter_init_mmxext(_bv,_flimit)
|
||||||
|
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
|
||||||
|
# define oc_restore_fpu(_state) \
|
||||||
|
oc_restore_fpu_mmx()
|
||||||
|
# else
|
||||||
|
# define OC_STATE_USE_VTABLE (1)
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# include "../state.h"
|
||||||
|
# include "x86cpu.h"
|
||||||
|
|
||||||
|
/*Converts the expression in the argument to a string.*/
|
||||||
|
#define OC_M2STR(_s) #_s
|
||||||
|
|
||||||
|
/*Memory operands do not always include an offset.
|
||||||
|
To avoid warnings, we force an offset with %H (which adds 8).*/
|
||||||
|
# if __GNUC_PREREQ(4,0)
|
||||||
|
# define OC_MEM_OFFS(_offs,_name) \
|
||||||
|
OC_M2STR(_offs-8+%H[_name])
|
||||||
|
# endif
|
||||||
|
/*If your gcc version does't support %H, then you get to suffer the warnings.
|
||||||
|
Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
|
||||||
|
whole offset, instead of substituting in 0 for the missing operand to +.*/
|
||||||
|
# if !defined(OC_MEM_OFFS)
|
||||||
|
# define OC_MEM_OFFS(_offs,_name) \
|
||||||
|
OC_M2STR(_offs+%[_name])
|
||||||
|
# endif
|
||||||
|
|
||||||
|
/*Declare an array operand with an exact size.
|
||||||
|
This tells gcc we're going to clobber this memory region, without having to
|
||||||
|
clobber all of "memory" and lets us access local buffers directly using the
|
||||||
|
stack pointer, without allocating a separate register to point to them.*/
|
||||||
|
#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||||
|
(*({ \
|
||||||
|
struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
|
||||||
|
array_addr__; \
|
||||||
|
}))
|
||||||
|
|
||||||
|
/*Declare an array operand with an exact size.
|
||||||
|
This tells gcc we're going to clobber this memory region, without having to
|
||||||
|
clobber all of "memory" and lets us access local buffers directly using the
|
||||||
|
stack pointer, without allocating a separate register to point to them.*/
|
||||||
|
#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||||
|
(*({ \
|
||||||
|
const struct{_type array_value__[(_size)];} *array_addr__= \
|
||||||
|
(const void *)(_ptr); \
|
||||||
|
array_addr__; \
|
||||||
|
}))
|
||||||
|
|
||||||
|
extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
|
||||||
|
|
||||||
|
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||||
|
|
||||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||||
const unsigned char *_src,int _ystride);
|
const unsigned char *_src,int _ystride);
|
||||||
|
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||||
const ogg_int16_t *_residue);
|
const ogg_int16_t *_residue);
|
||||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
|
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||||
|
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
|
||||||
int _dst_frame,int _src_frame,int _pli);
|
|
||||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||||
|
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||||
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||||
void oc_restore_fpu_mmx(void);
|
void oc_restore_fpu_mmx(void);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
47
thirdparty/libtheora/x86/x86state.c
vendored
47
thirdparty/libtheora/x86/x86state.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -19,8 +19,7 @@
|
|||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
#include "../cpu.c"
|
#if defined(OC_STATE_USE_VTABLE)
|
||||||
|
|
||||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||||
each quadrant of the destination.*/
|
each quadrant of the destination.*/
|
||||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||||
@ -39,24 +38,60 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
|||||||
64,64,64,64,64,64,64,64,
|
64,64,64,64,64,64,64,64,
|
||||||
64,64,64,64,64,64,64,64,
|
64,64,64,64,64,64,64,64,
|
||||||
64,64,64,64,64,64,64,64,
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
|
||||||
|
the destination.*/
|
||||||
|
static const unsigned char OC_FZIG_ZAG_SSE2[128]={
|
||||||
|
0, 8, 1, 2, 9,16,24,17,
|
||||||
|
10, 3, 4,11,18,25,32,40,
|
||||||
|
33,26,19,12, 5, 6,13,20,
|
||||||
|
27,34,41,48,56,49,42,35,
|
||||||
|
28,21,14, 7,15,22,29,36,
|
||||||
|
43,50,57,58,51,44,37,30,
|
||||||
|
23,31,38,45,52,59,60,53,
|
||||||
|
46,39,47,54,61,62,55,63,
|
||||||
64,64,64,64,64,64,64,64,
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64,
|
||||||
|
64,64,64,64,64,64,64,64
|
||||||
};
|
};
|
||||||
|
|
||||||
void oc_state_vtable_init_x86(oc_theora_state *_state){
|
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||||
|
oc_state_accel_init_c(_state);
|
||||||
_state->cpu_flags=oc_cpu_flags_get();
|
_state->cpu_flags=oc_cpu_flags_get();
|
||||||
|
# if defined(OC_STATE_USE_VTABLE)
|
||||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||||
|
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||||
_state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
|
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||||
oc_state_loop_filter_frag_rows_mmx;
|
oc_state_loop_filter_frag_rows_mmx;
|
||||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||||
}
|
}
|
||||||
else oc_state_vtable_init_c(_state);
|
if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
|
||||||
|
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
|
||||||
|
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||||
|
oc_state_loop_filter_frag_rows_mmxext;
|
||||||
|
}
|
||||||
|
if(_state->cpu_flags&OC_CPU_X86_SSE2){
|
||||||
|
_state->opt_vtable.idct8x8=oc_idct8x8_sse2;
|
||||||
|
# endif
|
||||||
|
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
|
||||||
|
# if defined(OC_STATE_USE_VTABLE)
|
||||||
|
}
|
||||||
|
# endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
244
thirdparty/libtheora/x86/x86zigzag.h
vendored
Normal file
244
thirdparty/libtheora/x86/x86zigzag.h
vendored
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#if !defined(_x86_x86zigzag_H)
|
||||||
|
# define _x86_x86zigzag_H (1)
|
||||||
|
# include "x86enc.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*Converts DCT coefficients from transposed order into zig-zag scan order and
|
||||||
|
stores them in %[y].
|
||||||
|
This relies on two macros to load the contents of each row:
|
||||||
|
OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
|
||||||
|
the first four and second four entries of each row into the specified
|
||||||
|
register, respectively.
|
||||||
|
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
|
||||||
|
(because when the rows are already in SSE2 registers, loading the high half
|
||||||
|
destructively modifies the register).
|
||||||
|
The index of each output element in the original 64-element array should wind
|
||||||
|
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||||
|
each 4-tuple below):
|
||||||
|
A 0 8 1 2 9 16 24 17 B
|
||||||
|
C 10 3 4 11 18 25 32 40 E
|
||||||
|
F 33 26 19 12 5 6 13 20 D
|
||||||
|
G 27 34 41 48 56 49 42 35 I
|
||||||
|
L 28 21 14 7 15 22 29 36 M
|
||||||
|
H 43 50 57 58 51 44 37 30 O
|
||||||
|
N 23 31 38 45 52 59 60 53 J
|
||||||
|
P 46 39 47 54 61 62 55 63 K
|
||||||
|
The order of the coefficients within each tuple is reversed in the comments
|
||||||
|
below to reflect the usual MSB to LSB notation.*/
|
||||||
|
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \
|
||||||
|
"movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \
|
||||||
|
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \
|
||||||
|
"pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
|
||||||
|
"punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \
|
||||||
|
"pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \
|
||||||
|
"movq %%mm7,0x00(%[y])\n\t" \
|
||||||
|
"punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \
|
||||||
|
"movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \
|
||||||
|
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \
|
||||||
|
"punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \
|
||||||
|
"punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \
|
||||||
|
"punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \
|
||||||
|
"movq %%mm1,0x08(%[y])\n\t" \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \
|
||||||
|
"pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
|
||||||
|
"movq %%mm0,0x10(%[y])\n\t" \
|
||||||
|
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \
|
||||||
|
"punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \
|
||||||
|
"movq %%mm4,0x28(%[y])\n\t" \
|
||||||
|
"psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \
|
||||||
|
"pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
|
||||||
|
"movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \
|
||||||
|
"punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \
|
||||||
|
"punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \
|
||||||
|
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \
|
||||||
|
"pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
|
||||||
|
"movq %%mm4,0x18(%[y])\n\t" \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \
|
||||||
|
"punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \
|
||||||
|
"movq %%mm2,0x20(%[y])\n\t" \
|
||||||
|
"pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
|
||||||
|
"pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
|
||||||
|
"movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \
|
||||||
|
"punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \
|
||||||
|
"pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
|
||||||
|
"punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \
|
||||||
|
"movq %%mm3,0x30(%[y])\n\t" \
|
||||||
|
"punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \
|
||||||
|
"movq %%mm1,0x50(%[y])\n\t" \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \
|
||||||
|
"punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \
|
||||||
|
"psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \
|
||||||
|
"movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \
|
||||||
|
"punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \
|
||||||
|
"movq %%mm4,0x38(%[y])\n\t" \
|
||||||
|
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \
|
||||||
|
"punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \
|
||||||
|
"movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \
|
||||||
|
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \
|
||||||
|
"punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \
|
||||||
|
"pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
|
||||||
|
"movq %%mm3,0x68(%[y])\n\t" \
|
||||||
|
"movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \
|
||||||
|
"pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
|
||||||
|
"punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \
|
||||||
|
"movq %%mm4,0x78(%[y])\n\t" \
|
||||||
|
"punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \
|
||||||
|
"punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \
|
||||||
|
"punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \
|
||||||
|
"pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
|
||||||
|
"pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
|
||||||
|
"movq %%mm5,0x40(%[y])\n\t" \
|
||||||
|
"punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \
|
||||||
|
"movq %%mm7,0x48(%[y])\n\t" \
|
||||||
|
"pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
|
||||||
|
"punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \
|
||||||
|
"punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \
|
||||||
|
"movq %%mm6,0x60(%[y])\n\t" \
|
||||||
|
"punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \
|
||||||
|
"punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \
|
||||||
|
"movq %%mm3,0x58(%[y])\n\t" \
|
||||||
|
"pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
|
||||||
|
"movq %%mm0,0x70(%[y])\n\t" \
|
||||||
|
|
||||||
|
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
|
||||||
|
order and stores them in %[qdct].
|
||||||
|
The index of each output element in the original 64-element array should wind
|
||||||
|
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||||
|
each 4-tuple below):
|
||||||
|
A 0 1 8 16 9 2 3 10 B
|
||||||
|
C 17 24 32 25 18 11 4 5 D
|
||||||
|
E 12 19 26 33 40 48 41 34 I
|
||||||
|
H 27 20 13 6 7 14 21 28 G
|
||||||
|
K 35 42 49 56 57 50 43 36 J
|
||||||
|
F 29 22 15 23 30 37 44 51 M
|
||||||
|
P 58 59 52 45 38 31 39 46 L
|
||||||
|
N 53 60 61 54 47 55 62 63 O
|
||||||
|
The order of the coefficients within each tuple is reversed in the comments
|
||||||
|
below to reflect the usual MSB to LSB notation.*/
|
||||||
|
#define OC_ZIG_ZAG_MMXEXT \
|
||||||
|
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
|
||||||
|
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
|
||||||
|
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
|
||||||
|
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
|
||||||
|
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
|
||||||
|
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
|
||||||
|
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
|
||||||
|
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
|
||||||
|
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
|
||||||
|
"movq %%mm0,0x00(%[qdct])\n\t" \
|
||||||
|
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
|
||||||
|
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
|
||||||
|
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
|
||||||
|
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
|
||||||
|
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
|
||||||
|
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
|
||||||
|
"movq %%mm6,0x08(%[qdct])\n\t" \
|
||||||
|
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
|
||||||
|
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
|
||||||
|
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
|
||||||
|
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
|
||||||
|
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
|
||||||
|
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
|
||||||
|
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
|
||||||
|
"movq %%mm2,0x10(%[qdct])\n\t" \
|
||||||
|
"movq %%mm3,0x18(%[qdct])\n\t" \
|
||||||
|
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
|
||||||
|
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
|
||||||
|
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
|
||||||
|
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
|
||||||
|
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
|
||||||
|
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
|
||||||
|
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
|
||||||
|
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
|
||||||
|
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
|
||||||
|
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
|
||||||
|
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
|
||||||
|
"movq %%mm0,0x20(%[qdct])\n\t" \
|
||||||
|
"movq %%mm3,0x50(%[qdct])\n\t" \
|
||||||
|
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
|
||||||
|
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
|
||||||
|
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
|
||||||
|
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
|
||||||
|
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
|
||||||
|
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
|
||||||
|
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
|
||||||
|
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
|
||||||
|
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
|
||||||
|
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
|
||||||
|
"movq %%mm2,0x30(%[qdct])\n\t" \
|
||||||
|
"movq %%mm6,0x38(%[qdct])\n\t" \
|
||||||
|
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
|
||||||
|
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
|
||||||
|
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
|
||||||
|
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
|
||||||
|
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
|
||||||
|
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
|
||||||
|
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
|
||||||
|
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
|
||||||
|
"movq %%mm0,0x28(%[qdct])\n\t" \
|
||||||
|
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
|
||||||
|
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
|
||||||
|
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
|
||||||
|
"movq %%mm4,0x40(%[qdct])\n\t" \
|
||||||
|
"movq %%mm6,0x48(%[qdct])\n\t" \
|
||||||
|
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
|
||||||
|
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
|
||||||
|
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
|
||||||
|
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
|
||||||
|
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
|
||||||
|
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
|
||||||
|
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
|
||||||
|
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
|
||||||
|
"movq %%mm2,0x68(%[qdct])\n\t" \
|
||||||
|
"movq %%mm1,0x58(%[qdct])\n\t" \
|
||||||
|
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
|
||||||
|
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
|
||||||
|
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
|
||||||
|
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
|
||||||
|
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
|
||||||
|
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
|
||||||
|
"movq %%mm6,0x70(%[qdct])\n\t" \
|
||||||
|
"movq %%mm5,0x78(%[qdct])\n\t" \
|
||||||
|
"movq %%mm7,0x60(%[qdct])\n\t" \
|
||||||
|
|
||||||
|
#endif
|
122
thirdparty/libtheora/x86_vc/mmxencfrag.c
vendored
122
thirdparty/libtheora/x86_vc/mmxencfrag.c
vendored
@ -266,7 +266,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
||||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||||
outputs 4-7.
|
outputs 4-7.
|
||||||
Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
|
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
|
||||||
perform this stage in place with no temporary registers).*/
|
perform this stage in place with no temporary registers).*/
|
||||||
#define OC_HADAMARD_AB_8x4 __asm{ \
|
#define OC_HADAMARD_AB_8x4 __asm{ \
|
||||||
/*Stage A: \
|
/*Stage A: \
|
||||||
@ -299,7 +299,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||||
Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||||
place with no temporary registers).*/
|
place with no temporary registers).*/
|
||||||
#define OC_HADAMARD_C_8x4 __asm{ \
|
#define OC_HADAMARD_C_8x4 __asm{ \
|
||||||
/*Stage C:*/ \
|
/*Stage C:*/ \
|
||||||
@ -468,12 +468,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
|||||||
mm7 = d3 c3 b3 a3*/ \
|
mm7 = d3 c3 b3 a3*/ \
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
|
static unsigned oc_int_frag_satd_mmxext(int *_dc,
|
||||||
int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
|
const unsigned char *_src,int _src_ystride,
|
||||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
const unsigned char *_ref,int _ref_ystride){
|
||||||
ogg_int16_t *bufp;
|
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||||
unsigned ret1;
|
ogg_int16_t *bufp;
|
||||||
unsigned ret2;
|
unsigned ret;
|
||||||
|
unsigned ret2;
|
||||||
|
int dc;
|
||||||
bufp=buf;
|
bufp=buf;
|
||||||
__asm{
|
__asm{
|
||||||
#define SRC esi
|
#define SRC esi
|
||||||
@ -481,8 +483,10 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
|
|||||||
#define SRC_YSTRIDE ecx
|
#define SRC_YSTRIDE ecx
|
||||||
#define REF_YSTRIDE edx
|
#define REF_YSTRIDE edx
|
||||||
#define BUF edi
|
#define BUF edi
|
||||||
#define RET eax
|
#define RET edx
|
||||||
#define RET2 edx
|
#define RET2 ecx
|
||||||
|
#define DC eax
|
||||||
|
#define DC_WORD ax
|
||||||
mov SRC,_src
|
mov SRC,_src
|
||||||
mov SRC_YSTRIDE,_src_ystride
|
mov SRC_YSTRIDE,_src_ystride
|
||||||
mov REF,_ref
|
mov REF,_ref
|
||||||
@ -508,14 +512,18 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
|
|||||||
movq mm2,[0x20+BUF]
|
movq mm2,[0x20+BUF]
|
||||||
movq mm3,[0x30+BUF]
|
movq mm3,[0x30+BUF]
|
||||||
movq mm0,[0x00+BUF]
|
movq mm0,[0x00+BUF]
|
||||||
OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
|
/*We split out the stages here so we can save the DC coefficient in the
|
||||||
|
middle.*/
|
||||||
|
OC_HADAMARD_AB_8x4
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||||
|
movd DC,mm1
|
||||||
|
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||||
Now we finally have to promote things to dwords.
|
Now we finally have to promote things to dwords.
|
||||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||||
latency of pmaddwd by starting the next series of loads now.*/
|
latency of pmaddwd by starting the next series of loads now.*/
|
||||||
mov RET2,_thresh
|
|
||||||
pmaddwd mm0,mm7
|
pmaddwd mm0,mm7
|
||||||
movq mm1,[0x50+BUF]
|
movq mm1,[0x50+BUF]
|
||||||
movq mm5,[0x58+BUF]
|
movq mm5,[0x58+BUF]
|
||||||
@ -525,29 +533,28 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
|
|||||||
movq mm6,[0x68+BUF]
|
movq mm6,[0x68+BUF]
|
||||||
paddd mm4,mm0
|
paddd mm4,mm0
|
||||||
movq mm3,[0x70+BUF]
|
movq mm3,[0x70+BUF]
|
||||||
movd RET,mm4
|
movd RET2,mm4
|
||||||
movq mm7,[0x78+BUF]
|
movq mm7,[0x78+BUF]
|
||||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
|
||||||
added to them, and a factor of two removed; correct the final sum here.*/
|
|
||||||
lea RET,[RET+RET-32]
|
|
||||||
movq mm0,[0x40+BUF]
|
movq mm0,[0x40+BUF]
|
||||||
cmp RET,RET2
|
|
||||||
movq mm4,[0x48+BUF]
|
movq mm4,[0x48+BUF]
|
||||||
jae at_end
|
|
||||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||||
pmaddwd mm0,mm7
|
pmaddwd mm0,mm7
|
||||||
/*There isn't much to stick in here to hide the latency this time, but the
|
/*Subtract abs(dc) from 2*ret2.*/
|
||||||
alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
|
movsx DC,DC_WORD
|
||||||
latency is even worse.*/
|
cdq
|
||||||
sub RET,32
|
lea RET2,[RET+RET2*2]
|
||||||
movq mm4,mm0
|
movq mm4,mm0
|
||||||
punpckhdq mm0,mm0
|
punpckhdq mm0,mm0
|
||||||
|
xor RET,DC
|
||||||
paddd mm4,mm0
|
paddd mm4,mm0
|
||||||
movd RET2,mm4
|
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
||||||
lea RET,[RET+RET2*2]
|
added to them, a factor of two removed, and the DC value included;
|
||||||
align 16
|
correct the final sum here.*/
|
||||||
at_end:
|
sub RET2,RET
|
||||||
mov ret1,RET
|
movd RET,mm4
|
||||||
|
lea RET,[RET2+RET*2-64]
|
||||||
|
mov ret,RET
|
||||||
|
mov dc,DC
|
||||||
#undef SRC
|
#undef SRC
|
||||||
#undef REF
|
#undef REF
|
||||||
#undef SRC_YSTRIDE
|
#undef SRC_YSTRIDE
|
||||||
@ -555,18 +562,21 @@ at_end:
|
|||||||
#undef BUF
|
#undef BUF
|
||||||
#undef RET
|
#undef RET
|
||||||
#undef RET2
|
#undef RET2
|
||||||
|
#undef DC
|
||||||
|
#undef DC_WORD
|
||||||
}
|
}
|
||||||
return ret1;
|
*_dc=dc;
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
const unsigned char *_ref,int _ystride){
|
||||||
return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
|
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
||||||
we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
|
we can share code with oc_enc_frag_satd2_mmxext().*/
|
||||||
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
||||||
__asm{
|
__asm{
|
||||||
@ -694,30 +704,31 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||||
unsigned _thresh){
|
|
||||||
OC_ALIGN8(unsigned char ref[64]);
|
OC_ALIGN8(unsigned char ref[64]);
|
||||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||||
return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
|
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||||
int _ystride){
|
int _ystride){
|
||||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||||
ogg_int16_t *bufp;
|
ogg_int16_t *bufp;
|
||||||
unsigned ret1;
|
unsigned ret1;
|
||||||
unsigned ret2;
|
unsigned ret2;
|
||||||
|
int dc;
|
||||||
bufp=buf;
|
bufp=buf;
|
||||||
__asm{
|
__asm{
|
||||||
#define SRC eax
|
#define SRC eax
|
||||||
#define SRC4 esi
|
#define SRC4 esi
|
||||||
#define BUF edi
|
#define BUF edi
|
||||||
#define RET eax
|
|
||||||
#define RET_WORD ax
|
|
||||||
#define RET2 ecx
|
|
||||||
#define YSTRIDE edx
|
#define YSTRIDE edx
|
||||||
#define YSTRIDE3 ecx
|
#define YSTRIDE3 ecx
|
||||||
|
#define RET eax
|
||||||
|
#define RET2 ecx
|
||||||
|
#define DC edx
|
||||||
|
#define DC_WORD dx
|
||||||
mov SRC,_src
|
mov SRC,_src
|
||||||
mov BUF,bufp
|
mov BUF,bufp
|
||||||
mov YSTRIDE,_ystride
|
mov YSTRIDE,_ystride
|
||||||
@ -749,7 +760,7 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
|
|||||||
middle.*/
|
middle.*/
|
||||||
OC_HADAMARD_AB_8x4
|
OC_HADAMARD_AB_8x4
|
||||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||||
movd RET,mm1
|
movd DC,mm1
|
||||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||||
@ -767,31 +778,34 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
|
|||||||
movq mm3,[0x70+BUF]
|
movq mm3,[0x70+BUF]
|
||||||
paddd mm4,mm0
|
paddd mm4,mm0
|
||||||
movq mm7,[0x78+BUF]
|
movq mm7,[0x78+BUF]
|
||||||
movd RET2,mm4
|
movd RET,mm4
|
||||||
movq mm0,[0x40+BUF]
|
movq mm0,[0x40+BUF]
|
||||||
movq mm4,[0x48+BUF]
|
movq mm4,[0x48+BUF]
|
||||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||||
pmaddwd mm0,mm7
|
pmaddwd mm0,mm7
|
||||||
/*We assume that the DC coefficient is always positive (which is true,
|
/*We assume that the DC coefficient is always positive (which is true,
|
||||||
because the input to the INTRA transform was not a difference).*/
|
because the input to the INTRA transform was not a difference).*/
|
||||||
movzx RET,RET_WORD
|
movzx DC,DC_WORD
|
||||||
add RET2,RET2
|
add RET,RET
|
||||||
sub RET2,RET
|
sub RET,DC
|
||||||
movq mm4,mm0
|
movq mm4,mm0
|
||||||
punpckhdq mm0,mm0
|
punpckhdq mm0,mm0
|
||||||
paddd mm4,mm0
|
paddd mm4,mm0
|
||||||
movd RET,mm4
|
movd RET2,mm4
|
||||||
lea RET,[-64+RET2+RET*2]
|
lea RET,[-64+RET+RET2*2]
|
||||||
|
mov [dc],DC
|
||||||
mov [ret1],RET
|
mov [ret1],RET
|
||||||
#undef SRC
|
#undef SRC
|
||||||
#undef SRC4
|
#undef SRC4
|
||||||
#undef BUF
|
#undef BUF
|
||||||
#undef RET
|
|
||||||
#undef RET_WORD
|
|
||||||
#undef RET2
|
|
||||||
#undef YSTRIDE
|
#undef YSTRIDE
|
||||||
#undef YSTRIDE3
|
#undef YSTRIDE3
|
||||||
|
#undef RET
|
||||||
|
#undef RET2
|
||||||
|
#undef DC
|
||||||
|
#undef DC_WORD
|
||||||
}
|
}
|
||||||
|
*_dc=dc;
|
||||||
return ret1;
|
return ret1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
130
thirdparty/libtheora/x86_vc/mmxfdct.c
vendored
130
thirdparty/libtheora/x86_vc/mmxfdct.c
vendored
@ -12,6 +12,7 @@
|
|||||||
/*MMX fDCT implementation for x86_32*/
|
/*MMX fDCT implementation for x86_32*/
|
||||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||||
#include "x86enc.h"
|
#include "x86enc.h"
|
||||||
|
#include "x86zigzag.h"
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
@ -462,18 +463,22 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*MMX implementation of the fDCT.*/
|
/*MMX implementation of the fDCT.*/
|
||||||
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||||
ptrdiff_t a;
|
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||||
|
ogg_int16_t *bufp;
|
||||||
|
bufp=buf;
|
||||||
__asm{
|
__asm{
|
||||||
|
#define X edx
|
||||||
#define Y eax
|
#define Y eax
|
||||||
#define A ecx
|
#define A ecx
|
||||||
#define X edx
|
#define BUF esi
|
||||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||||
we could overflow.*/
|
we could overflow.*/
|
||||||
/*We also add biases to correct for some systematic error that remains in
|
/*We also add biases to correct for some systematic error that remains in
|
||||||
the full fDCT->iDCT round trip.*/
|
the full fDCT->iDCT round trip.*/
|
||||||
mov X, _x
|
mov X, _x
|
||||||
mov Y, _y
|
mov Y, _y
|
||||||
|
mov BUF, bufp
|
||||||
movq mm0,[0x00+X]
|
movq mm0,[0x00+X]
|
||||||
movq mm1,[0x10+X]
|
movq mm1,[0x10+X]
|
||||||
movq mm2,[0x20+X]
|
movq mm2,[0x20+X]
|
||||||
@ -591,79 +596,90 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
|||||||
movq mm3,[0x30+Y]
|
movq mm3,[0x30+Y]
|
||||||
OC_FDCT_STAGE1_8x4
|
OC_FDCT_STAGE1_8x4
|
||||||
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
|
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
|
||||||
OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
|
|
||||||
/*mm0={-2}x4*/
|
/*mm0={-2}x4*/
|
||||||
pcmpeqw mm0,mm0
|
pcmpeqw mm2,mm2
|
||||||
paddw mm0,mm0
|
paddw mm2,mm2
|
||||||
/*Round the results.*/
|
/*Round and store the results (no transpose).*/
|
||||||
psubw mm1,mm0
|
movq mm7,[Y+0x10]
|
||||||
psubw mm2,mm0
|
psubw mm4,mm2
|
||||||
psraw mm1,2
|
psubw mm6,mm2
|
||||||
psubw mm3,mm0
|
|
||||||
movq [0x18+Y],mm1
|
|
||||||
psraw mm2,2
|
|
||||||
psubw mm4,mm0
|
|
||||||
movq mm1,[0x08+Y]
|
|
||||||
psraw mm3,2
|
|
||||||
psubw mm5,mm0
|
|
||||||
psraw mm4,2
|
psraw mm4,2
|
||||||
psubw mm6,mm0
|
psubw mm0,mm2
|
||||||
psraw mm5,2
|
movq [BUF+0x00],mm4
|
||||||
psubw mm7,mm0
|
movq mm4,[Y+0x30]
|
||||||
psraw mm6,2
|
psraw mm6,2
|
||||||
psubw mm1,mm0
|
psubw mm5,mm2
|
||||||
psraw mm7,2
|
movq [BUF+0x20],mm6
|
||||||
movq mm0,[0x40+Y]
|
psraw mm0,2
|
||||||
|
psubw mm3,mm2
|
||||||
|
movq [BUF+0x40],mm0
|
||||||
|
psraw mm5,2
|
||||||
|
psubw mm1,mm2
|
||||||
|
movq [BUF+0x50],mm5
|
||||||
|
psraw mm3,2
|
||||||
|
psubw mm7,mm2
|
||||||
|
movq [BUF+0x60],mm3
|
||||||
psraw mm1,2
|
psraw mm1,2
|
||||||
movq [0x30+Y],mm7
|
psubw mm4,mm2
|
||||||
|
movq [BUF+0x70],mm1
|
||||||
|
psraw mm7,2
|
||||||
|
movq [BUF+0x10],mm7
|
||||||
|
psraw mm4,2
|
||||||
|
movq [BUF+0x30],mm4
|
||||||
|
/*Load the next block.*/
|
||||||
|
movq mm0,[0x40+Y]
|
||||||
movq mm7,[0x78+Y]
|
movq mm7,[0x78+Y]
|
||||||
movq [0x08+Y],mm1
|
|
||||||
movq mm1,[0x50+Y]
|
movq mm1,[0x50+Y]
|
||||||
movq [0x20+Y],mm6
|
|
||||||
movq mm6,[0x68+Y]
|
movq mm6,[0x68+Y]
|
||||||
movq [0x28+Y],mm2
|
|
||||||
movq mm2,[0x60+Y]
|
movq mm2,[0x60+Y]
|
||||||
movq [0x10+Y],mm5
|
|
||||||
movq mm5,[0x58+Y]
|
movq mm5,[0x58+Y]
|
||||||
movq [0x38+Y],mm3
|
|
||||||
movq mm3,[0x70+Y]
|
movq mm3,[0x70+Y]
|
||||||
movq [0x00+Y],mm4
|
|
||||||
movq mm4,[0x48+Y]
|
movq mm4,[0x48+Y]
|
||||||
OC_FDCT_STAGE1_8x4
|
OC_FDCT_STAGE1_8x4
|
||||||
OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
|
OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
|
||||||
OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
|
|
||||||
/*mm0={-2}x4*/
|
/*mm0={-2}x4*/
|
||||||
pcmpeqw mm0,mm0
|
pcmpeqw mm2,mm2
|
||||||
paddw mm0,mm0
|
paddw mm2,mm2
|
||||||
/*Round the results.*/
|
/*Round and store the results (no transpose).*/
|
||||||
psubw mm1,mm0
|
movq mm7,[Y+0x50]
|
||||||
psubw mm2,mm0
|
psubw mm4,mm2
|
||||||
psraw mm1,2
|
psubw mm6,mm2
|
||||||
psubw mm3,mm0
|
|
||||||
movq [0x58+Y],mm1
|
|
||||||
psraw mm2,2
|
|
||||||
psubw mm4,mm0
|
|
||||||
movq mm1,[0x48+Y]
|
|
||||||
psraw mm3,2
|
|
||||||
psubw mm5,mm0
|
|
||||||
movq [0x68+Y],mm2
|
|
||||||
psraw mm4,2
|
psraw mm4,2
|
||||||
psubw mm6,mm0
|
psubw mm0,mm2
|
||||||
movq [0x78+Y],mm3
|
movq [BUF+0x08],mm4
|
||||||
psraw mm5,2
|
movq mm4,[Y+0x70]
|
||||||
psubw mm7,mm0
|
|
||||||
movq [0x40+Y],mm4
|
|
||||||
psraw mm6,2
|
psraw mm6,2
|
||||||
psubw mm1,mm0
|
psubw mm5,mm2
|
||||||
movq [0x50+Y],mm5
|
movq [BUF+0x28],mm6
|
||||||
psraw mm7,2
|
psraw mm0,2
|
||||||
movq [0x60+Y],mm6
|
psubw mm3,mm2
|
||||||
|
movq [BUF+0x48],mm0
|
||||||
|
psraw mm5,2
|
||||||
|
psubw mm1,mm2
|
||||||
|
movq [BUF+0x58],mm5
|
||||||
|
psraw mm3,2
|
||||||
|
psubw mm7,mm2
|
||||||
|
movq [BUF+0x68],mm3
|
||||||
psraw mm1,2
|
psraw mm1,2
|
||||||
movq [0x70+Y],mm7
|
psubw mm4,mm2
|
||||||
movq [0x48+Y],mm1
|
movq [BUF+0x78],mm1
|
||||||
|
psraw mm7,2
|
||||||
|
movq [BUF+0x18],mm7
|
||||||
|
psraw mm4,2
|
||||||
|
movq [BUF+0x38],mm4
|
||||||
|
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||||
|
__asm movq _reg,[BUF+16*(_row)] \
|
||||||
|
|
||||||
|
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||||
|
__asm movq _reg,[BUF+16*(_row)+8] \
|
||||||
|
|
||||||
|
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||||
|
#undef OC_ZZ_LOAD_ROW_LO
|
||||||
|
#undef OC_ZZ_LOAD_ROW_HI
|
||||||
|
#undef X
|
||||||
#undef Y
|
#undef Y
|
||||||
#undef A
|
#undef A
|
||||||
#undef X
|
#undef BUF
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
83
thirdparty/libtheora/x86_vc/mmxfrag.c
vendored
83
thirdparty/libtheora/x86_vc/mmxfrag.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -22,10 +22,61 @@
|
|||||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "x86int.h"
|
#include "x86int.h"
|
||||||
#include "mmxfrag.h"
|
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
|
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||||
|
between rows.*/
|
||||||
|
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||||
|
do{ \
|
||||||
|
const unsigned char *src; \
|
||||||
|
unsigned char *dst; \
|
||||||
|
src=(_src); \
|
||||||
|
dst=(_dst); \
|
||||||
|
__asm mov SRC,src \
|
||||||
|
__asm mov DST,dst \
|
||||||
|
__asm mov YSTRIDE,_ystride \
|
||||||
|
/*src+0*ystride*/ \
|
||||||
|
__asm movq mm0,[SRC] \
|
||||||
|
/*src+1*ystride*/ \
|
||||||
|
__asm movq mm1,[SRC+YSTRIDE] \
|
||||||
|
/*ystride3=ystride*3*/ \
|
||||||
|
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
||||||
|
/*src+2*ystride*/ \
|
||||||
|
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||||
|
/*src+3*ystride*/ \
|
||||||
|
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||||
|
/*dst+0*ystride*/ \
|
||||||
|
__asm movq [DST],mm0 \
|
||||||
|
/*dst+1*ystride*/ \
|
||||||
|
__asm movq [DST+YSTRIDE],mm1 \
|
||||||
|
/*Pointer to next 4.*/ \
|
||||||
|
__asm lea SRC,[SRC+YSTRIDE*4] \
|
||||||
|
/*dst+2*ystride*/ \
|
||||||
|
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||||
|
/*dst+3*ystride*/ \
|
||||||
|
__asm movq [DST+YSTRIDE3],mm3 \
|
||||||
|
/*Pointer to next 4.*/ \
|
||||||
|
__asm lea DST,[DST+YSTRIDE*4] \
|
||||||
|
/*src+0*ystride*/ \
|
||||||
|
__asm movq mm0,[SRC] \
|
||||||
|
/*src+1*ystride*/ \
|
||||||
|
__asm movq mm1,[SRC+YSTRIDE] \
|
||||||
|
/*src+2*ystride*/ \
|
||||||
|
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||||
|
/*src+3*ystride*/ \
|
||||||
|
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||||
|
/*dst+0*ystride*/ \
|
||||||
|
__asm movq [DST],mm0 \
|
||||||
|
/*dst+1*ystride*/ \
|
||||||
|
__asm movq [DST+YSTRIDE],mm1 \
|
||||||
|
/*dst+2*ystride*/ \
|
||||||
|
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||||
|
/*dst+3*ystride*/ \
|
||||||
|
__asm movq [DST+YSTRIDE3],mm3 \
|
||||||
|
} \
|
||||||
|
while(0)
|
||||||
|
|
||||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||||
between rows.*/
|
between rows.*/
|
||||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||||
@ -41,6 +92,34 @@ void oc_frag_copy_mmx(unsigned char *_dst,
|
|||||||
#undef YSTRIDE3
|
#undef YSTRIDE3
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*Copies the fragments specified by the lists of fragment indices from one
|
||||||
|
frame to another.
|
||||||
|
_dst_frame: The reference frame to copy to.
|
||||||
|
_src_frame: The reference frame to copy from.
|
||||||
|
_ystride: The row stride of the reference frames.
|
||||||
|
_fragis: A pointer to a list of fragment indices.
|
||||||
|
_nfragis: The number of fragment indices to copy.
|
||||||
|
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||||
|
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||||
|
ptrdiff_t fragii;
|
||||||
|
for(fragii=0;fragii<_nfragis;fragii++){
|
||||||
|
ptrdiff_t frag_buf_off;
|
||||||
|
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||||
|
#define SRC edx
|
||||||
|
#define DST eax
|
||||||
|
#define YSTRIDE ecx
|
||||||
|
#define YSTRIDE3 edi
|
||||||
|
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||||
|
_src_frame+frag_buf_off,_ystride);
|
||||||
|
#undef SRC
|
||||||
|
#undef DST
|
||||||
|
#undef YSTRIDE
|
||||||
|
#undef YSTRIDE3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||||
const ogg_int16_t *_residue){
|
const ogg_int16_t *_residue){
|
||||||
__asm{
|
__asm{
|
||||||
|
61
thirdparty/libtheora/x86_vc/mmxfrag.h
vendored
61
thirdparty/libtheora/x86_vc/mmxfrag.h
vendored
@ -1,61 +0,0 @@
|
|||||||
#if !defined(_x86_vc_mmxfrag_H)
|
|
||||||
# define _x86_vc_mmxfrag_H (1)
|
|
||||||
# include <stddef.h>
|
|
||||||
# include "x86int.h"
|
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
|
||||||
|
|
||||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
|
||||||
between rows.*/
|
|
||||||
#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
|
||||||
do{ \
|
|
||||||
const unsigned char *src; \
|
|
||||||
unsigned char *dst; \
|
|
||||||
src=(_src); \
|
|
||||||
dst=(_dst); \
|
|
||||||
__asm mov SRC,src \
|
|
||||||
__asm mov DST,dst \
|
|
||||||
__asm mov YSTRIDE,_ystride \
|
|
||||||
/*src+0*ystride*/ \
|
|
||||||
__asm movq mm0,[SRC] \
|
|
||||||
/*src+1*ystride*/ \
|
|
||||||
__asm movq mm1,[SRC+YSTRIDE] \
|
|
||||||
/*ystride3=ystride*3*/ \
|
|
||||||
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
|
||||||
/*src+2*ystride*/ \
|
|
||||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
|
||||||
/*src+3*ystride*/ \
|
|
||||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
|
||||||
/*dst+0*ystride*/ \
|
|
||||||
__asm movq [DST],mm0 \
|
|
||||||
/*dst+1*ystride*/ \
|
|
||||||
__asm movq [DST+YSTRIDE],mm1 \
|
|
||||||
/*Pointer to next 4.*/ \
|
|
||||||
__asm lea SRC,[SRC+YSTRIDE*4] \
|
|
||||||
/*dst+2*ystride*/ \
|
|
||||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
|
||||||
/*dst+3*ystride*/ \
|
|
||||||
__asm movq [DST+YSTRIDE3],mm3 \
|
|
||||||
/*Pointer to next 4.*/ \
|
|
||||||
__asm lea DST,[DST+YSTRIDE*4] \
|
|
||||||
/*src+0*ystride*/ \
|
|
||||||
__asm movq mm0,[SRC] \
|
|
||||||
/*src+1*ystride*/ \
|
|
||||||
__asm movq mm1,[SRC+YSTRIDE] \
|
|
||||||
/*src+2*ystride*/ \
|
|
||||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
|
||||||
/*src+3*ystride*/ \
|
|
||||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
|
||||||
/*dst+0*ystride*/ \
|
|
||||||
__asm movq [DST],mm0 \
|
|
||||||
/*dst+1*ystride*/ \
|
|
||||||
__asm movq [DST+YSTRIDE],mm1 \
|
|
||||||
/*dst+2*ystride*/ \
|
|
||||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
|
||||||
/*dst+3*ystride*/ \
|
|
||||||
__asm movq [DST+YSTRIDE3],mm3 \
|
|
||||||
} \
|
|
||||||
while(0)
|
|
||||||
|
|
||||||
# endif
|
|
||||||
#endif
|
|
230
thirdparty/libtheora/x86_vc/mmxidct.c
vendored
230
thirdparty/libtheora/x86_vc/mmxidct.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -24,15 +24,15 @@
|
|||||||
|
|
||||||
/*These are offsets into the table of constants below.*/
|
/*These are offsets into the table of constants below.*/
|
||||||
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
|
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
|
||||||
#define OC_COSINE_OFFSET (0)
|
#define OC_COSINE_OFFSET (8)
|
||||||
/*A row of 8's.*/
|
/*A row of 8's.*/
|
||||||
#define OC_EIGHT_OFFSET (56)
|
#define OC_EIGHT_OFFSET (0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*A table of constants used by the MMX routines.*/
|
/*A table of constants used by the MMX routines.*/
|
||||||
static const __declspec(align(16))ogg_uint16_t
|
static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
|
||||||
OC_IDCT_CONSTS[(7+1)*4]={
|
8, 8, 8, 8,
|
||||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
||||||
@ -46,28 +46,27 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
||||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
|
||||||
8, 8, 8, 8
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/*38 cycles*/
|
/*38 cycles*/
|
||||||
#define OC_IDCT_BEGIN __asm{ \
|
#define OC_IDCT_BEGIN(_y,_x) __asm{ \
|
||||||
__asm movq mm2,OC_I(3) \
|
__asm movq mm2,OC_I(3,_x) \
|
||||||
__asm movq mm6,OC_C(3) \
|
__asm movq mm6,OC_C(3) \
|
||||||
__asm movq mm4,mm2 \
|
__asm movq mm4,mm2 \
|
||||||
__asm movq mm7,OC_J(5) \
|
__asm movq mm7,OC_J(5,_x) \
|
||||||
__asm pmulhw mm4,mm6 \
|
__asm pmulhw mm4,mm6 \
|
||||||
__asm movq mm1,OC_C(5) \
|
__asm movq mm1,OC_C(5) \
|
||||||
__asm pmulhw mm6,mm7 \
|
__asm pmulhw mm6,mm7 \
|
||||||
__asm movq mm5,mm1 \
|
__asm movq mm5,mm1 \
|
||||||
__asm pmulhw mm1,mm2 \
|
__asm pmulhw mm1,mm2 \
|
||||||
__asm movq mm3,OC_I(1) \
|
__asm movq mm3,OC_I(1,_x) \
|
||||||
__asm pmulhw mm5,mm7 \
|
__asm pmulhw mm5,mm7 \
|
||||||
__asm movq mm0,OC_C(1) \
|
__asm movq mm0,OC_C(1) \
|
||||||
__asm paddw mm4,mm2 \
|
__asm paddw mm4,mm2 \
|
||||||
__asm paddw mm6,mm7 \
|
__asm paddw mm6,mm7 \
|
||||||
__asm paddw mm2,mm1 \
|
__asm paddw mm2,mm1 \
|
||||||
__asm movq mm1,OC_J(7) \
|
__asm movq mm1,OC_J(7,_x) \
|
||||||
__asm paddw mm7,mm5 \
|
__asm paddw mm7,mm5 \
|
||||||
__asm movq mm5,mm0 \
|
__asm movq mm5,mm0 \
|
||||||
__asm pmulhw mm0,mm3 \
|
__asm pmulhw mm0,mm3 \
|
||||||
@ -77,13 +76,13 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
__asm psubw mm6,mm2 \
|
__asm psubw mm6,mm2 \
|
||||||
__asm paddw mm0,mm3 \
|
__asm paddw mm0,mm3 \
|
||||||
__asm pmulhw mm3,mm7 \
|
__asm pmulhw mm3,mm7 \
|
||||||
__asm movq mm2,OC_I(2) \
|
__asm movq mm2,OC_I(2,_x) \
|
||||||
__asm pmulhw mm7,mm1 \
|
__asm pmulhw mm7,mm1 \
|
||||||
__asm paddw mm5,mm1 \
|
__asm paddw mm5,mm1 \
|
||||||
__asm movq mm1,mm2 \
|
__asm movq mm1,mm2 \
|
||||||
__asm pmulhw mm2,OC_C(2) \
|
__asm pmulhw mm2,OC_C(2) \
|
||||||
__asm psubw mm3,mm5 \
|
__asm psubw mm3,mm5 \
|
||||||
__asm movq mm5,OC_J(6) \
|
__asm movq mm5,OC_J(6,_x) \
|
||||||
__asm paddw mm0,mm7 \
|
__asm paddw mm0,mm7 \
|
||||||
__asm movq mm7,mm5 \
|
__asm movq mm7,mm5 \
|
||||||
__asm psubw mm0,mm4 \
|
__asm psubw mm0,mm4 \
|
||||||
@ -97,18 +96,18 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
__asm paddw mm6,mm6 \
|
__asm paddw mm6,mm6 \
|
||||||
__asm pmulhw mm7,OC_C(6) \
|
__asm pmulhw mm7,OC_C(6) \
|
||||||
__asm paddw mm6,mm3 \
|
__asm paddw mm6,mm3 \
|
||||||
__asm movq OC_I(1),mm4 \
|
__asm movq OC_I(1,_y),mm4 \
|
||||||
__asm psubw mm1,mm5 \
|
__asm psubw mm1,mm5 \
|
||||||
__asm movq mm4,OC_C(4) \
|
__asm movq mm4,OC_C(4) \
|
||||||
__asm movq mm5,mm3 \
|
__asm movq mm5,mm3 \
|
||||||
__asm pmulhw mm3,mm4 \
|
__asm pmulhw mm3,mm4 \
|
||||||
__asm paddw mm7,mm2 \
|
__asm paddw mm7,mm2 \
|
||||||
__asm movq OC_I(2),mm6 \
|
__asm movq OC_I(2,_y),mm6 \
|
||||||
__asm movq mm2,mm0 \
|
__asm movq mm2,mm0 \
|
||||||
__asm movq mm6,OC_I(0) \
|
__asm movq mm6,OC_I(0,_x) \
|
||||||
__asm pmulhw mm0,mm4 \
|
__asm pmulhw mm0,mm4 \
|
||||||
__asm paddw mm5,mm3 \
|
__asm paddw mm5,mm3 \
|
||||||
__asm movq mm3,OC_J(4) \
|
__asm movq mm3,OC_J(4,_x) \
|
||||||
__asm psubw mm5,mm1 \
|
__asm psubw mm5,mm1 \
|
||||||
__asm paddw mm2,mm0 \
|
__asm paddw mm2,mm0 \
|
||||||
__asm psubw mm6,mm3 \
|
__asm psubw mm6,mm3 \
|
||||||
@ -122,17 +121,17 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
__asm paddw mm6,mm0 \
|
__asm paddw mm6,mm0 \
|
||||||
__asm psubw mm6,mm2 \
|
__asm psubw mm6,mm2 \
|
||||||
__asm paddw mm2,mm2 \
|
__asm paddw mm2,mm2 \
|
||||||
__asm movq mm0,OC_I(1) \
|
__asm movq mm0,OC_I(1,_y) \
|
||||||
__asm paddw mm2,mm6 \
|
__asm paddw mm2,mm6 \
|
||||||
__asm paddw mm4,mm3 \
|
__asm paddw mm4,mm3 \
|
||||||
__asm psubw mm2,mm1 \
|
__asm psubw mm2,mm1 \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*38+8=46 cycles.*/
|
/*38+8=46 cycles.*/
|
||||||
#define OC_ROW_IDCT __asm{ \
|
#define OC_ROW_IDCT(_y,_x) __asm{ \
|
||||||
OC_IDCT_BEGIN \
|
OC_IDCT_BEGIN(_y,_x) \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
__asm movq mm3,OC_I(2) \
|
__asm movq mm3,OC_I(2,_y) \
|
||||||
/*r4=E'=E-G*/ \
|
/*r4=E'=E-G*/ \
|
||||||
__asm psubw mm4,mm7 \
|
__asm psubw mm4,mm7 \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
@ -157,7 +156,7 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
__asm psubw mm7,mm0 \
|
__asm psubw mm7,mm0 \
|
||||||
__asm paddw mm0,mm0 \
|
__asm paddw mm0,mm0 \
|
||||||
/*Save R1.*/ \
|
/*Save R1.*/ \
|
||||||
__asm movq OC_I(1),mm1 \
|
__asm movq OC_I(1,_y),mm1 \
|
||||||
/*r0=R0=G.+C.*/ \
|
/*r0=R0=G.+C.*/ \
|
||||||
__asm paddw mm0,mm7 \
|
__asm paddw mm0,mm7 \
|
||||||
}
|
}
|
||||||
@ -190,10 +189,10 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
|
|
||||||
Since r1 is free at entry, we calculate the Js first.*/
|
Since r1 is free at entry, we calculate the Js first.*/
|
||||||
/*19 cycles.*/
|
/*19 cycles.*/
|
||||||
#define OC_TRANSPOSE __asm{ \
|
#define OC_TRANSPOSE(_y) __asm{ \
|
||||||
__asm movq mm1,mm4 \
|
__asm movq mm1,mm4 \
|
||||||
__asm punpcklwd mm4,mm5 \
|
__asm punpcklwd mm4,mm5 \
|
||||||
__asm movq OC_I(0),mm0 \
|
__asm movq OC_I(0,_y),mm0 \
|
||||||
__asm punpckhwd mm1,mm5 \
|
__asm punpckhwd mm1,mm5 \
|
||||||
__asm movq mm0,mm6 \
|
__asm movq mm0,mm6 \
|
||||||
__asm punpcklwd mm6,mm7 \
|
__asm punpcklwd mm6,mm7 \
|
||||||
@ -201,17 +200,17 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
__asm punpckldq mm4,mm6 \
|
__asm punpckldq mm4,mm6 \
|
||||||
__asm punpckhdq mm5,mm6 \
|
__asm punpckhdq mm5,mm6 \
|
||||||
__asm movq mm6,mm1 \
|
__asm movq mm6,mm1 \
|
||||||
__asm movq OC_J(4),mm4 \
|
__asm movq OC_J(4,_y),mm4 \
|
||||||
__asm punpckhwd mm0,mm7 \
|
__asm punpckhwd mm0,mm7 \
|
||||||
__asm movq OC_J(5),mm5 \
|
__asm movq OC_J(5,_y),mm5 \
|
||||||
__asm punpckhdq mm6,mm0 \
|
__asm punpckhdq mm6,mm0 \
|
||||||
__asm movq mm4,OC_I(0) \
|
__asm movq mm4,OC_I(0,_y) \
|
||||||
__asm punpckldq mm1,mm0 \
|
__asm punpckldq mm1,mm0 \
|
||||||
__asm movq mm5,OC_I(1) \
|
__asm movq mm5,OC_I(1,_y) \
|
||||||
__asm movq mm0,mm4 \
|
__asm movq mm0,mm4 \
|
||||||
__asm movq OC_J(7),mm6 \
|
__asm movq OC_J(7,_y),mm6 \
|
||||||
__asm punpcklwd mm0,mm5 \
|
__asm punpcklwd mm0,mm5 \
|
||||||
__asm movq OC_J(6),mm1 \
|
__asm movq OC_J(6,_y),mm1 \
|
||||||
__asm punpckhwd mm4,mm5 \
|
__asm punpckhwd mm4,mm5 \
|
||||||
__asm movq mm5,mm2 \
|
__asm movq mm5,mm2 \
|
||||||
__asm punpcklwd mm2,mm3 \
|
__asm punpcklwd mm2,mm3 \
|
||||||
@ -219,18 +218,18 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
__asm punpckldq mm0,mm2 \
|
__asm punpckldq mm0,mm2 \
|
||||||
__asm punpckhdq mm1,mm2 \
|
__asm punpckhdq mm1,mm2 \
|
||||||
__asm movq mm2,mm4 \
|
__asm movq mm2,mm4 \
|
||||||
__asm movq OC_I(0),mm0 \
|
__asm movq OC_I(0,_y),mm0 \
|
||||||
__asm punpckhwd mm5,mm3 \
|
__asm punpckhwd mm5,mm3 \
|
||||||
__asm movq OC_I(1),mm1 \
|
__asm movq OC_I(1,_y),mm1 \
|
||||||
__asm punpckhdq mm4,mm5 \
|
__asm punpckhdq mm4,mm5 \
|
||||||
__asm punpckldq mm2,mm5 \
|
__asm punpckldq mm2,mm5 \
|
||||||
__asm movq OC_I(3),mm4 \
|
__asm movq OC_I(3,_y),mm4 \
|
||||||
__asm movq OC_I(2),mm2 \
|
__asm movq OC_I(2,_y),mm2 \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*38+19=57 cycles.*/
|
/*38+19=57 cycles.*/
|
||||||
#define OC_COLUMN_IDCT __asm{ \
|
#define OC_COLUMN_IDCT(_y) __asm{ \
|
||||||
OC_IDCT_BEGIN \
|
OC_IDCT_BEGIN(_y,_y) \
|
||||||
__asm paddw mm2,OC_8 \
|
__asm paddw mm2,OC_8 \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
__asm paddw mm1,mm1 \
|
__asm paddw mm1,mm1 \
|
||||||
@ -243,15 +242,15 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
/*r1=NR1*/ \
|
/*r1=NR1*/ \
|
||||||
__asm psraw mm1,4 \
|
__asm psraw mm1,4 \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
__asm movq mm3,OC_I(2) \
|
__asm movq mm3,OC_I(2,_y) \
|
||||||
/*r7=G+G*/ \
|
/*r7=G+G*/ \
|
||||||
__asm paddw mm7,mm7 \
|
__asm paddw mm7,mm7 \
|
||||||
/*Store NR2 at I(2).*/ \
|
/*Store NR2 at I(2).*/ \
|
||||||
__asm movq OC_I(2),mm2 \
|
__asm movq OC_I(2,_y),mm2 \
|
||||||
/*r7=G'=E+G*/ \
|
/*r7=G'=E+G*/ \
|
||||||
__asm paddw mm7,mm4 \
|
__asm paddw mm7,mm4 \
|
||||||
/*Store NR1 at I(1).*/ \
|
/*Store NR1 at I(1).*/ \
|
||||||
__asm movq OC_I(1),mm1 \
|
__asm movq OC_I(1,_y),mm1 \
|
||||||
/*r4=R4=E'-D'*/ \
|
/*r4=R4=E'-D'*/ \
|
||||||
__asm psubw mm4,mm3 \
|
__asm psubw mm4,mm3 \
|
||||||
__asm paddw mm4,OC_8 \
|
__asm paddw mm4,OC_8 \
|
||||||
@ -273,11 +272,11 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
/*r6=NR6*/ \
|
/*r6=NR6*/ \
|
||||||
__asm psraw mm6,4 \
|
__asm psraw mm6,4 \
|
||||||
/*Store NR4 at J(4).*/ \
|
/*Store NR4 at J(4).*/ \
|
||||||
__asm movq OC_J(4),mm4 \
|
__asm movq OC_J(4,_y),mm4 \
|
||||||
/*r5=NR5*/ \
|
/*r5=NR5*/ \
|
||||||
__asm psraw mm5,4 \
|
__asm psraw mm5,4 \
|
||||||
/*Store NR3 at I(3).*/ \
|
/*Store NR3 at I(3).*/ \
|
||||||
__asm movq OC_I(3),mm3 \
|
__asm movq OC_I(3,_y),mm3 \
|
||||||
/*r7=R7=G'-C'*/ \
|
/*r7=R7=G'-C'*/ \
|
||||||
__asm psubw mm7,mm0 \
|
__asm psubw mm7,mm0 \
|
||||||
__asm paddw mm7,OC_8 \
|
__asm paddw mm7,OC_8 \
|
||||||
@ -288,71 +287,89 @@ static const __declspec(align(16))ogg_uint16_t
|
|||||||
/*r7=NR7*/ \
|
/*r7=NR7*/ \
|
||||||
__asm psraw mm7,4 \
|
__asm psraw mm7,4 \
|
||||||
/*Store NR6 at J(6).*/ \
|
/*Store NR6 at J(6).*/ \
|
||||||
__asm movq OC_J(6),mm6 \
|
__asm movq OC_J(6,_y),mm6 \
|
||||||
/*r0=NR0*/ \
|
/*r0=NR0*/ \
|
||||||
__asm psraw mm0,4 \
|
__asm psraw mm0,4 \
|
||||||
/*Store NR5 at J(5).*/ \
|
/*Store NR5 at J(5).*/ \
|
||||||
__asm movq OC_J(5),mm5 \
|
__asm movq OC_J(5,_y),mm5 \
|
||||||
/*Store NR7 at J(7).*/ \
|
/*Store NR7 at J(7).*/ \
|
||||||
__asm movq OC_J(7),mm7 \
|
__asm movq OC_J(7,_y),mm7 \
|
||||||
/*Store NR0 at I(0).*/ \
|
/*Store NR0 at I(0).*/ \
|
||||||
__asm movq OC_I(0),mm0 \
|
__asm movq OC_I(0,_y),mm0 \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
|
#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
|
||||||
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
|
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
|
||||||
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
|
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
|
||||||
|
|
||||||
static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
|
int i;
|
||||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||||
Every 4x4 block is transposed.*/
|
Every 4x4 block is transposed.*/
|
||||||
__asm{
|
__asm{
|
||||||
#define CONSTS eax
|
#define CONSTS eax
|
||||||
#define Y edx
|
#define Y edx
|
||||||
|
#define X ecx
|
||||||
mov CONSTS,offset OC_IDCT_CONSTS
|
mov CONSTS,offset OC_IDCT_CONSTS
|
||||||
mov Y,_y
|
mov Y,_y
|
||||||
#define OC_I(_k) [Y+_k*16]
|
mov X,_x
|
||||||
#define OC_J(_k) [Y+(_k-4)*16+8]
|
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||||
OC_ROW_IDCT
|
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
|
||||||
OC_TRANSPOSE
|
OC_ROW_IDCT(Y,X)
|
||||||
|
OC_TRANSPOSE(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) [Y+(_k*16)+64]
|
#define OC_I(_k,_y) [(_y)+(_k)*16+64]
|
||||||
#define OC_J(_k) [Y+(_k-4)*16+72]
|
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72]
|
||||||
OC_ROW_IDCT
|
OC_ROW_IDCT(Y,X)
|
||||||
OC_TRANSPOSE
|
OC_TRANSPOSE(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) [Y+_k*16]
|
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT
|
OC_COLUMN_IDCT(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) [Y+_k*16+8]
|
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT
|
OC_COLUMN_IDCT(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#undef CONSTS
|
#undef CONSTS
|
||||||
#undef Y
|
#undef Y
|
||||||
|
#undef X
|
||||||
|
}
|
||||||
|
__asm pxor mm0,mm0;
|
||||||
|
for(i=0;i<4;i++){
|
||||||
|
ogg_int16_t *x;
|
||||||
|
x=_x+16*i;
|
||||||
|
#define X ecx
|
||||||
|
__asm{
|
||||||
|
mov X,x
|
||||||
|
movq [X+0x00],mm0
|
||||||
|
movq [X+0x08],mm0
|
||||||
|
movq [X+0x10],mm0
|
||||||
|
movq [X+0x18],mm0
|
||||||
|
}
|
||||||
|
#undef X
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*25 cycles.*/
|
/*25 cycles.*/
|
||||||
#define OC_IDCT_BEGIN_10 __asm{ \
|
#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
|
||||||
__asm movq mm2,OC_I(3) \
|
__asm movq mm2,OC_I(3,_x) \
|
||||||
__asm nop \
|
__asm nop \
|
||||||
__asm movq mm6,OC_C(3) \
|
__asm movq mm6,OC_C(3) \
|
||||||
__asm movq mm4,mm2 \
|
__asm movq mm4,mm2 \
|
||||||
__asm movq mm1,OC_C(5) \
|
__asm movq mm1,OC_C(5) \
|
||||||
__asm pmulhw mm4,mm6 \
|
__asm pmulhw mm4,mm6 \
|
||||||
__asm movq mm3,OC_I(1) \
|
__asm movq mm3,OC_I(1,_x) \
|
||||||
__asm pmulhw mm1,mm2 \
|
__asm pmulhw mm1,mm2 \
|
||||||
__asm movq mm0,OC_C(1) \
|
__asm movq mm0,OC_C(1) \
|
||||||
__asm paddw mm4,mm2 \
|
__asm paddw mm4,mm2 \
|
||||||
__asm pxor mm6,mm6 \
|
__asm pxor mm6,mm6 \
|
||||||
__asm paddw mm2,mm1 \
|
__asm paddw mm2,mm1 \
|
||||||
__asm movq mm5,OC_I(2) \
|
__asm movq mm5,OC_I(2,_x) \
|
||||||
__asm pmulhw mm0,mm3 \
|
__asm pmulhw mm0,mm3 \
|
||||||
__asm movq mm1,mm5 \
|
__asm movq mm1,mm5 \
|
||||||
__asm paddw mm0,mm3 \
|
__asm paddw mm0,mm3 \
|
||||||
@ -360,43 +377,43 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
__asm psubw mm6,mm2 \
|
__asm psubw mm6,mm2 \
|
||||||
__asm pmulhw mm5,OC_C(2) \
|
__asm pmulhw mm5,OC_C(2) \
|
||||||
__asm psubw mm0,mm4 \
|
__asm psubw mm0,mm4 \
|
||||||
__asm movq mm7,OC_I(2) \
|
__asm movq mm7,OC_I(2,_x) \
|
||||||
__asm paddw mm4,mm4 \
|
__asm paddw mm4,mm4 \
|
||||||
__asm paddw mm7,mm5 \
|
__asm paddw mm7,mm5 \
|
||||||
__asm paddw mm4,mm0 \
|
__asm paddw mm4,mm0 \
|
||||||
__asm pmulhw mm1,OC_C(6) \
|
__asm pmulhw mm1,OC_C(6) \
|
||||||
__asm psubw mm3,mm6 \
|
__asm psubw mm3,mm6 \
|
||||||
__asm movq OC_I(1),mm4 \
|
__asm movq OC_I(1,_y),mm4 \
|
||||||
__asm paddw mm6,mm6 \
|
__asm paddw mm6,mm6 \
|
||||||
__asm movq mm4,OC_C(4) \
|
__asm movq mm4,OC_C(4) \
|
||||||
__asm paddw mm6,mm3 \
|
__asm paddw mm6,mm3 \
|
||||||
__asm movq mm5,mm3 \
|
__asm movq mm5,mm3 \
|
||||||
__asm pmulhw mm3,mm4 \
|
__asm pmulhw mm3,mm4 \
|
||||||
__asm movq OC_I(2),mm6 \
|
__asm movq OC_I(2,_y),mm6 \
|
||||||
__asm movq mm2,mm0 \
|
__asm movq mm2,mm0 \
|
||||||
__asm movq mm6,OC_I(0) \
|
__asm movq mm6,OC_I(0,_x) \
|
||||||
__asm pmulhw mm0,mm4 \
|
__asm pmulhw mm0,mm4 \
|
||||||
__asm paddw mm5,mm3 \
|
__asm paddw mm5,mm3 \
|
||||||
__asm paddw mm2,mm0 \
|
__asm paddw mm2,mm0 \
|
||||||
__asm psubw mm5,mm1 \
|
__asm psubw mm5,mm1 \
|
||||||
__asm pmulhw mm6,mm4 \
|
__asm pmulhw mm6,mm4 \
|
||||||
__asm paddw mm6,OC_I(0) \
|
__asm paddw mm6,OC_I(0,_x) \
|
||||||
__asm paddw mm1,mm1 \
|
__asm paddw mm1,mm1 \
|
||||||
__asm movq mm4,mm6 \
|
__asm movq mm4,mm6 \
|
||||||
__asm paddw mm1,mm5 \
|
__asm paddw mm1,mm5 \
|
||||||
__asm psubw mm6,mm2 \
|
__asm psubw mm6,mm2 \
|
||||||
__asm paddw mm2,mm2 \
|
__asm paddw mm2,mm2 \
|
||||||
__asm movq mm0,OC_I(1) \
|
__asm movq mm0,OC_I(1,_y) \
|
||||||
__asm paddw mm2,mm6 \
|
__asm paddw mm2,mm6 \
|
||||||
__asm psubw mm2,mm1 \
|
__asm psubw mm2,mm1 \
|
||||||
__asm nop \
|
__asm nop \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*25+8=33 cycles.*/
|
/*25+8=33 cycles.*/
|
||||||
#define OC_ROW_IDCT_10 __asm{ \
|
#define OC_ROW_IDCT_10(_y,_x) __asm{ \
|
||||||
OC_IDCT_BEGIN_10 \
|
OC_IDCT_BEGIN_10(_y,_x) \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
__asm movq mm3,OC_I(2) \
|
__asm movq mm3,OC_I(2,_y) \
|
||||||
/*r4=E'=E-G*/ \
|
/*r4=E'=E-G*/ \
|
||||||
__asm psubw mm4,mm7 \
|
__asm psubw mm4,mm7 \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
@ -421,14 +438,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
__asm psubw mm7,mm0 \
|
__asm psubw mm7,mm0 \
|
||||||
__asm paddw mm0,mm0 \
|
__asm paddw mm0,mm0 \
|
||||||
/*Save R1.*/ \
|
/*Save R1.*/ \
|
||||||
__asm movq OC_I(1),mm1 \
|
__asm movq OC_I(1,_y),mm1 \
|
||||||
/*r0=R0=G'+C'*/ \
|
/*r0=R0=G'+C'*/ \
|
||||||
__asm paddw mm0,mm7 \
|
__asm paddw mm0,mm7 \
|
||||||
}
|
}
|
||||||
|
|
||||||
/*25+19=44 cycles'*/
|
/*25+19=44 cycles'*/
|
||||||
#define OC_COLUMN_IDCT_10 __asm{ \
|
#define OC_COLUMN_IDCT_10(_y) __asm{ \
|
||||||
OC_IDCT_BEGIN_10 \
|
OC_IDCT_BEGIN_10(_y,_y) \
|
||||||
__asm paddw mm2,OC_8 \
|
__asm paddw mm2,OC_8 \
|
||||||
/*r1=H'+H'*/ \
|
/*r1=H'+H'*/ \
|
||||||
__asm paddw mm1,mm1 \
|
__asm paddw mm1,mm1 \
|
||||||
@ -441,15 +458,15 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
/*r1=NR1*/ \
|
/*r1=NR1*/ \
|
||||||
__asm psraw mm1,4 \
|
__asm psraw mm1,4 \
|
||||||
/*r3=D'*/ \
|
/*r3=D'*/ \
|
||||||
__asm movq mm3,OC_I(2) \
|
__asm movq mm3,OC_I(2,_y) \
|
||||||
/*r7=G+G*/ \
|
/*r7=G+G*/ \
|
||||||
__asm paddw mm7,mm7 \
|
__asm paddw mm7,mm7 \
|
||||||
/*Store NR2 at I(2).*/ \
|
/*Store NR2 at I(2).*/ \
|
||||||
__asm movq OC_I(2),mm2 \
|
__asm movq OC_I(2,_y),mm2 \
|
||||||
/*r7=G'=E+G*/ \
|
/*r7=G'=E+G*/ \
|
||||||
__asm paddw mm7,mm4 \
|
__asm paddw mm7,mm4 \
|
||||||
/*Store NR1 at I(1).*/ \
|
/*Store NR1 at I(1).*/ \
|
||||||
__asm movq OC_I(1),mm1 \
|
__asm movq OC_I(1,_y),mm1 \
|
||||||
/*r4=R4=E'-D'*/ \
|
/*r4=R4=E'-D'*/ \
|
||||||
__asm psubw mm4,mm3 \
|
__asm psubw mm4,mm3 \
|
||||||
__asm paddw mm4,OC_8 \
|
__asm paddw mm4,OC_8 \
|
||||||
@ -471,11 +488,11 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
/*r6=NR6*/ \
|
/*r6=NR6*/ \
|
||||||
__asm psraw mm6,4 \
|
__asm psraw mm6,4 \
|
||||||
/*Store NR4 at J(4).*/ \
|
/*Store NR4 at J(4).*/ \
|
||||||
__asm movq OC_J(4),mm4 \
|
__asm movq OC_J(4,_y),mm4 \
|
||||||
/*r5=NR5*/ \
|
/*r5=NR5*/ \
|
||||||
__asm psraw mm5,4 \
|
__asm psraw mm5,4 \
|
||||||
/*Store NR3 at I(3).*/ \
|
/*Store NR3 at I(3).*/ \
|
||||||
__asm movq OC_I(3),mm3 \
|
__asm movq OC_I(3,_y),mm3 \
|
||||||
/*r7=R7=G'-C'*/ \
|
/*r7=R7=G'-C'*/ \
|
||||||
__asm psubw mm7,mm0 \
|
__asm psubw mm7,mm0 \
|
||||||
__asm paddw mm7,OC_8 \
|
__asm paddw mm7,OC_8 \
|
||||||
@ -486,50 +503,63 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
|
|||||||
/*r7=NR7*/ \
|
/*r7=NR7*/ \
|
||||||
__asm psraw mm7,4 \
|
__asm psraw mm7,4 \
|
||||||
/*Store NR6 at J(6).*/ \
|
/*Store NR6 at J(6).*/ \
|
||||||
__asm movq OC_J(6),mm6 \
|
__asm movq OC_J(6,_y),mm6 \
|
||||||
/*r0=NR0*/ \
|
/*r0=NR0*/ \
|
||||||
__asm psraw mm0,4 \
|
__asm psraw mm0,4 \
|
||||||
/*Store NR5 at J(5).*/ \
|
/*Store NR5 at J(5).*/ \
|
||||||
__asm movq OC_J(5),mm5 \
|
__asm movq OC_J(5,_y),mm5 \
|
||||||
/*Store NR7 at J(7).*/ \
|
/*Store NR7 at J(7).*/ \
|
||||||
__asm movq OC_J(7),mm7 \
|
__asm movq OC_J(7,_y),mm7 \
|
||||||
/*Store NR0 at I(0).*/ \
|
/*Store NR0 at I(0).*/ \
|
||||||
__asm movq OC_I(0),mm0 \
|
__asm movq OC_I(0,_y),mm0 \
|
||||||
}
|
}
|
||||||
|
|
||||||
static void oc_idct8x8_10(ogg_int16_t _y[64]){
|
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||||
__asm{
|
__asm{
|
||||||
#define CONSTS eax
|
#define CONSTS eax
|
||||||
#define Y edx
|
#define Y edx
|
||||||
|
#define X ecx
|
||||||
mov CONSTS,offset OC_IDCT_CONSTS
|
mov CONSTS,offset OC_IDCT_CONSTS
|
||||||
mov Y,_y
|
mov Y,_y
|
||||||
#define OC_I(_k) [Y+_k*16]
|
mov X,_x
|
||||||
#define OC_J(_k) [Y+(_k-4)*16+8]
|
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||||
|
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
|
||||||
/*Done with dequant, descramble, and partial transpose.
|
/*Done with dequant, descramble, and partial transpose.
|
||||||
Now do the iDCT itself.*/
|
Now do the iDCT itself.*/
|
||||||
OC_ROW_IDCT_10
|
OC_ROW_IDCT_10(Y,X)
|
||||||
OC_TRANSPOSE
|
OC_TRANSPOSE(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) [Y+_k*16]
|
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT_10
|
OC_COLUMN_IDCT_10(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#define OC_I(_k) [Y+_k*16+8]
|
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
|
||||||
#define OC_J(_k) OC_I(_k)
|
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||||
OC_COLUMN_IDCT_10
|
OC_COLUMN_IDCT_10(Y)
|
||||||
#undef OC_I
|
#undef OC_I
|
||||||
#undef OC_J
|
#undef OC_J
|
||||||
#undef CONSTS
|
#undef CONSTS
|
||||||
#undef Y
|
#undef Y
|
||||||
|
#undef X
|
||||||
}
|
}
|
||||||
|
#define X ecx
|
||||||
|
__asm{
|
||||||
|
pxor mm0,mm0;
|
||||||
|
mov X,_x
|
||||||
|
movq [X+0x00],mm0
|
||||||
|
movq [X+0x10],mm0
|
||||||
|
movq [X+0x20],mm0
|
||||||
|
movq [X+0x30],mm0
|
||||||
|
}
|
||||||
|
#undef X
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||||
version of the transform.*/
|
version of the transform.*/
|
||||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||||
/*_last_zzi is subtly different from an actual count of the number of
|
/*_last_zzi is subtly different from an actual count of the number of
|
||||||
coefficients we decoded for this block.
|
coefficients we decoded for this block.
|
||||||
It contains the value of zzi BEFORE the final token in the block was
|
It contains the value of zzi BEFORE the final token in the block was
|
||||||
@ -555,8 +585,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
|
|||||||
gets.
|
gets.
|
||||||
Needless to say we inherited this approach from VP3.*/
|
Needless to say we inherited this approach from VP3.*/
|
||||||
/*Perform the iDCT.*/
|
/*Perform the iDCT.*/
|
||||||
if(_last_zzi<10)oc_idct8x8_10(_y);
|
if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
|
||||||
else oc_idct8x8_slow(_y);
|
else oc_idct8x8_slow(_y,_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
75
thirdparty/libtheora/x86_vc/mmxstate.c
vendored
75
thirdparty/libtheora/x86_vc/mmxstate.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -19,17 +19,16 @@
|
|||||||
Originally written by Rudolf Marek.*/
|
Originally written by Rudolf Marek.*/
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "x86int.h"
|
#include "x86int.h"
|
||||||
#include "mmxfrag.h"
|
|
||||||
#include "mmxloop.h"
|
#include "mmxloop.h"
|
||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||||
unsigned char *dst;
|
unsigned char *dst;
|
||||||
ptrdiff_t frag_buf_off;
|
ptrdiff_t frag_buf_off;
|
||||||
int ystride;
|
int ystride;
|
||||||
int mb_mode;
|
int refi;
|
||||||
/*Apply the inverse transform.*/
|
/*Apply the inverse transform.*/
|
||||||
/*Special case only having a DC component.*/
|
/*Special case only having a DC component.*/
|
||||||
if(_last_zzi<2){
|
if(_last_zzi<2){
|
||||||
@ -45,6 +44,7 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|||||||
#define P ecx
|
#define P ecx
|
||||||
mov Y,_dct_coeffs
|
mov Y,_dct_coeffs
|
||||||
movzx P,p
|
movzx P,p
|
||||||
|
lea Y,[Y+128]
|
||||||
/*mm0=0000 0000 0000 AAAA*/
|
/*mm0=0000 0000 0000 AAAA*/
|
||||||
movd mm0,P
|
movd mm0,P
|
||||||
/*mm0=0000 0000 AAAA AAAA*/
|
/*mm0=0000 0000 AAAA AAAA*/
|
||||||
@ -74,65 +74,32 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
|||||||
else{
|
else{
|
||||||
/*Dequantize the DC coefficient.*/
|
/*Dequantize the DC coefficient.*/
|
||||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||||
oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
|
oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||||
}
|
}
|
||||||
/*Fill in the target buffer.*/
|
/*Fill in the target buffer.*/
|
||||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||||
mb_mode=_state->frags[_fragi].mb_mode;
|
refi=_state->frags[_fragi].refi;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
|
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||||
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
|
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||||
else{
|
else{
|
||||||
const unsigned char *ref;
|
const unsigned char *ref;
|
||||||
int mvoffsets[2];
|
int mvoffsets[2];
|
||||||
ref=
|
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||||
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
|
|
||||||
+frag_buf_off;
|
|
||||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||||
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
|
_state->frag_mvs[_fragi])>1){
|
||||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||||
_dct_coeffs);
|
_dct_coeffs+64);
|
||||||
}
|
}
|
||||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
|
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*We copy these entire function to inline the actual MMX routines so that we
|
/*We copy these entire function to inline the actual MMX routines so that we
|
||||||
use only a single indirect call.*/
|
use only a single indirect call.*/
|
||||||
|
|
||||||
/*Copies the fragments specified by the lists of fragment indices from one
|
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||||
frame to another.
|
memset(_bv,~(_flimit<<1),8);
|
||||||
_fragis: A pointer to a list of fragment indices.
|
|
||||||
_nfragis: The number of fragment indices to copy.
|
|
||||||
_dst_frame: The reference frame to copy to.
|
|
||||||
_src_frame: The reference frame to copy from.
|
|
||||||
_pli: The color plane the fragments lie in.*/
|
|
||||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli){
|
|
||||||
const ptrdiff_t *frag_buf_offs;
|
|
||||||
const unsigned char *src_frame_data;
|
|
||||||
unsigned char *dst_frame_data;
|
|
||||||
ptrdiff_t fragii;
|
|
||||||
int ystride;
|
|
||||||
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
|
|
||||||
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
|
|
||||||
ystride=_state->ref_ystride[_pli];
|
|
||||||
frag_buf_offs=_state->frag_buf_offs;
|
|
||||||
for(fragii=0;fragii<_nfragis;fragii++){
|
|
||||||
ptrdiff_t frag_buf_off;
|
|
||||||
frag_buf_off=frag_buf_offs[_fragis[fragii]];
|
|
||||||
#define SRC edx
|
|
||||||
#define DST eax
|
|
||||||
#define YSTRIDE ecx
|
|
||||||
#define YSTRIDE3 edi
|
|
||||||
OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
|
|
||||||
src_frame_data+frag_buf_off,ystride);
|
|
||||||
#undef SRC
|
|
||||||
#undef DST
|
|
||||||
#undef YSTRIDE
|
|
||||||
#undef YSTRIDE3
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||||
@ -144,8 +111,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
|||||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||||
OC_ALIGN8(unsigned char ll[8]);
|
|
||||||
const oc_fragment_plane *fplane;
|
const oc_fragment_plane *fplane;
|
||||||
const oc_fragment *frags;
|
const oc_fragment *frags;
|
||||||
const ptrdiff_t *frag_buf_offs;
|
const ptrdiff_t *frag_buf_offs;
|
||||||
@ -156,13 +122,12 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
|||||||
ptrdiff_t fragi0_end;
|
ptrdiff_t fragi0_end;
|
||||||
int ystride;
|
int ystride;
|
||||||
int nhfrags;
|
int nhfrags;
|
||||||
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
|
|
||||||
fplane=_state->fplanes+_pli;
|
fplane=_state->fplanes+_pli;
|
||||||
nhfrags=fplane->nhfrags;
|
nhfrags=fplane->nhfrags;
|
||||||
fragi_top=fplane->froffset;
|
fragi_top=fplane->froffset;
|
||||||
fragi_bot=fragi_top+fplane->nfrags;
|
fragi_bot=fragi_top+fplane->nfrags;
|
||||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||||
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
|
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||||
ystride=_state->ref_ystride[_pli];
|
ystride=_state->ref_ystride[_pli];
|
||||||
frags=_state->frags;
|
frags=_state->frags;
|
||||||
frag_buf_offs=_state->frag_buf_offs;
|
frag_buf_offs=_state->frag_buf_offs;
|
||||||
@ -187,13 +152,13 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
|||||||
#define LL edx
|
#define LL edx
|
||||||
#define D esi
|
#define D esi
|
||||||
#define D_WORD si
|
#define D_WORD si
|
||||||
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
|
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
|
||||||
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
|
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
|
||||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||||
OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
|
OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
|
||||||
}
|
}
|
||||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||||
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
|
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
|
||||||
}
|
}
|
||||||
#undef PIX
|
#undef PIX
|
||||||
#undef YSTRIDE3
|
#undef YSTRIDE3
|
||||||
|
@ -14,41 +14,17 @@
|
|||||||
Originally written by Rudolf Marek.
|
Originally written by Rudolf Marek.
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
#include "cpu.h"
|
#include "x86cpu.h"
|
||||||
|
|
||||||
#if !defined(OC_X86_ASM)
|
#if !defined(OC_X86_ASM)
|
||||||
static ogg_uint32_t oc_cpu_flags_get(void){
|
ogg_uint32_t oc_cpu_flags_get(void){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
# if !defined(_MSC_VER)
|
|
||||||
# if defined(__amd64__)||defined(__x86_64__)
|
|
||||||
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
|
|
||||||
compiling with -fPIC.*/
|
|
||||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
|
||||||
__asm__ __volatile__( \
|
|
||||||
"cpuid\n\t" \
|
|
||||||
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
|
||||||
:"a"(_op) \
|
|
||||||
:"cc" \
|
|
||||||
)
|
|
||||||
# else
|
|
||||||
/*On x86-32, not so much.*/
|
|
||||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
|
||||||
__asm__ __volatile__( \
|
|
||||||
"xchgl %%ebx,%[ebx]\n\t" \
|
|
||||||
"cpuid\n\t" \
|
|
||||||
"xchgl %%ebx,%[ebx]\n\t" \
|
|
||||||
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
|
||||||
:"a"(_op) \
|
|
||||||
:"cc" \
|
|
||||||
)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
/*Why does MSVC need this complicated rigamarole?
|
/*Why does MSVC need this complicated rigamarole?
|
||||||
At this point I honestly do not care.*/
|
At this point I honestly do not care.*/
|
||||||
|
|
||||||
@ -95,7 +71,6 @@ static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
|
|||||||
mov [ecx],ebx
|
mov [ecx],ebx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
# endif
|
|
||||||
|
|
||||||
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||||
ogg_uint32_t flags;
|
ogg_uint32_t flags;
|
||||||
@ -124,7 +99,7 @@ static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
|||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ogg_uint32_t oc_cpu_flags_get(void){
|
ogg_uint32_t oc_cpu_flags_get(void){
|
||||||
ogg_uint32_t flags;
|
ogg_uint32_t flags;
|
||||||
ogg_uint32_t eax;
|
ogg_uint32_t eax;
|
||||||
ogg_uint32_t ebx;
|
ogg_uint32_t ebx;
|
||||||
@ -132,25 +107,7 @@ static ogg_uint32_t oc_cpu_flags_get(void){
|
|||||||
ogg_uint32_t edx;
|
ogg_uint32_t edx;
|
||||||
# if !defined(__amd64__)&&!defined(__x86_64__)
|
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||||
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||||
# if !defined(_MSC_VER)
|
|
||||||
__asm__ __volatile__(
|
|
||||||
"pushfl\n\t"
|
|
||||||
"pushfl\n\t"
|
|
||||||
"popl %[a]\n\t"
|
|
||||||
"movl %[a],%[b]\n\t"
|
|
||||||
"xorl $0x200000,%[a]\n\t"
|
|
||||||
"pushl %[a]\n\t"
|
|
||||||
"popfl\n\t"
|
|
||||||
"pushfl\n\t"
|
|
||||||
"popl %[a]\n\t"
|
|
||||||
"popfl\n\t"
|
|
||||||
:[a]"=r"(eax),[b]"=r"(ebx)
|
|
||||||
:
|
|
||||||
:"cc"
|
|
||||||
);
|
|
||||||
# else
|
|
||||||
oc_detect_cpuid_helper(&eax,&ebx);
|
oc_detect_cpuid_helper(&eax,&ebx);
|
||||||
# endif
|
|
||||||
/*No cpuid.*/
|
/*No cpuid.*/
|
||||||
if(eax==ebx)return 0;
|
if(eax==ebx)return 0;
|
||||||
# endif
|
# endif
|
||||||
@ -159,9 +116,18 @@ static ogg_uint32_t oc_cpu_flags_get(void){
|
|||||||
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||||
/* 6 8 x M T e n i u n e G*/
|
/* 6 8 x M T e n i u n e G*/
|
||||||
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||||
|
int family;
|
||||||
|
int model;
|
||||||
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||||
cpuid(1,eax,ebx,ecx,edx);
|
cpuid(1,eax,ebx,ecx,edx);
|
||||||
flags=oc_parse_intel_flags(edx,ecx);
|
flags=oc_parse_intel_flags(edx,ecx);
|
||||||
|
family=(eax>>8)&0xF;
|
||||||
|
model=(eax>>4)&0xF;
|
||||||
|
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||||
|
unit, so don't use it.*/
|
||||||
|
if(family==6&&(model==9||model==13||model==14)){
|
||||||
|
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/* D M A c i t n e h t u A*/
|
/* D M A c i t n e h t u A*/
|
||||||
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
36
thirdparty/libtheora/x86_vc/x86cpu.h
vendored
Normal file
36
thirdparty/libtheora/x86_vc/x86cpu.h
vendored
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
function:
|
||||||
|
last mod: $Id$
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#if !defined(_x86_vc_x86cpu_H)
|
||||||
|
# define _x86_vc_x86cpu_H (1)
|
||||||
|
#include "../internal.h"
|
||||||
|
|
||||||
|
#define OC_CPU_X86_MMX (1<<0)
|
||||||
|
#define OC_CPU_X86_3DNOW (1<<1)
|
||||||
|
#define OC_CPU_X86_3DNOWEXT (1<<2)
|
||||||
|
#define OC_CPU_X86_MMXEXT (1<<3)
|
||||||
|
#define OC_CPU_X86_SSE (1<<4)
|
||||||
|
#define OC_CPU_X86_SSE2 (1<<5)
|
||||||
|
#define OC_CPU_X86_PNI (1<<6)
|
||||||
|
#define OC_CPU_X86_SSSE3 (1<<7)
|
||||||
|
#define OC_CPU_X86_SSE4_1 (1<<8)
|
||||||
|
#define OC_CPU_X86_SSE4_2 (1<<9)
|
||||||
|
#define OC_CPU_X86_SSE4A (1<<10)
|
||||||
|
#define OC_CPU_X86_SSE5 (1<<11)
|
||||||
|
|
||||||
|
ogg_uint32_t oc_cpu_flags_get(void);
|
||||||
|
|
||||||
|
#endif
|
14
thirdparty/libtheora/x86_vc/x86enc.c
vendored
14
thirdparty/libtheora/x86_vc/x86enc.c
vendored
@ -18,27 +18,25 @@
|
|||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
#include "../cpu.c"
|
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
|
||||||
|
|
||||||
void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
|
|
||||||
ogg_uint32_t cpu_flags;
|
ogg_uint32_t cpu_flags;
|
||||||
cpu_flags=oc_cpu_flags_get();
|
cpu_flags=_enc->state.cpu_flags;
|
||||||
oc_enc_vtable_init_c(_enc);
|
oc_enc_accel_init_c(_enc);
|
||||||
if(cpu_flags&OC_CPU_X86_MMX){
|
if(cpu_flags&OC_CPU_X86_MMX){
|
||||||
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
||||||
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
||||||
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||||
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
|
|
||||||
}
|
}
|
||||||
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
||||||
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
||||||
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
||||||
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
||||||
_enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
|
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
|
||||||
_enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
|
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
|
||||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
||||||
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
||||||
|
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
|
||||||
}
|
}
|
||||||
if(cpu_flags&OC_CPU_X86_SSE2){
|
if(cpu_flags&OC_CPU_X86_SSE2){
|
||||||
# if defined(OC_X86_64_ASM)
|
# if defined(OC_X86_64_ASM)
|
||||||
|
22
thirdparty/libtheora/x86_vc/x86enc.h
vendored
22
thirdparty/libtheora/x86_vc/x86enc.h
vendored
@ -17,10 +17,14 @@
|
|||||||
|
|
||||||
#if !defined(_x86_vc_x86enc_H)
|
#if !defined(_x86_vc_x86enc_H)
|
||||||
# define _x86_vc_x86enc_H (1)
|
# define _x86_vc_x86enc_H (1)
|
||||||
# include "../encint.h"
|
|
||||||
# include "x86int.h"
|
# include "x86int.h"
|
||||||
|
# if defined(OC_X86_ASM)
|
||||||
|
# define oc_enc_accel_init oc_enc_accel_init_x86
|
||||||
|
# define OC_ENC_USE_VTABLE (1)
|
||||||
|
# endif
|
||||||
|
# include "../encint.h"
|
||||||
|
|
||||||
void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
|
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
|
||||||
|
|
||||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride);
|
const unsigned char *_ref,int _ystride);
|
||||||
@ -29,19 +33,19 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
|||||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||||
unsigned _thresh);
|
unsigned _thresh);
|
||||||
unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
const unsigned char *_ref,int _ystride);
|
||||||
unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
|
unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
|
||||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||||
unsigned _thresh);
|
unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
|
||||||
unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
|
const unsigned char *_src,int _ystride);
|
||||||
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
||||||
const unsigned char *_x,const unsigned char *_y,int _stride);
|
const unsigned char *_x,const unsigned char *_y,int _stride);
|
||||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
||||||
const unsigned char *_x,int _stride);
|
const unsigned char *_x,int _stride);
|
||||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||||
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
23
thirdparty/libtheora/x86_vc/x86int.h
vendored
23
thirdparty/libtheora/x86_vc/x86int.h
vendored
@ -11,32 +11,39 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
#if !defined(_x86_vc_x86int_H)
|
#if !defined(_x86_vc_x86int_H)
|
||||||
# define _x86_vc_x86int_H (1)
|
# define _x86_vc_x86int_H (1)
|
||||||
# include "../internal.h"
|
# include "../internal.h"
|
||||||
|
# if defined(OC_X86_ASM)
|
||||||
|
# define oc_state_accel_init oc_state_accel_init_x86
|
||||||
|
# define OC_STATE_USE_VTABLE (1)
|
||||||
|
# endif
|
||||||
|
# include "../state.h"
|
||||||
|
# include "x86cpu.h"
|
||||||
|
|
||||||
void oc_state_vtable_init_x86(oc_theora_state *_state);
|
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||||
|
|
||||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||||
const unsigned char *_src,int _ystride);
|
const unsigned char *_src,int _ystride);
|
||||||
|
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||||
|
const unsigned char *_src_frame,int _ystride,
|
||||||
|
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||||
const ogg_int16_t *_residue);
|
const ogg_int16_t *_residue);
|
||||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
|
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||||
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
|
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||||
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
|
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
|
|
||||||
int _dst_frame,int _src_frame,int _pli);
|
|
||||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||||
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||||
void oc_restore_fpu_mmx(void);
|
void oc_restore_fpu_mmx(void);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
11
thirdparty/libtheora/x86_vc/x86state.c
vendored
11
thirdparty/libtheora/x86_vc/x86state.c
vendored
@ -11,7 +11,7 @@
|
|||||||
********************************************************************
|
********************************************************************
|
||||||
|
|
||||||
function:
|
function:
|
||||||
last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
|
last mod: $Id$
|
||||||
|
|
||||||
********************************************************************/
|
********************************************************************/
|
||||||
|
|
||||||
@ -19,8 +19,6 @@
|
|||||||
|
|
||||||
#if defined(OC_X86_ASM)
|
#if defined(OC_X86_ASM)
|
||||||
|
|
||||||
#include "../cpu.c"
|
|
||||||
|
|
||||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||||
each quadrant of the destination.*/
|
each quadrant of the destination.*/
|
||||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||||
@ -42,21 +40,22 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
|||||||
64,64,64,64,64,64,64,64,
|
64,64,64,64,64,64,64,64,
|
||||||
};
|
};
|
||||||
|
|
||||||
void oc_state_vtable_init_x86(oc_theora_state *_state){
|
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||||
_state->cpu_flags=oc_cpu_flags_get();
|
_state->cpu_flags=oc_cpu_flags_get();
|
||||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||||
|
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||||
_state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
|
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||||
oc_state_loop_filter_frag_rows_mmx;
|
oc_state_loop_filter_frag_rows_mmx;
|
||||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||||
}
|
}
|
||||||
else oc_state_vtable_init_c(_state);
|
else oc_state_accel_init_c(_state);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
244
thirdparty/libtheora/x86_vc/x86zigzag.h
vendored
Normal file
244
thirdparty/libtheora/x86_vc/x86zigzag.h
vendored
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
/********************************************************************
|
||||||
|
* *
|
||||||
|
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||||
|
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||||
|
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||||
|
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||||
|
* *
|
||||||
|
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||||
|
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||||
|
* *
|
||||||
|
********************************************************************
|
||||||
|
|
||||||
|
function:
|
||||||
|
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||||
|
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#if !defined(_x86_vc_x86zigzag_H)
|
||||||
|
# define _x86_vc_x86zigzag_H (1)
|
||||||
|
# include "x86enc.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*Converts DCT coefficients from transposed order into zig-zag scan order and
|
||||||
|
stores them in Y.
|
||||||
|
This relies on two macros to load the contents of each row:
|
||||||
|
OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
|
||||||
|
first four and second four entries of each row into the specified register,
|
||||||
|
respectively.
|
||||||
|
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
|
||||||
|
(because when the rows are already in SSE2 registers, loading the high half
|
||||||
|
destructively modifies the register).
|
||||||
|
The index of each output element in the original 64-element array should wind
|
||||||
|
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||||
|
each 4-tuple below):
|
||||||
|
A 0 8 1 2 9 16 24 17 B
|
||||||
|
C 10 3 4 11 18 25 32 40 E
|
||||||
|
F 33 26 19 12 5 6 13 20 D
|
||||||
|
G 27 34 41 48 56 49 42 35 I
|
||||||
|
L 28 21 14 7 15 22 29 36 M
|
||||||
|
H 43 50 57 58 51 44 37 30 O
|
||||||
|
N 23 31 38 45 52 59 60 53 J
|
||||||
|
P 46 39 47 54 61 62 55 63 K
|
||||||
|
The order of the coefficients within each tuple is reversed in the comments
|
||||||
|
below to reflect the usual MSB to LSB notation.*/
|
||||||
|
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \
|
||||||
|
__asm movq mm7,mm0 /*mm7=03 02 01 00*/ \
|
||||||
|
__asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \
|
||||||
|
__asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
|
||||||
|
__asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \
|
||||||
|
__asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
|
||||||
|
__asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \
|
||||||
|
__asm movq [Y+0x00],mm7 \
|
||||||
|
__asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \
|
||||||
|
__asm movq mm7,mm2 /*mm7=19 18 17 16*/ \
|
||||||
|
__asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \
|
||||||
|
__asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \
|
||||||
|
__asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \
|
||||||
|
__asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \
|
||||||
|
__asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \
|
||||||
|
__asm movq [Y+0x08],mm1 \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \
|
||||||
|
__asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
|
||||||
|
__asm movq [Y+0x10],mm0 \
|
||||||
|
__asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \
|
||||||
|
__asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \
|
||||||
|
__asm movq [Y+0x28],mm4 \
|
||||||
|
__asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \
|
||||||
|
__asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
|
||||||
|
__asm movq mm4,mm7 /*mm4=12 19 15 18*/ \
|
||||||
|
__asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \
|
||||||
|
__asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \
|
||||||
|
__asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \
|
||||||
|
__asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \
|
||||||
|
__asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \
|
||||||
|
__asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
|
||||||
|
__asm movq [Y+0x18],mm4 \
|
||||||
|
OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \
|
||||||
|
__asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \
|
||||||
|
__asm movq [Y+0x20],mm2 \
|
||||||
|
__asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
|
||||||
|
__asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
|
||||||
|
__asm movq mm2,mm3 /*mm2=35 42 34 27*/ \
|
||||||
|
__asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \
|
||||||
|
__asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
|
||||||
|
__asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \
|
||||||
|
__asm movq [Y+0x30],mm3 \
|
||||||
|
__asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \
|
||||||
|
__asm movq [Y+0x50],mm1 \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \
|
||||||
|
__asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \
|
||||||
|
__asm psllq mm6,16 /*mm6=07 23 22 ..*/ \
|
||||||
|
__asm movq mm3,mm4 /*mm3=49 56 51 59*/ \
|
||||||
|
__asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \
|
||||||
|
__asm movq [Y+0x38],mm4 \
|
||||||
|
__asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \
|
||||||
|
__asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \
|
||||||
|
__asm movq mm4,mm3 /*mm4=61 51 60 59*/ \
|
||||||
|
__asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \
|
||||||
|
__asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \
|
||||||
|
__asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
|
||||||
|
__asm movq [Y+0x68],mm3 \
|
||||||
|
__asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \
|
||||||
|
__asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
|
||||||
|
__asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \
|
||||||
|
OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \
|
||||||
|
__asm movq [Y+0x78],mm4 \
|
||||||
|
__asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \
|
||||||
|
__asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \
|
||||||
|
__asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \
|
||||||
|
__asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
|
||||||
|
__asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
|
||||||
|
__asm movq [Y+0x40],mm5 \
|
||||||
|
__asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \
|
||||||
|
__asm movq [Y+0x48],mm7 \
|
||||||
|
__asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
|
||||||
|
__asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \
|
||||||
|
__asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \
|
||||||
|
__asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \
|
||||||
|
__asm movq [Y+0x60],mm6 \
|
||||||
|
__asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \
|
||||||
|
__asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \
|
||||||
|
__asm movq [Y+0x58],mm3 \
|
||||||
|
__asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
|
||||||
|
__asm movq [Y+0x70],mm0 \
|
||||||
|
|
||||||
|
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
|
||||||
|
order and stores them in %[qdct].
|
||||||
|
The index of each output element in the original 64-element array should wind
|
||||||
|
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||||
|
each 4-tuple below):
|
||||||
|
A 0 1 8 16 9 2 3 10 B
|
||||||
|
C 17 24 32 25 18 11 4 5 D
|
||||||
|
E 12 19 26 33 40 48 41 34 I
|
||||||
|
H 27 20 13 6 7 14 21 28 G
|
||||||
|
K 35 42 49 56 57 50 43 36 J
|
||||||
|
F 29 22 15 23 30 37 44 51 M
|
||||||
|
P 58 59 52 45 38 31 39 46 L
|
||||||
|
N 53 60 61 54 47 55 62 63 O
|
||||||
|
The order of the coefficients within each tuple is reversed in the comments
|
||||||
|
below to reflect the usual MSB to LSB notation.*/
|
||||||
|
#define OC_ZIG_ZAG_MMXEXT \
|
||||||
|
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
|
||||||
|
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
|
||||||
|
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
|
||||||
|
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
|
||||||
|
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
|
||||||
|
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
|
||||||
|
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
|
||||||
|
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
|
||||||
|
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
|
||||||
|
"movq %%mm0,0x00(%[qdct])\n\t" \
|
||||||
|
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
|
||||||
|
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
|
||||||
|
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
|
||||||
|
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
|
||||||
|
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
|
||||||
|
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
|
||||||
|
"movq %%mm6,0x08(%[qdct])\n\t" \
|
||||||
|
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
|
||||||
|
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
|
||||||
|
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
|
||||||
|
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
|
||||||
|
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
|
||||||
|
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
|
||||||
|
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
|
||||||
|
"movq %%mm2,0x10(%[qdct])\n\t" \
|
||||||
|
"movq %%mm3,0x18(%[qdct])\n\t" \
|
||||||
|
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
|
||||||
|
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
|
||||||
|
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
|
||||||
|
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
|
||||||
|
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
|
||||||
|
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
|
||||||
|
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
|
||||||
|
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
|
||||||
|
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
|
||||||
|
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
|
||||||
|
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
|
||||||
|
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
|
||||||
|
"movq %%mm0,0x20(%[qdct])\n\t" \
|
||||||
|
"movq %%mm3,0x50(%[qdct])\n\t" \
|
||||||
|
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
|
||||||
|
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
|
||||||
|
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
|
||||||
|
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
|
||||||
|
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
|
||||||
|
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
|
||||||
|
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
|
||||||
|
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
|
||||||
|
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
|
||||||
|
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
|
||||||
|
"movq %%mm2,0x30(%[qdct])\n\t" \
|
||||||
|
"movq %%mm6,0x38(%[qdct])\n\t" \
|
||||||
|
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
|
||||||
|
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
|
||||||
|
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
|
||||||
|
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
|
||||||
|
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
|
||||||
|
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
|
||||||
|
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
|
||||||
|
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
|
||||||
|
"movq %%mm0,0x28(%[qdct])\n\t" \
|
||||||
|
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
|
||||||
|
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
|
||||||
|
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
|
||||||
|
"movq %%mm4,0x40(%[qdct])\n\t" \
|
||||||
|
"movq %%mm6,0x48(%[qdct])\n\t" \
|
||||||
|
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
|
||||||
|
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
|
||||||
|
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
|
||||||
|
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
|
||||||
|
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
|
||||||
|
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
|
||||||
|
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
|
||||||
|
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
|
||||||
|
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
|
||||||
|
"movq %%mm2,0x68(%[qdct])\n\t" \
|
||||||
|
"movq %%mm1,0x58(%[qdct])\n\t" \
|
||||||
|
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
|
||||||
|
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
|
||||||
|
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
|
||||||
|
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
|
||||||
|
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
|
||||||
|
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
|
||||||
|
"movq %%mm6,0x70(%[qdct])\n\t" \
|
||||||
|
"movq %%mm5,0x78(%[qdct])\n\t" \
|
||||||
|
"movq %%mm7,0x60(%[qdct])\n\t" \
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user