Commit 5e2bf244bf3c9c56957dae9755030c25032d9931

Authored by Arthur HUGEAT
1 parent 0642fff00e
Exists in master

Suppression d'un copier-coller.

Showing 1 changed file with 19 additions and 21 deletions Inline Diff

ifcs2018_journal.tex
% fusionner max rejection a surface donnee v.s minimiser surface a rejection donnee 1 1 % fusionner max rejection a surface donnee v.s minimiser surface a rejection donnee
% demontrer comment la quantification rejette du bruit vers les hautes frequences => 6 dB de 2 2 % demontrer comment la quantification rejette du bruit vers les hautes frequences => 6 dB de
% rejection par bit et perte si moins de bits que rejection/6 3 3 % rejection par bit et perte si moins de bits que rejection/6
% developper programme lineaire en incluant le decalage de bits 4 4 % developper programme lineaire en incluant le decalage de bits
% insister que avant on etait synthetisable mais pas implementable, alors que maintenant on 5 5 % insister que avant on etait synthetisable mais pas implementable, alors que maintenant on
% implemente et on demontre que ca tourne 6 6 % implemente et on demontre que ca tourne
% gwen : pourquoi le FIR est desormais implementable et ne l'etait pas meme sur zedboard->new FIR ? 7 7 % gwen : pourquoi le FIR est desormais implementable et ne l'etait pas meme sur zedboard->new FIR ?
% Gwen : peut-on faire un vrai banc de bruit de phase avec ce FIR, ie ajouter ADC, NCO et mixer 8 8 % Gwen : peut-on faire un vrai banc de bruit de phase avec ce FIR, ie ajouter ADC, NCO et mixer
% (zedboard ou redpit) 9 9 % (zedboard ou redpit)
10 10
% label schema : verifier que "argumenter de la cascade de FIR" est fait 11 11 % label schema : verifier que "argumenter de la cascade de FIR" est fait
12 12
\documentclass[a4paper,conference]{IEEEtran/IEEEtran} 13 13 \documentclass[a4paper,conference]{IEEEtran/IEEEtran}
\usepackage{graphicx,color,hyperref} 14 14 \usepackage{graphicx,color,hyperref}
\usepackage{amsfonts} 15 15 \usepackage{amsfonts}
\usepackage{amsthm} 16 16 \usepackage{amsthm}
\usepackage{amssymb} 17 17 \usepackage{amssymb}
\usepackage{amsmath} 18 18 \usepackage{amsmath}
\usepackage{algorithm2e} 19 19 \usepackage{algorithm2e}
\usepackage{url,balance} 20 20 \usepackage{url,balance}
\usepackage[normalem]{ulem} 21 21 \usepackage[normalem]{ulem}
\usepackage{tikz} 22 22 \usepackage{tikz}
\usetikzlibrary{positioning,fit} 23 23 \usetikzlibrary{positioning,fit}
\usepackage{multirow} 24 24 \usepackage{multirow}
\usepackage{scalefnt} 25 25 \usepackage{scalefnt}
26 26
% correct bad hyphenation here 27 27 % correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor} 28 28 \hyphenation{op-tical net-works semi-conduc-tor}
\textheight=26cm 29 29 \textheight=26cm
\setlength{\footskip}{30pt} 30 30 \setlength{\footskip}{30pt}
\pagenumbering{gobble} 31 31 \pagenumbering{gobble}
\begin{document} 32 32 \begin{document}
\title{Filter optimization for real time digital processing of radiofrequency signals: application 33 33 \title{Filter optimization for real time digital processing of radiofrequency signals: application
to oscillator metrology} 34 34 to oscillator metrology}
35 35
\author{\IEEEauthorblockN{A. Hugeat\IEEEauthorrefmark{1}\IEEEauthorrefmark{2}, J. Bernard\IEEEauthorrefmark{2}, 36 36 \author{\IEEEauthorblockN{A. Hugeat\IEEEauthorrefmark{1}\IEEEauthorrefmark{2}, J. Bernard\IEEEauthorrefmark{2},
G. Goavec-M\'erou\IEEEauthorrefmark{1}, 37 37 G. Goavec-M\'erou\IEEEauthorrefmark{1},
P.-Y. Bourgeois\IEEEauthorrefmark{1}, J.-M. Friedt\IEEEauthorrefmark{1}} 38 38 P.-Y. Bourgeois\IEEEauthorrefmark{1}, J.-M. Friedt\IEEEauthorrefmark{1}}
\IEEEauthorblockA{\IEEEauthorrefmark{1}FEMTO-ST, Time \& Frequency department, Besan\c con, France } 39 39 \IEEEauthorblockA{\IEEEauthorrefmark{1}FEMTO-ST, Time \& Frequency department, Besan\c con, France }
\IEEEauthorblockA{\IEEEauthorrefmark{2}FEMTO-ST, Computer Science department DISC, Besan\c con, France \\ 40 40 \IEEEauthorblockA{\IEEEauthorrefmark{2}FEMTO-ST, Computer Science department DISC, Besan\c con, France \\
Email: \{pyb2,jmfriedt\}@femto-st.fr} 41 41 Email: \{pyb2,jmfriedt\}@femto-st.fr}
} 42 42 }
\maketitle 43 43 \maketitle
\thispagestyle{plain} 44 44 \thispagestyle{plain}
\pagestyle{plain} 45 45 \pagestyle{plain}
\newtheorem{definition}{Definition} 46 46 \newtheorem{definition}{Definition}
47 47
\begin{abstract} 48 48 \begin{abstract}
Software Defined Radio (SDR) provides stability, flexibility and reconfigurability to 49 49 Software Defined Radio (SDR) provides stability, flexibility and reconfigurability to
radiofrequency signal processing. Applied to oscillator characterization in the context 50 50 radiofrequency signal processing. Applied to oscillator characterization in the context
of ultrastable clocks, stringent filtering requirements are defined by spurious signal or 51 51 of ultrastable clocks, stringent filtering requirements are defined by spurious signal or
noise rejection needs. Since real time radiofrequency processing must be performed in a 52 52 noise rejection needs. Since real time radiofrequency processing must be performed in a
Field Programmable Array to meet timing constraints, we investigate optimization strategies 53 53 Field Programmable Array to meet timing constraints, we investigate optimization strategies
to design filters meeting rejection characteristics while limiting the hardware resources 54 54 to design filters meeting rejection characteristics while limiting the hardware resources
required and keeping timing constraints within the targeted measurement bandwidths. The 55 55 required and keeping timing constraints within the targeted measurement bandwidths. The
presented technique is applicable to scheduling any sequence of processing blocks characterized 56 56 presented technique is applicable to scheduling any sequence of processing blocks characterized
by a throughput, resource occupation and performance tabulated as a function of configuration 57 57 by a throughput, resource occupation and performance tabulated as a function of configuration
characateristics, as is the case for filters with their coefficients and resolution yielding 58 58 characateristics, as is the case for filters with their coefficients and resolution yielding
rejection and number of multipliers. 59 59 rejection and number of multipliers.
\end{abstract} 60 60 \end{abstract}
61 61
\begin{IEEEkeywords} 62 62 \begin{IEEEkeywords}
Software Defined Radio, Mixed-Integer Linear Programming, Finite Impulse Response filter 63 63 Software Defined Radio, Mixed-Integer Linear Programming, Finite Impulse Response filter
\end{IEEEkeywords} 64 64 \end{IEEEkeywords}
65 65
\section{Digital signal processing of ultrastable clock signals} 66 66 \section{Digital signal processing of ultrastable clock signals}
67 67
Analog oscillator phase noise characteristics are classically performed by downconverting 68 68 Analog oscillator phase noise characteristics are classically performed by downconverting
the radiofrequency signal using a saturated mixer to bring the radiofrequency signal to baseband, 69 69 the radiofrequency signal using a saturated mixer to bring the radiofrequency signal to baseband,
followed by a Fourier analysis of the beat signal to analyze phase fluctuations close to carrier. In 70 70 followed by a Fourier analysis of the beat signal to analyze phase fluctuations close to carrier. In
a fully digital approach, the radiofrequency signal is digitized and numerically downconverted by 71 71 a fully digital approach, the radiofrequency signal is digitized and numerically downconverted by
multiplying the samples with a local numerically controlled oscillator (Fig. \ref{schema}) \cite{rsi}. 72 72 multiplying the samples with a local numerically controlled oscillator (Fig. \ref{schema}) \cite{rsi}.
73 73
\begin{figure}[h!tb] 74 74 \begin{figure}[h!tb]
\begin{center} 75 75 \begin{center}
\includegraphics[width=.8\linewidth]{images/schema} 76 76 \includegraphics[width=.8\linewidth]{images/schema}
\end{center} 77 77 \end{center}
\caption{Fully digital oscillator phase noise characterization: the Device Under Test 78 78 \caption{Fully digital oscillator phase noise characterization: the Device Under Test
(DUT) signal is sampled by the radiofrequency grade Analog to Digital Converter (ADC) and 79 79 (DUT) signal is sampled by the radiofrequency grade Analog to Digital Converter (ADC) and
downconverted by mixing with a Numerically Controlled Oscillator (NCO). Unwanted signals 80 80 downconverted by mixing with a Numerically Controlled Oscillator (NCO). Unwanted signals
and noise aliases are rejected by a Low Pass Filter (LPF) implemented as a cascade of Finite 81 81 and noise aliases are rejected by a Low Pass Filter (LPF) implemented as a cascade of Finite
Impulse Response (FIR) filters. The signal is then decimated before a Fourier analysis displays 82 82 Impulse Response (FIR) filters. The signal is then decimated before a Fourier analysis displays
the spectral characteristics of the phase fluctuations.} 83 83 the spectral characteristics of the phase fluctuations.}
\label{schema} 84 84 \label{schema}
\end{figure} 85 85 \end{figure}
86 86
As with the analog mixer, 87 87 As with the analog mixer,
the non-linear behavior of the downconverter introduces noise or spurious signal aliasing as 88 88 the non-linear behavior of the downconverter introduces noise or spurious signal aliasing as
well as the generation of the frequency sum signal in addition to the frequency difference. 89 89 well as the generation of the frequency sum signal in addition to the frequency difference.
These unwanted spectral characteristics must be rejected before decimating the data stream 90 90 These unwanted spectral characteristics must be rejected before decimating the data stream
for the phase noise spectral characterization \cite{andrich2018high}. The characteristics introduced between the 91 91 for the phase noise spectral characterization \cite{andrich2018high}. The characteristics introduced between the
downconverter 92 92 downconverter
and the decimation processing blocks are core characteristics of an oscillator characterization 93 93 and the decimation processing blocks are core characteristics of an oscillator characterization
system, and must reject out-of-band signals below the targeted phase noise -- typically in the 94 94 system, and must reject out-of-band signals below the targeted phase noise -- typically in the
sub -170~dBc/Hz for ultrastable oscillator we aim at characterizing. The filter blocks will 95 95 sub -170~dBc/Hz for ultrastable oscillator we aim at characterizing. The filter blocks will
use most resources of the Field Programmable Gate Array (FPGA) used to process the radiofrequency 96 96 use most resources of the Field Programmable Gate Array (FPGA) used to process the radiofrequency
datastream: optimizing the performance of the filter while reducing the needed resources is 97 97 datastream: optimizing the performance of the filter while reducing the needed resources is
hence tackled in a systematic approach using optimization techniques. Most significantly, we 98 98 hence tackled in a systematic approach using optimization techniques. Most significantly, we
tackle the issue by attempting to cascade multiple Finite Impulse Response (FIR) filters with 99 99 tackle the issue by attempting to cascade multiple Finite Impulse Response (FIR) filters with
tunable number of coefficients and tunable number of bits representing the coefficients and the 100 100 tunable number of coefficients and tunable number of bits representing the coefficients and the
data being processed. 101 101 data being processed.
102 102
\section{Finite impulse response filter} 103 103 \section{Finite impulse response filter}
104 104
We select FIR filters for their unconditional stability and ease of design. A FIR filter is defined 105 105 We select FIR filters for their unconditional stability and ease of design. A FIR filter is defined
by a set of weights $b_k$ applied to the inputs $x_k$ through a convolution to generate the 106 106 by a set of weights $b_k$ applied to the inputs $x_k$ through a convolution to generate the
outputs $y_k$ 107 107 outputs $y_k$
\begin{align} 108 108 \begin{align}
y_n=\sum_{k=0}^N b_k x_{n-k} 109 109 y_n=\sum_{k=0}^N b_k x_{n-k}
\label{eq:fir_equation} 110 110 \label{eq:fir_equation}
\end{align} 111 111 \end{align}
112 112
As opposed to an implementation on a general purpose processor in which word size is defined by the 113 113 As opposed to an implementation on a general purpose processor in which word size is defined by the
processor architecture, implementing such a filter on an FPGA offers more degrees of freedom since 114 114 processor architecture, implementing such a filter on an FPGA offers more degrees of freedom since
not only the coefficient values and number of taps must be defined, but also the number of bits 115 115 not only the coefficient values and number of taps must be defined, but also the number of bits
defining the coefficients and the sample size. For this reason, and because we consider pipeline 116 116 defining the coefficients and the sample size. For this reason, and because we consider pipeline
processing (as opposed to First-In, First-Out FIFO memory batch processing) of radiofrequency 117 117 processing (as opposed to First-In, First-Out FIFO memory batch processing) of radiofrequency
signals, High Level Synthesis (HLS) languages \cite{kasbah2008multigrid} are not considered but 118 118 signals, High Level Synthesis (HLS) languages \cite{kasbah2008multigrid} are not considered but
the problem is tackled at the Very-high-speed-integrated-circuit Hardware Description Language 119 119 the problem is tackled at the Very-high-speed-integrated-circuit Hardware Description Language
(VHDL) level. 120 120 (VHDL) level.
Since latency is not an issue in a openloop phase noise characterization instrument, the large 121 121 Since latency is not an issue in a openloop phase noise characterization instrument, the large
numbre of taps in the FIR, as opposed to the shorter Infinite Impulse Response (IIR) filter, 122 122 numbre of taps in the FIR, as opposed to the shorter Infinite Impulse Response (IIR) filter,
is not considered as an issue as would be in a closed loop system. 123 123 is not considered as an issue as would be in a closed loop system.
124 124
The coefficients are classically expressed as floating point values. However, this binary 125 125 The coefficients are classically expressed as floating point values. However, this binary
number representation is not efficient for fast arithmetic computation by an FPGA. Instead, 126 126 number representation is not efficient for fast arithmetic computation by an FPGA. Instead,
we select to quantify these floating point values into integer values. This quantization 127 127 we select to quantify these floating point values into integer values. This quantization
will result in some precision loss. 128 128 will result in some precision loss.
129 129
\begin{figure}[h!tb] 130 130 \begin{figure}[h!tb]
\includegraphics[width=\linewidth]{images/zero_values} 131 131 \includegraphics[width=\linewidth]{images/zero_values}
\caption{Impact of the quantization resolution of the coefficients: the quantization is 132 132 \caption{Impact of the quantization resolution of the coefficients: the quantization is
set to 6~bits -- with the horizontal black lines indicating $\pm$1 least significant bit -- setting 133 133 set to 6~bits -- with the horizontal black lines indicating $\pm$1 least significant bit -- setting
the 30~first and 30~last coefficients out of the initial 128~band-pass 134 134 the 30~first and 30~last coefficients out of the initial 128~band-pass
filter coefficients to 0 (red dots).} 135 135 filter coefficients to 0 (red dots).}
\label{float_vs_int} 136 136 \label{float_vs_int}
\end{figure} 137 137 \end{figure}
138 138
The tradeoff between quantization resolution and number of coefficients when considering 139 139 The tradeoff between quantization resolution and number of coefficients when considering
integer operations is not trivial. As an illustration of the issue related to the 140 140 integer operations is not trivial. As an illustration of the issue related to the
relation between number of fiter taps and quantization, Fig. \ref{float_vs_int} exhibits 141 141 relation between number of fiter taps and quantization, Fig. \ref{float_vs_int} exhibits
a 128-coefficient FIR bandpass filter designed using floating point numbers (blue). Upon 142 142 a 128-coefficient FIR bandpass filter designed using floating point numbers (blue). Upon
quantization on 6~bit integers, 60 of the 128~coefficients in the beginning and end of the 143 143 quantization on 6~bit integers, 60 of the 128~coefficients in the beginning and end of the
taps become null, making the large number of coefficients irrelevant and allowing to save 144 144 taps become null, making the large number of coefficients irrelevant and allowing to save
processing resource by shrinking the filter length. This tradeoff aimed at minimizing resources 145 145 processing resource by shrinking the filter length. This tradeoff aimed at minimizing resources
to reach a given rejection level, or maximizing out of band rejection for a given computational 146 146 to reach a given rejection level, or maximizing out of band rejection for a given computational
resource, will drive the investigation on cascading filters designed with varying tap resolution 147 147 resource, will drive the investigation on cascading filters designed with varying tap resolution
and tap length, as will be shown in the next section. Indeed, our development strategy closely 148 148 and tap length, as will be shown in the next section. Indeed, our development strategy closely
follows the skeleton approach \cite{crookes1998environment, crookes2000design, benkrid2002towards} 149 149 follows the skeleton approach \cite{crookes1998environment, crookes2000design, benkrid2002towards}
in which basic blocks are defined and characterized before being assembled \cite{hide} 150 150 in which basic blocks are defined and characterized before being assembled \cite{hide}
in a complete processing chain. In our case, assembling the filter blocks is a simpler block 151 151 in a complete processing chain. In our case, assembling the filter blocks is a simpler block
combination process since we assume a single value to be processed and a single value to be 152 152 combination process since we assume a single value to be processed and a single value to be
generated at each clock cycle. The FIR filters will not be considered to decimate in the 153 153 generated at each clock cycle. The FIR filters will not be considered to decimate in the
current implementation: the decimation is assumed to be located after the FIR cascade at the 154 154 current implementation: the decimation is assumed to be located after the FIR cascade at the
moment. 155 155 moment.
156 156
\section{Methodology description} 157 157 \section{Methodology description}
158 158
Our objective is to develop a new methodology applicable to any Digital Signal Processing (DSP) 159 159 Our objective is to develop a new methodology applicable to any Digital Signal Processing (DSP)
chain obtained by assembling basic processing blocks, with hardware and manufacturer independence. 160 160 chain obtained by assembling basic processing blocks, with hardware and manufacturer independence.
Achieving such a target requires defining an abstract model to represent some basic properties 161 161 Achieving such a target requires defining an abstract model to represent some basic properties
of DSP blocks such as perfomance (i.e. rejection or ripples in the bandpass for filters) and 162 162 of DSP blocks such as perfomance (i.e. rejection or ripples in the bandpass for filters) and
resource occupation. These abstract properties, not necessarily related to the detailed hardware 163 163 resource occupation. These abstract properties, not necessarily related to the detailed hardware
implementation of a given platform, will feed a scheduler solver aimed at assembling the optimum 164 164 implementation of a given platform, will feed a scheduler solver aimed at assembling the optimum
target, whether in terms of maximizing performance for a given arbitrary resource occupation, or 165 165 target, whether in terms of maximizing performance for a given arbitrary resource occupation, or
minimizing resource occupation for a given perfomance. In our approach, the solution of the 166 166 minimizing resource occupation for a given perfomance. In our approach, the solution of the
solver is then synthesized using the dedicated tool provided by each platform manufacturer 167 167 solver is then synthesized using the dedicated tool provided by each platform manufacturer
to assess the validity of our abstract resource occupation indicator, and the result of running 168 168 to assess the validity of our abstract resource occupation indicator, and the result of running
the DSP chain on the FPGA allows for assessing the performance of the scheduler. We emphasize 169 169 the DSP chain on the FPGA allows for assessing the performance of the scheduler. We emphasize
that all solutions found by the solver are synthesized and executed on hardware at the end 170 170 that all solutions found by the solver are synthesized and executed on hardware at the end
of the analysis. 171 171 of the analysis.
172 172
In this demonstration , we focus on only two operations: filtering and shifting the number of 173 173 In this demonstration , we focus on only two operations: filtering and shifting the number of
bits needed to represent the data along the processing chain. 174 174 bits needed to represent the data along the processing chain.
We have chosen these basic operations because shifting and the filtering have already been studied 175 175 We have chosen these basic operations because shifting and the filtering have already been studied
in the literature \cite{lim_1996, lim_1988, young_1992, smith_1998} providing a framework for 176 176 in the literature \cite{lim_1996, lim_1988, young_1992, smith_1998} providing a framework for
assessing our results. Furthermore, filtering is a core step in any radiofrequency frontend 177 177 assessing our results. Furthermore, filtering is a core step in any radiofrequency frontend
requiring pipelined processing at full bandwidth for the earliest steps, including for 178 178 requiring pipelined processing at full bandwidth for the earliest steps, including for
time and frequency transfer or characterization \cite{carolina1,carolina2,rsi}. 179 179 time and frequency transfer or characterization \cite{carolina1,carolina2,rsi}.
180 180
Addressing only two operations allows for demonstrating the methodology but should not be 181 181 Addressing only two operations allows for demonstrating the methodology but should not be
considered as a limitation of the framework which can be extended to assembling any number 182 182 considered as a limitation of the framework which can be extended to assembling any number
of skeleton blocks as long as perfomance and resource occupation can be determined. Hence, 183 183 of skeleton blocks as long as perfomance and resource occupation can be determined. Hence,
in this paper we will apply our methodology on simple DSP chains: a white noise input signal 184 184 in this paper we will apply our methodology on simple DSP chains: a white noise input signal
is generated using a Pseudo-Random Number (PRN) generator or thanks at a radiofrequency-grade 185 185 is generated using a Pseudo-Random Number (PRN) generator or thanks at a radiofrequency-grade
Analog to Digital Converter (ADC) loaded by a 50~$\Omega$ resistor. Once samples have been 186 186 Analog to Digital Converter (ADC) loaded by a 50~$\Omega$ resistor. Once samples have been
digitized at a rate of 125~MS/s, filtering is applied to qualify the processing block performance -- 187 187 digitized at a rate of 125~MS/s, filtering is applied to qualify the processing block performance --
practically meeting the radiofrequency frontend requirement of noise and bandwidth reduction 188 188 practically meeting the radiofrequency frontend requirement of noise and bandwidth reduction
by filtering and decimating. Finally, bursts of filtered samples are stored for post-processing, 189 189 by filtering and decimating. Finally, bursts of filtered samples are stored for post-processing,
allowing to assess either filter rejection for a given resource usage, or validating the rejection 190 190 allowing to assess either filter rejection for a given resource usage, or validating the rejection
when implementing a solution minimizing resource occupation. 191 191 when implementing a solution minimizing resource occupation.
192 192
The first step of our approach is to model the DSP chain and since we just optimize 193 193 The first step of our approach is to model the DSP chain and since we just optimize
the filtering, we have not modeling the PRN generator or the ADC. The filtering can be 194 194 the filtering, we have not modeling the PRN generator or the ADC. The filtering can be
done by two ways. The first one we use only one FIR filter with lot of coefficients 195 195 done by two ways. The first one we use only one FIR filter with lot of coefficients
to rejection the noise, we called this approach a monolithic approach. And the second one 196 196 to rejection the noise, we called this approach a monolithic approach. And the second one
we select different FIR filters with less coefficients the monolithic filter and we cascaded 197 197 we select different FIR filters with less coefficients the monolithic filter and we cascaded
it to filtering the signal. 198 198 it to filtering the signal.
199 199
After each filter we leave the possibility of shifting the filtered data to consume 200 200 After each filter we leave the possibility of shifting the filtered data to consume
less resources. Hence in the case of cascaded filter, we define a stage as a filter 201 201 less resources. Hence in the case of cascaded filter, we define a stage as a filter
and a shifter (the shift could be omitted if we do not need to divide the filtered data). 202 202 and a shifter (the shift could be omitted if we do not need to divide the filtered data).
203 203
\subsection{Model of a FIR filter} 204 204 \subsection{Model of a FIR filter}
205 205
A cascade of filters is composed of $n$ FIR stages. In stage $i$ ($1 \leq i \leq n$) 206 206 A cascade of filters is composed of $n$ FIR stages. In stage $i$ ($1 \leq i \leq n$)
the FIR has $C_i$ coefficients and each coefficient is an integer value with $\pi^C_i$ 207 207 the FIR has $C_i$ coefficients and each coefficient is an integer value with $\pi^C_i$
bits while the filtered data are shifted by $\pi^S_i$ bits. We define also $\pi^-_i$ as 208 208 bits while the filtered data are shifted by $\pi^S_i$ bits. We define also $\pi^-_i$ as
the size of input data and $\pi^+_i$ as the size of output data. The figure~\ref{fig:fir_stage} 209 209 the size of input data and $\pi^+_i$ as the size of output data. The figure~\ref{fig:fir_stage}
shows a filtering stage. 210 210 shows a filtering stage.
211 211
\begin{figure} 212 212 \begin{figure}
\centering 213 213 \centering
\begin{tikzpicture}[node distance=2cm] 214 214 \begin{tikzpicture}[node distance=2cm]
\node[draw,minimum size=1.3cm] (FIR) { $C_i, \pi_i^C$ } ; 215 215 \node[draw,minimum size=1.3cm] (FIR) { $C_i, \pi_i^C$ } ;
\node[draw,minimum size=1.3cm] (Shift) [right of=FIR, ] { $\pi_i^S$ } ; 216 216 \node[draw,minimum size=1.3cm] (Shift) [right of=FIR, ] { $\pi_i^S$ } ;
\node (Start) [left of=FIR] { } ; 217 217 \node (Start) [left of=FIR] { } ;
\node (End) [right of=Shift] { } ; 218 218 \node (End) [right of=Shift] { } ;
219 219
\node[draw,fit=(FIR) (Shift)] (Filter) { } ; 220 220 \node[draw,fit=(FIR) (Shift)] (Filter) { } ;
221 221
\draw[->] (Start) edge node [above] { $\pi_i^-$ } (FIR) ; 222 222 \draw[->] (Start) edge node [above] { $\pi_i^-$ } (FIR) ;
\draw[->] (FIR) -- (Shift) ; 223 223 \draw[->] (FIR) -- (Shift) ;
\draw[->] (Shift) edge node [above] { $\pi_i^+$ } (End) ; 224 224 \draw[->] (Shift) edge node [above] { $\pi_i^+$ } (End) ;
\end{tikzpicture} 225 225 \end{tikzpicture}
\caption{A single filter is composed of a FIR (on the left) and a Shifter (on the right)} 226 226 \caption{A single filter is composed of a FIR (on the left) and a Shifter (on the right)}
\label{fig:fir_stage} 227 227 \label{fig:fir_stage}
\end{figure} 228 228 \end{figure}
229 229
FIR $i$ has been characterized through numerical simulation as able to reject $F(C_i, \pi_i^C)$ dB. 230 230 FIR $i$ has been characterized through numerical simulation as able to reject $F(C_i, \pi_i^C)$ dB.
This rejection has been computed using GNU Octave software FIR coefficient design functions 231 231 This rejection has been computed using GNU Octave software FIR coefficient design functions
(\texttt{firls} and \texttt{fir1}). 232 232 (\texttt{firls} and \texttt{fir1}).
For each configuration $(C_i, \pi_i^C)$, we first create a FIR with floating point coefficients and a given $C_i$ number of coefficients. 233 233 For each configuration $(C_i, \pi_i^C)$, we first create a FIR with floating point coefficients and a given $C_i$ number of coefficients.
Then, the floating point coefficients are discretized into integers. In order to ensure that the coefficients are coded on $\pi_i^C$~bits effectively, 234 234 Then, the floating point coefficients are discretized into integers. In order to ensure that the coefficients are coded on $\pi_i^C$~bits effectively,
the coefficients are normalized by their absolute maximum before being scaled to integer coefficients. 235 235 the coefficients are normalized by their absolute maximum before being scaled to integer coefficients.
At least one coefficient is coded on $\pi_i^C$~bits, and in practice only $b_{C_i/2}$ is coded on $\pi_i^C$~bits while the others are coded on much fewer bits. 236 236 At least one coefficient is coded on $\pi_i^C$~bits, and in practice only $b_{C_i/2}$ is coded on $\pi_i^C$~bits while the others are coded on much fewer bits.
237 237
With these coefficients, the \texttt{freqz} function is used to estimate the magnitude of the filter 238 238 With these coefficients, the \texttt{freqz} function is used to estimate the magnitude of the filter
transfer function. 239 239 transfer function.
Comparing the performance between FIRs requires however defining a unique criterion. As shown in figure~\ref{fig:fir_mag}, 240 240 Comparing the performance between FIRs requires however defining a unique criterion. As shown in figure~\ref{fig:fir_mag},
the FIR magnitude exhibits two parts: we focus here on the transitions width and the rejection rather than on the 241 241 the FIR magnitude exhibits two parts: we focus here on the transitions width and the rejection rather than on the
bandpass ripples as emphasized in \cite{lim_1988,lim_1996}. 242 242 bandpass ripples as emphasized in \cite{lim_1988,lim_1996}.
243 243
\begin{figure} 244 244 \begin{figure}
\begin{center} 245 245 \begin{center}
\scalebox{0.8}{ 246 246 \scalebox{0.8}{
\centering 247 247 \centering
\begin{tikzpicture}[scale=0.3] 248 248 \begin{tikzpicture}[scale=0.3]
\draw[<->] (0,15) -- (0,0) -- (21,0) ; 249 249 \draw[<->] (0,15) -- (0,0) -- (21,0) ;
\draw[thick] (0,12) -- (8,12) -- (20,0) ; 250 250 \draw[thick] (0,12) -- (8,12) -- (20,0) ;
251 251
\draw (0,14) node [left] { $P$ } ; 252 252 \draw (0,14) node [left] { $P$ } ;
\draw (20,0) node [below] { $f$ } ; 253 253 \draw (20,0) node [below] { $f$ } ;
254 254
\draw[>=latex,<->] (0,14) -- (8,14) ; 255 255 \draw[>=latex,<->] (0,14) -- (8,14) ;
\draw (4,14) node [above] { passband } node [below] { $40\%$ } ; 256 256 \draw (4,14) node [above] { passband } node [below] { $40\%$ } ;
257 257
\draw[>=latex,<->] (8,14) -- (12,14) ; 258 258 \draw[>=latex,<->] (8,14) -- (12,14) ;
\draw (10,14) node [above] { transition } node [below] { $20\%$ } ; 259 259 \draw (10,14) node [above] { transition } node [below] { $20\%$ } ;
260 260
\draw[>=latex,<->] (12,14) -- (20,14) ; 261 261 \draw[>=latex,<->] (12,14) -- (20,14) ;
\draw (16,14) node [above] { stopband } node [below] { $40\%$ } ; 262 262 \draw (16,14) node [above] { stopband } node [below] { $40\%$ } ;
263 263
\draw[>=latex,<->] (16,12) -- (16,8) ; 264 264 \draw[>=latex,<->] (16,12) -- (16,8) ;
\draw (16,10) node [right] { rejection } ; 265 265 \draw (16,10) node [right] { rejection } ;
266 266
\draw[dashed] (8,-1) -- (8,14) ; 267 267 \draw[dashed] (8,-1) -- (8,14) ;
\draw[dashed] (12,-1) -- (12,14) ; 268 268 \draw[dashed] (12,-1) -- (12,14) ;
269 269
\draw[dashed] (8,12) -- (16,12) ; 270 270 \draw[dashed] (8,12) -- (16,12) ;
\draw[dashed] (12,8) -- (16,8) ; 271 271 \draw[dashed] (12,8) -- (16,8) ;
272 272
\end{tikzpicture} 273 273 \end{tikzpicture}
} 274 274 }
\end{center} 275 275 \end{center}
\caption{Shape of the filter transmitted power $P$ as a function of frequency $f$: 276 276 \caption{Shape of the filter transmitted power $P$ as a function of frequency $f$:
the passband is considered to occupy the initial 40\% of the Nyquist frequency range, 277 277 the passband is considered to occupy the initial 40\% of the Nyquist frequency range,
the stopband the last 40\%, allowing 20\% transition width.} 278 278 the stopband the last 40\%, allowing 20\% transition width.}
\label{fig:fir_mag} 279 279 \label{fig:fir_mag}
\end{figure} 280 280 \end{figure}
281 281
In the transition band, the behavior of the filter is left free, we only care about the passband and the stopband characteristics. 282 282 In the transition band, the behavior of the filter is left free, we only care about the passband and the stopband characteristics.
Our initial criterion considered the mean value of the stopband rejection, as shown in figure~\ref{fig:mean_criterion}. This criterion 283 283 Our initial criterion considered the mean value of the stopband rejection, as shown in figure~\ref{fig:mean_criterion}. This criterion
yields unacceptable results since notches overestimate the rejection capability of the filter. Furthermore, the losses within 284 284 yields unacceptable results since notches overestimate the rejection capability of the filter. Furthermore, the losses within
the passband are not considered and might be excessive for excessively wide transitions widths introduced for filters with few coefficients. 285 285 the passband are not considered and might be excessive for excessively wide transitions widths introduced for filters with few coefficients.
Such biases are compensated for by the second considered criterion which is based on computing the maximum rejection within the stopband minus the mean of the absolute value of passband rejection. With this criterion, the results are significantly improved as shown in figure~\ref{fig:custom_criterion} and meet the expected rejection capability of low pass filters. 286 286 Such biases are compensated for by the second considered criterion which is based on computing the maximum rejection within the stopband minus the mean of the absolute value of passband rejection. With this criterion, the results are significantly improved as shown in figure~\ref{fig:custom_criterion} and meet the expected rejection capability of low pass filters.
287 287
\begin{figure} 288 288 \begin{figure}
\centering 289 289 \centering
\includegraphics[width=\linewidth]{images/colored_mean_criterion} 290 290 \includegraphics[width=\linewidth]{images/colored_mean_criterion}
\caption{Mean stopband rejection criterion comparison between monolithic filter and cascaded filters} 291 291 \caption{Mean stopband rejection criterion comparison between monolithic filter and cascaded filters}
\label{fig:mean_criterion} 292 292 \label{fig:mean_criterion}
\end{figure} 293 293 \end{figure}
294 294
\begin{figure} 295 295 \begin{figure}
\centering 296 296 \centering
\includegraphics[width=\linewidth]{images/colored_custom_criterion} 297 297 \includegraphics[width=\linewidth]{images/colored_custom_criterion}
\caption{Custom criterion (maximum rejection in the stopband minus the mean of the absolute value of the passband rejection) 298 298 \caption{Custom criterion (maximum rejection in the stopband minus the mean of the absolute value of the passband rejection)
comparison between monolithic filter and cascaded filters} 299 299 comparison between monolithic filter and cascaded filters}
\label{fig:custom_criterion} 300 300 \label{fig:custom_criterion}
\end{figure} 301 301 \end{figure}
302 302
Thanks to the latter criterion which will be used in the remainder of this paper, we are able to automatically generate multiple FIR taps 303 303 Thanks to the latter criterion which will be used in the remainder of this paper, we are able to automatically generate multiple FIR taps
and estimate their rejection. Figure~\ref{fig:rejection_pyramid} exhibits the 304 304 and estimate their rejection. Figure~\ref{fig:rejection_pyramid} exhibits the
rejection as a function of the number of coefficients and the number of bits representing these coefficients. 305 305 rejection as a function of the number of coefficients and the number of bits representing these coefficients.
The curve shaped as a pyramid exhibits optimum configurations sets at the vertex where both edges meet. 306 306 The curve shaped as a pyramid exhibits optimum configurations sets at the vertex where both edges meet.
Indeed for a given number of coefficients, increasing the number of bits over the edge will not improve the rejection. 307 307 Indeed for a given number of coefficients, increasing the number of bits over the edge will not improve the rejection.
Conversely when setting the a given number of bits, increasing the number of coefficients will not improve 308 308 Conversely when setting the a given number of bits, increasing the number of coefficients will not improve
the rejection. Hence the best coefficient set are on the vertex of the pyramid. 309 309 the rejection. Hence the best coefficient set are on the vertex of the pyramid.
310 310
\begin{figure} 311 311 \begin{figure}
\centering 312 312 \centering
\includegraphics[width=\linewidth]{images/rejection_pyramid} 313 313 \includegraphics[width=\linewidth]{images/rejection_pyramid}
\caption{Rejection as a function of number of coefficients and number of bits} 314 314 \caption{Rejection as a function of number of coefficients and number of bits}
\label{fig:rejection_pyramid} 315 315 \label{fig:rejection_pyramid}
\end{figure} 316 316 \end{figure}
317 317
Although we have an efficient criterion to estimate the rejection of one set of coefficients (taps), 318 318 Although we have an efficient criterion to estimate the rejection of one set of coefficients (taps),
we have a problem when we cascade filters and estimate the criterion as a sum two or more individual criteria. 319 319 we have a problem when we cascade filters and estimate the criterion as a sum two or more individual criteria.
If the FIR filter coefficients are the same between the stages, we have: 320 320 If the FIR filter coefficients are the same between the stages, we have:
$$F_{total} = F_1 + F_2$$ 321 321 $$F_{total} = F_1 + F_2$$
But selecting two different sets of coefficient will yield a more complex situation in which 322 322 But selecting two different sets of coefficient will yield a more complex situation in which
the previous relation is no longer valid as illustrated on figure~\ref{fig:sum_rejection}. The red and blue curves 323 323 the previous relation is no longer valid as illustrated on figure~\ref{fig:sum_rejection}. The red and blue curves
are two different filters with maximums and notches not located at the same frequency offsets. 324 324 are two different filters with maximums and notches not located at the same frequency offsets.
Hence when summing the transfer functions, the resulting rejection shown as the dashed yellow line is improved 325 325 Hence when summing the transfer functions, the resulting rejection shown as the dashed yellow line is improved
with respect to a basic sum of the rejection criteria shown as a the dotted yellow line. 326 326 with respect to a basic sum of the rejection criteria shown as a the dotted yellow line.
Thus, estimating the rejection of filter cascades is more complex than takin the sum of all the rejection 327 327 Thus, estimating the rejection of filter cascades is more complex than takin the sum of all the rejection
criteria of each filter. However since the this sum underestimates the rejection capability of the cascade, 328 328 criteria of each filter. However since the this sum underestimates the rejection capability of the cascade,
this upper bound is considered as a pessimistic and acceptable criterion for deciding on the suitability 329 329 this upper bound is considered as a pessimistic and acceptable criterion for deciding on the suitability
of the filter cascade to meet design criteria. 330 330 of the filter cascade to meet design criteria.
331 331
\begin{figure} 332 332 \begin{figure}
\centering 333 333 \centering
\includegraphics[width=\linewidth]{images/cascaded_criterion} 334 334 \includegraphics[width=\linewidth]{images/cascaded_criterion}
\caption{Rejection of two cascaded filters} 335 335 \caption{Rejection of two cascaded filters}
\label{fig:sum_rejection} 336 336 \label{fig:sum_rejection}
\end{figure} 337 337 \end{figure}
338 338
Based on this analysis, we address the estimate of resource consumption (called 339 339 Based on this analysis, we address the estimate of resource consumption (called
silicon area -- in the case of FPGAs meaning processing cells) as a function of 340 340 silicon area -- in the case of FPGAs meaning processing cells) as a function of
filter characteristics. As a reminder, we do not aim at matching actual hardware 341 341 filter characteristics. As a reminder, we do not aim at matching actual hardware
configuration but consider an arbitrary silicon area occupied by each processing function, 342 342 configuration but consider an arbitrary silicon area occupied by each processing function,
and will assess after synthesis the adequation of this arbitrary unit with actual 343 343 and will assess after synthesis the adequation of this arbitrary unit with actual
hardware resources provided by FPGA manufacturers. The sum of individual processing 344 344 hardware resources provided by FPGA manufacturers. The sum of individual processing
unit areas is constrained by a total silicon area representative of FPGA global resources. 345 345 unit areas is constrained by a total silicon area representative of FPGA global resources.
Formally, variable $a_i$ is the area taken by filter~$i$ 346 346 Formally, variable $a_i$ is the area taken by filter~$i$
(in arbitrary unit). Variable $r_i$ is the rejection of filter~$i$ (in dB). 347 347 (in arbitrary unit). Variable $r_i$ is the rejection of filter~$i$ (in dB).
Constant $\mathcal{A}$ is the total available area. We model our problem as follows: 348 348 Constant $\mathcal{A}$ is the total available area. We model our problem as follows:
349 349
\begin{align} 350 350 \begin{align}
\text{Maximize } & \sum_{i=1}^n r_i \notag \\ 351 351 \text{Maximize } & \sum_{i=1}^n r_i \notag \\
\sum_{i=1}^n a_i & \leq \mathcal{A} & \label{eq:area} \\ 352 352 \sum_{i=1}^n a_i & \leq \mathcal{A} & \label{eq:area} \\
a_i & = C_i \times (\pi_i^C + \pi_i^-), & \forall i \in [1, n] \label{eq:areadef} \\ 353 353 a_i & = C_i \times (\pi_i^C + \pi_i^-), & \forall i \in [1, n] \label{eq:areadef} \\
r_i & = F(C_i, \pi_i^C), & \forall i \in [1, n] \label{eq:rejectiondef} \\ 354 354 r_i & = F(C_i, \pi_i^C), & \forall i \in [1, n] \label{eq:rejectiondef} \\
\pi_i^+ & = \pi_i^- + \pi_i^C - \pi_i^S, & \forall i \in [1, n] \label{eq:bits} \\ 355 355 \pi_i^+ & = \pi_i^- + \pi_i^C - \pi_i^S, & \forall i \in [1, n] \label{eq:bits} \\
\pi_{i - 1}^+ & = \pi_i^-, & \forall i \in [2, n] \label{eq:inout} \\ 356 356 \pi_{i - 1}^+ & = \pi_i^-, & \forall i \in [2, n] \label{eq:inout} \\
\pi_i^+ & \geq 1 + \sum_{k=1}^{i} \left(1 + \frac{r_j}{6}\right), & \forall i \in [1, n] \label{eq:maxshift} \\ 357 357 \pi_i^+ & \geq 1 + \sum_{k=1}^{i} \left(1 + \frac{r_j}{6}\right), & \forall i \in [1, n] \label{eq:maxshift} \\
\pi_1^- &= \Pi^I \label{eq:init} 358 358 \pi_1^- &= \Pi^I \label{eq:init}
\end{align} 359 359 \end{align}
360 360
Equation~\ref{eq:area} states that the total area taken by the filters must be 361 361 Equation~\ref{eq:area} states that the total area taken by the filters must be
less than the available area. Equation~\ref{eq:areadef} gives the definition of 362 362 less than the available area. Equation~\ref{eq:areadef} gives the definition of
the area used by a filter, considered as the area of the FIR since the Shifter is 363 363 the area used by a filter, considered as the area of the FIR since the Shifter is
assumed not to require significant resources. We consider that the FIR needs $C_i$ registers of size 364 364 assumed not to require significant resources. We consider that the FIR needs $C_i$ registers of size
$\pi_i^C + \pi_i^-$~bits to store the results of the multiplications of the 365 365 $\pi_i^C + \pi_i^-$~bits to store the results of the multiplications of the
input data with the coefficients. Equation~\ref{eq:rejectiondef} gives the 366 366 input data with the coefficients. Equation~\ref{eq:rejectiondef} gives the
definition of the rejection of the filter thanks to the tabulated function~$F$ that we defined 367 367 definition of the rejection of the filter thanks to the tabulated function~$F$ that we defined
previously. The Shifter does not introduce negative rejection as we will explain later, 368 368 previously. The Shifter does not introduce negative rejection as we will explain later,
so the rejection only comes from the FIR. Equation~\ref{eq:bits} states the 369 369 so the rejection only comes from the FIR. Equation~\ref{eq:bits} states the
relation between $\pi_i^+$ and $\pi_i^-$. The multiplications in the FIR add 370 370 relation between $\pi_i^+$ and $\pi_i^-$. The multiplications in the FIR add
$\pi_i^C$ bits as most coefficients are close to zero, and the Shifter removes 371 371 $\pi_i^C$ bits as most coefficients are close to zero, and the Shifter removes
$\pi_i^S$ bits. Equation~\ref{eq:inout} states that the output number of bits of 372 372 $\pi_i^S$ bits. Equation~\ref{eq:inout} states that the output number of bits of
a filter is the same as the input number of bits of the next filter. 373 373 a filter is the same as the input number of bits of the next filter.
Equation~\ref{eq:maxshift} ensures that the Shifter does not introduce negative 374 374 Equation~\ref{eq:maxshift} ensures that the Shifter does not introduce negative
rejection. Indeed, the results of the FIR can be right shifted without compromising 375 375 rejection. Indeed, the results of the FIR can be right shifted without compromising
the quality of the rejection until a threshold. Each bit of the output data 376 376 the quality of the rejection until a threshold. Each bit of the output data
increases the maximum rejection level by 6~dB. We add one to take the sign bit 377 377 increases the maximum rejection level by 6~dB. We add one to take the sign bit
into account. If equation~\ref{eq:maxshift} was not present, the Shifter could 378 378 into account. If equation~\ref{eq:maxshift} was not present, the Shifter could
shift too much and introduce some noise in the output data. Each supplementary 379 379 shift too much and introduce some noise in the output data. Each supplementary
shift bit would cause an additional 6~dB rejection rise. A totally equivalent equation is: 380 380 shift bit would cause an additional 6~dB rejection rise. A totally equivalent equation is:
$\pi_i^S \leq \pi_i^- + \pi_i^C - 1 - \sum_{k=1}^{i} \left(1 + \frac{r_j}{6}\right)$. 381 381 $\pi_i^S \leq \pi_i^- + \pi_i^C - 1 - \sum_{k=1}^{i} \left(1 + \frac{r_j}{6}\right)$.
Finally, equation~\ref{eq:init} gives the number of bits of the global input. 382 382 Finally, equation~\ref{eq:init} gives the number of bits of the global input.
383 383
This model is non-linear and even non-quadratic, as $F$ does not have a known 384 384 This model is non-linear and even non-quadratic, as $F$ does not have a known
linear or quadratic expression. We introduce $p$ FIR configurations 385 385 linear or quadratic expression. We introduce $p$ FIR configurations
$(C_{ij}, \pi_{ij}^C), 1 \leq j \leq p$ that are constants. We define binary 386 386 $(C_{ij}, \pi_{ij}^C), 1 \leq j \leq p$ that are constants. We define binary
variable $\delta_{ij}$ that has value 1 if stage~$i$ is in configuration~$j$ 387 387 variable $\delta_{ij}$ that has value 1 if stage~$i$ is in configuration~$j$
and 0 otherwise. The new equations are as follows: 388 388 and 0 otherwise. The new equations are as follows:
389 389
\begin{align} 390 390 \begin{align}
a_i & = \sum_{j=1}^p \delta_{ij} \times C_{ij} \times (\pi_{ij}^C + \pi_i^-), & \forall i \in [1, n] \label{eq:areadef2} \\ 391 391 a_i & = \sum_{j=1}^p \delta_{ij} \times C_{ij} \times (\pi_{ij}^C + \pi_i^-), & \forall i \in [1, n] \label{eq:areadef2} \\
r_i & = \sum_{j=1}^p \delta_{ij} \times F(C_{ij}, \pi_{ij}^C), & \forall i \in [1, n] \label{eq:rejectiondef2} \\ 392 392 r_i & = \sum_{j=1}^p \delta_{ij} \times F(C_{ij}, \pi_{ij}^C), & \forall i \in [1, n] \label{eq:rejectiondef2} \\
\pi_i^+ & = \pi_i^- + \left(\sum_{j=1}^p \delta_{ij} \pi_{ij}^C\right) - \pi_i^S, & \forall i \in [1, n] \label{eq:bits2} \\ 393 393 \pi_i^+ & = \pi_i^- + \left(\sum_{j=1}^p \delta_{ij} \pi_{ij}^C\right) - \pi_i^S, & \forall i \in [1, n] \label{eq:bits2} \\
\sum_{j=1}^p \delta_{ij} & \leq 1, & \forall i \in [1, n] \label{eq:config} 394 394 \sum_{j=1}^p \delta_{ij} & \leq 1, & \forall i \in [1, n] \label{eq:config}
\end{align} 395 395 \end{align}
396 396
Equations \ref{eq:areadef2}, \ref{eq:rejectiondef2} and \ref{eq:bits2} replace 397 397 Equations \ref{eq:areadef2}, \ref{eq:rejectiondef2} and \ref{eq:bits2} replace
respectively equations \ref{eq:areadef}, \ref{eq:rejectiondef} and \ref{eq:bits}. 398 398 respectively equations \ref{eq:areadef}, \ref{eq:rejectiondef} and \ref{eq:bits}.
Equation~\ref{eq:config} states that for each stage, a single configuration is chosen at most. 399 399 Equation~\ref{eq:config} states that for each stage, a single configuration is chosen at most.
400 400
This modified model is quadratic, and it can be linearised if necessary. The Gurobi 401 401 This modified model is quadratic, and it can be linearised if necessary. The Gurobi
(\url{www.gurobi.com}) optimization software is used to solve this quadratic 402 402 (\url{www.gurobi.com}) optimization software is used to solve this quadratic
model, and since Gurobi is able to linearize, the model is left as is. This model 403 403 model, and since Gurobi is able to linearize, the model is left as is. This model
has $O(np)$ variables and $O(n)$ constraints. 404 404 has $O(np)$ variables and $O(n)$ constraints.
405 405
Two problems will be addressed using the workflow described in the next section: on the one 406 406 Two problems will be addressed using the workflow described in the next section: on the one
hand maximizing the rejection capability of a set of cascaded filters occupying a fixed arbitrary 407 407 hand maximizing the rejection capability of a set of cascaded filters occupying a fixed arbitrary
silcon area (section~\ref{sec:fixed_area}) and on the second hand the dual problem of minimizing the silicon area 408 408 silcon area (section~\ref{sec:fixed_area}) and on the second hand the dual problem of minimizing the silicon area
for a fixed rejection criterion (section~\ref{sec:fixed_rej}). In the latter case, the 409 409 for a fixed rejection criterion (section~\ref{sec:fixed_rej}). In the latter case, the
objective function is replaced with: 410 410 objective function is replaced with:
\begin{align} 411 411 \begin{align}
\text{Minimize } & \sum_{i=1}^n a_i \notag 412 412 \text{Minimize } & \sum_{i=1}^n a_i \notag
\end{align} 413 413 \end{align}
We adapt our constraints of quadratic program to replace equation \ref{eq:area} 414 414 We adapt our constraints of quadratic program to replace equation \ref{eq:area}
with equation \ref{eq:rejection_min} where $\mathcal{R}$ is the minimal 415 415 with equation \ref{eq:rejection_min} where $\mathcal{R}$ is the minimal
rejection required. 416 416 rejection required.
417 417
\begin{align} 418 418 \begin{align}
\sum_{i=1}^n r_i & \geq \mathcal{R} & \label{eq:rejection_min} 419 419 \sum_{i=1}^n r_i & \geq \mathcal{R} & \label{eq:rejection_min}
\end{align} 420 420 \end{align}
421 421
\section{Design workflow} 422 422 \section{Design workflow}
\label{sec:workflow} 423 423 \label{sec:workflow}
424 424
In this section, we describe the workflow to compute all the results presented in sections~\ref{sec:fixed_area} 425 425 In this section, we describe the workflow to compute all the results presented in sections~\ref{sec:fixed_area}
and \ref{sec:fixed_rej}. Figure~\ref{fig:workflow} shows the global workflow and the different steps involved 426 426 and \ref{sec:fixed_rej}. Figure~\ref{fig:workflow} shows the global workflow and the different steps involved
in the computation of the results. 427 427 in the computation of the results.
428 428
\begin{figure} 429 429 \begin{figure}
\centering 430 430 \centering
\begin{tikzpicture}[node distance=0.75cm and 2cm] 431 431 \begin{tikzpicture}[node distance=0.75cm and 2cm]
\node[draw,minimum size=1cm] (Solver) { Filter Solver } ; 432 432 \node[draw,minimum size=1cm] (Solver) { Filter Solver } ;
\node (Start) [left= 3cm of Solver] { } ; 433 433 \node (Start) [left= 3cm of Solver] { } ;
\node[draw,minimum size=1cm] (TCL) [right= of Solver] { TCL Script } ; 434 434 \node[draw,minimum size=1cm] (TCL) [right= of Solver] { TCL Script } ;
\node (Input) [above= of TCL] { } ; 435 435 \node (Input) [above= of TCL] { } ;
\node[draw,minimum size=1cm] (Deploy) [below= of Solver] { Deploy Script } ; 436 436 \node[draw,minimum size=1cm] (Deploy) [below= of Solver] { Deploy Script } ;
\node[draw,minimum size=1cm] (Bitstream) [below= of TCL] { Bitstream } ; 437 437 \node[draw,minimum size=1cm] (Bitstream) [below= of TCL] { Bitstream } ;
\node[draw,minimum size=1cm,rounded corners] (Board) [below right= of Deploy] { Board } ; 438 438 \node[draw,minimum size=1cm,rounded corners] (Board) [below right= of Deploy] { Board } ;
\node[draw,minimum size=1cm] (Postproc) [below= of Deploy] { Post-Processing } ; 439 439 \node[draw,minimum size=1cm] (Postproc) [below= of Deploy] { Post-Processing } ;
\node (Results) [left= of Postproc] { } ; 440 440 \node (Results) [left= of Postproc] { } ;
441 441
\draw[->] (Start) edge node [above] { $\mathcal{A}, n, \Pi^I$ } node [below] { $(C_{ij}, \pi_{ij}^C), F$ } (Solver) ; 442 442 \draw[->] (Start) edge node [above] { $\mathcal{A}, n, \Pi^I$ } node [below] { $(C_{ij}, \pi_{ij}^C), F$ } (Solver) ;
\draw[->] (Input) edge node [left] { ADC or PRN } (TCL) ; 443 443 \draw[->] (Input) edge node [left] { ADC or PRN } (TCL) ;
\draw[->] (Solver) edge node [below] { (1a) } (TCL) ; 444 444 \draw[->] (Solver) edge node [below] { (1a) } (TCL) ;
\draw[->] (Solver) edge node [right] { (1b) } (Deploy) ; 445 445 \draw[->] (Solver) edge node [right] { (1b) } (Deploy) ;
\draw[->] (TCL) edge node [left] { (2) } (Bitstream) ; 446 446 \draw[->] (TCL) edge node [left] { (2) } (Bitstream) ;
\draw[->,dashed] (Bitstream) -- (Deploy) ; 447 447 \draw[->,dashed] (Bitstream) -- (Deploy) ;
\draw[->] (Deploy) to[out=-30,in=120] node [above] { (3) } (Board) ; 448 448 \draw[->] (Deploy) to[out=-30,in=120] node [above] { (3) } (Board) ;
\draw[->] (Board) to[out=150,in=-60] node [below] { (4) } (Deploy) ; 449 449 \draw[->] (Board) to[out=150,in=-60] node [below] { (4) } (Deploy) ;
\draw[->] (Deploy) edge node [left] { (5) } (Postproc) ; 450 450 \draw[->] (Deploy) edge node [left] { (5) } (Postproc) ;
\draw[->] (Postproc) -- (Results) ; 451 451 \draw[->] (Postproc) -- (Results) ;
\end{tikzpicture} 452 452 \end{tikzpicture}
\caption{Design workflow from the input parameters to the results} 453 453 \caption{Design workflow from the input parameters to the results}
\label{fig:workflow} 454 454 \label{fig:workflow}
\end{figure} 455 455 \end{figure}
456 456
The filter solver is a C++ program that takes as input the maximum area 457 457 The filter solver is a C++ program that takes as input the maximum area
$\mathcal{A}$, the number of stages $n$, the size of the input signal $\Pi^I$, 458 458 $\mathcal{A}$, the number of stages $n$, the size of the input signal $\Pi^I$,
the FIR configurations $(C_{ij}, \pi_{ij}^C)$ and the function $F$. It creates 459 459 the FIR configurations $(C_{ij}, \pi_{ij}^C)$ and the function $F$. It creates
the quadratic programs and uses the Gurobi solver to estimate the optimal results. 460 460 the quadratic programs and uses the Gurobi solver to estimate the optimal results.
Then it produces two scripts: a TCL script ((1a) on figure~\ref{fig:workflow}) 461 461 Then it produces two scripts: a TCL script ((1a) on figure~\ref{fig:workflow})
and a deploy script ((1b) on figure~\ref{fig:workflow}). 462 462 and a deploy script ((1b) on figure~\ref{fig:workflow}).
463 463
The TCL script describes the whole digital processing chain from the beginning 464 464 The TCL script describes the whole digital processing chain from the beginning
(the raw signal data) to the end (the filtered data) in a language compatible 465 465 (the raw signal data) to the end (the filtered data) in a language compatible
with proprietary synthesis software, namely Vivado for Xilinx and Quartus for 466 466 with proprietary synthesis software, namely Vivado for Xilinx and Quartus for
Intel/Altera. The raw input data generated from a 20-bit Pseudo Random Number (PRN) 467 467 Intel/Altera. The raw input data generated from a 20-bit Pseudo Random Number (PRN)
generator inside the FPGA and $\Pi^I$ is fixed at 16~bits. 468 468 generator inside the FPGA and $\Pi^I$ is fixed at 16~bits.
Then the script builds each stage of the chain with a generic FIR task that 469 469 Then the script builds each stage of the chain with a generic FIR task that
comes from a skeleton library. The generic FIR is highly configurable 470 470 comes from a skeleton library. The generic FIR is highly configurable
with the number of coefficients and the size of the coefficients. The coefficients 471 471 with the number of coefficients and the size of the coefficients. The coefficients
themselves are not stored in the script. 472 472 themselves are not stored in the script.
As the signal is processed in real-time, the output signal is stored as 473 473 As the signal is processed in real-time, the output signal is stored as
consecutive bursts of data for post-processing, mainly assessing the consistency of the 474 474 consecutive bursts of data for post-processing, mainly assessing the consistency of the
implemented FIR cascade transfer function with the design criteria and the expected 475 475 implemented FIR cascade transfer function with the design criteria and the expected
transfer function. 476 476 transfer function.
477 477
The TCL script is used by Vivado to produce the FPGA bitstream ((2) on figure~\ref{fig:workflow}). 478 478 The TCL script is used by Vivado to produce the FPGA bitstream ((2) on figure~\ref{fig:workflow}).
We use the 2018.2 version of Xilinx Vivado and we execute the synthesized 479 479 We use the 2018.2 version of Xilinx Vivado and we execute the synthesized
bitstream on a Redpitaya board fitted with a Xilinx Zynq-7010 series 480 480 bitstream on a Redpitaya board fitted with a Xilinx Zynq-7010 series
FPGA (xc7z010clg400-1) and two LTC2145 14-bit 125~MS/s ADC, loaded with 50~$\Omega$ resistors to 481 481 FPGA (xc7z010clg400-1) and two LTC2145 14-bit 125~MS/s ADC, loaded with 50~$\Omega$ resistors to
provide a broadband noise source. 482 482 provide a broadband noise source.
The board runs the Linux kernel and surrounding environment produced from the 483 483 The board runs the Linux kernel and surrounding environment produced from the
Buildroot framework available at \url{https://github.com/trabucayre/redpitaya/}: configuring 484 484 Buildroot framework available at \url{https://github.com/trabucayre/redpitaya/}: configuring
the Zynq FPGA, feeding the FIR with the set of coefficients, executing the simulation and 485 485 the Zynq FPGA, feeding the FIR with the set of coefficients, executing the simulation and
fetching the results is automated. 486 486 fetching the results is automated.
487 487
The deploy script uploads the bitstream to the board ((3) on 488 488 The deploy script uploads the bitstream to the board ((3) on
figure~\ref{fig:workflow}), flashes the FPGA, loads the different drivers, 489 489 figure~\ref{fig:workflow}), flashes the FPGA, loads the different drivers,
configures the coefficients of the FIR filters. It then waits for the results 490 490 configures the coefficients of the FIR filters. It then waits for the results
and retrieves the data to the main computer ((4) on figure~\ref{fig:workflow}). 491 491 and retrieves the data to the main computer ((4) on figure~\ref{fig:workflow}).
492 492
Finally, an Octave post-processing script computes the final results thanks to 493 493 Finally, an Octave post-processing script computes the final results thanks to
the output data ((5) on figure~\ref{fig:workflow}). 494 494 the output data ((5) on figure~\ref{fig:workflow}).
The results are normalized so that the Power Spectrum Density (PSD) starts at zero 495 495 The results are normalized so that the Power Spectrum Density (PSD) starts at zero
and the different configurations can be compared. 496 496 and the different configurations can be compared.
497 497
\section{Maximizing the rejection at fixed silicon area} 498 498 \section{Maximizing the rejection at fixed silicon area}
\label{sec:fixed_area} 499 499 \label{sec:fixed_area}
This section presents the output of the filter solver {\em i.e.} the computed 500 500 This section presents the output of the filter solver {\em i.e.} the computed
configurations for each stage, the computed rejection and the computed silicon area. 501 501 configurations for each stage, the computed rejection and the computed silicon area.
Such results allow for understanding the choices made by the solver to compute its solutions. 502 502 Such results allow for understanding the choices made by the solver to compute its solutions.
503 503
The experimental setup is composed of three cases. The raw input is generated 504 504 The experimental setup is composed of three cases. The raw input is generated
by a Pseudo Random Number (PRN) generator, which fixes the input data size $\Pi^I$. 505 505 by a Pseudo Random Number (PRN) generator, which fixes the input data size $\Pi^I$.
Then the total silicon area $\mathcal{A}$ has been fixed to either 500, 1000 or 1500 506 506 Then the total silicon area $\mathcal{A}$ has been fixed to either 500, 1000 or 1500
arbitrary units. Hence, the three cases have been named: MAX/500, MAX/1000, MAX/1500. 507 507 arbitrary units. Hence, the three cases have been named: MAX/500, MAX/1000, MAX/1500.
The number of configurations $p$ is 1827, with $C_i$ ranging from 3 to 60 and $\pi^C$ 508 508 The number of configurations $p$ is 1827, with $C_i$ ranging from 3 to 60 and $\pi^C$
ranging from 2 to 22. In each case, the quadratic program has been able to give a 509 509 ranging from 2 to 22. In each case, the quadratic program has been able to give a
result up to five stages ($n = 5$) in the cascaded filter. 510 510 result up to five stages ($n = 5$) in the cascaded filter.
511 511
Table~\ref{tbl:gurobi_max_500} shows the results obtained by the filter solver for MAX/500. 512 512 Table~\ref{tbl:gurobi_max_500} shows the results obtained by the filter solver for MAX/500.
Table~\ref{tbl:gurobi_max_1000} shows the results obtained by the filter solver for MAX/1000. 513 513 Table~\ref{tbl:gurobi_max_1000} shows the results obtained by the filter solver for MAX/1000.
Table~\ref{tbl:gurobi_max_1500} shows the results obtained by the filter solver for MAX/1500. 514 514 Table~\ref{tbl:gurobi_max_1500} shows the results obtained by the filter solver for MAX/1500.
515 515
\renewcommand{\arraystretch}{1.4} 516 516 \renewcommand{\arraystretch}{1.4}
517 517
\begin{table} 518 518 \begin{table}
\caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MAX/500} 519 519 \caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MAX/500}
\label{tbl:gurobi_max_500} 520 520 \label{tbl:gurobi_max_500}
\centering 521 521 \centering
{\scalefont{0.77} 522 522 {\scalefont{0.77}
\begin{tabular}{|c|ccccc|c|c|} 523 523 \begin{tabular}{|c|ccccc|c|c|}
\hline 524 524 \hline
$n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\ 525 525 $n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\
\hline 526 526 \hline
1 & (21, 7, 0) & - & - & - & - & 32~dB & 483 \\ 527 527 1 & (21, 7, 0) & - & - & - & - & 32~dB & 483 \\
2 & (3, 3, 15) & (31, 9, 0) & - & - & - & 58~dB & 460 \\ 528 528 2 & (3, 3, 15) & (31, 9, 0) & - & - & - & 58~dB & 460 \\
3 & (3, 3, 15) & (27, 9, 0) & (5, 3, 0) & - & - & 66~dB & 488 \\ 529 529 3 & (3, 3, 15) & (27, 9, 0) & (5, 3, 0) & - & - & 66~dB & 488 \\
4 & (3, 3, 15) & (19, 7, 0) & (11, 5, 0) & (3, 3, 0) & - & 74~dB & 499 \\ 530 530 4 & (3, 3, 15) & (19, 7, 0) & (11, 5, 0) & (3, 3, 0) & - & 74~dB & 499 \\
5 & (3, 3, 15) & (23, 8, 0) & (3, 3, 1) & (3, 3, 0) & (3, 3, 0) & 78~dB & 489 \\ 531 531 5 & (3, 3, 15) & (23, 8, 0) & (3, 3, 1) & (3, 3, 0) & (3, 3, 0) & 78~dB & 489 \\
\hline 532 532 \hline
\end{tabular} 533 533 \end{tabular}
} 534 534 }
\end{table} 535 535 \end{table}
536 536
\begin{table} 537 537 \begin{table}
\caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MAX/1000} 538 538 \caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MAX/1000}
\label{tbl:gurobi_max_1000} 539 539 \label{tbl:gurobi_max_1000}
\centering 540 540 \centering
{\scalefont{0.77} 541 541 {\scalefont{0.77}
\begin{tabular}{|c|ccccc|c|c|} 542 542 \begin{tabular}{|c|ccccc|c|c|}
\hline 543 543 \hline
$n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\ 544 544 $n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\
\hline 545 545 \hline
1 & (37, 11, 0) & - & - & - & - & 56~dB & 999 \\ 546 546 1 & (37, 11, 0) & - & - & - & - & 56~dB & 999 \\
2 & (3, 3, 15) & (51, 14, 0) & - & - & - & 87~dB & 975 \\ 547 547 2 & (3, 3, 15) & (51, 14, 0) & - & - & - & 87~dB & 975 \\
3 & (3, 3, 15) & (35, 11, 0) & (19, 7, 0) & - & - & 99~dB & 1000 \\ 548 548 3 & (3, 3, 15) & (35, 11, 0) & (19, 7, 0) & - & - & 99~dB & 1000 \\
4 & (3, 4, 16) & (27, 8, 0) & (19, 7, 1) & (11, 5, 0) & - & 103~dB & 998 \\ 549 549 4 & (3, 4, 16) & (27, 8, 0) & (19, 7, 1) & (11, 5, 0) & - & 103~dB & 998 \\
5 & (3, 3, 15) & (31, 9, 0) & (19, 7, 0) & (3, 3, 1) & (3, 3, 0) & 111~dB & 984 \\ 550 550 5 & (3, 3, 15) & (31, 9, 0) & (19, 7, 0) & (3, 3, 1) & (3, 3, 0) & 111~dB & 984 \\
\hline 551 551 \hline
\end{tabular} 552 552 \end{tabular}
} 553 553 }
\end{table} 554 554 \end{table}
555 555
\begin{table} 556 556 \begin{table}
\caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MAX/1500} 557 557 \caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MAX/1500}
\label{tbl:gurobi_max_1500} 558 558 \label{tbl:gurobi_max_1500}
\centering 559 559 \centering
{\scalefont{0.77} 560 560 {\scalefont{0.77}
\begin{tabular}{|c|ccccc|c|c|} 561 561 \begin{tabular}{|c|ccccc|c|c|}
\hline 562 562 \hline
$n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\ 563 563 $n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\
\hline 564 564 \hline
1 & (47, 15, 0) & - & - & - & - & 71~dB & 1457 \\ 565 565 1 & (47, 15, 0) & - & - & - & - & 71~dB & 1457 \\
2 & (19, 6, 15) & (51, 14, 0) & - & - & - & 103~dB & 1489 \\ 566 566 2 & (19, 6, 15) & (51, 14, 0) & - & - & - & 103~dB & 1489 \\
3 & (3, 3, 15) & (35, 11, 0) & (35, 11, 0) & - & - & 122~dB & 1492 \\ 567 567 3 & (3, 3, 15) & (35, 11, 0) & (35, 11, 0) & - & - & 122~dB & 1492 \\
4 & (3, 3, 15) & (27, 8, 0) & (19, 7, 0) & (27, 9, 0) & - & 129~dB & 1498 \\ 568 568 4 & (3, 3, 15) & (27, 8, 0) & (19, 7, 0) & (27, 9, 0) & - & 129~dB & 1498 \\
5 & (3, 3, 15) & (23, 9, 2) & (27, 9, 0) & (19, 7, 0) & (3, 3, 0) & 136~dB & 1499 \\ 569 569 5 & (3, 3, 15) & (23, 9, 2) & (27, 9, 0) & (19, 7, 0) & (3, 3, 0) & 136~dB & 1499 \\
\hline 570 570 \hline
\end{tabular} 571 571 \end{tabular}
} 572 572 }
\end{table} 573 573 \end{table}
574 574
\renewcommand{\arraystretch}{1} 575 575 \renewcommand{\arraystretch}{1}
576 576
From these tables, we can first state that the more stages are used to define 577 577 From these tables, we can first state that the more stages are used to define
the cascaded FIR filters, the better the rejection. It was an expected result as it has 578 578 the cascaded FIR filters, the better the rejection. It was an expected result as it has
been previously observed that many small filters are better than 579 579 been previously observed that many small filters are better than
a single large filter \cite{lim_1988, lim_1996, young_1992}, despite such conclusions 580 580 a single large filter \cite{lim_1988, lim_1996, young_1992}, despite such conclusions
being hardly used in practice due to the lack of tools for identifying individual filter 581 581 being hardly used in practice due to the lack of tools for identifying individual filter
coefficients in the cascaded approach. 582 582 coefficients in the cascaded approach.
583 583
Second, the larger the silicon area, the better the rejection. This was also an 584 584 Second, the larger the silicon area, the better the rejection. This was also an
expected result as more area means a filter of better quality with more coefficients 585 585 expected result as more area means a filter of better quality with more coefficients
or more bits per coefficient. 586 586 or more bits per coefficient.
587 587
Then, we also observe that the first stage can have a larger shift than the other 588 588 Then, we also observe that the first stage can have a larger shift than the other
stages. This is explained by the fact that the solver tries to use just enough 589 589 stages. This is explained by the fact that the solver tries to use just enough
bits for the computed rejection after each stage. In the first stage, a 590 590 bits for the computed rejection after each stage. In the first stage, a
balance between a strong rejection with a low number of bits is targeted. Equation~\ref{eq:maxshift} 591 591 balance between a strong rejection with a low number of bits is targeted. Equation~\ref{eq:maxshift}
gives the relation between both values. 592 592 gives the relation between both values.
593 593
Finally, we note that the solver consumes all the given silicon area. 594 594 Finally, we note that the solver consumes all the given silicon area.
595 595
The following graphs present the rejection for real data on the FPGA. In all the following 596 596 The following graphs present the rejection for real data on the FPGA. In all the following
figures, the solid line represents the actual rejection of the filtered 597 597 figures, the solid line represents the actual rejection of the filtered
data on the FPGA as measured experimentally and the dashed line are the noise levels 598 598 data on the FPGA as measured experimentally and the dashed line are the noise levels
given by the quadratic solver. The configurations are those computed in the previous section. 599 599 given by the quadratic solver. The configurations are those computed in the previous section.
600 600
Figure~\ref{fig:max_500_result} shows the rejection of the different configurations in the case of MAX/500. 601 601 Figure~\ref{fig:max_500_result} shows the rejection of the different configurations in the case of MAX/500.
Figure~\ref{fig:max_1000_result} shows the rejection of the different configurations in the case of MAX/1000. 602 602 Figure~\ref{fig:max_1000_result} shows the rejection of the different configurations in the case of MAX/1000.
Figure~\ref{fig:max_1500_result} shows the rejection of the different configurations in the case of MAX/1500. 603 603 Figure~\ref{fig:max_1500_result} shows the rejection of the different configurations in the case of MAX/1500.
604 604
\begin{figure} 605 605 \begin{figure}
\centering 606 606 \centering
\includegraphics[width=\linewidth]{images/max_500} 607 607 \includegraphics[width=\linewidth]{images/max_500}
\caption{Signal spectrum for MAX/500} 608 608 \caption{Signal spectrum for MAX/500}
\label{fig:max_500_result} 609 609 \label{fig:max_500_result}
\end{figure} 610 610 \end{figure}
611 611
\begin{figure} 612 612 \begin{figure}
\centering 613 613 \centering
\includegraphics[width=\linewidth]{images/max_1000} 614 614 \includegraphics[width=\linewidth]{images/max_1000}
\caption{Signal spectrum for MAX/1000} 615 615 \caption{Signal spectrum for MAX/1000}
\label{fig:max_1000_result} 616 616 \label{fig:max_1000_result}
\end{figure} 617 617 \end{figure}
618 618
\begin{figure} 619 619 \begin{figure}
\centering 620 620 \centering
\includegraphics[width=\linewidth]{images/max_1500} 621 621 \includegraphics[width=\linewidth]{images/max_1500}
\caption{Signal spectrum for MAX/1500} 622 622 \caption{Signal spectrum for MAX/1500}
\label{fig:max_1500_result} 623 623 \label{fig:max_1500_result}
\end{figure} 624 624 \end{figure}
625 625
In all cases, we observe that the actual rejection is close to the rejection computed by the solver. 626 626 In all cases, we observe that the actual rejection is close to the rejection computed by the solver.
627 627
We compare the actual silicon resources given by Vivado to the 628 628 We compare the actual silicon resources given by Vivado to the
resources in arbitrary units. 629 629 resources in arbitrary units.
The goal is to check that our arbitrary units of silicon area models well enough 630 630 The goal is to check that our arbitrary units of silicon area models well enough
the real resources on the FPGA. Especially we want to verify that, for a given 631 631 the real resources on the FPGA. Especially we want to verify that, for a given
number of arbitrary units, the actual silicon resources do not depend on the 632 632 number of arbitrary units, the actual silicon resources do not depend on the
number of stages $n$. Most significantly, our approach aims 633 633 number of stages $n$. Most significantly, our approach aims
at remaining far enough from the practical logic gate implementation used by 634 634 at remaining far enough from the practical logic gate implementation used by
various vendors to remain platform independent and be portable from one 635 635 various vendors to remain platform independent and be portable from one
architecture to another. 636 636 architecture to another.
637 637
Table~\ref{tbl:resources_usage} shows the resources usage in the case of MAX/500, MAX/1000 and 638 638 Table~\ref{tbl:resources_usage} shows the resources usage in the case of MAX/500, MAX/1000 and
MAX/1500 \emph{i.e.} when the maximum allowed silicon area is fixed to 500, 1000 639 639 MAX/1500 \emph{i.e.} when the maximum allowed silicon area is fixed to 500, 1000
and 1500 arbitrary units. We have taken care to extract solely the resources used by 640 640 and 1500 arbitrary units. We have taken care to extract solely the resources used by
the FIR filters and remove additional processing blocks including FIFO and Programmable 641 641 the FIR filters and remove additional processing blocks including FIFO and Programmable
Logic (PL -- FPGA) to Processing System (PS -- general purpose processor) communication. 642 642 Logic (PL -- FPGA) to Processing System (PS -- general purpose processor) communication.
643 643
\begin{table}[h!tb] 644 644 \begin{table}[h!tb]
\caption{Resource occupation. The last column refers to available resources on a Zynq-7010 as found on the Redpitaya.} 645 645 \caption{Resource occupation. The last column refers to available resources on a Zynq-7010 as found on the Redpitaya.}
\label{tbl:resources_usage} 646 646 \label{tbl:resources_usage}
\centering 647 647 \centering
\begin{tabular}{|c|c|ccc|c|} 648 648 \begin{tabular}{|c|c|ccc|c|}
\hline 649 649 \hline
$n$ & & MAX/500 & MAX/1000 & MAX/1500 & \emph{Zynq 7010} \\ \hline\hline 650 650 $n$ & & MAX/500 & MAX/1000 & MAX/1500 & \emph{Zynq 7010} \\ \hline\hline
& LUT & 249 & 453 & 627 & \emph{17600} \\ 651 651 & LUT & 249 & 453 & 627 & \emph{17600} \\
1 & BRAM & 1 & 1 & 1 & \emph{120} \\ 652 652 1 & BRAM & 1 & 1 & 1 & \emph{120} \\
& DSP & 21 & 37 & 47 & \emph{80} \\ \hline 653 653 & DSP & 21 & 37 & 47 & \emph{80} \\ \hline
& LUT & 2374 & 5494 & 691 & \emph{17600} \\ 654 654 & LUT & 2374 & 5494 & 691 & \emph{17600} \\
2 & BRAM & 2 & 2 & 2 & \emph{120} \\ 655 655 2 & BRAM & 2 & 2 & 2 & \emph{120} \\
& DSP & 0 & 0 & 70 & \emph{80} \\ \hline 656 656 & DSP & 0 & 0 & 70 & \emph{80} \\ \hline
& LUT & 2443 & 3304 & 3521 & \emph{17600} \\ 657 657 & LUT & 2443 & 3304 & 3521 & \emph{17600} \\
3 & BRAM & 3 & 3 & 3 & \emph{120} \\ 658 658 3 & BRAM & 3 & 3 & 3 & \emph{120} \\
& DSP & 0 & 19 & 35 & \emph{80} \\ \hline 659 659 & DSP & 0 & 19 & 35 & \emph{80} \\ \hline
& LUT & 2634 & 3753 & 2557 & \emph{17600} \\ 660 660 & LUT & 2634 & 3753 & 2557 & \emph{17600} \\
4 & BRAM & 4 & 4 & 4 & \emph{120} \\ 661 661 4 & BRAM & 4 & 4 & 4 & \emph{120} \\
& DPS & 0 & 19 & 46 & \emph{80} \\ \hline 662 662 & DPS & 0 & 19 & 46 & \emph{80} \\ \hline
& LUT & 2423 & 3047 & 2847 & \emph{17600} \\ 663 663 & LUT & 2423 & 3047 & 2847 & \emph{17600} \\
5 & BRAM & 5 & 5 & 5 & \emph{120} \\ 664 664 5 & BRAM & 5 & 5 & 5 & \emph{120} \\
& DPS & 0 & 22 & 46 & \emph{80} \\ \hline 665 665 & DPS & 0 & 22 & 46 & \emph{80} \\ \hline
\end{tabular} 666 666 \end{tabular}
\end{table} 667 667 \end{table}
668 668
In some cases, Vivado replaces the DSPs by Look Up Tables (LUTs). We assume that, 669 669 In some cases, Vivado replaces the DSPs by Look Up Tables (LUTs). We assume that,
when the filter coefficients are small enough, or when the input size is small 670 670 when the filter coefficients are small enough, or when the input size is small
enough, Vivado optimizes resource consumption by selecting multiplexers to 671 671 enough, Vivado optimizes resource consumption by selecting multiplexers to
implement the multiplications instead of a DSP. In this case, it is quite difficult 672 672 implement the multiplications instead of a DSP. In this case, it is quite difficult
to compare the whole silicon budget. 673 673 to compare the whole silicon budget.
674 674
However, a rough estimation can be made with a simple equivalence: looking at 675 675 However, a rough estimation can be made with a simple equivalence: looking at
the first column (MAX/500), where the number of LUTs is quite stable for $n \geq 2$, 676 676 the first column (MAX/500), where the number of LUTs is quite stable for $n \geq 2$,
we can deduce that a DSP is roughly equivalent to 100~LUTs in terms of silicon 677 677 we can deduce that a DSP is roughly equivalent to 100~LUTs in terms of silicon
area use. With this equivalence, our 500 arbitraty units correspond to 2500 LUTs, 678 678 area use. With this equivalence, our 500 arbitraty units correspond to 2500 LUTs,
1000 arbitrary units correspond to 5000 LUTs and 1500 arbitrary units correspond 679 679 1000 arbitrary units correspond to 5000 LUTs and 1500 arbitrary units correspond
to 7300 LUTs. The conclusion is that the orders of magnitude of our arbitrary 680 680 to 7300 LUTs. The conclusion is that the orders of magnitude of our arbitrary
unit map well to actual hardware resources. The relatively small differences can probably be explained 681 681 unit map well to actual hardware resources. The relatively small differences can probably be explained
by the optimizations done by Vivado based on the detailed map of available processing resources. 682 682 by the optimizations done by Vivado based on the detailed map of available processing resources.
683 683
We now present the computation time needed to solve the quadratic problem. 684 684 We now present the computation time needed to solve the quadratic problem.
For each case, the filter solver software is executed on a Intel(R) Xeon(R) CPU E5606 685 685 For each case, the filter solver software is executed on a Intel(R) Xeon(R) CPU E5606
clocked at 2.13~GHz. The CPU has 8 cores that are used by Gurobi to solve 686 686 clocked at 2.13~GHz. The CPU has 8 cores that are used by Gurobi to solve
the quadratic problem. Table~\ref{tbl:area_time} shows the time needed to solve the quadratic 687 687 the quadratic problem. Table~\ref{tbl:area_time} shows the time needed to solve the quadratic
problem when the maximal area is fixed to 500, 1000 and 1500 arbitrary units. 688 688 problem when the maximal area is fixed to 500, 1000 and 1500 arbitrary units.
689 689
\begin{table}[h!tb] 690 690 \begin{table}[h!tb]
\caption{Time needed to solve the quadratic program with Gurobi} 691 691 \caption{Time needed to solve the quadratic program with Gurobi}
\label{tbl:area_time} 692 692 \label{tbl:area_time}
\centering 693 693 \centering
\begin{tabular}{|c|c|c|c|}\hline 694 694 \begin{tabular}{|c|c|c|c|}\hline
$n$ & Time (MAX/500) & Time (MAX/1000) & Time (MAX/1500) \\\hline\hline 695 695 $n$ & Time (MAX/500) & Time (MAX/1000) & Time (MAX/1500) \\\hline\hline
1 & 0.1~s & 0.1~s & 0.3~s \\ 696 696 1 & 0.1~s & 0.1~s & 0.3~s \\
2 & 1.1~s & 2.2~s & 12~s \\ 697 697 2 & 1.1~s & 2.2~s & 12~s \\
3 & 17~s & 137~s ($\approx$ 2~min) & 275~s ($\approx$ 4~min) \\ 698 698 3 & 17~s & 137~s ($\approx$ 2~min) & 275~s ($\approx$ 4~min) \\
4 & 52~s & 5448~s ($\approx$ 90~min) & 5505~s ($\approx$ 17~h) \\ 699 699 4 & 52~s & 5448~s ($\approx$ 90~min) & 5505~s ($\approx$ 17~h) \\
5 & 286~s ($\approx$ 4~min) & 4119~s ($\approx$ 68~min) & 235479~s ($\approx$ 3~days) \\\hline 700 700 5 & 286~s ($\approx$ 4~min) & 4119~s ($\approx$ 68~min) & 235479~s ($\approx$ 3~days) \\\hline
\end{tabular} 701 701 \end{tabular}
\end{table} 702 702 \end{table}
703 703
As expected, the computation time seems to rise exponentially with the number of stages. % TODO: exponentiel ? 704 704 As expected, the computation time seems to rise exponentially with the number of stages. % TODO: exponentiel ?
When the area is limited, the design exploration space is more limited and the solver is able to 705 705 When the area is limited, the design exploration space is more limited and the solver is able to
find an optimal solution faster. On the contrary, in the case of MAX/1500 with 706 706 find an optimal solution faster.
5~stages, we were not able to obtain a result after 40~hours of computation when the program was 707
manually stopped. 708
709 707
\subsection{Minimizing resource occupation at fixed rejection}\label{sec:fixed_rej} 710 708 \subsection{Minimizing resource occupation at fixed rejection}\label{sec:fixed_rej}
711 709
This section presents the results of the complementary quadratic program aimed at 712 710 This section presents the results of the complementary quadratic program aimed at
minimizing the area occupation for a targeted rejection level. 713 711 minimizing the area occupation for a targeted rejection level.
714 712
The experimental setup is also composed of three cases. The raw input is the same 715 713 The experimental setup is also composed of three cases. The raw input is the same
as in the previous section, from a PRN generator, which fixes the input data size $\Pi^I$. 716 714 as in the previous section, from a PRN generator, which fixes the input data size $\Pi^I$.
Then the targeted rejection $\mathcal{R}$ has been fixed to either 40, 60 or 80~dB. 717 715 Then the targeted rejection $\mathcal{R}$ has been fixed to either 40, 60 or 80~dB.
Hence, the three cases have been named: MIN/40, MIN/60, MIN/80. 718 716 Hence, the three cases have been named: MIN/40, MIN/60, MIN/80.
The number of configurations $p$ is the same as previous section. 719 717 The number of configurations $p$ is the same as previous section.
720 718
Table~\ref{tbl:gurobi_min_40} shows the results obtained by the filter solver for MIN/40. 721 719 Table~\ref{tbl:gurobi_min_40} shows the results obtained by the filter solver for MIN/40.
Table~\ref{tbl:gurobi_min_60} shows the results obtained by the filter solver for MIN/60. 722 720 Table~\ref{tbl:gurobi_min_60} shows the results obtained by the filter solver for MIN/60.
Table~\ref{tbl:gurobi_min_80} shows the results obtained by the filter solver for MIN/80. 723 721 Table~\ref{tbl:gurobi_min_80} shows the results obtained by the filter solver for MIN/80.
724 722
\renewcommand{\arraystretch}{1.4} 725 723 \renewcommand{\arraystretch}{1.4}
726 724
\begin{table}[h!tb] 727 725 \begin{table}[h!tb]
\caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MIN/40} 728 726 \caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MIN/40}
\label{tbl:gurobi_min_40} 729 727 \label{tbl:gurobi_min_40}
\centering 730 728 \centering
{\scalefont{0.77} 731 729 {\scalefont{0.77}
\begin{tabular}{|c|ccccc|c|c|} 732 730 \begin{tabular}{|c|ccccc|c|c|}
\hline 733 731 \hline
$n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\ 734 732 $n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\
\hline 735 733 \hline
1 & (27, 8, 0) & - & - & - & - & 41~dB & 648 \\ 736 734 1 & (27, 8, 0) & - & - & - & - & 41~dB & 648 \\
2 & (3, 2, 14) & (19, 7, 0) & - & - & - & 40~dB & 263 \\ 737 735 2 & (3, 2, 14) & (19, 7, 0) & - & - & - & 40~dB & 263 \\
3 & (3, 3, 15) & (11, 5, 0) & (3, 3, 0) & - & - & 41~dB & 192 \\ 738 736 3 & (3, 3, 15) & (11, 5, 0) & (3, 3, 0) & - & - & 41~dB & 192 \\
4 & (3, 3, 15) & (3, 3, 0) & (3, 3, 0) & (3, 3, 0) & - & 42~dB & 147 \\ 739 737 4 & (3, 3, 15) & (3, 3, 0) & (3, 3, 0) & (3, 3, 0) & - & 42~dB & 147 \\
\hline 740 738 \hline
\end{tabular} 741 739 \end{tabular}
} 742 740 }
\end{table} 743 741 \end{table}
744 742
\begin{table}[h!tb] 745 743 \begin{table}[h!tb]
\caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MIN/60} 746 744 \caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MIN/60}
\label{tbl:gurobi_min_60} 747 745 \label{tbl:gurobi_min_60}
\centering 748 746 \centering
{\scalefont{0.77} 749 747 {\scalefont{0.77}
\begin{tabular}{|c|ccccc|c|c|} 750 748 \begin{tabular}{|c|ccccc|c|c|}
\hline 751 749 \hline
$n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\ 752 750 $n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\
\hline 753 751 \hline
1 & (39, 13, 0) & - & - & - & - & 60~dB & 1131 \\ 754 752 1 & (39, 13, 0) & - & - & - & - & 60~dB & 1131 \\
2 & (3, 3, 15) & (35, 10, 0) & - & - & - & 60~dB & 547 \\ 755 753 2 & (3, 3, 15) & (35, 10, 0) & - & - & - & 60~dB & 547 \\
3 & (3, 3, 15) & (27, 8, 0) & (3, 3, 0) & - & - & 62~dB & 426 \\ 756 754 3 & (3, 3, 15) & (27, 8, 0) & (3, 3, 0) & - & - & 62~dB & 426 \\
4 & (3, 2, 14) & (11, 5, 1) & (11, 5, 0) & (3, 3, 0) & - & 60~dB & 344 \\ 757 755 4 & (3, 2, 14) & (11, 5, 1) & (11, 5, 0) & (3, 3, 0) & - & 60~dB & 344 \\
5 & (3, 2, 14) & (3, 3, 1) & (3, 3, 0) & (3, 3, 0) & (3, 3, 0) & 60~dB & 279 \\ 758 756 5 & (3, 2, 14) & (3, 3, 1) & (3, 3, 0) & (3, 3, 0) & (3, 3, 0) & 60~dB & 279 \\
\hline 759 757 \hline
\end{tabular} 760 758 \end{tabular}
} 761 759 }
\end{table} 762 760 \end{table}
763 761
\begin{table}[h!tb] 764 762 \begin{table}[h!tb]
\caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MIN/80} 765 763 \caption{Configurations $(C_i, \pi_i^C, \pi_i^S)$, rejections and areas (in arbitrary units) for MIN/80}
\label{tbl:gurobi_min_80} 766 764 \label{tbl:gurobi_min_80}
\centering 767 765 \centering
{\scalefont{0.77} 768 766 {\scalefont{0.77}
\begin{tabular}{|c|ccccc|c|c|} 769 767 \begin{tabular}{|c|ccccc|c|c|}
\hline 770 768 \hline
$n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\ 771 769 $n$ & $i = 1$ & $i = 2$ & $i = 3$ & $i = 4$ & $i = 5$ & Rejection & Area \\
\hline 772 770 \hline
1 & (55, 16, 0) & - & - & - & - & 81~dB & 1760 \\ 773 771 1 & (55, 16, 0) & - & - & - & - & 81~dB & 1760 \\
2 & (3, 3, 15) & (47, 14, 0) & - & - & - & 80~dB & 903 \\ 774 772 2 & (3, 3, 15) & (47, 14, 0) & - & - & - & 80~dB & 903 \\
3 & (3, 3, 15) & (23, 9, 0) & (19, 7, 0) & - & - & 80~dB & 698 \\ 775 773 3 & (3, 3, 15) & (23, 9, 0) & (19, 7, 0) & - & - & 80~dB & 698 \\
4 & (3, 3, 15) & (27, 9, 0) & (7, 7, 4) & (3, 3, 0) & - & 80~dB & 605 \\ 776 774 4 & (3, 3, 15) & (27, 9, 0) & (7, 7, 4) & (3, 3, 0) & - & 80~dB & 605 \\
5 & (3, 2, 14) & (27, 8, 0) & (3, 3, 1) & (3, 3, 0) & (3, 3, 0) & 81~dB & 534 \\ 777 775 5 & (3, 2, 14) & (27, 8, 0) & (3, 3, 1) & (3, 3, 0) & (3, 3, 0) & 81~dB & 534 \\
\hline 778 776 \hline
\end{tabular} 779 777 \end{tabular}
} 780 778 }
\end{table} 781 779 \end{table}
\renewcommand{\arraystretch}{1} 782 780 \renewcommand{\arraystretch}{1}
783 781
% JMF : je croyais que dans un cas le monolithique n'y arrivait juste pas : tu as retire' ce cas ? 784 782 % JMF : je croyais que dans un cas le monolithique n'y arrivait juste pas : tu as retire' ce cas ?
From these tables, we can first state that all configurations reach the targeted rejection 785 783 From these tables, we can first state that all configurations reach the targeted rejection
level or even better thanks to our underestimate of the cascade rejection as the sum of the 786 784 level or even better thanks to our underestimate of the cascade rejection as the sum of the
individual filter rejection 787 785 individual filter rejection
% we have stages lesser is the area occupied in arbitrary unit. JMF : je ne comprends pas cette phrase 788 786 % we have stages lesser is the area occupied in arbitrary unit. JMF : je ne comprends pas cette phrase
Futhermore, the area of the monolithic filter is twice as big as the two cascaded filters 789 787 Futhermore, the area of the monolithic filter is twice as big as the two cascaded filters
(1131 and 1760 arbitrary units v.s 547 and 903 arbitrary units for 60 and 80~dB rejection 790 788 (1131 and 1760 arbitrary units v.s 547 and 903 arbitrary units for 60 and 80~dB rejection
respectively). More generally, the more filters are cascaded, the lower the occupied area. 791 789 respectively). More generally, the more filters are cascaded, the lower the occupied area.
792 790
Like in previous section, the solver chooses always a little filter as first 793 791 Like in previous section, the solver chooses always a little filter as first
filter stage and the second one is often the biggest filter. This choice can be explained 794 792 filter stage and the second one is often the biggest filter. This choice can be explained
as in the previous section, with the solver using just enough bits not to degrade the input 795 793 as in the previous section, with the solver using just enough bits not to degrade the input
signal and in the second filter selecting a better filter to improve rejection without 796 794 signal and in the second filter selecting a better filter to improve rejection without
having too many bits in the output data. 797 795 having too many bits in the output data.
798 796
For the specific case of MIN/40 for $n = 5$ the solver has determined that the optimal 799 797 For the specific case of MIN/40 for $n = 5$ the solver has determined that the optimal
number of filters is 4 so it did not chose any configuration for the last filter. Hence this 800 798 number of filters is 4 so it did not chose any configuration for the last filter. Hence this
solution is equivalent to the result for $n = 4$. 801 799 solution is equivalent to the result for $n = 4$.
802 800
The following graphs present the rejection for real data on the FPGA. In all the following 803 801 The following graphs present the rejection for real data on the FPGA. In all the following
figures, the solid line represents the actual rejection of the filtered 804 802 figures, the solid line represents the actual rejection of the filtered
data on the FPGA as measured experimentally and the dashed line is the noise level 805 803 data on the FPGA as measured experimentally and the dashed line is the noise level
given by the quadratic solver. 806 804 given by the quadratic solver.
807 805
Figure~\ref{fig:min_40} shows the rejection of the different configurations in the case of MIN/40. 808 806 Figure~\ref{fig:min_40} shows the rejection of the different configurations in the case of MIN/40.
Figure~\ref{fig:min_60} shows the rejection of the different configurations in the case of MIN/60. 809 807 Figure~\ref{fig:min_60} shows the rejection of the different configurations in the case of MIN/60.
Figure~\ref{fig:min_80} shows the rejection of the different configurations in the case of MIN/80. 810 808 Figure~\ref{fig:min_80} shows the rejection of the different configurations in the case of MIN/80.
811 809
\begin{figure} 812 810 \begin{figure}
\centering 813 811 \centering
\includegraphics[width=\linewidth]{images/min_40} 814 812 \includegraphics[width=\linewidth]{images/min_40}
\caption{Signal spectrum for MIN/40} 815 813 \caption{Signal spectrum for MIN/40}
\label{fig:min_40} 816 814 \label{fig:min_40}
\end{figure} 817 815 \end{figure}
818 816
\begin{figure} 819 817 \begin{figure}
\centering 820 818 \centering
\includegraphics[width=\linewidth]{images/min_60} 821 819 \includegraphics[width=\linewidth]{images/min_60}
\caption{Signal spectrum for MIN/60} 822 820 \caption{Signal spectrum for MIN/60}
\label{fig:min_60} 823 821 \label{fig:min_60}
\end{figure} 824 822 \end{figure}
825 823
\begin{figure} 826 824 \begin{figure}
\centering 827 825 \centering
\includegraphics[width=\linewidth]{images/min_80} 828 826 \includegraphics[width=\linewidth]{images/min_80}
\caption{Signal spectrum for MIN/80} 829 827 \caption{Signal spectrum for MIN/80}
\label{fig:min_80} 830 828 \label{fig:min_80}
\end{figure} 831 829 \end{figure}
832 830