| rfc9628xml2.original.xml | rfc9628.xml | |||
|---|---|---|---|---|
| <?xml version="1.0" encoding="US-ASCII"?> | <?xml version="1.0" encoding="UTF-8"?> | |||
| <!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | ||||
| <!ENTITY rfc2119 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.2119.xml"> | ||||
| <!ENTITY rfc3264 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.3264.xml"> | ||||
| <!ENTITY rfc3550 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.3550.xml"> | ||||
| <!ENTITY rfc3551 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.3551.xml"> | ||||
| <!ENTITY rfc3711 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.3711.xml"> | ||||
| <!ENTITY rfc3984 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.3984.xml"> | ||||
| <!ENTITY rfc4855 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.4855.xml"> | ||||
| <!ENTITY rfc4585 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.4585.xml"> | ||||
| <!ENTITY rfc5104 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.5104.xml"> | ||||
| <!ENTITY rfc5124 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.5124.xml"> | ||||
| <!ENTITY rfc6386 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.6386.xml"> | ||||
| <!ENTITY rfc6838 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.6838.xml"> | ||||
| <!ENTITY rfc7201 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.7201.xml"> | ||||
| <!ENTITY rfc7202 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.7202.xml"> | ||||
| <!ENTITY rfc7667 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.7667.xml"> | ||||
| <!ENTITY rfc8174 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.8174.xml"> | ||||
| <!ENTITY rfc8866 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
| ce.RFC.8866.xml"> | ||||
| <!ENTITY lrr SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml3/reference. | ||||
| I-D.ietf-avtext-lrr.xml"> | ||||
| <!DOCTYPE rfc [ | ||||
| <!ENTITY nbsp " "> | ||||
| <!ENTITY zwsp "​"> | ||||
| <!ENTITY nbhy "‑"> | ||||
| <!ENTITY wj "⁠"> | ||||
| ]> | ]> | |||
| <rfc category="std" docName="draft-ietf-payload-vp9-16" ipr="trust200902"> | ||||
| <?rfc symrefs="yes" ?> | ||||
| <?rfc sortrefs="yes" ?> | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" docName="draft-ietf-payload-vp9- | |||
| 16" number="9628" ipr="trust200902" obsoletes="" updates="" submissionType="IETF | ||||
| <!-- alphabetize the references --> | " category="std" consensus="true" xml:lang="en" symRefs="true" sortRefs="true" t | |||
| ocInclude="true" version="3"> | ||||
| <?rfc comments="no"?> | ||||
| <!-- show comments --> | ||||
| <?rfc inline="yes" ?> | ||||
| <!-- comments are inline --> | ||||
| <?rfc toc="yes" ?> | ||||
| <!-- generate table of contents --> | ||||
| <front> | <front> | |||
| <title abbrev="RTP Payload Format for VP9">RTP Payload Format for VP9 | <title abbrev="RTP Payload Format for VP9">RTP Payload Format for VP9 | |||
| Video</title> | Video</title> | |||
| <seriesInfo name="RFC" value="9628"/> | ||||
| <author fullname="Justin Uberti" initials="J." surname="Uberti"> | <author fullname="Justin Uberti" initials="J." surname="Uberti"> | |||
| <organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>747 6th Street South</street> | <street>747 6th Street South</street> | |||
| <city>Kirkland</city> | <city>Kirkland</city> | |||
| <region>WA</region> | <region>WA</region> | |||
| <code>98033</code> | <code>98033</code> | |||
| <country>United States of America</country> | ||||
| <country>USA</country> | ||||
| </postal> | </postal> | |||
| <email>justin@uberti.name</email> | <email>justin@uberti.name</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Stefan Holmer" initials="S." surname="Holmer"> | <author fullname="Stefan Holmer" initials="S." surname="Holmer"> | |||
| <organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Kungsbron 2</street> | <street>Kungsbron 2</street> | |||
| <code>111 22</code> | <code>111 22</code> | |||
| <city>Stockholm</city> | <city>Stockholm</city> | |||
| <country>Sweden</country> | <country>Sweden</country> | |||
| </postal> | </postal> | |||
| <email>holmer@google.com</email> | <email>holmer@google.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Magnus Flodman" initials="M." surname="Flodman"> | <author fullname="Magnus Flodman" initials="M." surname="Flodman"> | |||
| <organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>Kungsbron 2</street> | <street>Kungsbron 2</street> | |||
| <code>111 22</code> | <code>111 22</code> | |||
| <city>Stockholm</city> | <city>Stockholm</city> | |||
| <country>Sweden</country> | <country>Sweden</country> | |||
| </postal> | </postal> | |||
| <email>mflodman@google.com</email> | <email>mflodman@google.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Danny Hong" initials="D." surname="Hong"> | ||||
| <author fullname="Danny Hong" initials="D." surname="Hong"> | ||||
| <organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street>1585 Charleston Road</street> | <street>1585 Charleston Road</street> | |||
| <city>Mountain View</city> | <city>Mountain View</city> | |||
| <region>CA</region> | <region>CA</region> | |||
| <code>94043</code> | <code>94043</code> | |||
| <country>United States of America</country> | ||||
| <country>US</country> | ||||
| </postal> | </postal> | |||
| <email>dannyhong@google.com</email> | <email>dannyhong@google.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <author fullname="Jonathan Lennox" initials="J." surname="Lennox"> | <author fullname="Jonathan Lennox" initials="J." surname="Lennox"> | |||
| <organization abbrev="8x8 / Jitsi">8x8, Inc. / Jitsi</organization> | <organization abbrev="8x8 / Jitsi">8x8, Inc. / Jitsi</organization> | |||
| <address> | <address> | |||
| <postal> | <postal> | |||
| <street/> | <street/> | |||
| <city>Jersey City</city> | <city>Jersey City</city> | |||
| <region>NJ</region> | <region>NJ</region> | |||
| <code>07302</code> | <code>07302</code> | |||
| <country>United States of America</country> | ||||
| <country>US</country> | ||||
| </postal> | </postal> | |||
| <email>jonathan.lennox@8x8.com</email> | <email>jonathan.lennox@8x8.com</email> | |||
| </address> | </address> | |||
| </author> | </author> | |||
| <date year="2024" month="October" /> | ||||
| <date/> | ||||
| <area>RAI</area> | <area>RAI</area> | |||
| <workgroup>AVTCore Working Group</workgroup> | <workgroup>AVTCore Working Group</workgroup> | |||
| <keyword>RFC</keyword> | ||||
| <keyword>Request for Comments</keyword> | ||||
| <keyword>RTP</keyword> | <keyword>RTP</keyword> | |||
| <keyword>VP9</keyword> | <keyword>VP9</keyword> | |||
| <keyword>WebM</keyword> | <keyword>WebM</keyword> | |||
| <abstract> | <abstract> | |||
| <t>This specification describes an RTP payload format for the VP9 video co dec. | <t>This specification describes an RTP payload format for the VP9 video co dec. | |||
| The payload format has wide applicability, as it supports applications | The payload format has wide applicability as it supports applications | |||
| from low bit-rate peer-to-peer usage, to high bit-rate video | from low bitrate peer-to-peer usage to high bitrate video | |||
| conferences. It includes provisions for temporal and spatial scalability. </t> | conferences. It includes provisions for temporal and spatial scalability. </t> | |||
| </abstract> | </abstract> | |||
| </front> | </front> | |||
| <middle> | <middle> | |||
| <section anchor="intro" title="Introduction"> | <section anchor="intro" numbered="true" toc="default"> | |||
| <t>This specification describes an <xref target="RFC3550">RTP</xref> paylo | <name>Introduction</name> | |||
| ad specification applicable to the | ||||
| transmission of video streams encoded using the VP9 video codec <xref | ||||
| target="VP9-BITSTREAM"/>. The format described in this document can be use | ||||
| d | ||||
| both in peer-to-peer and video conferencing applications.</t> | ||||
| <t>The VP9 video codec was developed by Google, and is the | <t>This document describes an <xref target="RFC3550" | |||
| successor to its earlier <xref target="RFC6386">VP8</xref> | format="default">RTP</xref> payload specification applicable to the | |||
| codec. Above the compression improvements and other general | transmission of video streams encoded using the VP9 video codec <xref | |||
| enhancements above VP8, VP9 is also designed in a way that | target="VP9-BITSTREAM" format="default"/>. The format described in this | |||
| allows spatially-scalable video encoding.</t> | document can be used both in peer-to-peer and video conferencing | |||
| applications.</t> | ||||
| <t>The VP9 video codec was developed by Google and is the successor to | ||||
| its earlier <xref target="RFC6386" format="default">VP8</xref> codec. | ||||
| Above the compression improvements and other general enhancements to | ||||
| VP8, VP9 is also designed in a way that allows spatially scalable video | ||||
| encoding.</t> | ||||
| </section> | </section> | |||
| <section anchor="conventions" | <section anchor="conventions" numbered="true" toc="default"> | |||
| title="Conventions, Definitions and Acronyms"> | <name>Conventions</name> | |||
| <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", | <t> | |||
| "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
| RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL | |||
| interpreted as described in BCP 14 <xref target="RFC2119"/> | NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | |||
| <xref target="RFC8174"/> when, and only when, | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
| they appear in all capitals, as shown here.</t> | "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are | |||
| </section> | to be interpreted as described in BCP 14 <xref target="RFC2119"/> | |||
| <xref target="RFC8174"/> when, and only when, they appear in all capitals, | ||||
| as shown here. | ||||
| </t> | ||||
| <section anchor="mediaFormatDescription" title="Media Format Description"> | </section> | |||
| <section anchor="mediaFormatDescription" numbered="true" toc="default"> | ||||
| <name>Media Format Description</name> | ||||
| <t>The VP9 codec can maintain up to eight reference frames, of | <t>The VP9 codec can maintain up to eight reference frames, of | |||
| which up to three can be referenced by any new frame.</t> | which up to three can be referenced by any new frame.</t> | |||
| <t>VP9 also allows a frame to use another frame of a different | <t>VP9 also allows a frame to use another frame of a different | |||
| resolution as a reference frame. (Specifically, a frame may use | resolution as a reference frame. (Specifically, a frame may use | |||
| any references whose width and height are between 1/16th that of | any references whose width and height are between 1/16th that of | |||
| the current frame and twice that of the current frame, | the current frame and twice that of the current frame, | |||
| inclusive.) This allows internal resolution changes without | inclusive.) This allows internal resolution changes without | |||
| requiring the use of key frames.</t> | requiring the use of keyframes.</t> | |||
| <t>These features together enable an encoder to | <t>These features together enable an encoder to | |||
| implement various forms of coarse-grained scalability, | implement various forms of coarse-grained scalability, | |||
| including temporal, spatial and quality scalability modes, as | including temporal, spatial, and quality scalability modes, as | |||
| well as combinations of these, without the need for explicit | well as combinations of these, without the need for explicit | |||
| scalable coding tools.</t> | scalable coding tools.</t> | |||
| <t>Temporal layers define different frame rates of video; | <t>Temporal layers define different frame rates of video; | |||
| spatial and quality layers define different and possibly dependent | spatial and quality layers define different and possibly dependent | |||
| representations of a single input frame. Spatial layers allow | representations of a single input frame. Spatial layers allow | |||
| a frame to be encoded at different resolutions, whereas | a frame to be encoded at different resolutions, whereas | |||
| quality layers allow a frame to be encoded at the same | quality layers allow a frame to be encoded at the same | |||
| resolution but at different qualities (and thus with different | resolution but at different qualities (and, thus, with different | |||
| amounts of coding error). VP9 supports quality layers as | amounts of coding error). VP9 supports quality layers as | |||
| spatial layers without any resolution changes; hereinafter, | spatial layers without any resolution changes; hereinafter, | |||
| the term "spatial layer" is used to represent both spatial and | the term "spatial layer" is used to represent both spatial and | |||
| quality layers.</t> | quality layers.</t> | |||
| <t>This payload format specification defines how such | <t>This payload format specification defines how such | |||
| temporal and spatial scalability layers can be described and | temporal and spatial scalability layers can be described and | |||
| communicated.</t> | communicated.</t> | |||
| <t>Temporal and spatial scalability layers are associated with | ||||
| <t>Temporal and spatial scalability layers are associated with | ||||
| non-negative integer IDs. The lowest layer of either type has an | non-negative integer IDs. The lowest layer of either type has an | |||
| ID of 0, and is sometimes referred to as the "base" temporal or | ID of 0 and is sometimes referred to as the "base" temporal or | |||
| spatial layer.</t> | spatial layer.</t> | |||
| <t>Layers are designed, and <bcp14>MUST</bcp14> be encoded, such that if | ||||
| <t>Layers are designed, and MUST be encoded, such that if | ||||
| any layer, and all higher layers, are removed from the bitstream | any layer, and all higher layers, are removed from the bitstream | |||
| along either the spatial or temporal dimension, the remaining bitstream is | along either the spatial or temporal dimension, the remaining bitstream is | |||
| still correctly decodable.</t> | still correctly decodable.</t> | |||
| <t>For terminology, this document uses the term "frame" to refer | <t>For terminology, this document uses the term "frame" to refer to a | |||
| to a single encoded VP9 frame for a particular resolution/quality, and | single encoded VP9 frame for a particular resolution and/or quality, and | |||
| "picture" to refer to all the representations (frames) at a single | "picture" to refer to all the representations (frames) at a single | |||
| instant in time. A picture thus consists of one or more frames, | instant in time. Thus, a picture consists of one or more frames, | |||
| encoding different spatial layers.</t> | encoding different spatial layers.</t> | |||
| <t>Within a picture, a frame with spatial layer ID equal to SID, | <t>Within a picture, a frame with | |||
| where SID > 0, can depend on a frame of the same picture with a lower spat | spatial-layer ID equal to S, where S > 0, can depend on a frame | |||
| ial layer ID. This | of the same picture with a lower spatial-layer ID. This "inter-layer" | |||
| "inter-layer" dependency can result in additional coding gain | dependency can result in additional coding gain compared to the case | |||
| compared to the case where only | where only "inter-picture" dependency is used, where a frame | |||
| traditional "inter-picture" dependency is used, where a frame depends on p | depends on a previously coded frame in time. For simplicity, this | |||
| reviously | payload format assumes that, within a picture and if inter-layer | |||
| coded frame in time. For simplicity, this payload format assumes that, | dependency is used, a spatial-layer S frame can depend only on the | |||
| within a picture and if inter-layer dependency is used, a spatial layer SI | immediately previous spatial-layer S-1 frame, when S > 0. | |||
| D frame | Additionally, if inter-picture dependency is used, a spatial-layer S | |||
| can depend only on the immediately previous spatial layer SID-1 frame, whe | frame is assumed to only depend on a previously coded spatial-layer S | |||
| n S > 0. Additionally, if | frame.</t> | |||
| inter-picture dependency is used, a spatial layer SID frame is assumed to | ||||
| only | ||||
| depend on a previously coded spatial layer SID frame.</t> | ||||
| <t>Given above simplifications for inter-layer and inter-picture | ||||
| dependencies, a flag (the D bit described below) is used to indicate wheth | ||||
| er a | ||||
| spatial layer SID frame depends on the spatial layer SID-1 frame. Given t | ||||
| he D bit, a receiver | ||||
| only needs to additionally know the inter-picture dependency structure for | ||||
| a given | ||||
| spatial layer frame in order to determine its decodability. Two modes | ||||
| of describing the inter-picture dependency structure are possible: | ||||
| "flexible mode" and "non-flexible mode". An encoder can only switch | ||||
| between the two on the first packet of a key frame with temporal | ||||
| layer ID equal to 0.</t> | ||||
| <t>In flexible mode, each packet can contain up to 3 reference | ||||
| indices, which identify all frames referenced by the frame | ||||
| transmitted in the current packet for inter-picture prediction. | ||||
| This (along with the D bit) enables a receiver to identify if a frame | ||||
| is decodable or not and helps it understand the temporal layer | ||||
| structure. | ||||
| Since this is signaled in | ||||
| each packet it makes it possible to have very flexible temporal layer | ||||
| hierarchies, and scalability structures which are changing dynamically.</t | ||||
| > | ||||
| <t>Given the above simplifications for inter-layer and inter-picture | ||||
| dependencies, a flag (the D bit described below) is used to indicate | ||||
| whether a spatial-layer SID frame depends on the spatial-layer SID-1 | ||||
| frame. Given the D bit, a receiver only needs to additionally know the | ||||
| inter-picture dependency structure for a given spatial-layer frame in | ||||
| order to determine its decodability. Two modes of describing the | ||||
| inter-picture dependency structure are possible: "flexible mode" and | ||||
| "non-flexible mode". An encoder can only switch between the two on the | ||||
| first packet of a keyframe with a temporal-layer ID equal to 0.</t> | ||||
| <t>In flexible mode, each packet can contain up to three reference indices | ||||
| , | ||||
| which identify all frames referenced by the frame transmitted in the | ||||
| current packet for inter-picture prediction. This (along with the D | ||||
| bit) enables a receiver to identify if a frame is decodable or not and | ||||
| helps it understand the temporal-layer structure. Since this is | ||||
| signaled in each packet, it makes it possible to have very flexible | ||||
| temporal-layer hierarchies and scalability structures, which are | ||||
| changing dynamically.</t> | ||||
| <t>In non-flexible mode, frames are encoded using a fixed, recurring patte rn of dependencies; | <t>In non-flexible mode, frames are encoded using a fixed, recurring patte rn of dependencies; | |||
| the set of pictures that recur in this pattern is known as a Picture Group (PG). | the set of pictures that recur in this pattern is known as a "Picture Grou p" (or "PG"). | |||
| In this mode, the inter-picture dependencies (the reference | In this mode, the inter-picture dependencies (the reference | |||
| indices) of the Picture Group MUST be pre-specified as part of the | indices) of the PG <bcp14>MUST</bcp14> be pre-specified as part of the | |||
| scalability structure (SS) data. | Scalability Structure (SS) data. | |||
| Each | Each | |||
| packet has an index to refer to one of the described pictures | packet has an index to refer to one of the described pictures | |||
| in the PG, from which the pictures referenced by the picture transmitted i n the current packet | in the PG from which the pictures referenced by the picture transmitted in the current packet | |||
| for inter-picture prediction can be identified.</t> | for inter-picture prediction can be identified.</t> | |||
| <t>(Note: A "Picture Group", as used in this document, | <aside> | |||
| <t>Note: A "Picture Group" or "PG", as used in this document, | ||||
| is not the same thing as the term "Group of Pictures" as | is not the same thing as the term "Group of Pictures" as | |||
| it is traditionally used in video coding, i.e. to mean an | it is commonly used in video coding, i.e., to mean an | |||
| independently-decoadable run of pictures beginning with a | independently decodable run of pictures beginning with a | |||
| keyframe.)</t> | keyframe.</t> | |||
| <t>The SS data can also be used to specify the resolution of each | <t>The SS data can also be used to specify the resolution of each | |||
| spatial layer present in the VP9 stream for both flexible and non-flexible | spatial layer present in the VP9 stream for both flexible and non-flexible | |||
| modes.</t> | modes.</t></aside> | |||
| </section> | </section> | |||
| <section anchor="payloadFormat" numbered="true" toc="default"> | ||||
| <name>Payload Format</name> | ||||
| <section anchor="payloadFormat" title="Payload Format"> | ||||
| <t>This section describes how the encoded VP9 bitstream is encapsulated | <t>This section describes how the encoded VP9 bitstream is encapsulated | |||
| in RTP. To handle network losses usage of RTP/AVPF <xref | in RTP. To handle network losses, usage of RTP/AVPF <xref target="RFC4585" | |||
| target="RFC4585"/> is RECOMMENDED. All integer fields in the | format="default"/> is <bcp14>RECOMMENDED</bcp14>. All integer fields in this | |||
| specifications are encoded as unsigned integers in network octet | specification are encoded as unsigned integers in network octet | |||
| order.</t> | order.</t> | |||
| <section anchor="RTPHeaderUsage" numbered="true" toc="default"> | ||||
| <name>RTP Header Usage</name> | ||||
| <t keepWithNext="true">The general RTP payload format for VP9 is depicte | ||||
| d | ||||
| below.</t> | ||||
| <section anchor="RTPHeaderUsage" title="RTP Header Usage"> | <figure anchor="figureRTPHeader" title="General RTP Payload Format for | |||
| <figure anchor="figureRTPHeader"> | VP"> | |||
| <preamble>The general RTP payload format for VP9 is depicted | <artwork type="" align="left" alt=""><![CDATA[ | |||
| below.</preamble> | ||||
| <artwork><![CDATA[ | ||||
| 0 1 2 3 | 0 1 2 3 | |||
| 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| |V=2|P|X| CC |M| PT | sequence number | | |V=2|P|X| CC |M| PT | sequence number | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | timestamp | | | timestamp | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | synchronization source (SSRC) identifier | | | synchronization source (SSRC) identifier | | |||
| +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ | +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ | |||
| | contributing source (CSRC) identifiers | | | contributing source (CSRC) identifiers | | |||
| skipping to change at line 322 ¶ | skipping to change at line 253 ¶ | |||
| | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | : | | | : | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | |||
| | | | | | | |||
| + | | + | | |||
| : VP9 payload : | : VP9 payload : | |||
| | | | | | | |||
| | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| | : OPTIONAL RTP padding | | | : OPTIONAL RTP padding | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| ]]></artwork> | ]]></artwork> | |||
| <postamble>The VP9 payload descriptor will be | ||||
| described in <xref target="VP9payloadDescriptor"/>; the VP9 payload is | ||||
| described | ||||
| in <xref target="VP9-BITSTREAM"/>. | ||||
| OPTIONAL RTP padding MUST NOT be included unless the P bi | ||||
| t is set.</postamble> | ||||
| </figure> | </figure> | |||
| <t keepWithPrevious="true">See <xref target="VP9payloadDescriptor" forma | ||||
| t="default"/> for more information on the VP9 payload descriptor; | ||||
| the VP9 payload is described in <xref target="VP9-BITSTREAM" | ||||
| format="default"/>. <bcp14>OPTIONAL</bcp14> RTP padding <bcp14>MUST | ||||
| NOT</bcp14> be included unless the P bit is set.</t> | ||||
| <t><list style="hanging"> | <dl newline="false" spacing="normal"> | |||
| <t hangText="Marker bit (M):">MUST be set to 1 for the final packet | <dt>Marker bit (M):</dt> | |||
| of the highest spatial layer frame (the final packet of the picture) | <dd>This bit <bcp14>MUST</bcp14> be set to 1 for the final packet | |||
| , | of the highest spatial-layer frame (the final packet of the picture) | |||
| and 0 otherwise. Unless spatial scalability is in use for this pict | ; otherwise, it is 0. Unless spatial scalability is in use for this picture, | |||
| ure, | this bit will have the same value as the E bit described in <xref ta | |||
| this will have the same value as the E bit described below. Note th | rget="VP9payloadDescriptor"/>. Note this bit | |||
| is bit | <bcp14>MUST</bcp14> be set to 1 for the target spatial-layer frame | |||
| MUST be set to 1 for the target spatial layer frame | if a stream is being rewritten to remove higher spatial layers.</dd> | |||
| if a stream is being rewritten to remove higher spatial layers.</t> | <dt>Payload Type (PT):</dt> | |||
| <dd>In line with the policy in <xref target="RFC3551" | ||||
| <t hangText="Payload Type (PT):">In line with the policy | sectionFormat="of" section="3" format="default"/>, applications using | |||
| in Section 3 of <xref target='RFC3551'/>, applications | the VP9 RTP payload profile <bcp14>MUST</bcp14> assign a dynamic | |||
| using the VP9 RTP payload | payload type number to be used in each RTP session and provide a | |||
| profile MUST assign a dynamic payload type number to be | mechanism to indicate the mapping. See <xref target="SDPParameters" | |||
| used in each RTP session and provide a mechanism to | format="default"/> for the mechanism to be used with the <xref | |||
| indicate the mapping. See <xref target="SDPParameters" | target="RFC8866" format="default">Session Description Protocol | |||
| /> for the mechanism | (SDP)</xref>.</dd> | |||
| to be used with the <xref target='RFC8866'>Session | <dt>Timestamp:</dt> | |||
| Description Protocol (SDP)</xref>.</t> | <dd>The <xref target="RFC3550" format="default">RTP timestamp</xref> i | |||
| ndicates the time when | ||||
| <t hangText="Timestamp:">The <xref target="RFC3550">RTP timestamp</x | ||||
| ref> indicates the time when | ||||
| the input frame was sampled, at a clock rate of 90 kHz. If the | the input frame was sampled, at a clock rate of 90 kHz. If the | |||
| input picture is encoded with multiple layer frames, all of the | input picture is encoded with multiple-layer frames, all of the | |||
| frames of the picture MUST have the same timestamp.</t> | frames of the picture <bcp14>MUST</bcp14> have the same timestamp.</ | |||
| dd> | ||||
| <t>If a frame has the VP9 show_frame field set to 0 (i.e. | <dt/> | |||
| , it is meant only to | <dd>If a frame has the VP9 show_frame field set to 0 (i.e., it is | |||
| populate a reference buffer, without being output) its | meant only to populate a reference buffer without being output), its | |||
| timestamp MAY alternatively be set | timestamp <bcp14>MAY</bcp14> alternatively be set to be the same as | |||
| to be the same as the subsequent frame with show_frame | the subsequent frame with show_frame equal to 1. (This will be | |||
| equal to 1. (This will | convenient for playing out pre-encoded content packaged with VP9 | |||
| be convenient for playing out pre-encoded content packa | "superframes", which typically bundle show_frame==0 frames with a | |||
| ged with VP9 "superframes", which | subsequent show_frame==1 frame.) Every frame with show_frame==1, | |||
| typically bundle show_frame==0 frames with a subsequent | however, <bcp14>MUST</bcp14> have a unique timestamp modulo the 2<sup> | |||
| show_frame==1 frame.) Every | 32</sup> | |||
| frame with show_frame==1, however, MUST have a unique t | wrap of the field.</dd> | |||
| imestamp modulo the 2^32 wrap of | </dl> | |||
| the field.</t> | <t>The remaining RTP Fixed Header Fields (V, P, X, CC, sequence | |||
| number, SSRC, and CSRC identifiers) are used as specified in <xref | ||||
| </list></t> | target="RFC3550" sectionFormat="of" section="5.1" | |||
| <t>The remaining RTP Fixed Header Fields (V, P, X, CC, | format="default"/>.</t> | |||
| sequence number, SSRC and CSRC identifiers) are used as | ||||
| specified in Section 5.1 of <xref | ||||
| target="RFC3550"/>.</t> | ||||
| </section> | </section> | |||
| <section anchor="VP9payloadDescriptor" numbered="true" toc="default"> | ||||
| <name>VP9 Payload Descriptor</name> | ||||
| <section anchor="VP9payloadDescriptor" title="VP9 Payload Descriptor"> | <!--[rfced] Section 4.2: It seems the descriptions following Figure 3 | |||
| <figure anchor="figureVP9payloadDescriptor"> | apply to both Figures 2 and 3. If that is so, might a note of this appear | |||
| <preamble>In flexible mode (with the F bit below set to 1), the first | somewhere earlier in that section for the ease of the reader?--> | |||
| octets | ||||
| after the RTP header are the VP9 payload descriptor, with the followin | ||||
| g | ||||
| structure.</preamble> | ||||
| <artwork><![CDATA[ | <t keepWithNext="true">In flexible mode (with the F bit below set to 1), | |||
| the first octets | ||||
| after the RTP header are the VP9 payload descriptor, with the followin | ||||
| g | ||||
| structure.</t> | ||||
| <figure anchor="figureVP9payloadDescriptor" title="Flexible Mode Format | ||||
| for VP9 Payload Descriptor"> | ||||
| <artwork name="" type="" align="left" alt=""><![CDATA[ | ||||
| 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| |I|P|L|F|B|E|V|Z| (REQUIRED) | |I|P|L|F|B|E|V|Z| (REQUIRED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| I: |M| PICTURE ID | (REQUIRED) | I: |M| PICTURE ID | (REQUIRED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| M: | EXTENDED PID | (RECOMMENDED) | M: | EXTENDED PID | (RECOMMENDED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| L: | TID |U| SID |D| (Conditionally RECOMMENDED) | L: | TID |U| SID |D| (Conditionally RECOMMENDED) | |||
| +-+-+-+-+-+-+-+-+ -\ | +-+-+-+-+-+-+-+-+ -\ | |||
| P,F: | P_DIFF |N| (Conditionally REQUIRED) - up to 3 times | P,F: | P_DIFF |N| (Conditionally REQUIRED) - up to 3 times | |||
| +-+-+-+-+-+-+-+-+ -/ | +-+-+-+-+-+-+-+-+ -/ | |||
| V: | SS | | V: | SS | | |||
| | .. | | | .. | | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| ]]></artwork> | ||||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <t keepWithNext="true">In non-flexible mode (with the F bit below set to | ||||
| <figure anchor="figureVP9payloadDescriptorNonFlexible"> | 0), the first octets | |||
| <preamble>In non-flexible mode (with the F bit below set to 0), the fi | ||||
| rst octets | ||||
| after the RTP header are the VP9 payload descriptor, with the followin g | after the RTP header are the VP9 payload descriptor, with the followin g | |||
| structure.</preamble> | structure.</t> | |||
| <figure anchor="figureVP9payloadDescriptorNonFlexible" title="Non-flexib | ||||
| <artwork><![CDATA[ | le Mode Format for VP9 Payload Descriptor"> | |||
| <artwork name="" type="" align="left" alt=""><![CDATA[ | ||||
| 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| |I|P|L|F|B|E|V|Z| (REQUIRED) | |I|P|L|F|B|E|V|Z| (REQUIRED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| I: |M| PICTURE ID | (RECOMMENDED) | I: |M| PICTURE ID | (RECOMMENDED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| M: | EXTENDED PID | (RECOMMENDED) | M: | EXTENDED PID | (RECOMMENDED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| L: | TID |U| SID |D| (Conditionally RECOMMENDED) | L: | TID |U| SID |D| (Conditionally RECOMMENDED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| | TL0PICIDX | (Conditionally REQUIRED) | | TL0PICIDX | (Conditionally REQUIRED) | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| V: | SS | | V: | SS | | |||
| | .. | | | .. | | |||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| ]]></artwork> | ||||
| ]]></artwork> | ||||
| </figure> | </figure> | |||
| <dl newline="false" spacing="normal"> | ||||
| <dt>I:</dt> | ||||
| <dd>Picture ID (PID) present. When set to 1, the | ||||
| <bcp14>OPTIONAL</bcp14> PID <bcp14>MUST</bcp14> be present after the | ||||
| mandatory first octet and specified as below. Otherwise, PID | ||||
| <bcp14>MUST NOT</bcp14> be present. If the V bit was set in the | ||||
| stream's most recent start of a keyframe (i.e., the SS field was | ||||
| present) and the F bit is set to 0 (i.e., non-flexible scalability | ||||
| mode is in use), then this bit <bcp14>MUST</bcp14> be set on every | ||||
| packet.</dd> | ||||
| <dt>P:</dt> | ||||
| <dd>Inter-picture predicted frame. When set to 0, the frame does | ||||
| not utilize inter-picture prediction. In this case, up-switching to | ||||
| a current spatial layer's frame is possible from a directly lower | ||||
| spatial-layer frame. P <bcp14>SHOULD</bcp14> also be set to 0 when | ||||
| encoding a layer synchronization frame in response to a <xref target=" | ||||
| RFC9627" format="default">Layer Refresh Request (LRR)</xref> | ||||
| message (see <xref target="LRR" format="default"/>). When P is set | ||||
| to 0, the TID field (described below) <bcp14>MUST</bcp14> also be | ||||
| set to 0 (if present). Note that the P bit does not forbid | ||||
| intra-picture, inter-layer prediction from earlier frames of the | ||||
| same picture, if any.</dd> | ||||
| <dt>L:</dt> | ||||
| <t><list style="hanging"> | <dd>Layer indices present. When set to 1, the one or two octets | |||
| <t hangText="I:">Picture ID (PID) present. When set to one, the | following the mandatory first octet and the PID (if present) is as | |||
| OPTIONAL PID MUST be present after the mandatory first octet and | described by "Layer indices" below. If the F bit (described below) | |||
| specified as below. Otherwise, PID MUST NOT be present. If the V bit | is set to 1 (indicating flexible mode), then only one octet is | |||
| was set in the | present for the layer indices. Otherwise, if the F bit is set to 0 | |||
| stream's most recent start of a keyframe (i.e. the SS field was presen | (indicating non-flexible mode), then two octets are present for the | |||
| t) and the F bit | layer indices.</dd> | |||
| is set to 0 (i.e. non-flexible scalability mode is in use), | <dt>F:</dt> | |||
| then this bit MUST be set on every packet.</t> | <dd>Flexible mode. When set to 1, this indicates flexible mode; if th | |||
| e | ||||
| <t hangText="P:">Inter-picture predicted frame. When set to zero, the | P bit is also set to 1, then the octets following the mandatory | |||
| frame does not utilize inter-picture prediction. In this case, | first octet, the PID, and layer indices (if present) are as | |||
| up-switching to a current spatial layer's frame is possible from direc | described by "reference indices" below. This bit <bcp14>MUST</bcp14> | |||
| tly | only be set to 1 if the I bit is also set to 1; if the I bit is | |||
| lower spatial layer frame. P SHOULD also be set to zero when | set to 0, then this bit <bcp14>MUST</bcp14> also be set to 0 and | |||
| encoding a layer synchronization frame in response to an <xref target= | ignored by receivers. (Flexible mode's reference indices are defined | |||
| 'I-D.ietf-avtext-lrr'>LRR</xref> message (see <xref target='LRR'/>). | as offsets from the Picture ID field, so they would have no meaning | |||
| When P is set to zero, the TID field (described below) MUST also | if I were not set.) The value of the F bit <bcp14>MUST</bcp14> | |||
| be set to 0 (if present). Note that the P bit does not | only change on the first packet of a key picture. A "key picture" is | |||
| forbid intra-picture, inter-layer prediction from earlier | a picture whose base spatial-layer frame is a keyframe, and thus one w | |||
| frames of the same picture, if any.</t> | hich | |||
| completely resets the encoder state. This packet will have its | ||||
| <t hangText="L:">Layer indices present. When set to one, | P bit equal to 0, SID or L bit (described below) equal to 0, | |||
| the one or two octets following the mandatory first octet and the PID | and B bit (described below) equal to 1.</dd> | |||
| (if present) is as described by "Layer indices" below. If the F bit ( | <dt>B:</dt> | |||
| described below) | <dd>Start of a frame. This bit <bcp14>MUST</bcp14> be set to 1 if | |||
| is set to 1 (indicating flexible mode), then only one octet is present | ||||
| for the | ||||
| layer indices. Otherwise if the F bit is set to 0 (indicating non-flex | ||||
| ible mode), | ||||
| then two octets are present for the layer indices.</t> | ||||
| <t hangText="F:">Flexible mode. F set to one indicates | ||||
| flexible mode and if the P bit is also set to one, then the octets fol | ||||
| lowing | ||||
| the mandatory first octet, the PID, and layer indices (if present) are | ||||
| as described by "Reference indices" below. This MUST only be set to 1 | ||||
| if the I | ||||
| bit is also set to one; if the I bit is set to zero, then this MUST al | ||||
| so be | ||||
| set to zero and ignored by receivers. (Flexible mode's Reference indic | ||||
| es are defined as offsets | ||||
| from the Picture ID field, so they would have no meaning if I were not | ||||
| set.) | ||||
| The value of this F | ||||
| bit MUST only change | ||||
| on the first packet of a key picture. A key picture is a | ||||
| picture whose base spatial layer frame is a key frame, and | ||||
| which thus completely resets the encoder state. This | ||||
| packet will have its P bit | ||||
| equal to zero, SID or L bit (described below) equal to zero, and B bit | ||||
| (described below) | ||||
| equal to 1.</t> | ||||
| <t hangText="B:">Start of a frame. MUST be set to 1 if | ||||
| the first payload octet of the RTP packet is the beginning of a | the first payload octet of the RTP packet is the beginning of a | |||
| new VP9 frame, and MUST NOT be 1 otherwise. Note that this | new VP9 frame; otherwise, it <bcp14>MUST NOT</bcp14> be 1. Note that t | |||
| frame might not be the first frame of a picture.</t> | his | |||
| frame might not be the first frame of a picture.</dd> | ||||
| <t hangText="E:">End of a frame. MUST be set to 1 for the final | <dt>E:</dt> | |||
| RTP packet of a VP9 frame, and 0 otherwise. This enables a | <dd>End of a frame. This bit <bcp14>MUST</bcp14> be set to 1 for the | |||
| final | ||||
| RTP packet of a VP9 frame; otherwise, it is 0. This enables a | ||||
| decoder to finish decoding the frame, where it otherwise may need to | decoder to finish decoding the frame, where it otherwise may need to | |||
| wait for the next packet to explicitly know that the frame is complete . | wait for the next packet to explicitly know that the frame is complete . | |||
| Note that, if spatial scalability is in use, more frames from the | Note that, if spatial scalability is in use, more frames from the | |||
| same picture may follow; see the description of the B bit above.</t> | same picture may follow; see the description of the B bit above.</dd> | |||
| <dt>V:</dt> | ||||
| <t hangText="V:">Scalability structure (SS) data present. When set | <dd>Scalability Structure (SS) data present. When set | |||
| to one, the OPTIONAL SS data MUST be present in the payload descriptor | to 1, the <bcp14>OPTIONAL</bcp14> SS data <bcp14>MUST</bcp14> be prese | |||
| . | nt in the payload descriptor. | |||
| Otherwise, the SS data MUST NOT be present.</t> | Otherwise, the SS data <bcp14>MUST NOT</bcp14> be present.</dd> | |||
| <dt>Z:</dt> | ||||
| <t hangText="Z:">Not a reference frame for upper spatial | <dd>Not a reference frame for upper spatial layers. If set to 1, | |||
| layers. If set to 1, indicates that frames with higher | indicates that frames with higher spatial layers SID+1 and greater | |||
| spatial layers SID+1 and greater of the current and following pictures | of the current and following pictures do not depend on the current | |||
| do not depend on the current spatial layer SID frame. This | spatial-layer SID frame. This enables a decoder that is targeting a | |||
| enables a decoder which is targeting a higher spatial layer | higher spatial layer to know that it can safely discard this | |||
| to know that it can safely discard this packet's frame | packet's frame without processing it, without having to wait for the | |||
| without processing it, without having to wait for the "D" | D bit in the higher-layer frame (see below).</dd> | |||
| bit in the higher-layer frame (see below).</t> | </dl> | |||
| </list></t> | ||||
| <t>The mandatory first octet is followed by the extension data fields th at | <t>The mandatory first octet is followed by the extension data fields th at | |||
| are enabled:<list style="hanging"> | are enabled:</t> | |||
| <t hangText="M:">The most significant bit of the first octet is an | <dl newline="false" spacing="normal"> | |||
| extension flag. The field MUST be present if the I bit is equal to | <dt>M:</dt> | |||
| one. If M is set, the PID field MUST contain 15 bits; otherwise, it MU | ||||
| ST | ||||
| contain 7 bits. See PID below.</t> | ||||
| <t hangText="Picture ID (PID):">Picture ID represented in 7 or 15 bits | ||||
| , | ||||
| depending on the M bit. This is a running index of the pictures, where | ||||
| the | ||||
| sender increments the value by 1 for each picture it sends. (Note how | ||||
| ever that | ||||
| because a middlebox can discard pictures where permitted by the scalab | ||||
| ility structure, Picture IDs | ||||
| as received by a receiver might not be contiguous.) This | ||||
| field MUST be present if the I bit is equal to one. If M is set to zer | ||||
| o, | ||||
| 7 bits carry the PID; else if M is set to one, 15 bits carry | ||||
| the PID in network byte order. | ||||
| The sender may choose between a 7- or 15-bit index. The PID SHOULD sta | ||||
| rt on a | ||||
| random number, and MUST wrap after reaching the maximum ID (0x7f or 0x | ||||
| 7fff depending on | ||||
| the index size chosen). The receiver | ||||
| MUST NOT assume that the number of bits in PID stay the same through t | ||||
| he | ||||
| session. If this field transitions from 7-bits to 15-bits, the value | ||||
| is zero-extended | ||||
| (i.e. the value after 0x6e is 0x006f); if the field transitions from 1 | ||||
| 5 bits to 7 bits, | ||||
| it is truncated (i.e. the value after 0x1bbe is 0xbf). | ||||
| </t> | ||||
| <t>In the non-flexible mode (when the F bit is set to 0), this PID is | ||||
| used | ||||
| as an index to the picture group (PG) specified in the SS data below. | ||||
| In this mode, the | ||||
| PID of the key frame corresponds to the first specified frame in the | ||||
| PG. Then subsequent PIDs are mapped to subsequently specified frames | ||||
| in | ||||
| the PG (modulo N_G, specified in the SS data below), respectively.</t> | ||||
| <t>All frames of the same picture MUST have the same PID value. | ||||
| </t> | ||||
| <t>Frames (and their corresponding pictures) with the VP9 show_ | ||||
| frame field equal to 0 MUST | ||||
| have distinct PID values from subsequent pictures with sh | ||||
| ow_frame equal to 1. Thus, | ||||
| a Picture as defined in this specification is different t | ||||
| han a VP9 Superframe.</t> | ||||
| <t>All frames of the same picture MUST have the same value for | ||||
| show_frame.</t> | ||||
| <t hangText="Layer indices:">This information is optional but RECOMMEN | ||||
| DED | ||||
| whenever encoding with layers. For both flexible and non-flexible mod | ||||
| es, | ||||
| one octet is used to specify a layer frame's temporal layer ID (TID) a | ||||
| nd spatial layer ID (SID) | ||||
| as shown both in <xref target="figureVP9payloadDescriptor"/> and <xref | ||||
| target="figureVP9payloadDescriptorNonFlexible"/>. | ||||
| Additionally, a bit (U) is used to indicate that the current frame is | ||||
| a | ||||
| "switching up point" frame. Another bit (D) is used to indicate wheth | ||||
| er inter-layer | ||||
| prediction is used for the current frame.</t> | ||||
| <t>In the non-flexible mode (when the F bit is set to 0), another octe | ||||
| t is used | ||||
| to represent temporal layer 0 index (TL0PICIDX), as depicted in <xref | ||||
| target="figureVP9payloadDescriptorNonFlexible"/>. | ||||
| The TL0PICIDX is present so that all minimally required frames - the b | ||||
| ase temporal layer frames - can be tracked.</t> | ||||
| <t>The TID and SID fields indicate the temporal and spatial layers and | ||||
| can help middleboxes and | ||||
| endpoints quickly identify which layer a packet belongs to. | ||||
| <list style="hanging"> | ||||
| <t hangText="TID:">The temporal layer ID of current frame. In the c | ||||
| ase of non-flexible mode, | ||||
| if PID is mapped to a picture in a specified PG, then | ||||
| the value of TID MUST match the corresponding TID value of the mappe | ||||
| d picture in the PG.</t> | ||||
| <t hangText="U:">Switching up point. If this bit is set to 1 for th | ||||
| e current picture with temporal | ||||
| layer ID equal to TID, then "switch up" to a higher frame rate is po | ||||
| ssible as subsequent higher temporal | ||||
| layer pictures will not depend on any picture before the current pic | ||||
| ture (in coding order) with temporal layer | ||||
| ID greater than TID.</t> | ||||
| <t hangText="SID:">The spatial layer ID of current frame. Note that | <dd>The most significant bit of the first octet is an extension | |||
| frames with spatial layer SID > 0 | flag. The field <bcp14>MUST</bcp14> be present if the I bit is equal | |||
| may be dependent on decoded spatial layer SID-1 frame within the sam | to one. If M is set, the PID field <bcp14>MUST</bcp14> contain 15 | |||
| e picture. Different | bits; otherwise, it <bcp14>MUST</bcp14> contain 7 bits. See PID | |||
| frames of the same picture MUST have distinct spatial lay | below.</dd> | |||
| er IDs, and frames' spatial layers | <dt>Picture ID (PID):</dt> | |||
| MUST appear in increasing order within the frame.</t> | <dd>Picture ID represented in 7 or 15 bits, depending on the M | |||
| bit. This is a running index of the pictures, where the sender | ||||
| increments the value by 1 for each picture it sends. (Note, | ||||
| however, that because a middlebox can discard pictures where | ||||
| permitted by the SS, Picture IDs as received by a | ||||
| receiver might not be contiguous.) This field <bcp14>MUST</bcp14> | ||||
| be present if the I bit is equal to one. If M is set to 0, 7 bits | ||||
| carry the PID; else, if M is set to 1, 15 bits carry the PID in | ||||
| network byte order. The sender may choose between a 7- or 15-bit | ||||
| index. The PID <bcp14>SHOULD</bcp14> start on a random number and | ||||
| <bcp14>MUST</bcp14> wrap after reaching the maximum ID (0x7f or | ||||
| 0x7fff depending on the index size chosen). The receiver <bcp14>MUST | ||||
| NOT</bcp14> assume that the number of bits in the PID stays the same | ||||
| through the session. If this field transitions from 7 bits to 15 | ||||
| bits, the value is zero-extended (i.e., the value after 0x6e is | ||||
| 0x006f); if the field transitions from 15 bits to 7 bits, it is | ||||
| truncated (i.e., the value after 0x1bbe is 0xbf). | ||||
| </dd> | ||||
| <dt/> | ||||
| <dd>In the non-flexible mode (when the F bit is set to 0), this PID | ||||
| is used as an index to the PG specified in the SS | ||||
| data below. In this mode, the PID of the keyframe corresponds to | ||||
| the first specified frame in the PG. Then subsequent PIDs are | ||||
| mapped to subsequently specified frames in the PG (modulo N_G, | ||||
| specified in the SS data below), respectively.</dd> | ||||
| <dt/> | ||||
| <dd>All frames of the same picture <bcp14>MUST</bcp14> have the same | ||||
| PID value.</dd> | ||||
| <dt/> | ||||
| <dd>Frames (and their corresponding pictures) with the VP9 | ||||
| show_frame field equal to 0 <bcp14>MUST</bcp14> have distinct PID | ||||
| values from subsequent pictures with show_frame equal to 1. Thus, a | ||||
| picture (as defined in this specification) is different than a VP9 | ||||
| superframe.</dd> | ||||
| <dt/> | ||||
| <dd>All frames of the same picture <bcp14>MUST</bcp14> have the same | ||||
| value for show_frame.</dd> | ||||
| <t hangText="D:">Inter-layer dependency used. MUST be set to one if | <dt>Layer indices:</dt> | |||
| and only if the current spatial layer SID frame | <dd>This field is optional but <bcp14>RECOMMENDED</bcp14> | |||
| depends on spatial layer SID-1 frame of the same picture, otherwise | whenever encoding with layers. For both flexible and non-flexible | |||
| MUST be set to zero. For the base layer frame | modes, one octet is used to specify a layer frame's temporal-layer | |||
| (with SID equal to 0), this D bit MUST be set to zero.</t> | ID (TID) and spatial-layer ID (SID) as shown both in <xref | |||
| target="figureVP9payloadDescriptor" format="default"/> and <xref | ||||
| target="figureVP9payloadDescriptorNonFlexible" format="default"/>. | ||||
| Additionally, a bit (U) is used to indicate that the current frame | ||||
| is a "switching up point" frame. Another bit (D) is used to | ||||
| indicate whether inter-layer prediction is used for the current | ||||
| frame.</dd> | ||||
| <dt/> | ||||
| <dd>In the non-flexible mode (when the F bit is set to 0), another | ||||
| octet is used to represent Temporal Layer 0 Picture Index (8 bits) (TL | ||||
| 0PICIDX), as | ||||
| depicted in <xref target="figureVP9payloadDescriptorNonFlexible" | ||||
| format="default"/>. The TL0PICIDX is present so that all minimally | ||||
| required frames (the base temporal-layer frames) can be | ||||
| tracked.</dd> | ||||
| <dt/> | ||||
| <dd> | ||||
| <t>The TID and SID fields indicate the temporal and spatial layers | ||||
| and can help middleboxes and endpoints quickly identify which | ||||
| layer a packet belongs to. | ||||
| <t hangText="TL0PICIDX:">8 bits temporal layer zero index. TL0PICIDX | </t> | |||
| is only present | <dl newline="false" spacing="normal"> | |||
| in the non-flexible mode (F = 0). This is a running index for the t | <dt>TID:</dt> | |||
| emporal | <dd>The temporal-layer ID of the current frame. In the case of | |||
| base layer pictures, i.e., the pictures with TID set to 0. If TID i | non-flexible mode, if a PID is mapped to a picture in a specified | |||
| s larger than 0, | PG, then the value of the TID <bcp14>MUST</bcp14> match the | |||
| TL0PICIDX indicates which temporal base layer picture the current pi | corresponding TID value of the mapped picture in the PG.</dd> | |||
| cture depends on. TL0PICIDX MUST be | <dt>U:</dt> | |||
| incremented by 1 when TID is equal to 0. The index SHOULD start on | <dd>Switching up point. If this bit is set to 1 for the current | |||
| a random number, and MUST restart | picture with a temporal-layer ID equal to value T, then "switching | |||
| at 0 after reaching the maximum number 255.</t> | up" | |||
| </list></t> | to a higher frame rate is possible as subsequent higher | |||
| temporal-layer pictures will not depend on any picture before | ||||
| the current picture (in coding order) with a temporal-layer ID | ||||
| value greater than T.</dd> | ||||
| <dt>SID:</dt> | ||||
| <dd>The spatial-layer ID of the current frame. Note that frames | ||||
| with spatial-layer SID > 0 may be dependent on decoded | ||||
| spatial-layer SID-1 frame within the same picture. Different | ||||
| frames of the same picture <bcp14>MUST</bcp14> have distinct | ||||
| spatial-layer IDs, and frames' spatial layers | ||||
| <bcp14>MUST</bcp14> appear in increasing order within the | ||||
| frame.</dd> | ||||
| <dt>D:</dt> | ||||
| <dd>Inter-layer dependency is used. D <bcp14>MUST</bcp14> be | ||||
| set to 1 if and only if the current spatial-layer SID frame | ||||
| depends on spatial-layer SID-1 frame of the same picture; | ||||
| otherwise, it <bcp14>MUST</bcp14> be set to 0. For the | ||||
| base-layer frame (with SID equal to 0), the D bit | ||||
| <bcp14>MUST</bcp14> be set to 0.</dd> | ||||
| <dt>TL0PICIDX:</dt> | ||||
| <dd>Temporal Layer 0 Picture Index (8 bits). TL0PICIDX is only pres | ||||
| ent | ||||
| in the non-flexible mode (F = 0). This is a running index for | ||||
| the temporal base-layer pictures, i.e., the pictures with a TID | ||||
| set to 0. If the TID is larger than 0, TL0PICIDX indicates which | ||||
| temporal base-layer picture the current picture depends on. | ||||
| TL0PICIDX <bcp14>MUST</bcp14> be incremented by 1 when the TID is | ||||
| equal to 0. The index <bcp14>SHOULD</bcp14> start on a random | ||||
| number and <bcp14>MUST</bcp14> restart at 0 after reaching the | ||||
| maximum number 255.</dd> | ||||
| </dl> | ||||
| </dd> | ||||
| <dt>Reference indices:</dt> | ||||
| <dd> | ||||
| <t>When P and F are both set to 1, indicating a non-keyframe in | ||||
| flexible mode, then at least one reference index | ||||
| <bcp14>MUST</bcp14> be specified as below. Additional reference | ||||
| indices (a total of up to three reference indices are allowed) may b | ||||
| e | ||||
| specified using the N bit below. When either P or F is set to 0, | ||||
| then no reference index is specified. | ||||
| </t> | ||||
| <dl newline="false" spacing="normal"> | ||||
| <dt>P_DIFF:</dt> | ||||
| <dd>The reference index (in 7 bits) specified as the relative | ||||
| PID from the current picture. For example, when P_DIFF=3 on a | ||||
| packet containing the picture with PID 112 means that the | ||||
| picture refers back to the picture with PID 109. This | ||||
| calculation is done modulo the size of the PID field, i.e., | ||||
| either 7 or 15 bits. A P_DIFF value of 0 is invalid.</dd> | ||||
| <dt>N:</dt> | ||||
| <dd>1 if there is additional P_DIFF following the current P_DIFF.< | ||||
| /dd> | ||||
| </dl> | ||||
| </dd> | ||||
| </dl> | ||||
| <t hangText="Reference indices:">When P and F are both set to one, ind | <section anchor="VP9payloadDescriptorSS" numbered="true" toc="default"> | |||
| icating a non-key frame in | <name>Scalability Structure (SS)</name> | |||
| flexible mode, then at least | <t>The SS data describes the resolution of | |||
| one reference index MUST be specified as below. Additional reference | each frame within a picture as well as the inter-picture | |||
| indices (total of up to | dependencies for a PG. If the VP9 payload | |||
| 3 reference indices are allowed) may be specified using the N bit belo | descriptor's V bit is set, the SS data is present in the position | |||
| w. When either P or F is | indicated in Figures <xref format="counter" target="figureVP9payloadDe | |||
| set to zero, then no reference index is specified. | scriptor"/> and <xref target="figureVP9payloadDescriptorNonFlexible" format="cou | |||
| <list style="hanging"> | nter"/>.</t> | |||
| <t hangText="P_DIFF:">The reference index (in 7 bits) specified as t | ||||
| he | ||||
| relative PID from the current picture. For example, when P_DIFF=3 | ||||
| on a packet containing the picture with PID 112 means | ||||
| that the picture refers back to the picture with PID | ||||
| 109. This calculation is done modulo the size of the PID field, | ||||
| i.e., either 7 or 15 bits. A P_DIFF value of 0 is invalid.</t> | ||||
| <t hangText="N:">1 if there is additional P_DIFF following the curre | ||||
| nt P_DIFF.</t> | ||||
| </list></t> | ||||
| </list></t> | ||||
| <section anchor="VP9payloadDescriptorSS" title="Scalability Structure (SS) | <figure anchor="figureVP9ScalabilityStructure" title="VP9 Scalability | |||
| :"> | Structure"> | |||
| <t>The scalability structure (SS) data describes the resolution of | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| each frame within a picture as well as the inter-picture dependencies | ||||
| for a picture group (PG). If the VP9 payload descriptor's "V" | ||||
| bit is set, the SS data is present in the position indicated in | ||||
| <xref target="figureVP9payloadDescriptor"/> and <xref target="figureVP9p | ||||
| ayloadDescriptorNonFlexible"/>.</t> | ||||
| <figure anchor="figureVP9ScalabilityStructure"> | ||||
| <artwork><![CDATA[ | ||||
| +-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| V: | N_S |Y|G|-|-|-| | V: | N_S |Y|G|-|-|-| | |||
| +-+-+-+-+-+-+-+-+ -\ | +-+-+-+-+-+-+-+-+ -\ | |||
| Y: | WIDTH | (OPTIONAL) . | Y: | WIDTH | (OPTIONAL) . | |||
| + + . | + + . | |||
| | | (OPTIONAL) . | | | (OPTIONAL) . | |||
| +-+-+-+-+-+-+-+-+ . - N_S + 1 times | +-+-+-+-+-+-+-+-+ . - N_S + 1 times | |||
| | HEIGHT | (OPTIONAL) . | | HEIGHT | (OPTIONAL) . | |||
| + + . | + + . | |||
| | | (OPTIONAL) . | | | (OPTIONAL) . | |||
| +-+-+-+-+-+-+-+-+ -/ | +-+-+-+-+-+-+-+-+ -/ | |||
| G: | N_G | (OPTIONAL) | G: | N_G | (OPTIONAL) | |||
| +-+-+-+-+-+-+-+-+ -\ | +-+-+-+-+-+-+-+-+ -\ | |||
| N_G: | TID |U| R |-|-| (OPTIONAL) . | N_G: | TID |U| R |-|-| (OPTIONAL) . | |||
| +-+-+-+-+-+-+-+-+ -\ . - N_G times | +-+-+-+-+-+-+-+-+ -\ . - N_G times | |||
| | P_DIFF | (OPTIONAL) . - R times . | | P_DIFF | (OPTIONAL) . - R times . | |||
| +-+-+-+-+-+-+-+-+ -/ -/ | +-+-+-+-+-+-+-+-+ -/ -/ | |||
| ]]></artwork> | ]]></artwork> | |||
| </figure> | </figure> | |||
| <t><list style="hanging"> | ||||
| <t hangText="N_S:">N_S + 1 indicates the number of spatial | ||||
| layers present in the VP9 stream.</t> | ||||
| <t hangText="Y:">Each spatial layer's frame resolution present. | ||||
| When set to one, the OPTIONAL WIDTH (2 octets) and HEIGHT | ||||
| (2 octets) MUST be present for each layer frame. Otherwise, the | ||||
| resolution MUST NOT be present.</t> | ||||
| <t hangText="G:">PG description present flag.</t> | ||||
| <t hangText="-:">Bit reserved for future use. MUST be set to | ||||
| zero and MUST be ignored by the receiver.</t> | ||||
| <t hangText="N_G:">N_G indicates the number of pictures in a | ||||
| Picture Group (PG). | ||||
| If N_G is greater than 0, then the SS data allows | ||||
| the inter-picture dependency structure of the VP9 stream to | ||||
| be pre-declared, rather than indicating it on the fly with | ||||
| every packet. If N_G is greater than 0, then for N_G | ||||
| pictures in the PG, each picture's temporal layer ID (TID), switch up | ||||
| point (U), | ||||
| and the Reference indices (P_DIFFs) are specified.</t> | ||||
| <t>The first picture specified in the PG MUST have TID set to 0.</t> | ||||
| <t>G set to 0 or N_G set to 0 indicates that either there is only one | ||||
| temporal | ||||
| layer (for non-flexible mode) or no fixed inter-picture dependency inf | ||||
| ormation is present | ||||
| (for flexible mode) going forward in the bitstream.</t> | ||||
| <t>Note that for a given picture, all frames follow the | <!--[rfced] We note that not all fields that appear in Figure 4 are | |||
| same inter-picture dependency structure. However, the frame rate | described following it. Please review and let us know if text | |||
| of each spatial layer can be different from each other and this can | (or a pointer to where the reader can get more information on | |||
| be described with the use of the D bit described above. The | these fields) should be added. | |||
| specified dependency structure in the SS data MUST be for the highest | ||||
| frame rate layer.</t> | ||||
| </list></t> | ||||
| <t>In a scalable stream sent with a fixed pattern, the SS data | --> | |||
| SHOULD be included in the first packet of every key frame. This is a pac | <dl newline="false" spacing="normal"> | |||
| ket | <dt>N_S:</dt> | |||
| with P bit equal to zero, SID or L bit equal to zero, and B bit equal to | <dd>Number of Spatial Layers Minus 1. N_S + 1 indicates the number | |||
| 1. | of spatial | |||
| The SS data MUST only be changed on the picture that corresponds to the | layers present in the VP9 stream.</dd> | |||
| first picture specified in the previous SS data's PG | <dt>Y:</dt> | |||
| (if the previous SS data's N_G was greater than 0).</t> | <dd>Each spatial layer's frame resolution is present. | |||
| When set to 1, the <bcp14>OPTIONAL</bcp14> WIDTH (2 octets) and HEIGHT | ||||
| (2 octets) <bcp14>MUST</bcp14> be present for each layer frame. Other | ||||
| wise, the | ||||
| resolution <bcp14>MUST NOT</bcp14> be present.</dd> | ||||
| <dt>G:</dt> | ||||
| <dd>The PG description present flag.</dd> | ||||
| <dt>-:</dt> | ||||
| <dd>A bit reserved for future use. It <bcp14>MUST</bcp14> be set | ||||
| to 0 and <bcp14>MUST</bcp14> be ignored by the receiver.</dd> | ||||
| <dt>N_G:</dt> | ||||
| <dd>N_G indicates the number of pictures in a PG. | ||||
| If N_G is greater than 0, then the SS data allows the | ||||
| inter-picture dependency structure of the VP9 stream to be | ||||
| pre-declared, rather than indicating it on the fly with every | ||||
| packet. If N_G is greater than 0, then for N_G pictures in the | ||||
| PG, each picture's temporal-layer ID (TID), switch up point (U), | ||||
| and reference indices (P_DIFFs) are specified.</dd> | ||||
| <dt/> | ||||
| <dd>The first picture specified in the PG <bcp14>MUST</bcp14> have a | ||||
| TID set to 0.</dd> | ||||
| <dt/> | ||||
| <dd>G set to 0 or N_G set to 0 indicates that either there is only | ||||
| one temporal layer (for non-flexible mode) or no fixed | ||||
| inter-picture dependency information is present (for flexible | ||||
| mode) going forward in the bitstream.</dd> | ||||
| <dt/> | ||||
| <dd>Note that for a given picture, all frames follow the same | ||||
| inter-picture dependency structure. However, the frame rate of | ||||
| each spatial layer can be different from each other; this can | ||||
| be described with the use of the D bit described above. The | ||||
| specified dependency structure in the SS data <bcp14>MUST</bcp14> | ||||
| be for the highest frame rate layer.</dd> | ||||
| </dl> | ||||
| <t>In a scalable stream sent with a fixed pattern, the SS data | ||||
| <bcp14>SHOULD</bcp14> be included in the first packet of every key | ||||
| frame. This is a packet with the P bit equal to 0, SID or L bit equal | ||||
| to 0, and B bit equal to 1. The SS data <bcp14>MUST</bcp14> only | ||||
| be changed on the picture that corresponds to the first picture | ||||
| specified in the previous SS data's PG (if the previous SS data's | ||||
| N_G was greater than 0).</t> | ||||
| </section> | ||||
| </section> | </section> | |||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Frame Fragmentation</name> | ||||
| <section title="Frame Fragmentation"> | <t>VP9 frames are fragmented into packets in RTP sequence number | |||
| <t>VP9 frames are fragmented into packets, in RTP sequence | order: beginning with a packet with the B bit set and ending with a | |||
| number order, beginning with a | packet with the E bit set. There is no mechanism for finer-grained | |||
| packet with the B bit set, and ending with a packet with the | access to parts of a VP9 frame.</t> | |||
| E bit set. There is no mechanism for finer-grained | ||||
| access to parts of a VP9 frame.</t> | ||||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Scalable encoding considerations"> | <name>Scalable Encoding Considerations</name> | |||
| <t>In addition to the use of reference frames, VP9 has several | ||||
| <t>In addition to the use of reference frames, VP9 has several | ||||
| additional forms of inter-frame dependencies, largely | additional forms of inter-frame dependencies, largely | |||
| involving probability tables for the entropy and tree | involving probability tables for the entropy and tree | |||
| encoders. In VP9 syntax, the syntax element | encoders. In VP9 syntax, the syntax element | |||
| "error_resilient_mode" resets this additional inter-frame | "error_resilient_mode" resets this additional inter-frame | |||
| data, allowing a frame's syntax to be decoded | data, allowing a frame's syntax to be decoded | |||
| independently.</t> | independently.</t> | |||
| <t>Due to the requirements of scalable streams, a VP9 encoder | ||||
| <t>Due to the requirements of scalable streams, a VP9 encoder | ||||
| producing a scalable stream needs to ensure that a frame does | producing a scalable stream needs to ensure that a frame does | |||
| not depend on a previous frame (of the same or a previous | not depend on a previous frame (of the same or a previous | |||
| picture) that can legitimately be removed from the stream. | picture) that can legitimately be removed from the stream. | |||
| Thus, a frame that follows a frame that might be removed (in full decode | Thus, a frame that follows a frame that might be removed (in full decode | |||
| order) MUST be encoded with "error_resilient_mode" set to | order) <bcp14>MUST</bcp14> be encoded with "error_resilient_mode" set to | |||
| true.</t> | true.</t> | |||
| <t>For spatially scalable streams, this means that | ||||
| <t>For spatially-scalable streams, this means that | ||||
| "error_resilient_mode" needs to be turned on for the base | "error_resilient_mode" needs to be turned on for the base | |||
| spatial layer; it can however be turned off for higher spatial | spatial layer; however, it can be turned off for higher spatial | |||
| layers, assuming they are sent with inter-layer dependency | layers, assuming they are sent with inter-layer dependency | |||
| (i.e. with the "D" bit set). For streams that are only | (i.e., with the D bit set). For streams that are only | |||
| temporally-scalable without spatial scalability, | temporally scalable without spatial scalability, | |||
| "error_resilient_mode" can additionally be turned off for any | "error_resilient_mode" can additionally be turned off for any | |||
| picture that immediately follows a temporal layer 0 frame.</t> | picture that immediately follows a temporal-layer 0 frame.</t> | |||
| </section> | ||||
| </section> | <section numbered="true" toc="default"> | |||
| <name>Examples of VP9 RTP Stream</name> | ||||
| <section title="Examples of VP9 RTP Stream"> | <section numbered="true" toc="default"> | |||
| <section title="Reference picture use for scalable structure"> | <name>Reference Picture Use for Scalable Structure</name> | |||
| <t>As discussed in <xref target="mediaFormatDescription" format="defau | ||||
| <t>As discussed in <xref target="mediaFormatDescription"/>, the | lt"/>, the | |||
| VP9 codec can maintain up to eight reference frames, of | VP9 codec can maintain up to eight reference frames, of | |||
| which up to three can be referenced or updated by any new | which up to three can be referenced or updated by any new | |||
| frame. This section illustrates one way that a scalable | frame. This section illustrates one way that a scalable | |||
| structure (with three spatial layers and three temporal | structure (with three spatial layers and three temporal | |||
| layers) can be constructed using these reference | layers) can be constructed using these reference | |||
| frames.</t> | frames.</t> | |||
| <table align="center"> | ||||
| <texttable title="Example scalability structure"> | <name>Example Scalability Structure</name> | |||
| <thead> | ||||
| <ttcol align="center">Temporal</ttcol> | <tr> | |||
| <ttcol align="center">Spatial</ttcol> | <th align="center">Temporal</th> | |||
| <ttcol align="center">References</ttcol> | <th align="center">Spatial</th> | |||
| <ttcol align="center">Updates</ttcol> | <th align="center">References</th> | |||
| <c>0</c><c>0</c><c>0</c><c>0</c> | <th align="center">Updates</th> | |||
| <c>0</c><c>1</c><c>0,1</c><c>1</c> | </tr> | |||
| <c>0</c><c>2</c><c>1,2</c><c>2</c> | </thead> | |||
| <c>2</c><c>0</c><c>0</c><c>6</c> | <tbody> | |||
| <c>2</c><c>1</c><c>1,6</c><c>7</c> | <tr> | |||
| <c>2</c><c>2</c><c>2,7</c><c>-</c> | <td align="center">0</td> | |||
| <c>1</c><c>0</c><c>0</c><c>3</c> | <td align="center">0</td> | |||
| <c>1</c><c>1</c><c>1,3</c><c>4</c> | <td align="center">0</td> | |||
| <c>1</c><c>2</c><c>2,4</c><c>5</c> | <td align="center">0</td> | |||
| <c>2</c><c>0</c><c>3</c><c>6</c> | </tr> | |||
| <c>2</c><c>1</c><c>4,6</c><c>7</c> | <tr> | |||
| <c>2</c><c>2</c><c>5,7</c><c>-</c> | <td align="center">0</td> | |||
| <td align="center">1</td> | ||||
| </texttable> | <td align="center">0,1</td> | |||
| <td align="center">1</td> | ||||
| <t>This structure is constructed such that the "U" bit can | </tr> | |||
| <tr> | ||||
| <td align="center">0</td> | ||||
| <td align="center">2</td> | ||||
| <td align="center">1,2</td> | ||||
| <td align="center">2</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">0</td> | ||||
| <td align="center">0</td> | ||||
| <td align="center">6</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">1</td> | ||||
| <td align="center">1,6</td> | ||||
| <td align="center">7</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">2</td> | ||||
| <td align="center">2,7</td> | ||||
| <td align="center">-</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">1</td> | ||||
| <td align="center">0</td> | ||||
| <td align="center">0</td> | ||||
| <td align="center">3</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">1</td> | ||||
| <td align="center">1</td> | ||||
| <td align="center">1,3</td> | ||||
| <td align="center">4</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">1</td> | ||||
| <td align="center">2</td> | ||||
| <td align="center">2,4</td> | ||||
| <td align="center">5</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">0</td> | ||||
| <td align="center">3</td> | ||||
| <td align="center">6</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">1</td> | ||||
| <td align="center">4,6</td> | ||||
| <td align="center">7</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">2</td> | ||||
| <td align="center">5,7</td> | ||||
| <td align="center">-</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <t>This structure is constructed such that the U bit can | ||||
| always be set.</t> | always be set.</t> | |||
| </section> | ||||
| </section> | </section> | |||
| </section> | ||||
| </section> | </section> | |||
| <section anchor="Feedback" title="Feedback Messages and Header Extensions"> | <section anchor="Feedback" numbered="true" toc="default"> | |||
| <section anchor="RPSI" title="Reference Picture Selection Indication (RPSI | <name>Feedback Messages and Header Extensions</name> | |||
| )"> | <section anchor="RPSI" numbered="true" toc="default"> | |||
| <t>The reference picture selection index is a payload-specific | ||||
| <name>Reference Picture Selection Indication (RPSI)</name> | ||||
| <t>The RPSI is a payload-specific | ||||
| feedback message defined within the RTCP-based feedback format. The | feedback message defined within the RTCP-based feedback format. The | |||
| RPSI message is generated by a receiver and can be used in two ways. | RPSI message is generated by a receiver and can be used in two ways: | |||
| Either it can signal a preferred reference picture when a loss has | either it can signal a preferred reference picture when a loss has | |||
| been detected by the decoder -- preferably then a reference that the | been detected by the decoder (preferably a reference that the decoder | |||
| decoder knows is perfect -- or, it can be used as positive feedback | knows is perfect) or it can be used as positive feedback information | |||
| information to acknowledge correct decoding of certain reference | to acknowledge correct decoding of certain reference pictures. The | |||
| pictures. The positive feedback method is useful for VP9 used for | positive feedback method is useful for VP9 used for point-to-point | |||
| point to point (unicast) communication. The use of RPSI for VP9 is prefe | (unicast) communication. The use of RPSI for VP9 is preferably | |||
| rably combined with a special | combined with a special update pattern of the codec's two special | |||
| update pattern of the codec's two special reference frames -- the | reference frames -- the golden frame and the altref frame -- in which th | |||
| golden frame and the altref frame -- in which they are updated in an | ey | |||
| alternating leapfrog fashion. When a receiver has received and | are updated in an alternating leapfrog fashion. When a receiver has | |||
| correctly decoded a golden or altref frame, and that frame had a | received and correctly decoded a golden or altref frame, and that | |||
| Picture ID in the payload descriptor, the receiver can acknowledge this | frame had a Picture ID in the payload descriptor, the receiver can | |||
| simply by sending an RPSI message back to the sender. The message body | acknowledge this simply by sending an RPSI message back to the | |||
| (i.e., the "native RPSI bit string" in <xref target="RFC4585"/>) is | sender. The message body (i.e., the "native RPSI bit string" in <xref | |||
| simply the (7 or 15 bit) Picture ID of the received frame.</t> | target="RFC4585" format="default"/>) is simply the (7- or 15-bit) | |||
| Picture ID of the received frame.</t> | ||||
| <t>Note: because all frames of the same picture must have the | <aside> | |||
| <t>Note: because all frames of the same picture must have the | ||||
| same inter-picture reference structure, there is no need for a | same inter-picture reference structure, there is no need for a | |||
| message to specify which frame is being selected.</t> | message to specify which frame is being selected.</t></aside> | |||
| </section> | </section> | |||
| <section anchor="FIR" numbered="true" toc="default"> | ||||
| <section title='Full Intra Request (FIR)' anchor="FIR"> | <name>Full Intra Request (FIR)</name> | |||
| <t>The <xref target="RFC5104" format="default">Full Intra Request (FIR)< | ||||
| <t>The <xref target='RFC5104'>Full Intra Request (FIR)</xref> | /xref> | |||
| RTCP feedback message allows a receiver to request a full state r efresh of an encoded stream.</t> | RTCP feedback message allows a receiver to request a full state r efresh of an encoded stream.</t> | |||
| <t>Upon receipt of a FIR request, a VP9 sender <bcp14>MUST</bcp14> | ||||
| <t>Upon receipt of an FIR request, a VP9 sender MUST send a | send a picture with a keyframe for its spatial-layer 0 layer frame and | |||
| picture with a keyframe for its spatial layer 0 layer | then send frames without inter-picture prediction (P=0) for any | |||
| frame, and then send frames without inter-picture prediction | higher-layer frames.</t> | |||
| (P=0) for any higher layer frames.</t> | </section> | |||
| <section anchor="LRR" numbered="true" toc="default"> | ||||
| </section> | <name>Layer Refresh Request (LRR)</name> | |||
| <t>The <xref target="RFC9627" format="default">Layer Refresh Request | ||||
| <section title="Layer Refresh Request (LRR)" anchor="LRR"> | (LRR)</xref> allows a receiver to request a single layer of a | |||
| <t>The <xref target="I-D.ietf-avtext-lrr">Layer Refresh Request ( | spatially or temporally encoded stream to be refreshed without | |||
| LRR)</xref> | necessarily affecting the stream's other layers.</t> | |||
| allows a receiver to request a single layer of a spatially or | <figure anchor="figureLRRIndexFormat" title="LRR Index Format"> | |||
| temporally encoded stream to be refreshed, without necessarily | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
| affecting the stream's other layers.</t> | ||||
| <figure anchor="figureLRRIndexFormat"> | ||||
| <artwork><![CDATA[ | ||||
| +---------------+---------------+ | +---------------+---------------+ | |||
| |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| | |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| | |||
| +---------------+---------+-----+ | +---------------+---------+-----+ | |||
| | RES | TID | RES | SID | | | RES | TID | RES | SID | | |||
| +---------------+---------+-----+ | +---------------+---------+-----+ | |||
| ]]></artwork> | ]]></artwork> | |||
| </figure> | </figure> | |||
| <t><xref target="figureLRRIndexFormat" format="default"/> shows the form | ||||
| <t><xref target="figureLRRIndexFormat"/> shows the format | at | |||
| of LRR's layer index fields for VP9 streams. The two "RES" | of an LRR's layer index fields for VP9 streams. The two "RES" | |||
| fields MUST be set to 0 on transmission and ingnored on | fields <bcp14>MUST</bcp14> be set to 0 on transmission and ignore | |||
| reception. See <xref target="VP9payloadDescriptor"/> for | d on | |||
| reception. See <xref target="VP9payloadDescriptor" format="defau | ||||
| lt"/> for | ||||
| details on the TID and SID fields.</t> | details on the TID and SID fields.</t> | |||
| <t>Identification of a layer refresh frame can be derived from the | <t>Identification of a layer refresh frame can be derived from | |||
| reference IDs of each frame by backtracking the dependency chain | the reference IDs of each frame by backtracking the dependency | |||
| until reaching a point where only decodable frames are being | chain until reaching a point where only decodable frames are | |||
| referenced. Therefore it's recommended for both the | being referenced. Therefore, it's recommended for both the | |||
| flexible and the non-flexible mode that, when switching up points are | flexible and the non-flexible mode that, when switching up | |||
| being encoded in response to a LRR, those packets should contain | points are being encoded in response to an LRR, those packets | |||
| layer indices and the reference field(s) so that the decoder or a | contain layer indices and the reference field or fields so | |||
| <xref target='RFC7667'>selective forwarding | that the decoder or <xref target="RFC7667" | |||
| middleboxes</xref> can make this derivation.</t> | format="default">selective forwarding middleboxes</xref> can | |||
| make this derivation.</t> | ||||
| <t>Example:</t> | <t>Example:</t> | |||
| <t>LRR {1,0}, {2,1} is sent by an MCU when it is currently | <t>LRR {1,0}, {2,1} is sent by a Multipoint Control | |||
| relaying {1,0} to a receiver and which wants to upgrade to | Unit (MCU) when it is currently | |||
| {2,1}. In response the encoder should encode the next frames | relaying {1,0} to a receiver that wants to upgrade to | |||
| {2,1}. In response, the encoder should encode the next frames | ||||
| in layers {1,1} and {2,1} by only referring to frames in | in layers {1,1} and {2,1} by only referring to frames in | |||
| {1,0}, or {0,0}.</t> | {1,0} or {0,0}.</t> | |||
| <t>In the non-flexible mode, periodic upgrade frames can be defined by | ||||
| <t>In the non-flexible mode, periodic upgrade frames can be | the layer structure of the SS; thus, periodic upgrade frames can be | |||
| defined by the layer structure of the SS, thus periodic upgrade | automatically identified by the Picture ID.</t> | |||
| frames can be automatically identified by the picture ID.</t> | </section> | |||
| </section> | ||||
| </section> | </section> | |||
| <section anchor="payloadFormatParameters" | <section anchor="payloadFormatParameters" numbered="true" toc="default"> | |||
| title="Payload Format Parameters"> | ||||
| <t>This payload format has three optional parameters, "max-fr", "max-fs", | ||||
| and "profile-id".</t> | ||||
| <t>The max-fr and max-fs | <name>Payload Format Parameters</name> | |||
| parameters are used to signal the capabilities of a receiver | <t>This payload format has three optional parameters: max-fr, | |||
| implementation. If the implementation is willing to | max-fs, and profile-id.</t> | |||
| receive media, both parameters MUST be provided. These parameters MU | <t>The max-fr and max-fs parameters are used to signal the capabilities | |||
| ST | of a receiver implementation. If the implementation is willing to | |||
| NOT be used for any other purpose. A media sender SHOULD NOT send | receive media, both parameters <bcp14>MUST</bcp14> be provided. These | |||
| media with a frame rate or frame size exceeding the max-fr and max-f | parameters <bcp14>MUST NOT</bcp14> be used for any other purpose. A | |||
| s | media sender <bcp14>SHOULD NOT</bcp14> send media with a frame rate or | |||
| values signaled. (There may be scenarios, such as pre-encoded | frame size exceeding the max-fr and max-fs values signaled. (There may | |||
| media or <xref target='RFC7667'>selective forwarding | be scenarios, such as pre-encoded media or <xref target="RFC7667" | |||
| middleboxes</xref>, where a media sender does not have media availab | format="default">selective forwarding middleboxes</xref>, where a media | |||
| le | sender does not have media available that fits within a receiver's | |||
| that fits within a receivers max-fs and max-fr value; in such | max-fs and max-fr values; in such scenarios, a sender <bcp14>MAY</bcp14> | |||
| scenarios, a sender MAY exceed the signaled values.) | exceed the signaled values.) | |||
| <list style="hanging"> | </t> | |||
| <t hangText="max-fr:">The value of max-fr is an integer | <dl newline="false" spacing="normal"> | |||
| <dt>max-fr:</dt> | ||||
| <dd>The value of max-fr is an integer | ||||
| indicating the maximum frame rate in units of frames per | indicating the maximum frame rate in units of frames per | |||
| second that the decoder is capable of decoding.</t> | second that the decoder is capable of decoding.</dd> | |||
| <dt>max-fs:</dt> | ||||
| <t hangText="max-fs:">The value of max-fs is an integer | <dd>The value of max-fs is an integer | |||
| indicating the maximum frame size in units of macroblocks that | indicating the maximum frame size in units of macroblocks that | |||
| the decoder is capable of decoding.</t> | the decoder is capable of decoding.</dd> | |||
| <dt/> | ||||
| <t>The decoder is capable of decoding this frame size as long | <dd>The decoder is capable of decoding this frame size as long | |||
| as the width and height of the frame in macroblocks are less | as the width and height of the frame in macroblocks are each les | |||
| than int(sqrt(max-fs * 8)) - for instance, a max-fs of 1200 | s | |||
| than int(sqrt(max-fs * 8)); for instance, a max-fs of 1200 | ||||
| (capable of supporting 640x480 resolution) will support widths | (capable of supporting 640x480 resolution) will support widths | |||
| and heights up to 1552 pixels (97 macroblocks).</t> | and heights up to 1552 pixels (97 macroblocks).</dd> | |||
| <dt>profile-id:</dt> | ||||
| <t hangText="profile-id:">The value of profile-id is an integer | <dd>The value of profile-id is an integer indicating the default | |||
| indicating the default coding profile, the subset of coding | coding profile (the subset of coding tools that may have been used to | |||
| tools that may have been used to generate the stream or that the | generate the stream or that the receiver supports). <xref | |||
| receiver supports). <xref target="TableOfProfileIds"/> lists all | target="TableOfProfileIds" format="default"/> lists all of the | |||
| of the profiles defined in section 7.2 of <xref target="VP9-BITST | profiles defined in Section 7.2 of <xref target="VP9-BITSTREAM" | |||
| REAM"/> | format="default"/> and the corresponding integer values to be | |||
| and the corresponding integer values to be used.</t> | used.</dd> | |||
| <dt/> | ||||
| <t>If no profile-id is present, Profile 0 MUST be inferred. (The | <dd>If no profile-id is present, Profile 0 <bcp14>MUST</bcp14> be inferr ed. (The | |||
| profile-id parameter was added relatively late in the developmen t of this | profile-id parameter was added relatively late in the developmen t of this | |||
| specification, so some existing implementations may not send it. ) | specification, so some existing implementations may not send it. ) | |||
| </t> | </dd> | |||
| <dt/> | ||||
| <t>Informative note: See <xref target="TableOfProfiles"/> for cap | <dd>Informative note: See <xref target="TableOfProfiles" | |||
| abilities | format="default"/> for capabilities of coding profiles defined in Sectio | |||
| of coding profiles defined in section 7.2 of <xref target="VP9-BI | n 7.2 of | |||
| TSTREAM"/>.</t> | <xref target="VP9-BITSTREAM" format="default"/>.</dd> | |||
| </list></t> | </dl> | |||
| <t>A receiver <bcp14>MUST</bcp14> ignore any parameter unspecified in this | ||||
| <t>A receiver MUST ignore any parameter unspecified in this | specification.</t> | |||
| specification.</t> | ||||
| <texttable anchor="TableOfProfileIds" title="Table of profile-id | ||||
| integer values representing the VP9 profile corresponding to the set of | ||||
| coding tools supported."> | ||||
| <ttcol align="center">Profile</ttcol> | ||||
| <ttcol align="center">profile-id</ttcol> | ||||
| <c>0</c><c>0</c> | ||||
| <c>1</c><c>1</c> | ||||
| <c>2</c><c>2</c> | ||||
| <c>3</c><c>3</c> | ||||
| </texttable> | ||||
| <texttable anchor="TableOfProfiles" title="Table of profile | ||||
| capabilities."> | ||||
| <ttcol align="center">Profile</ttcol> | ||||
| <ttcol align="center">Bit Depth</ttcol> | ||||
| <ttcol align="center">SRGB Colorspace</ttcol> | ||||
| <ttcol align="center">Chroma Subsampling</ttcol> | ||||
| <c>0</c><c>8</c><c>No</c><c>YUV 4:2:0</c> | ||||
| <c>1</c><c>8</c><c>Yes</c><c>YUV 4:2:2,4:4:0 or 4:4:4</c> | ||||
| <c>2</c><c>10 or 12</c><c>No</c><c>YUV 4:2:0</c> | ||||
| <c>3</c><c>10 or 12</c><c>Yes</c><c>YUV 4:2:2,4:4:0 or 4:4:4</c> | ||||
| </texttable> | ||||
| <section anchor="SDPParameters" title="SDP Parameters"> | ||||
| <section title="Mapping of Media Subtype Parameters to SDP"> | ||||
| <t>The media type video/VP9 string is mapped to fields in the | ||||
| Session Description Protocol (SDP) <xref target="RFC8866"/> as | ||||
| follows: <list style="symbols"> | ||||
| <t>The media name in the "m=" line of SDP MUST be video.</t> | ||||
| <t>The encoding name in the "a=rtpmap" line of SDP MUST be VP9 | <table anchor="TableOfProfileIds" align="center"> | |||
| (the media subtype).</t> | <name>Correspondence between profile-id to VP9 Profile Integer</name> | |||
| <thead> | ||||
| <tr> | ||||
| <th align="center">Profile</th> | ||||
| <th align="center">profile-id</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td align="center">0</td> | ||||
| <td align="center">0</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">1</td> | ||||
| <td align="center">1</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">2</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">3</td> | ||||
| <td align="center">3</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <table anchor="TableOfProfiles" align="center"> | ||||
| <name>Profile Capabilities</name> | ||||
| <thead> | ||||
| <tr> | ||||
| <th align="center">Profile</th> | ||||
| <th align="center">Bit Depth</th> | ||||
| <th align="center">SRGB Colorspace</th> | ||||
| <th align="center">Chroma Subsampling</th> | ||||
| </tr> | ||||
| </thead> | ||||
| <tbody> | ||||
| <tr> | ||||
| <td align="center">0</td> | ||||
| <td align="center">8</td> | ||||
| <td align="center">No</td> | ||||
| <td align="center">YUV 4:2:0</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">1</td> | ||||
| <td align="center">8</td> | ||||
| <td align="center">Yes</td> | ||||
| <td align="center">YUV 4:2:2,4:4:0 or 4:4:4</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">2</td> | ||||
| <td align="center">10 or 12</td> | ||||
| <td align="center">No</td> | ||||
| <td align="center">YUV 4:2:0</td> | ||||
| </tr> | ||||
| <tr> | ||||
| <td align="center">3</td> | ||||
| <td align="center">10 or 12</td> | ||||
| <td align="center">Yes</td> | ||||
| <td align="center">YUV 4:2:2,4:4:0 or 4:4:4</td> | ||||
| </tr> | ||||
| </tbody> | ||||
| </table> | ||||
| <aside><t keepWithPrevious="true">Note: SRGB (often sRGB) = Standard Red-G | ||||
| reen-Blue</t></aside> | ||||
| <t>The clock rate in the "a=rtpmap" line MUST be 90000.</t> | <section anchor="SDPParameters" numbered="true" toc="default"> | |||
| <name>SDP Parameters</name> | ||||
| <section numbered="true" toc="default"> | ||||
| <name>Mapping of Media Subtype Parameters to SDP</name> | ||||
| <t>The parameters "max-fr" and "max-fs" MUST be included in | <t>The media type video/vp9 string is mapped to fields in the | |||
| Session Description Protocol (SDP) <xref target="RFC8866" format="defa | ||||
| ult"/> as | ||||
| follows: </t> | ||||
| <ul spacing="normal"> | ||||
| <li>The media name in the "m=" line of SDP <bcp14>MUST</bcp14> be vi | ||||
| deo.</li> | ||||
| <li>The encoding name in the "a=rtpmap" line of SDP | ||||
| <bcp14>MUST</bcp14> be VP9 (the media subtype).</li> | ||||
| <li>The clock rate in the "a=rtpmap" line <bcp14>MUST</bcp14> be 900 | ||||
| 00.</li> | ||||
| <li>The parameters max-fr and max-fs <bcp14>MUST</bcp14> be included | ||||
| in | ||||
| the "a=fmtp" line of SDP if the receiver wishes to declare its rec eiver | the "a=fmtp" line of SDP if the receiver wishes to declare its rec eiver | |||
| capabilities. These parameters are expressed as a media subtype | capabilities. These parameters are expressed as a media subtype | |||
| string, in the form of a semicolon separated list of | string in the form of a semicolon-separated list of | |||
| parameter=value pairs.</t> | parameter=value pairs.</li> | |||
| <li>The <bcp14>OPTIONAL</bcp14> parameter profile-id, when present, | ||||
| <t>The OPTIONAL parameter profile-id, when present, SHOULD be | <bcp14>SHOULD</bcp14> be | |||
| included in the "a=fmtp" line of SDP. This parameter is expressed | included in the "a=fmtp" line of SDP. This parameter is expressed | |||
| as a media subtype string, in the form of a parameter=value | as a media subtype string in the form of a parameter=value | |||
| pair. When the parameter is not present, a value of 0 MUST be | pair. When the parameter is not present, a value of 0 <bcp14>MUST</ | |||
| inferred for profile-id.</t> | bcp14> be | |||
| </list></t> | inferred for profile-id.</li> | |||
| </ul> | ||||
| <section title="Example"> | <section numbered="true" toc="default"> | |||
| <name>Example</name> | ||||
| <t>An example of media representation in SDP is as follows:</t> | <t>An example of media representation in SDP is as follows:</t> | |||
| <sourcecode type="sdp"><![CDATA[m=video 49170 RTP/AVPF 98 | ||||
| <figure> | ||||
| <artwork>m=video 49170 RTP/AVPF 98 | ||||
| a=rtpmap:98 VP9/90000 | a=rtpmap:98 VP9/90000 | |||
| a=fmtp:98 max-fr=30;max-fs=3600;profile-id=0 | a=fmtp:98 max-fr=30;max-fs=3600;profile-id=0 | |||
| </artwork> | ]]></sourcecode> | |||
| </figure> | ||||
| </section> | </section> | |||
| </section> | </section> | |||
| <section numbered="true" toc="default"> | ||||
| <section title="Offer/Answer Considerations"> | <name>Offer/Answer Considerations</name> | |||
| <t>When VP9 is offered over RTP using SDP in an Offer/Answer model | <t>When VP9 is offered over RTP using SDP in an Offer/Answer model | |||
| <xref target="RFC3264"/> for negotiation for unicast usage, the follow | <xref target="RFC3264" format="default"/> for negotiation for unicast | |||
| ing | usage, the following | |||
| limitations and rules apply: <list style="symbols"> | limitations and rules apply: </t> | |||
| <t>The parameter identifying a media format configuration for VP9 i | <ul spacing="normal"> | |||
| s | <li>The parameter identifying a media format configuration for VP9 i | |||
| profile-id. This media format configuration parameter MUST be used | s | |||
| symmetrically; that is, the answerer MUST either maintain this | profile-id. This media format configuration parameter <bcp14>MUST</ | |||
| bcp14> be used | ||||
| symmetrically; that is, the answerer <bcp14>MUST</bcp14> either mai | ||||
| ntain this | ||||
| configuration parameter or remove the media format (payload type) | configuration parameter or remove the media format (payload type) | |||
| completely if it is not supported.</t> | completely if it is not supported.</li> | |||
| <li>The max-fr and max-fs parameters are used declaratively to | ||||
| <t>The max-fr and max-fs parameters are used declaratively to | ||||
| describe receiver capabilities, even in the Offer/Answer model. | describe receiver capabilities, even in the Offer/Answer model. | |||
| The values in an answer are used to describe the answerer's | The values in an answer are used to describe the answerer's | |||
| capabilities, and thus their values are set independently of the | capabilities; thus, their values are set independently of the | |||
| values in the offer.</t> | values in the offer.</li> | |||
| <li>To simplify the handling and matching of these configurations, t | ||||
| <t>To simplify the handling and matching of these configurations, | he | |||
| the | same RTP payload type number used in the offer <bcp14>SHOULD</bcp1 | |||
| same RTP payload type number used in the offer SHOULD also be used | 4> also be used | |||
| in the answer and in a subsequent offer, as specified in <xref | in the answer and in a subsequent offer, as specified in <xref tar | |||
| target="RFC3264"/>. An answer or subsequent offer | get="RFC3264" format="default"/>. An answer or subsequent offer | |||
| MUST NOT contain the payload type number used in the offer unless t | <bcp14>MUST NOT</bcp14> contain the payload type number used in the | |||
| he | offer unless the | |||
| profile-id value is exactly the same as in the original offer. | profile-id value is exactly the same as in the original offer. | |||
| However, max-fr and max-fs parameters MAY be changed in subsequent | However, max-fr and max-fs parameters <bcp14>MAY</bcp14> be change d in subsequent | |||
| offers and answers, with the same payload type number, if an endpo int | offers and answers, with the same payload type number, if an endpo int | |||
| wishes to change its declared receiver capabilities.</t> | wishes to change its declared receiver capabilities.</li> | |||
| </list></t> | </ul> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| </section> | </section> | |||
| <section anchor="mediaTypeRegistration" title="Media Type Definition"> | <section anchor="mediaTypeRegistration" numbered="true" toc="default"> | |||
| <t>This registration is done using the template defined in <xref | <name>Media Type Definition</name> | |||
| target="RFC6838"/> and following <xref target="RFC4855"/>. <list | <t>This registration uses the template defined in <xref target="RFC6838" f | |||
| style="hanging"> | ormat="default"/> and following <xref target="RFC4855" format="default"/>. </t> | |||
| <t hangText="Type name:">video</t> | ||||
| <t hangText="Subtype name:">VP9</t> | ||||
| <t hangText="Required parameters:">N/A.</t> | ||||
| <t hangText="Optional parameters:"><vspace blankLines="0"/> | <!--[rfced] Please note that after AUTH48 concludes, we will | |||
| There are three optional parameters, "max-fr", "max-fs", and "profil | communicate any changes to the media type template in Section 7 | |||
| e-id". | to IANA for corresponding updates to | |||
| See <xref target='payloadFormatParameters' /> for their definition. | https://www.iana.org/assignments/media-types/video/VP9 to be | |||
| </t> | made.--> | |||
| <t hangText="Encoding considerations:"><vspace blankLines="0"/> | <dl newline="false" spacing="normal"> | |||
| <dt>Type name:</dt> | ||||
| <dd>video</dd> | ||||
| <dt>Subtype name:</dt> | ||||
| <dd>VP9</dd> | ||||
| <dt>Required parameters:</dt> | ||||
| <dd>N/A</dd> | ||||
| <dt>Optional parameters:</dt> | ||||
| <dd> | ||||
| There are three optional parameters: max-fr, max-fs, and profile-id. | ||||
| See <xref target="payloadFormatParameters" format="default"/> for th | ||||
| eir definition. | ||||
| </dd> | ||||
| <dt>Encoding considerations:</dt> | ||||
| <dd> | ||||
| This media type is framed in RTP and contains binary data; see | This media type is framed in RTP and contains binary data; see | |||
| Section 4.8 of <xref target="RFC6838"/>.</t> | <xref target="RFC6838" sectionFormat="of" section="4.8" | |||
| format="default"/>.</dd> | ||||
| <t hangText="Security considerations:">See <xref | <dt>Security considerations:</dt> | |||
| target="securityConsiderations"/> of RFC xxxx. <vspace | <dd> | |||
| blankLines="0"/> [RFC Editor: Upon publication as an RFC, please | <t>See <xref target="securityConsiderations" format="default"/> of RFC | |||
| replace "XXXX" with the number assigned to this document and | 9628. </t> | |||
| remove this note.]</t> | ||||
| <t hangText="Interoperability considerations:">None.</t> | ||||
| <t hangText="Published specification:">VP9 bitstream format <xref | ||||
| target="VP9-BITSTREAM"/> and RFC XXXX. <vspace blankLines="0"/> [RFC | ||||
| Editor: Upon publication as an RFC, please replace "XXXX" with the | ||||
| number assigned to this document and remove this note.] <vspace | ||||
| blankLines="0"/></t> | ||||
| <t hangText="Applications which use this media type:"><vspace | ||||
| blankLines="0"/> For example: Video over IP, video | ||||
| conferencing.</t> | ||||
| <t hangText="Fragment identifier considerations:">N/A.</t | ||||
| > | ||||
| <t hangText="Additional information:">None.</t> | ||||
| <t | </dd> | |||
| hangText="Person & email address to contact for further informat | <dt>Interoperability considerations:</dt> | |||
| ion:"><vspace | <dd>None</dd> | |||
| blankLines="0"/> Jonathan Lennox <jonathan.lennox@8x8.com></t> | <dt>Published specification:</dt> | |||
| <dd> | ||||
| <t>VP9 bitstream format <xref target="VP9-BITSTREAM" format="default"/ | ||||
| > and RFC 9628. </t> | ||||
| <t hangText="Intended usage:">COMMON</t> | </dd> | |||
| <dt>Applications that use this media type:</dt> | ||||
| <dd> For example, video over IP, video | ||||
| conferencing.</dd> | ||||
| <dt>Fragment identifier considerations:</dt> | ||||
| <dd>N/A</dd> | ||||
| <dt>Additional information:</dt> | ||||
| <dd>None</dd> | ||||
| <dt>Person & email address to contact for further information:</dt> | ||||
| <dd><t><contact fullname="Jonathan Lennox"/> <jonathan.lennox@8x8.com | ||||
| ></t></dd> | ||||
| <dt>Intended usage:</dt> | ||||
| <dd>COMMON</dd> | ||||
| <dt>Restrictions on usage:</dt> | ||||
| <dd> This media type depends on RTP framing; hence, it is only defined | ||||
| for transfer via RTP <xref target="RFC3550" format="default"/>.</dd> | ||||
| <dt>Author:</dt> | ||||
| <dd><t><contact fullname="Jonathan Lennox"/> <jonathan.lennox@8x8.com | ||||
| ></t></dd> | ||||
| <t hangText="Restrictions on usage:"><vspace blankLines="0"/> This | <!--[rfced] Please review the entry for "Change Controller" in Section | |||
| media type depends on RTP framing, and hence is only defined for | 7. While we see similar text for the vp8 and vc2 entries, we want to | |||
| transfer via RTP <xref target="RFC3550"/>.</t> | confirm that this entry has been reviewed with the following in | |||
| mind from | ||||
| https://www.iana.org/help/protocol-registration: | ||||
| <t hangText="Author:">Jonathan Lennox <jonathan.lennox@8x8.com> | "The IESG shouldn't be listed as a change controller unless the | |||
| ;</t> | RFC that created the registry (e.g. port numbers, XML namespaces | |||
| and schemas) requires it. The IETF should be named instead." | ||||
| <t hangText="Change controller:"><vspace blankLines="0"/> IETF | --> | |||
| AVTCore Working Group delegated from the IESG.</t> | ||||
| </list></t> | ||||
| </section> | ||||
| <section anchor="securityConsiderations" title="Security Considerations" | <dt>Change controller:</dt> | |||
| > | <dd> IETF | |||
| AVTCore Working Group delegated from the IESG.</dd> | ||||
| </dl> | ||||
| </section> | ||||
| <section anchor="securityConsiderations" numbered="true" toc="default"> | ||||
| <name>Security Considerations</name> | ||||
| <t>RTP packets using the payload format defined in this specification | <t>RTP packets using the payload format defined in this specification | |||
| are subject to the security considerations discussed in the RTP | are subject to the security considerations discussed in the RTP | |||
| specification <xref target="RFC3550"/>, and in any applicable RTP | specification <xref target="RFC3550" format="default"/>, and in any | |||
| profile such | applicable RTP profile such as <xref target="RFC3551" | |||
| as <xref target='RFC3551'>RTP/AVP</xref>, <xref target='RFC4585'>RTP/AVPF< | format="default">RTP/AVP</xref>, <xref target="RFC4585" | |||
| /xref>, | format="default">RTP/AVPF</xref>, <xref target="RFC3711" | |||
| <xref target='RFC3711'>RTP/SAVP</xref>, | format="default">RTP/SAVP</xref>, or <xref target="RFC5124" | |||
| or <xref target='RFC5124'>RTP/SAVPF</xref>. | format="default">RTP/SAVPF</xref>. However, as "<xref target="RFC7202" fo | |||
| However, as <xref target='RFC7202'>"Securing the RTP Protocol | rmat="title"/>" <xref target="RFC7202" | |||
| Framework: Why RTP Does Not Mandate a Single Media | format="default"></xref> discusses, it is not an RTP | |||
| Security Solution"</xref> discusses, it is not an RTP payload format's res | payload format's responsibility to discuss or mandate what solutions are | |||
| ponsibility to | used to meet the basic security goals like confidentiality, integrity, | |||
| discuss or mandate what solutions are used to meet the | and source authenticity for RTP in general. This responsibility lies with | |||
| basic security goals like confidentiality, integrity and source | ||||
| authenticity for RTP in general. This responsibility lays on | ||||
| anyone using RTP in an application. They can find guidance on available | anyone using RTP in an application. They can find guidance on available | |||
| security mechanisms in <xref target='RFC7201'>Options for Securing | security mechanisms in "<xref target="RFC7201" format="title"/> <xref targ | |||
| RTP Sessions</xref>. Applications SHOULD use one or more appropriate | et="RFC7201" format="default"></xref>. Applications <bcp14>SHOULD</bcp14> | |||
| strong security mechanisms. The rest of this security | use one or more appropriate strong security mechanisms.</t> | |||
| consideration section discusses the security impacting properties of the | <t>Implementations of this RTP payload format need to take appropriate | |||
| payload format itself.</t> | security considerations into account. It is extremely important for the | |||
| decoder to be robust against malicious or malformed payloads and ensure | ||||
| <t>Implementations of this RTP payload format need to take appropriate sec | that they do not cause the decoder to overrun its allocated memory or | |||
| urity | otherwise misbehave. An overrun in allocated memory could lead to | |||
| considerations into account. It is extremely important for the decoder to | arbitrary code execution by an attacker. The same applies to the | |||
| be | encoder, even though problems in encoders are (typically) rarer.</t> | |||
| robust against malicious or malformed payloads and ensure that they do not | <t>This RTP payload format and its media decoder do not exhibit any | |||
| cause the decoder | significant non-uniformity in the receiver-side computational complexity | |||
| to overrun its allocated memory or otherwise mis-behave. An overrun in al | for packet processing; thus, they are unlikely to pose a denial-of-service | |||
| located memory could lead to | threat due to the receipt of pathological data. Nor does the RTP payload | |||
| arbitrary code execution by an attacker. The same applies to the encoder, | format contain any active content.</t> | |||
| even | ||||
| though problems in encoders are typically rarer.</t> | ||||
| <t>This RTP payload | ||||
| format and its media decoder do not exhibit any significant | ||||
| non-uniformity in the receiver-side computational complexity for packet | ||||
| processing, and thus are unlikely to pose a denial-of-service threat due | ||||
| to the receipt of pathological data. Nor does the RTP payload format | ||||
| contain any active content.</t> | ||||
| </section> | </section> | |||
| <section anchor="congestionControl" numbered="true" toc="default"> | ||||
| <section anchor="congestionControl" title="Congestion Control"> | <name>Congestion Control</name> | |||
| <t>Congestion control for RTP SHALL be used in accordance with RFC 3550 | <t>Congestion control for RTP <bcp14>SHALL</bcp14> be used in accordance | |||
| <xref target="RFC3550"/>, and with any applicable RTP profile; e.g., RFC | with <xref target="RFC3550" format="default"/>, and with any | |||
| 3551 <xref target="RFC3551"/>. The congestion control mechanism can, in | applicable RTP profile, e.g., <xref target="RFC3551" | |||
| a real-time encoding scenario, adapt the transmission rate by | format="default"/>. The congestion control mechanism can, in a real-time | |||
| instructing the encoder to encode at a certain target rate. Media aware | encoding scenario, adapt the transmission rate by instructing the | |||
| network elements MAY use the information in the VP9 payload descriptor | encoder to encode at a certain target rate. Media-aware network elements | |||
| in <xref target="VP9payloadDescriptor"/> to identify non-reference | <bcp14>MAY</bcp14> use the information in the VP9 payload descriptor in | |||
| frames and discard them in order to reduce network congestion. Note that | <xref target="VP9payloadDescriptor" format="default"/> to identify | |||
| discarding of non-reference frames cannot be done if the stream is | non-reference frames and discard them in order to reduce network | |||
| encrypted (because the non-reference marker is encrypted).</t> | congestion. Note that discarding of non-reference frames cannot be done | |||
| if the stream is encrypted (because the non-reference marker is | ||||
| encrypted).</t> | ||||
| </section> | </section> | |||
| <section anchor="IANAConsiderations" numbered="true" toc="default"> | ||||
| <name>IANA Considerations</name> | ||||
| <section anchor="IANAConsiderations" title="IANA Considerations"> | <t>IANA has registered the media type registration "video/vp9" | |||
| <t>The IANA is requested to register the media type registration | as specified in <xref target="mediaTypeRegistration" format="default"/>. | |||
| "video/vp9" as specified in <xref | The media type has also been added to the | |||
| target="mediaTypeRegistration"/>. The media type is also | "RTP Payload Format Media Types" <eref | |||
| requested to | target="https://www.iana.org/assignments/rtp-parameters" | |||
| be added to the IANA registry for "RTP Payload Format MIME types" | brackets="angle"/> subregistry of the "Real-Time Transport Protocol (RTP) | |||
| <http://www.iana.org/assignments/rtp-parameters>.</t> | Paramaeters" registry as follows.</t> | |||
| </section> | ||||
| <section title="Acknowledgments"> | <dl spacing="compact"> | |||
| <t>Alex Eleftheriadis, Yuki Ito, Won Kap Jang, Sergio Garcia | <dt>Media Type:</dt><dd>video</dd> | |||
| Murillo, Roi Sasson, Timothy Terriberry, Emircan Uysaler, and | <dt>Subtype:</dt><dd>VP9</dd> | |||
| Thomas Volkert commented on the development of this document and | <dt>Clock Rate (Hz):</dt><dd>90000</dd> | |||
| provided helpful comments and feedback.</t> | <dt>Reference:</dt><dd>RFC 9628</dd> | |||
| </dl> | ||||
| </section> | </section> | |||
| </middle> | </middle> | |||
| <back> | <back> | |||
| <references title='Normative References'> | <references> | |||
| <name>References</name> | ||||
| <reference anchor='VP9-BITSTREAM' target='https://storage.googleapis.co | <references> | |||
| m/downloads.webmproject.org/docs/vp9/vp9-bitstream-specification-v0.6-20160331-d | <name>Normative References</name> | |||
| raft.pdf'> | ||||
| <front> | ||||
| <title>VP9 Bitstream & Decoding Process Specification</titl | ||||
| e> | ||||
| <author initials='A' surname='Grange' fullname='Adrian Grange'> | <reference anchor="VP9-BITSTREAM" target="https://storage.googleapis.com | |||
| <organization>Google</organization> | /downloads.webmproject.org/docs/vp9/vp9-bitstream-specification-v0.6-20160331-dr | |||
| </author> | aft.pdf"> | |||
| <author initials='P' surname='de Rivaz' fullname='Peter de Riva | <front> | |||
| z'> | <title>VP9 Bitstream & Decoding Process Specification</title> | |||
| <organization>Argon Design</organization> | <author initials="A" surname="Grange" fullname="Adrian Grange"> | |||
| </author> | <organization>Google</organization> | |||
| <author initials='J' surname='Hunt' fullname='Jonathan Hunt'> | </author> | |||
| <organization>Argon Design</organization> | <author initials="P" surname="de Rivaz" fullname="Peter de Rivaz"> | |||
| </author> | <organization>Argon Design</organization> | |||
| <date month='March' day='31' year='2016' /> | </author> | |||
| <abstract> | <author initials="J" surname="Hunt" fullname="Jonathan Hunt"> | |||
| <t> | <organization>Argon Design</organization> | |||
| </author> | ||||
| <date month="March" day="31" year="2016"/> | ||||
| <abstract> | ||||
| <t> | ||||
| This document defines the bitstream format and decoding process for the | This document defines the bitstream format and decoding process for the | |||
| Google VP9 video codec. | Google VP9 video codec. | |||
| </t> | </t> | |||
| </abstract> | </abstract> | |||
| </front> | ||||
| </front> | <seriesInfo name="Version" value="0.6"/> | |||
| <seriesInfo name='Version' value='0.6' /> | </reference> | |||
| </reference> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.2119.xml"/> | ||||
| &rfc2119; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.8174.xml"/> | ||||
| &rfc8174; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.4585.xml"/> | ||||
| &rfc4585; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.3550.xml"/> | ||||
| &rfc3550; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.8866.xml"/> | ||||
| &rfc8866; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.6838.xml"/> | ||||
| &rfc6838; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.4855.xml"/> | ||||
| &rfc4855; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
| FC.5104.xml"/> | ||||
| &rfc5104; | ||||
| &lrr; | <!-- [I-D.ietf-avtext-lrr] companion document RFC 9627 --> | |||
| <reference anchor="RFC9627" target="https://www.rfc-editor.org/info/rfc9627"> | ||||
| <front> | ||||
| <title>The Layer Refresh Request (LRR) RTCP Feedback Message</title> | ||||
| <author initials="J." surname="Lennox" fullname="Jonathan Lennox"> | ||||
| <organization>Vidyo, Inc.</organization> | ||||
| </author> | ||||
| <author initials="D." surname="Hong" fullname="Danny Hong"> | ||||
| <organization>Vidyo, Inc.</organization> | ||||
| </author> | ||||
| <author initials="J." surname="Uberti" fullname="Justin Uberti"> | ||||
| <organization>Google, Inc.</organization> | ||||
| </author> | ||||
| <author initials="S." surname="Holmer" fullname="Stefan Holmer"> | ||||
| <organization>Google, Inc.</organization> | ||||
| </author> | ||||
| <author initials="M." surname="Flodman" fullname="Magnus Flodman"> | ||||
| <organization>Google, Inc.</organization> | ||||
| </author> | ||||
| <date month="August" year="2024" /> | ||||
| </front> | ||||
| <seriesInfo name="RFC" value="9627" /> | ||||
| <seriesInfo name="DOI" value="10.17487/RFC9627"/> | ||||
| &rfc3264; | </reference> | |||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.3264.xml"/> | ||||
| </references> | ||||
| <references> | ||||
| <name>Informative References</name> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.3551.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.5124.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.6386.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7201.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7202.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.7667.xml"/> | ||||
| <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
| FC.3711.xml"/> | ||||
| </references> | ||||
| </references> | </references> | |||
| <section numbered="false" toc="default"> | ||||
| <references title='Informative References'> | <name>Acknowledgments</name> | |||
| <t><contact fullname="Alex Eleftheriadis"/>, <contact fullname="Yuki | ||||
| &rfc3551; | Ito"/>, <contact fullname="Won Kap Jang"/>, <contact fullname="Sergio | |||
| Garcia"/> <contact fullname="Murillo"/>, <contact fullname="Roi | ||||
| &rfc5124; | Sasson"/>, <contact fullname="Timothy Terriberry"/>, <contact | |||
| fullname="Emircan Uysaler"/>, and <contact fullname="Thomas Volkert"/> | ||||
| &rfc6386; | commented on the development of this document and provided helpful | |||
| feedback.</t> | ||||
| &rfc7201; | </section> | |||
| &rfc7202; | ||||
| &rfc7667; | ||||
| &rfc3711; | ||||
| </references> | ||||
| </back> | </back> | |||
| </rfc> | </rfc> | |||
| <!-- LocalWords: PictureID DCT Hadamard WHT SSRC CSRC pyld hdr FI VER RPSI | ||||
| --> | ||||
| <!-- LocalWords: stPartitionSize SDP AVPF SRTP IANA PID PICIDX TID | ||||
| --> | ||||
| End of changes. 161 change blocks. | ||||
| 979 lines changed or deleted | 1034 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. | ||||