Page History

...

Column	Type
`<container>_id`	serial primary key
`publication_id`	integer not null
`modified_load_id`	integer not null
`status`	character(1) (v=valid XML, i=invalid XML)
`content`	XML

The XML is always in the content (PostgreSQL type: XML) column.

...

select content
from xml.t_publication_reference
where publication_id=xml.f_ucid2id('US-5551212-A');
--------
<publication-reference fvid="71329085" ucid="US-5551212-A">
   <document-id>
    <country>US</country>
    <doc-number>5551212</doc-number>
    <kind>A</kind>
    <date>19960903</date>
    <lang>EN</lang>
  </document-id>
</publication-reference>

...

Info
The invaluable utility

...

function xml.f_ucid2id

...

takes a ucid

...

and returns

...

a publication_id.

...

Extracting Pieces of the XML Content

...

Function

Example

string

xml.f_textContent(xpath::text,xml)

Extract text of node:

select xml.f_textContent('./document-id/lang', content)
from xml.t_publication_reference
where publication_id=xml.f_ucid2id('US-5551212-A');
----
EN

string

xml_f_getAttribute(att::text,xml)

Extract single attribute value:

select xml.f_getAttribute('ucid', content)
from xml.t_publication_reference
where publication_id=xml.f_ucid2id('US-5551212-A');
----
US-5551212-A

boolean

xml.f_nodeExists(xpath::text,xml)

Test presence of a particular node:

select xml.f_nodeExists('//date', content)
from xml.t_publication_reference
where publication_id=xml.f_ucid2id('US-5551212-A');
----
t

XML

xml.f_findNode(xpath::text,xml)

Select a node (type XML) satisfying the given xpath expression:

select xml.f_findNode('./main-classification', content)
from xml.t_classification_national
where publication_id=xml.f_ucid2id('US-5551212-A');
----
 <main-classification mxw-id="PCL893513162" load-source="mcf">053397</main-classification>

XML[]

xml.f_findNodes(xpath::text,xml)

Select an array of nodes satisfying the given xpath expression:

select xml.f_findNodes('./classification-ipcr', content)
from xml.t_classifications_ipcr
where publication_id=xml.f_ucid2id('US-5551212-A');
----
{
  "<classification-ipcr ...",
  "<classification-ipcr ...",
  "<classification-ipcr ..."
}

XML[]

xml.f_getElementsByTagName(name::text,xml))

Select an array of nodes based on element name:

select xml.f_getElementsByTagName('classification-ipcr', content)
from xml.t_classifications_ipcr
where publication_id=xml.f_ucid2id('US-5551212-A');
----
{
  "<classification-ipcr ...",
  "<classification-ipcr ...",
  "<classification-ipcr ..."
}

string

xml.f_nodeName(xml)

Select the name of the content node:

select xml.f_nodeName( content )
from xml.t_invention_title
where publication_id=xml.f_ucid2id('US-5551212-A');
-----------------
 invention-title

XML[]

xml.f_childNodes(xml)

Select all child nodes:

select xml.f_childNodes( content )
from xml.t_application_reference
where publication_id=xml.f_ucid2id('US-5551212-A');
-----------------------------------------------------------------------------------
 <document-id mxw-id="PAPP60364944" load-source="docdb" format="epo">
   <country>US</country>
   <doc-number>2582893</doc-number>
   <kind>A</kind>
   <date>19930303</date>
   <lang>EN</lang>
 </document-id>
 <document-id mxw-id="PAPP84571955" load-source="patent-office" format="original">
   <country>US</country>
   <doc-number>08025828</doc-number>
   <date>19930303</date>
   <lang>EN</lang>
 </document-id>

string

xml.f_node2string(xml,indent)

Concatenate all child text nodes:

select xml.f_node2string( content::text, 1 )
from xml.t_abstract
where publication_id=xml.f_ucid2id('US-5551212-A');
-----------------------------------------------------------------------------------
 A package is formed by holding a nonextensible strip taut and wrapping it around the upper and/or lower periphery of the bundle and joining the ends of the strip segments e.g. by adhesive bonding. A projecting margin of the strip is folded to overlie the bundle or underlie the bundle so that an L-configuration is imparted to the frame formed around the bundle after the strip is applied thereto.

...

Now we can create a function that returns a TABLE of these properties. We'll use publication_id as input to the function and create it in a private schema (mySchema).

create or replace function mySchema.f_rdb_citations( integer )
  returns table (
    publication_id integer,
    ref_publication_id integer,
    ref_ucid varchar(64),
    published date,
    country varchar(2),
    kind varchar(2),
    load_source varchar(32),
    format varchar(32),
    source varchar(32)
  ) as
$BODY$
declare v_content xml;
        v_node xml;
        v_doc_node xml;
begin

-- example usage:
-- select * from mySchema.f_rdb_citations( xml.f_ucid2id('US-9240001-B2') );
  select x.publication_id, x.content into publication_id, v_content
    from xml.t_citations as x where x.publication_id=$1;
  if not found then
    return;
  end if;

  -- loop through all patent citations
  for each v_node in array
    ( select xml.f_findNodes('//patcit', v_content) ) loop
    select xml.f_getAttribute( 'ucid', v_node ) into ref_ucid;

    select x.publication_id into ref_publication_id
      from xml.t_patent_document_values as x
      where ucid=xml.f_getAttribute( 'ucid', v_node );

    select xml.f_getAttribute( 'load-source', v_node ) into load_source;
    select xml.f_getAttribute('name', xml.f_findNode('./sources/source', v_node ) ) into source;

    -- loop through all forms of the document-id(s)
    for each v_doc_node in array
      ( select xml.f_findNodes('./document-id', v_node) ) loop
      select xml.f_getAttribute( 'format', v_doc_node ) into format;
      select xml.f_textContent( './country', v_doc_node ) into country;
      select xml.f_textContent( './kind', v_doc_node ) into kind;
      select xml.f_textContent( './date', v_doc_node ) into published;
      return next;
    end loop;
  end loop;
end;
$BODY$
  LANGUAGE plpgsql VOLATILE
  COST 100
  ROWS 1000;

...

Now, we test the results:

select * from mySchema.f_rdb_citations( xml.f_ucid2id('US-9240001-B2') ) limit 5;
publication_id | ref_publication_id |     ref_ucid      | published | country | kind | load_source | format | source
----------------+--------------------+-------------------+------------+---------+------+-------------+--------+--------
      160817649 |            5778519 | CA-2580978-A1     | 2006-03-30 | CA      | A1   | docdb       | epo    | APP
      160817649 |          127402642 | CN-201918032-U    | 2011-08-03 | CN      | U    | docdb       | epo    | APP
      160817649 |           75847915 | US-20020150866-A1 | 2002-10-17 | US      | A1   | docdb       | epo    | APP
      160817649 |           75947470 | US-20030060942-A1 | 2003-03-27 | US      | A1   | docdb       | epo    | APP
      160817649 |           76046244 | US-20030154010-A1 | 2003-08-14 | US      | A1   | docdb       | epo    | APP

...

Of course, GROUP BY and ORDER BY are available on the returned columns. In addition

...

to SELECTing citations for a single document, we can also JOIN to table expressions, e.g., assume we are interested in the top 5 cited countries from US grant publication date 2016-05-03:

...

select count(*) as n, t2.country as ref_country
from xml.t_patent_document_values t1
  inner join mySchema.f_rdb_citations (t1.publication_id) as t2
    on (t1.publication_id=t2.publication_id)
where t1.country='US'
  and t1.published='2016-05-03'
group by t2.country
order by n desc
limit 5;
   n    | ref_country
--------+---------
221800 | US
  14285 | WO
  13313 | JP
   7115 | EP
   2554 | CN

Blog

Versions Compared

Old Version 2

New Version 3

Key

Extracting Pieces of the XML Content