PDBx/mmCIF Dictionary Resources

About This Example:

A more involved and extensive example that uses the PDBX parsing Python library to generate a CIF file for each biological assembly listed in the pdbx_struct_assembly category of a CIF file. This example synthesizes information located in the pdbx_struct_assembly_gen, pdbx_struct_oper_list, and atom_site categories to accomplish this task. For every assembly in pdbx_struct_assembly_gen, the program retrieves the operation expression and applies the operations it specifies (which are stored in pdbx_struct_oper_list) to certain coordinates.

Build Instructions:

Files: Assemblies.py, 2BUK.cif

Save Assemblies.py and the CIF data file. Run python Assemblies.py /path/to/file.cif, which generates a /path/to/2BUK-ASSEMBLY.cif file for each assembly listed in pdbx_struct_assembly

Methods to Note

from pdbx.reader.PdbxContainers import ContainerBase
from pdbx.reader.PdbxContainers import DataCategory

getObj(self, name) Returns the DataCategory object specified by name.
getAttributeList(self) Returns a list of attribute/data item names.
getRowCount(self) Returns the number of rows in the category table.
getValue(self, attributeName=None, rowIndex=None) Returns the value of the attribute attributeName at row index rowIndex.
getFullRow(self, index) Attempts to fetch the full row at index index and returns an empty list if it fails.
setValue(self, value, attributeName=None, rowIndex=None) Sets a value for the attribute specified by attributeName at the row index rowIndex.

Assemblies for 2BUK.cif

Rendered via Chimera

2BUK-1.cif - Complete Icosahedral Assembly

2BUK-2.cif - Icosahedral Asymmetric Unit

2BUK-3.cif - Icosahedral Pentamer

2BUK-4.cif - Icosahedral 23 Hexamer

2BUK-PAU.cif - Icosahedral Asymmetric Unit, Std Point Frame

2BUK-XAU.cif - Crystal Asymmetric Unit, Crystal Frame

Example Source Code

/*************************
 * Assemblies.py
 *
 * For some CIF file, generate a complete CIF file for every assembly
 * listed in the pdbx_struct_assembly category table by performing
 * rotation and translation matrix operations and creating a new atom_site
 * category table for each assembly.
 *
 * Lines with superscriptions contain footnoted references or explanations.
 *************************/

import copy
from os.path import splitext
from pdbx.reader.PdbxReader import PdbxReader
from pdbx.writer.PdbxWriter import PdbxWriter
from pdbx.reader.PdbxContainers import *
from sys import argv

def parseOperationExpression(expression) :
    operations = []
    stops = [ "," , "-" , ")" ]

    currentOp = ""
    i = 1
	
    # Iterate over the operation expression
    while i in range(1, len(expression) - 1):
        pos = i

        # Read an operation
        while expression[pos] not in stops and pos < len(expression) - 1 : 
            pos += 1    
        currentOp = expression[i : pos]

        # Handle single operations
        if expression[pos] != "-" :
            operations.append(currentOp)
            i = pos

        # Handle ranges
        if expression[pos] == "-" :
            pos += 1
            i = pos
			
            # Read in the range's end value
            while expression[pos] not in stops :
                pos += 1
            end = int(expression[i : pos])
			
            # Add all the operations in [currentOp, end]
            for val in range((int(currentOp)), end + 1) :
                operations.append(str(val))
            i = pos
        i += 1
    return operations

def prepareOperation(oper_list, op1index, op2index) :
    # Prepare matrices for operations 1 & 2
    op1 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 1]]
    op2 = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 1]]
	
    # Fill the operation matrices for operations 1 & 2
    for i in range(3) :
        op1[i][3] = float(oper_list.getValue("vector[" + str(i + 1) + "]", op1index))
		
        if (op2index != -1) :
            op2[i][3] = float(oper_list.getValue("vector[" + str(i + 1) + "]", op2index))
        for j in range(3) :
            op1[i][j] = float(oper_list.getValue("matrix[" + str(i + 1) + "][" + str(j + 1) + "]", op1index))
            if (op2index != -1) :
                op2[i][j] = float(oper_list.getValue("matrix[" + str(i + 1) + "][" + str(j + 1) + "]", op2index))
    
    # Handles non-Cartesian product expressions
    if (op2index == -1) :
        return op1

    operation = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 1]]

    # Handles Cartesian product expressions (4x4 matrix multiplication)
    sum = 0.0
    for row in range(4) :
        for col in range(4) :
            sum = 0.0
            for r in range(4) :
                sum += (op1[row][r] * op2[r][col])
            operation[row][col] = sum
    return operation

# Open the CIF file
cif = open(argv[1])

root, ext = splitext(argv[1])

# A list to be propagated with data blocks
data = []

# Create a PdbxReader object with the CIF file handle
pRd = PdbxReader(cif)

# Read the CIF file, propagating the data list
pRd.read(data)

# Close the CIF file, as it is no longer needed
cif.close()

# Retrieve the first data block
block = data[0]

# Retrieve the atom_site category table, which delineates constituent atoms¹
atom_site = block.getObj("atom_site")

# Make a reference copy of the atom_site category table
atom_site_ref = copy.copy(atom_site)

# Retrieve the pdbx_struct_assembly_gen category table, which details the generation of each macromolecular assembly²
assembly_gen = block.getObj("pdbx_struct_assembly_gen")

# Retrieve the pdbx_struct_oper_list category table, which details translation and rotation 
# operations required to generate/transform assembly coordinates³
oper_list = block.getObj("pdbx_struct_oper_list")

attributes = atom_site_ref.getAttributeList()

# Create a CIF file for every assembly specified in pdbx_struct_assembly_gen
for index in range(assembly_gen.getRowCount()) :

    # Create a new atom_site category table for this assembly
    atom_site = DataCategory("atom_site", attributes)
	
    # Lists to hold the individual operations
    oper = []
    oper2 = []

    # Keep track of the current atom and model number
    atomNum = 1
    modelNum = 0

    # Retrieve the assembly_id attribute value for this assembly
    assemblyId = assembly_gen.getValue("assembly_id", index)

    # Retrieve the operation expression for this assembly from the oper_expression attribute	
    oper_expression = assembly_gen.getValue("oper_expression", index)

    # Count the number of left parentheses in the operation expression
    parenCount = oper_expression.count("(")

    # Handles one operation assemblies (e.g., "1")
    if parenCount == 0 : oper.append(oper_expression)
	
    # Handles multiple operation assemblies, no Cartesian products (e.g., "(1-5)")
    if parenCount == 1 : oper.extend(parseOperationExpression(oper_expression))
	
    # Handles Cartesian product expressions (e.g., "(X0)(1-60)")
    if parenCount == 2 :
        # Break the expression into two parenthesized expressions and parse them
        temp = oper_expression.find(")")
        oper.extend(parseOperationExpression(oper_expression[0:temp+1]))
        oper2.extend(parseOperationExpression(oper_expression[temp+1:]))

    # Retrieve the asym_id_list, which indicates which atoms to apply the operations to
    asym_id_list = assembly_gen.getValue("asym_id_list", index)

    temp = (1 > len(oper2)) and 1 or len(oper2)

    # For every operation in the first parenthesized list
    for op1 in oper :
        # Find the index of the current operation in the oper_list category table
        op1index = 0
        for row in range(oper_list.getRowCount()) : 
            if oper_list.getValue("id", row) == op1 : 
                op1index = row
                break

        # For every operation in the second parenthesized list (if there is one)
        for i in range(temp) :		
            # Find the index of the second operation in the oper_list category table
            op2index = -1
            if (oper2) :
                for row in range(oper_list.getRowCount()) :
                    if oper_list.getValue("id", row) == oper2[i] :
                        op2index = row
                        break

            # Prepare the operation matrix
            operation = prepareOperation(oper_list, op1index, op2index)

            # Iterate over every atom in the atom_site reference table
            for r in range(atom_site_ref.getRowCount()) :
                
                # If the asym_id of the current atom is not in the asym_id list, skip to the next atom
                if (asym_id_list.find(atom_site_ref.getValue("label_asym_id", r)) == -1) :
                    continue
                
                # Retrieve the atom's row from the atom_site reference table
                atom = atom_site_ref.getFullRow(r)

                # Add this row to the atom_site table for this assembly
                for s in range(len(attributes)) :
                    atom_site.setValue(atom[s], attributes[s], atomNum - 1)

                # Update the atom number and model number for this row
                atom_site.setValue(str(atomNum), "id", atomNum - 1)
                atom_site.setValue(str(modelNum), "pdbx_PDB_model_num", atomNum - 1) 

                # Determine and set the new coordinates
                coords = [float(atom[10]), float(atom[11]), float(atom[12]), 1.0]
                sum = 0.0
                xyz = ['x', 'y', 'z']
                for a in range(3) :
                    sum = 0.0
                    for b in range(4) :
                        sum += (operation[a][b] * coords[b])
                    atom_site.setValue("%.3f" % sum, "Cartn_" + xyz[a], atomNum - 1)
                atomNum += 1
            modelNum += 1

    # Write the CIF out file
    block.replace(atom_site)
    out = open(root + "-" + assemblyId + ".cif", "w")
    pWt = PdbxWriter(out)
    pWt.writeContainer(block)
    out.close()

Python Example Assemblies.py