0

I am trying to create unique customer groups which are determined by customer interactivity across transactions.

Here is an example of the data:

Transaction # Primary Customer Cosigner WANT: Customer Group
1 1 2 A
2 1 3 A
3 1 4 A
4 1 2 A
5 2 5 A
6 3 6 A
7 2 1 A
8 3 1 A
9 7 8 B
10 9 C

In this example, customer 1 is connected to customers 2-6 either directly or indirectly, so all transactions associated with customers 1-6 would be a part of an "A" group. Customer 7 and 8 are directly connected and would be labeled as a "B" group. Customer 9 has no connections and are the single member of the "C" group.

Any suggestions are appreciated!

2 Answers 2

4

Your data can be considered the edges of a graph. So your request is to find the connected subgraphs of that graph. That question has an answer on Stackoverflow and SAS Communities. But this question is more on topic than that older SO question. So let's post the subnet SAS macro from the SAS Communities answer here on SO where it will be easier to find.

This simple macro uses repeated PROC SQL queries to build the list of connected subgraphs until all of the original records have been assigned to a subgraph.

The macro is setup to let you pass in the name of the source dataset and the names of the two variables that hold the ids of the nodes.

So first let's convert your printout into an actual SAS dataset.

data have;
  input id primary cosign want $;
cards;
1 1 2 A
2 1 3 A
3 1 4 A
4 1 2 A
5 2 5 A
6 3 6 A
7 2 1 A
8 3 1 A
9 7 8 B
10 9 . C
;

Now we can call the macro and tell it that PRIMARY and COSIGN are the variables with the node ids and that SUBNET is the name for the new variable to hold the ids of the connected subgraphs. NOTE: This version treats the graph as directed by default.

%subnet(in=have,out=want,from=primary,to=cosign,subnet=subnet);

Results:

Obs    id    primary    cosign    want    subnet

  1     1       1          2        A        1
  2     2       1          3        A        1
  3     3       1          4        A        1
  4     4       1          2        A        1
  5     5       2          5        A        1
  6     6       3          6        A        1
  7     7       2          1        A        1
  8     8       3          1        A        1
  9     9       7          8        B        2
 10    10       9          .        C        3

Here is the code of the %SUBNET() macro.

%macro subnet(in=,out=,from=from,to=to,subnet=subnet,directed=1);
/*----------------------------------------------------------------------
SUBNET - Build connected subnets from pairs of nodes.
Input Table :FROM TO pairs of rows
Output Table:input data with &subnet added
Work Tables:
  NODES - List of all nodes in input.
  NEW - List of new nodes to assign to current subnet.

Algorithm:
Pick next unassigned node and grow the subnet by adding all connected
nodes. Repeat until all unassigned nodes are put into a subnet.

To treat the graph as undirected set the DIRECTED parameter to 0.
----------------------------------------------------------------------*/
%local subnetid next getnext ;
%*----------------------------------------------------------------------
Put code to get next unassigned node into a macro variable. This query 
is used in two places in the program.
-----------------------------------------------------------------------;
%let getnext= select node into :next from nodes where subnet=.;
%*----------------------------------------------------------------------
Initialize subnet id counter.
-----------------------------------------------------------------------;
%let subnetid=0;
proc sql noprint;
*----------------------------------------------------------------------;
* Get list of all nodes ;
*----------------------------------------------------------------------;
  create table nodes as
    select . as subnet, &from as node from &in where &from is not null
    union
    select . as subnet, &to as node from &in where &to is not null
  ;
*----------------------------------------------------------------------;
* Get next unassigned node ;
*----------------------------------------------------------------------;
  &getnext;
%do %while (&sqlobs) ;
*----------------------------------------------------------------------;
* Set subnet to next id ;
*----------------------------------------------------------------------;
  %let subnetid=%eval(&subnetid+1);
  update nodes set subnet=&subnetid where node=&next;
  %do %while (&sqlobs) ;
*----------------------------------------------------------------------;
* Get list of connected nodes for this subnet ;
*----------------------------------------------------------------------;
    create table new as
      select distinct a.&to as node
        from &in a, nodes b, nodes c
        where a.&from= b.node
          and a.&to= c.node
          and b.subnet = &subnetid
          and c.subnet = .
    ;
%if "&directed" ne "1" %then %do;
    insert into new 
      select distinct a.&from as node
        from &in a, nodes b, nodes c
        where a.&to= b.node
          and a.&from= c.node
          and b.subnet = &subnetid
          and c.subnet = .
    ;
%end;
*----------------------------------------------------------------------;
* Update subnet for these nodes ;
*----------------------------------------------------------------------;
    update nodes set subnet=&subnetid
      where node in (select node from new )
    ;
  %end;
*----------------------------------------------------------------------;
* Get next unassigned node ;
*----------------------------------------------------------------------;
  &getnext;
%end;
*----------------------------------------------------------------------;
* Create output dataset by adding subnet number. ;
*----------------------------------------------------------------------;
  create table &out as
    select distinct a.*,b.subnet as &subnet
      from &in a , nodes b
      where a.&from = b.node
  ;
quit;
%mend subnet ;
Sign up to request clarification or add additional context in comments.

1 Comment

This is excellent, thank you so much! I wasn't familiar with the technical terms for graph data which made it difficult for me to find similar questions. It was helpful to have the additional logic for the undirected graph as I certainly need to pick up those tangential relationships.
0

You can use Hashes to compute your group identities and their members:

Example:

Proc DS2 is used for the succinctness of hash declaration and clarity that can be coded. The final pair Q H bridges two groups that were independent up-to that linkage point and requires the two groups to merge.

data customer;
  length id1-id2 $8;
  input id1-id2 @@; output;
datalines;
A B  A C  B A  B D  C A  C D  D C  D .
E F  E .  F E  F .
H J  H K  K L  K M
P Q  Q R  R S  S T
Q H
;
run;

%if %sysfunc(exist(vs)) %then %do;
  proc delete data=vs;
  proc delete data=gs;
%end;

options nosource;

proc ds2 ;
  data _null_ ;
    declare char(8) v1 v2 v;
    declare double g gnew;

    declare package hash vs([v], [v g], 0, '', 'ascending');
    declare package hash gs([g], [g v], 0, '', 'ascending', '', '', 'multidata');

    method add11(char(8) x1, char(8) x2); /* neither vertex has been seen before */
      g + 1;
      v = x1; vs.add();  gs.add();
      v = x2; vs.add();  gs.add();
*     put 'add00' x1 $char1. x2 $char1. ' ' g;
    end;

    method add10(char(8) x1, char(8) x2); /* x1 is not in a group, x2 is */
      v = x2; vs.find();  * get group;
      v = x1; vs.add();   * apply group to x2;
      gs.add();
*     put 'add10' x1 $char1. x2 $char1. ' ' g;
    end;

    method add01(char(8) x1, char(8) x2); /* x1 is in a group, x2 is not */
      v = x1; vs.find();  * get group;
      v = x2; vs.add();   * apply group to x1;
      gs.add();
*     put 'add01' x1 $char1. x2 $char1. ' ' g;
    end;

    method add00(char(8) x1, char(8) x2); /* both x1 and x2 are in a group */
      declare double g1 g2;

      v = x1; vs.find(); g1 = g; * get group of x1;
      v = x2; vs.find(); g2 = g; * get group of x2;

      if g1 ^= g2 then do;
        * merge groups, v of higher group moved to lower group;
        gnew = min(g1,g2);
        g    = max(g1,g2);

        gs.find();
        vs.replace([v], [v gnew]);

        do while (gs.has_next() = 0);
          gs.find_next();
          vs.replace([v], [v gnew]);
        end;

        gs.removeall();
      end;

*     put 'add00' x1 $char1. x2 $char1. ' ' g g1 g2;
    end;

    method run();
      declare int e1 e2;
      declare char(2) f;

      set customer;
      if not missing(id1) and not missing(id2);

      e1 = vs.check([id1]);
      e2 = vs.check([id2]);

      select (cats(e1^=0,e2^=0));
        when ('11') add11(id1,id2);
        when ('10') add10(id1,id2);
        when ('01') add01(id1,id2);
        when ('00') add00(id1,id2);
        otherwise stop;
      end;
    end;

    method term();
      vs.output('vs');
      gs.output('gs');
    end;
  run;
quit;

enter image description here

enter image description here

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.